LLVM  13.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
81 #include "llvm/Analysis/CFG.h"
87 #include "llvm/Analysis/LoopInfo.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
134 #include "llvm/Support/MathExtras.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169  "llvm.loop.vectorize.followup_vectorized";
171  "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
179  "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180  cl::desc("Enable vectorization of epilogue loops."));
181 
183  "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184  cl::desc("When epilogue vectorization is enabled, and a value greater than "
185  "1 is specified, forces the given VF for all applicable epilogue "
186  "loops."));
187 
189  "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190  cl::desc("Only loops with vectorization factor equal to or larger than "
191  "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
196  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197  cl::desc("Loops with a constant trip count that is smaller than this "
198  "value are vectorized only if no scalar iteration overheads "
199  "are incurred."));
200 
202  "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203  cl::desc("The maximum allowed number of runtime memory checks with a "
204  "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212  enum Option {
216  };
217 } // namespace PreferPredicateTy
218 
220  "prefer-predicate-over-epilogue",
222  cl::Hidden,
223  cl::desc("Tail-folding and predication preferences over creating a scalar "
224  "epilogue loop."),
226  "scalar-epilogue",
227  "Don't tail-predicate loops, create scalar epilogue"),
229  "predicate-else-scalar-epilogue",
230  "prefer tail-folding, create scalar epilogue if tail "
231  "folding fails."),
233  "predicate-dont-vectorize",
234  "prefers tail-folding, don't attempt vectorization if "
235  "tail-folding fails.")));
236 
238  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239  cl::desc("Maximize bandwidth when selecting vectorization factor which "
240  "will be determined by the smallest type in loop."));
241 
243  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
249  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
253  "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254  cl::desc("We don't interleave loops with a estimated constant trip count "
255  "below this number"));
256 
258  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259  cl::desc("A flag that overrides the target's number of scalar registers."));
260 
262  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263  cl::desc("A flag that overrides the target's number of vector registers."));
264 
266  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267  cl::desc("A flag that overrides the target's max interleave factor for "
268  "scalar loops."));
269 
271  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272  cl::desc("A flag that overrides the target's max interleave factor for "
273  "vectorized loops."));
274 
276  "force-target-instruction-cost", cl::init(0), cl::Hidden,
277  cl::desc("A flag that overrides the target's expected cost for "
278  "an instruction to a single constant value. Mostly "
279  "useful for getting consistent testing."));
280 
282  "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283  cl::desc(
284  "Pretend that scalable vectors are supported, even if the target does "
285  "not support them. This flag should only be used for testing."));
286 
288  "small-loop-cost", cl::init(20), cl::Hidden,
289  cl::desc(
290  "The cost of a loop that is considered 'small' by the interleaver."));
291 
293  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294  cl::desc("Enable the use of the block frequency analysis to access PGO "
295  "heuristics minimizing code growth in cold regions and being more "
296  "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
300  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301  cl::desc(
302  "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
306  "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307  cl::desc("Enable interleaving for loops with small iteration counts that "
308  "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
312  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313  cl::desc("Max number of stores to be predicated behind an if."));
314 
316  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317  cl::desc("Count the induction variable only once when interleaving"));
318 
320  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321  cl::desc("Enable if predication of stores during vectorization."));
322 
324  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325  cl::desc("The maximum interleave count to use when interleaving a scalar "
326  "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329  PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330  cl::Hidden,
331  cl::desc("Prefer in-loop vector reductions, "
332  "overriding the targets preference."));
333 
335  "enable-strict-reductions", cl::init(false), cl::Hidden,
336  cl::desc("Enable the vectorisation of loops with in-order (strict) "
337  "FP reductions"));
338 
340  "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341  cl::desc(
342  "Prefer predicating a reduction operation over an after loop select."));
343 
345  "enable-vplan-native-path", cl::init(false), cl::Hidden,
346  cl::desc("Enable VPlan-native vectorization path with "
347  "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
352  "enable-vplan-predication", cl::init(false), cl::Hidden,
353  cl::desc("Enable VPlan-native vectorization path predicator with "
354  "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
361  "vplan-build-stress-test", cl::init(false), cl::Hidden,
362  cl::desc(
363  "Build VPlan for every supported loop nest in the function and bail "
364  "out right after the build (stress test the VPlan H-CFG construction "
365  "in the VPlan-native vectorization path)."));
366 
368  "interleave-loops", cl::init(true), cl::Hidden,
369  cl::desc("Enable loop interleaving in Loop vectorization passes"));
371  "vectorize-loops", cl::init(true), cl::Hidden,
372  cl::desc("Run the Loop vectorization passes"));
373 
375  "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376  cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns the type of loaded or stored value.
380  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
381  "Expected Load or Store instruction");
382  if (auto *LI = dyn_cast<LoadInst>(I))
383  return LI->getType();
384  return cast<StoreInst>(I)->getValueOperand()->getType();
385 }
386 
387 /// A helper function that returns true if the given type is irregular. The
388 /// type is irregular if its allocated size doesn't equal the store size of an
389 /// element of the corresponding vector type.
390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
391  // Determine if an array of N elements of type Ty is "bitcast compatible"
392  // with a <N x Ty> vector.
393  // This is only true if there is no padding between the array elements.
394  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
395 }
396 
397 /// A helper function that returns the reciprocal of the block probability of
398 /// predicated blocks. If we return X, we are assuming the predicated block
399 /// will execute once for every X iterations of the loop header.
400 ///
401 /// TODO: We should use actual block probability here, if available. Currently,
402 /// we always assume predicated blocks have a 50% chance of executing.
403 static unsigned getReciprocalPredBlockProb() { return 2; }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409  : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 /// 1) Returns exact trip count if it is known.
415 /// 2) Returns expected trip count according to profile data if any.
416 /// 3) Returns upper bound estimate if it is known.
417 /// 4) Returns None if all of the above failed.
419  // Check if exact trip count is known.
420  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421  return ExpectedTC;
422 
423  // Check if there is an expected trip count available from profile data.
425  if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426  return EstimatedTC;
427 
428  // Check if upper bound estimate is known.
429  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430  return ExpectedTC;
431 
432  return None;
433 }
434 
435 // Forward declare GeneratedRTChecks.
436 class GeneratedRTChecks;
437 
438 namespace llvm {
439 
440 /// InnerLoopVectorizer vectorizes loops which contain only one basic
441 /// block to a specified vectorization factor (VF).
442 /// This class performs the widening of scalars into vectors, or multiple
443 /// scalars. This class also implements the following features:
444 /// * It inserts an epilogue loop for handling loops that don't have iteration
445 /// counts that are known to be a multiple of the vectorization factor.
446 /// * It handles the code generation for reduction variables.
447 /// * Scalarization (implementation using scalars) of un-vectorizable
448 /// instructions.
449 /// InnerLoopVectorizer does not perform any vectorization-legality
450 /// checks, and relies on the caller to check for the different legality
451 /// aspects. The InnerLoopVectorizer relies on the
452 /// LoopVectorizationLegality class to provide information about the induction
453 /// and reduction variables that were found to a given vectorization factor.
455 public:
458  const TargetLibraryInfo *TLI,
461  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
464  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
465  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
466  Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
467  PSI(PSI), RTChecks(RTChecks) {
468  // Query this against the original loop and save it here because the profile
469  // of the original loop header may change as the transformation happens.
472  }
473 
474  virtual ~InnerLoopVectorizer() = default;
475 
476  /// Create a new empty loop that will contain vectorized instructions later
477  /// on, while the old loop will be used as the scalar remainder. Control flow
478  /// is generated around the vectorized (and scalar epilogue) loops consisting
479  /// of various checks and bypasses. Return the pre-header block of the new
480  /// loop.
481  /// In the case of epilogue vectorization, this function is overriden to
482  /// handle the more complex control flow around the loops.
484 
485  /// Widen a single instruction within the innermost loop.
487  VPTransformState &State);
488 
489  /// Widen a single call instruction within the innermost loop.
490  void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
491  VPTransformState &State);
492 
493  /// Widen a single select instruction within the innermost loop.
495  bool InvariantCond, VPTransformState &State);
496 
497  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
498  void fixVectorizedLoop(VPTransformState &State);
499 
500  // Return true if any runtime check is added.
502 
503  /// A type for vectorized values in the new loop. Each value from the
504  /// original loop, when vectorized, is represented by UF vector values in the
505  /// new unrolled loop, where UF is the unroll factor.
507 
508  /// Vectorize a single GetElementPtrInst based on information gathered and
509  /// decisions taken during planning.
510  void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
511  unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
512  SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
513 
514  /// Vectorize a single PHINode in a block. This method handles the induction
515  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
516  /// arbitrary length vectors.
518  VPWidenPHIRecipe *PhiR, VPTransformState &State);
519 
520  /// A helper function to scalarize a single Instruction in the innermost loop.
521  /// Generates a sequence of scalar instances for each lane between \p MinLane
522  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
523  /// inclusive. Uses the VPValue operands from \p Operands instead of \p
524  /// Instr's operands.
526  const VPIteration &Instance, bool IfPredicateInstr,
527  VPTransformState &State);
528 
529  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
530  /// is provided, the integer induction variable will first be truncated to
531  /// the corresponding type.
532  void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
533  VPValue *Def, VPValue *CastDef,
534  VPTransformState &State);
535 
536  /// Construct the vector value of a scalarized value \p V one lane at a time.
537  void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
538  VPTransformState &State);
539 
540  /// Try to vectorize interleaved access group \p Group with the base address
541  /// given in \p Addr, optionally masking the vector operations if \p
542  /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
543  /// values in the vectorized loop.
545  ArrayRef<VPValue *> VPDefs,
546  VPTransformState &State, VPValue *Addr,
547  ArrayRef<VPValue *> StoredValues,
548  VPValue *BlockInMask = nullptr);
549 
550  /// Vectorize Load and Store instructions with the base address given in \p
551  /// Addr, optionally masking the vector operations if \p BlockInMask is
552  /// non-null. Use \p State to translate given VPValues to IR values in the
553  /// vectorized loop.
555  VPValue *Def, VPValue *Addr,
556  VPValue *StoredValue, VPValue *BlockInMask);
557 
558  /// Set the debug location in the builder using the debug location in
559  /// the instruction.
560  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
561 
562  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
564 
565  /// Create a broadcast instruction. This method generates a broadcast
566  /// instruction (shuffle) for loop invariant values and for the induction
567  /// value. If this is the induction variable then we extend it to N, N+1, ...
568  /// this is needed because each iteration in the loop corresponds to a SIMD
569  /// element.
570  virtual Value *getBroadcastInstrs(Value *V);
571 
572 protected:
574 
575  /// A small list of PHINodes.
577 
578  /// A type for scalarized values in the new loop. Each value from the
579  /// original loop, when scalarized, is represented by UF x VF scalar values
580  /// in the new unrolled loop, where UF is the unroll factor and VF is the
581  /// vectorization factor.
583 
584  /// Set up the values of the IVs correctly when exiting the vector loop.
585  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
586  Value *CountRoundDown, Value *EndValue,
587  BasicBlock *MiddleBlock);
588 
589  /// Create a new induction variable inside L.
590  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
591  Value *Step, Instruction *DL);
592 
593  /// Handle all cross-iteration phis in the header.
595 
596  /// Fix a first-order recurrence. This is the second phase of vectorizing
597  /// this phi node.
599 
600  /// Fix a reduction cross-iteration phi. This is the second phase of
601  /// vectorizing this phi node.
603 
604  /// Clear NSW/NUW flags from reduction instructions if necessary.
606  VPTransformState &State);
607 
608  /// Fixup the LCSSA phi nodes in the unique exit block. This simply
609  /// means we need to add the appropriate incoming value from the middle
610  /// block as exiting edges from the scalar epilogue loop (if present) are
611  /// already in place, and we exit the vector loop exclusively to the middle
612  /// block.
613  void fixLCSSAPHIs(VPTransformState &State);
614 
615  /// Iteratively sink the scalarized operands of a predicated instruction into
616  /// the block that was created for it.
617  void sinkScalarOperands(Instruction *PredInst);
618 
619  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
620  /// represented as.
622 
623  /// This function adds
624  /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
625  /// to each vector element of Val. The sequence starts at StartIndex.
626  /// \p Opcode is relevant for FP induction variable.
627  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
628  Instruction::BinaryOps Opcode =
629  Instruction::BinaryOpsEnd);
630 
631  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
632  /// variable on which to base the steps, \p Step is the size of the step, and
633  /// \p EntryVal is the value from the original loop that maps to the steps.
634  /// Note that \p EntryVal doesn't have to be an induction variable - it
635  /// can also be a truncate instruction.
636  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
638  VPValue *CastDef, VPTransformState &State);
639 
640  /// Create a vector induction phi node based on an existing scalar one. \p
641  /// EntryVal is the value from the original loop that maps to the vector phi
642  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
643  /// truncate instruction, instead of widening the original IV, we widen a
644  /// version of the IV truncated to \p EntryVal's type.
646  Value *Step, Value *Start,
647  Instruction *EntryVal, VPValue *Def,
648  VPValue *CastDef,
649  VPTransformState &State);
650 
651  /// Returns true if an instruction \p I should be scalarized instead of
652  /// vectorized for the chosen vectorization factor.
654 
655  /// Returns true if we should generate a scalar version of \p IV.
656  bool needsScalarInduction(Instruction *IV) const;
657 
658  /// If there is a cast involved in the induction variable \p ID, which should
659  /// be ignored in the vectorized loop body, this function records the
660  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
661  /// cast. We had already proved that the casted Phi is equal to the uncasted
662  /// Phi in the vectorized loop (under a runtime guard), and therefore
663  /// there is no need to vectorize the cast - the same value can be used in the
664  /// vector loop for both the Phi and the cast.
665  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
666  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
667  ///
668  /// \p EntryVal is the value from the original loop that maps to the vector
669  /// phi node and is used to distinguish what is the IV currently being
670  /// processed - original one (if \p EntryVal is a phi corresponding to the
671  /// original IV) or the "newly-created" one based on the proof mentioned above
672  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
673  /// latter case \p EntryVal is a TruncInst and we must not record anything for
674  /// that IV, but it's error-prone to expect callers of this routine to care
675  /// about that, hence this explicit parameter.
677  const InductionDescriptor &ID, const Instruction *EntryVal,
678  Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
679  unsigned Part, unsigned Lane = UINT_MAX);
680 
681  /// Generate a shuffle sequence that will reverse the vector Vec.
682  virtual Value *reverseVector(Value *Vec);
683 
684  /// Returns (and creates if needed) the original loop trip count.
685  Value *getOrCreateTripCount(Loop *NewLoop);
686 
687  /// Returns (and creates if needed) the trip count of the widened loop.
689 
690  /// Returns a bitcasted value to the requested vector type.
691  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
693  const DataLayout &DL);
694 
695  /// Emit a bypass check to see if the vector trip count is zero, including if
696  /// it overflows.
698 
699  /// Emit a bypass check to see if all of the SCEV assumptions we've
700  /// had to make are correct. Returns the block containing the checks or
701  /// nullptr if no checks have been added.
703 
704  /// Emit bypass checks to check any memory assumptions we may have made.
705  /// Returns the block containing the checks or nullptr if no checks have been
706  /// added.
708 
709  /// Compute the transformed value of Index at offset StartValue using step
710  /// StepValue.
711  /// For integer induction, returns StartValue + Index * StepValue.
712  /// For pointer induction, returns StartValue[Index * StepValue].
713  /// FIXME: The newly created binary instructions should contain nsw/nuw
714  /// flags, which can be found from the original scalar operations.
716  const DataLayout &DL,
717  const InductionDescriptor &ID) const;
718 
719  /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
720  /// vector loop preheader, middle block and scalar preheader. Also
721  /// allocate a loop object for the new vector loop and return it.
723 
724  /// Create new phi nodes for the induction variables to resume iteration count
725  /// in the scalar epilogue, from where the vectorized loop left off (given by
726  /// \p VectorTripCount).
727  /// In cases where the loop skeleton is more complicated (eg. epilogue
728  /// vectorization) and the resume values can come from an additional bypass
729  /// block, the \p AdditionalBypass pair provides information about the bypass
730  /// block and the end value on the edge from bypass to this loop.
733  std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
734 
735  /// Complete the loop skeleton by adding debug MDs, creating appropriate
736  /// conditional branches in the middle block, preparing the builder and
737  /// running the verifier. Take in the vector loop \p L as argument, and return
738  /// the preheader of the completed vector loop.
739  BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
740 
741  /// Add additional metadata to \p To that was not present on \p Orig.
742  ///
743  /// Currently this is used to add the noalias annotations based on the
744  /// inserted memchecks. Use this for instructions that are *cloned* into the
745  /// vector loop.
746  void addNewMetadata(Instruction *To, const Instruction *Orig);
747 
748  /// Add metadata from one instruction to another.
749  ///
750  /// This includes both the original MDs from \p From and additional ones (\see
751  /// addNewMetadata). Use this for *newly created* instructions in the vector
752  /// loop.
754 
755  /// Similar to the previous function but it adds the metadata to a
756  /// vector of instructions.
758 
759  /// Allow subclasses to override and print debug traces before/after vplan
760  /// execution, when trace information is requested.
761  virtual void printDebugTracesAtStart(){};
762  virtual void printDebugTracesAtEnd(){};
763 
764  /// The original loop.
765  Loop *OrigLoop;
766 
767  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
768  /// dynamic knowledge to simplify SCEV expressions and converts them to a
769  /// more usable form.
771 
772  /// Loop Info.
774 
775  /// Dominator Tree.
777 
778  /// Alias Analysis.
780 
781  /// Target Library Info.
783 
784  /// Target Transform Info.
786 
787  /// Assumption Cache.
789 
790  /// Interface to emit optimization remarks.
792 
793  /// LoopVersioning. It's only set up (non-null) if memchecks were
794  /// used.
795  ///
796  /// This is currently only used to add no-alias metadata based on the
797  /// memchecks. The actually versioning is performed manually.
798  std::unique_ptr<LoopVersioning> LVer;
799 
800  /// The vectorization SIMD factor to use. Each vector will have this many
801  /// vector elements.
803 
804  /// The vectorization unroll factor to use. Each scalar is vectorized to this
805  /// many different vector instructions.
806  unsigned UF;
807 
808  /// The builder that we use
810 
811  // --- Vectorization state ---
812 
813  /// The vector-loop preheader.
815 
816  /// The scalar-loop preheader.
818 
819  /// Middle Block between the vector and the scalar.
821 
822  /// The (unique) ExitBlock of the scalar loop. Note that
823  /// there can be multiple exiting edges reaching this block.
825 
826  /// The vector loop body.
828 
829  /// The scalar loop body.
831 
832  /// A list of all bypass blocks. The first block is the entry of the loop.
834 
835  /// The new Induction variable which was added to the new block.
836  PHINode *Induction = nullptr;
837 
838  /// The induction variable of the old basic block.
839  PHINode *OldInduction = nullptr;
840 
841  /// Store instructions that were predicated.
843 
844  /// Trip count of the original loop.
845  Value *TripCount = nullptr;
846 
847  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
848  Value *VectorTripCount = nullptr;
849 
850  /// The legality analysis.
852 
853  /// The profitablity analysis.
855 
856  // Record whether runtime checks are added.
857  bool AddedSafetyChecks = false;
858 
859  // Holds the end values for each induction variable. We save the end values
860  // so we can later fix-up the external users of the induction variables.
862 
863  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
864  // fixed up at the end of vector code generation.
866 
867  /// BFI and PSI are used to check for profile guided size optimizations.
870 
871  // Whether this loop should be optimized for size based on profile guided size
872  // optimizatios.
874 
875  /// Structure to hold information about generated runtime checks, responsible
876  /// for cleaning the checks, if vectorization turns out unprofitable.
878 };
879 
881 public:
884  const TargetLibraryInfo *TLI,
886  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
891  ElementCount::getFixed(1), UnrollFactor, LVL, CM,
892  BFI, PSI, Check) {}
893 
894 private:
895  Value *getBroadcastInstrs(Value *V) override;
896  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
897  Instruction::BinaryOps Opcode =
898  Instruction::BinaryOpsEnd) override;
899  Value *reverseVector(Value *Vec) override;
900 };
901 
902 /// Encapsulate information regarding vectorization of a loop and its epilogue.
903 /// This information is meant to be updated and used across two stages of
904 /// epilogue vectorization.
907  unsigned MainLoopUF = 0;
909  unsigned EpilogueUF = 0;
914  Value *TripCount = nullptr;
915  Value *VectorTripCount = nullptr;
916 
917  EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
918  unsigned EUF)
919  : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
920  EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
921  assert(EUF == 1 &&
922  "A high UF for the epilogue loop is likely not beneficial.");
923  }
924 };
925 
926 /// An extension of the inner loop vectorizer that creates a skeleton for a
927 /// vectorized loop that has its epilogue (residual) also vectorized.
928 /// The idea is to run the vplan on a given loop twice, firstly to setup the
929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
930 /// from the first step and vectorize the epilogue. This is achieved by
931 /// deriving two concrete strategy classes from this base class and invoking
932 /// them in succession from the loop vectorizer planner.
934 public:
942  GeneratedRTChecks &Checks)
944  EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
945  Checks),
946  EPI(EPI) {}
947 
948  // Override this function to handle the more complex control flow around the
949  // three loops.
952  }
953 
954  /// The interface for creating a vectorized skeleton using one of two
955  /// different strategies, each corresponding to one execution of the vplan
956  /// as described above.
958 
959  /// Holds and updates state information required to vectorize the main loop
960  /// and its epilogue in two separate passes. This setup helps us avoid
961  /// regenerating and recomputing runtime safety checks. It also helps us to
962  /// shorten the iteration-count-check path length for the cases where the
963  /// iteration count of the loop is so small that the main vector loop is
964  /// completely skipped.
966 };
967 
968 /// A specialized derived class of inner loop vectorizer that performs
969 /// vectorization of *main* loops in the process of vectorizing loops and their
970 /// epilogues.
972 public:
982  EPI, LVL, CM, BFI, PSI, Check) {}
983  /// Implements the interface for creating a vectorized skeleton using the
984  /// *main loop* strategy (ie the first pass of vplan execution).
986 
987 protected:
988  /// Emits an iteration count bypass check once for the main loop (when \p
989  /// ForEpilogue is false) and once for the epilogue loop (when \p
990  /// ForEpilogue is true).
992  bool ForEpilogue);
993  void printDebugTracesAtStart() override;
994  void printDebugTracesAtEnd() override;
995 };
996 
997 // A specialized derived class of inner loop vectorizer that performs
998 // vectorization of *epilogue* loops in the process of vectorizing loops and
999 // their epilogues.
1001 public:
1009  GeneratedRTChecks &Checks)
1011  EPI, LVL, CM, BFI, PSI, Checks) {}
1012  /// Implements the interface for creating a vectorized skeleton using the
1013  /// *epilogue loop* strategy (ie the second pass of vplan execution).
1015 
1016 protected:
1017  /// Emits an iteration count bypass check after the main vector loop has
1018  /// finished to see if there are any iterations left to execute by either
1019  /// the vector epilogue or the scalar epilogue.
1020  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1021  BasicBlock *Bypass,
1022  BasicBlock *Insert);
1023  void printDebugTracesAtStart() override;
1024  void printDebugTracesAtEnd() override;
1025 };
1026 } // end namespace llvm
1027 
1028 /// Look for a meaningful debug location on the instruction or it's
1029 /// operands.
1031  if (!I)
1032  return I;
1033 
1034  DebugLoc Empty;
1035  if (I->getDebugLoc() != Empty)
1036  return I;
1037 
1038  for (Use &Op : I->operands()) {
1039  if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1040  if (OpInst->getDebugLoc() != Empty)
1041  return OpInst;
1042  }
1043 
1044  return I;
1045 }
1046 
1048  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1049  const DILocation *DIL = Inst->getDebugLoc();
1050  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1051  !isa<DbgInfoIntrinsic>(Inst)) {
1052  assert(!VF.isScalable() && "scalable vectors not yet supported.");
1053  auto NewDIL =
1055  if (NewDIL)
1056  B.SetCurrentDebugLocation(NewDIL.getValue());
1057  else
1058  LLVM_DEBUG(dbgs()
1059  << "Failed to create new discriminator: "
1060  << DIL->getFilename() << " Line: " << DIL->getLine());
1061  }
1062  else
1063  B.SetCurrentDebugLocation(DIL);
1064  } else
1065  B.SetCurrentDebugLocation(DebugLoc());
1066 }
1067 
1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1069 /// is passed, the message relates to that particular instruction.
1070 #ifndef NDEBUG
1072  const StringRef DebugMsg,
1073  Instruction *I) {
1074  dbgs() << "LV: " << Prefix << DebugMsg;
1075  if (I != nullptr)
1076  dbgs() << " " << *I;
1077  else
1078  dbgs() << '.';
1079  dbgs() << '\n';
1080 }
1081 #endif
1082 
1083 /// Create an analysis remark that explains why vectorization failed
1084 ///
1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an
1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1088 /// the location of the remark. \return the remark object that can be
1089 /// streamed to.
1091  StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1092  Value *CodeRegion = TheLoop->getHeader();
1093  DebugLoc DL = TheLoop->getStartLoc();
1094 
1095  if (I) {
1096  CodeRegion = I->getParent();
1097  // If there is no debug location attached to the instruction, revert back to
1098  // using the loop's.
1099  if (I->getDebugLoc())
1100  DL = I->getDebugLoc();
1101  }
1102 
1103  return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1104 }
1105 
1106 /// Return a value for Step multiplied by VF.
1108  assert(isa<ConstantInt>(Step) && "Expected an integer step");
1109  Constant *StepVal = ConstantInt::get(
1110  Step->getType(),
1111  cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1112  return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1113 }
1114 
1115 namespace llvm {
1116 
1117 /// Return the runtime value for VF.
1120  return VF.isScalable() ? B.CreateVScale(EC) : EC;
1121 }
1122 
1124  const StringRef OREMsg, const StringRef ORETag,
1125  OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1126  Instruction *I) {
1127  LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1128  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1129  ORE->emit(
1130  createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1131  << "loop not vectorized: " << OREMsg);
1132 }
1133 
1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1135  OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1136  Instruction *I) {
1138  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1139  ORE->emit(
1140  createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1141  << Msg);
1142 }
1143 
1144 } // end namespace llvm
1145 
1146 #ifndef NDEBUG
1147 /// \return string containing a file name and a line # for the given loop.
1148 static std::string getDebugLocString(const Loop *L) {
1149  std::string Result;
1150  if (L) {
1151  raw_string_ostream OS(Result);
1152  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1153  LoopDbgLoc.print(OS);
1154  else
1155  // Just print the module name.
1156  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1157  OS.flush();
1158  }
1159  return Result;
1160 }
1161 #endif
1162 
1164  const Instruction *Orig) {
1165  // If the loop was versioned with memchecks, add the corresponding no-alias
1166  // metadata.
1167  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1168  LVer->annotateInstWithNoAlias(To, Orig);
1169 }
1170 
1172  Instruction *From) {
1173  propagateMetadata(To, From);
1174  addNewMetadata(To, From);
1175 }
1176 
1178  Instruction *From) {
1179  for (Value *V : To) {
1180  if (Instruction *I = dyn_cast<Instruction>(V))
1181  addMetadata(I, From);
1182  }
1183 }
1184 
1185 namespace llvm {
1186 
1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1188 // lowered.
1190 
1191  // The default: allowing scalar epilogues.
1193 
1194  // Vectorization with OptForSize: don't allow epilogues.
1196 
1197  // A special case of vectorisation with OptForSize: loops with a very small
1198  // trip count are considered for vectorization under OptForSize, thereby
1199  // making sure the cost of their loop body is dominant, free of runtime
1200  // guards and scalar iteration overheads.
1202 
1203  // Loop hint predicate indicating an epilogue is undesired.
1205 
1206  // Directive indicating we must either tail fold or not vectorize
1208 };
1209 
1210 /// LoopVectorizationCostModel - estimates the expected speedups due to
1211 /// vectorization.
1212 /// In many cases vectorization is not profitable. This can happen because of
1213 /// a number of reasons. In this class we mainly attempt to predict the
1214 /// expected speedup/slowdowns due to the supported instruction set. We use the
1215 /// TargetTransformInfo to query the different backends for the cost of
1216 /// different operations.
1218 public:
1222  const TargetTransformInfo &TTI,
1223  const TargetLibraryInfo *TLI, DemandedBits *DB,
1226  const LoopVectorizeHints *Hints,
1227  InterleavedAccessInfo &IAI)
1228  : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1229  TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1230  Hints(Hints), InterleaveInfo(IAI) {}
1231 
1232  /// \return An upper bound for the vectorization factor, or None if
1233  /// vectorization and interleaving should be avoided up front.
1234  Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1235 
1236  /// \return True if runtime checks are required for vectorization, and false
1237  /// otherwise.
1238  bool runtimeChecksRequired();
1239 
1240  /// \return The most profitable vectorization factor and the cost of that VF.
1241  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1242  /// then this vectorization factor will be selected if vectorization is
1243  /// possible.
1244  VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1246  selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1247  const LoopVectorizationPlanner &LVP);
1248 
1249  /// Setup cost-based decisions for user vectorization factor.
1251  collectUniformsAndScalars(UserVF);
1252  collectInstsToScalarize(UserVF);
1253  }
1254 
1255  /// \return The size (in bits) of the smallest and widest types in the code
1256  /// that needs to be vectorized. We ignore values that remain scalar such as
1257  /// 64 bit loop indices.
1258  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1259 
1260  /// \return The desired interleave count.
1261  /// If interleave count has been specified by metadata it will be returned.
1262  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1263  /// are the selected vectorization factor and the cost of the selected VF.
1264  unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1265 
1266  /// Memory access instruction may be vectorized in more than one way.
1267  /// Form of instruction after vectorization depends on cost.
1268  /// This function takes cost-based decisions for Load/Store instructions
1269  /// and collects them in a map. This decisions map is used for building
1270  /// the lists of loop-uniform and loop-scalar instructions.
1271  /// The calculated cost is saved with widening decision in order to
1272  /// avoid redundant calculations.
1273  void setCostBasedWideningDecision(ElementCount VF);
1274 
1275  /// A struct that represents some properties of the register usage
1276  /// of a loop.
1277  struct RegisterUsage {
1278  /// Holds the number of loop invariant values that are used in the loop.
1279  /// The key is ClassID of target-provided register class.
1281  /// Holds the maximum number of concurrent live intervals in the loop.
1282  /// The key is ClassID of target-provided register class.
1284  };
1285 
1286  /// \return Returns information about the register usages of the loop for the
1287  /// given vectorization factors.
1289  calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1290 
1291  /// Collect values we want to ignore in the cost model.
1292  void collectValuesToIgnore();
1293 
1294  /// Split reductions into those that happen in the loop, and those that happen
1295  /// outside. In loop reductions are collected into InLoopReductionChains.
1296  void collectInLoopReductions();
1297 
1298  /// \returns The smallest bitwidth each instruction can be represented with.
1299  /// The vector equivalents of these instructions should be truncated to this
1300  /// type.
1302  return MinBWs;
1303  }
1304 
1305  /// \returns True if it is more profitable to scalarize instruction \p I for
1306  /// vectorization factor \p VF.
1308  assert(VF.isVector() &&
1309  "Profitable to scalarize relevant only for VF > 1.");
1310 
1311  // Cost model is not run in the VPlan-native path - return conservative
1312  // result until this changes.
1314  return false;
1315 
1316  auto Scalars = InstsToScalarize.find(VF);
1317  assert(Scalars != InstsToScalarize.end() &&
1318  "VF not yet analyzed for scalarization profitability");
1319  return Scalars->second.find(I) != Scalars->second.end();
1320  }
1321 
1322  /// Returns true if \p I is known to be uniform after vectorization.
1324  if (VF.isScalar())
1325  return true;
1326 
1327  // Cost model is not run in the VPlan-native path - return conservative
1328  // result until this changes.
1330  return false;
1331 
1332  auto UniformsPerVF = Uniforms.find(VF);
1333  assert(UniformsPerVF != Uniforms.end() &&
1334  "VF not yet analyzed for uniformity");
1335  return UniformsPerVF->second.count(I);
1336  }
1337 
1338  /// Returns true if \p I is known to be scalar after vectorization.
1340  if (VF.isScalar())
1341  return true;
1342 
1343  // Cost model is not run in the VPlan-native path - return conservative
1344  // result until this changes.
1346  return false;
1347 
1348  auto ScalarsPerVF = Scalars.find(VF);
1349  assert(ScalarsPerVF != Scalars.end() &&
1350  "Scalar values are not calculated for VF");
1351  return ScalarsPerVF->second.count(I);
1352  }
1353 
1354  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1355  /// for vectorization factor \p VF.
1357  return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1358  !isProfitableToScalarize(I, VF) &&
1359  !isScalarAfterVectorization(I, VF);
1360  }
1361 
1362  /// Decision that was taken during cost calculation for memory instruction.
1365  CM_Widen, // For consecutive accesses with stride +1.
1366  CM_Widen_Reverse, // For consecutive accesses with stride -1.
1369  CM_Scalarize
1370  };
1371 
1372  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1373  /// instruction \p I and vector width \p VF.
1376  assert(VF.isVector() && "Expected VF >=2");
1377  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1378  }
1379 
1380  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1381  /// interleaving group \p Grp and vector width \p VF.
1385  assert(VF.isVector() && "Expected VF >=2");
1386  /// Broadcast this decicion to all instructions inside the group.
1387  /// But the cost will be assigned to one instruction only.
1388  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1389  if (auto *I = Grp->getMember(i)) {
1390  if (Grp->getInsertPos() == I)
1391  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1392  else
1393  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1394  }
1395  }
1396  }
1397 
1398  /// Return the cost model decision for the given instruction \p I and vector
1399  /// width \p VF. Return CM_Unknown if this instruction did not pass
1400  /// through the cost modeling.
1402  assert(VF.isVector() && "Expected VF to be a vector VF");
1403  // Cost model is not run in the VPlan-native path - return conservative
1404  // result until this changes.
1406  return CM_GatherScatter;
1407 
1408  std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1409  auto Itr = WideningDecisions.find(InstOnVF);
1410  if (Itr == WideningDecisions.end())
1411  return CM_Unknown;
1412  return Itr->second.first;
1413  }
1414 
1415  /// Return the vectorization cost for the given instruction \p I and vector
1416  /// width \p VF.
1418  assert(VF.isVector() && "Expected VF >=2");
1419  std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1420  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1421  "The cost is not calculated");
1422  return WideningDecisions[InstOnVF].second;
1423  }
1424 
1425  /// Return True if instruction \p I is an optimizable truncate whose operand
1426  /// is an induction variable. Such a truncate will be removed by adding a new
1427  /// induction variable with the destination type.
1429  // If the instruction is not a truncate, return false.
1430  auto *Trunc = dyn_cast<TruncInst>(I);
1431  if (!Trunc)
1432  return false;
1433 
1434  // Get the source and destination types of the truncate.
1435  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1436  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1437 
1438  // If the truncate is free for the given types, return false. Replacing a
1439  // free truncate with an induction variable would add an induction variable
1440  // update instruction to each iteration of the loop. We exclude from this
1441  // check the primary induction variable since it will need an update
1442  // instruction regardless.
1443  Value *Op = Trunc->getOperand(0);
1444  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1445  return false;
1446 
1447  // If the truncated value is not an induction variable, return false.
1448  return Legal->isInductionPhi(Op);
1449  }
1450 
1451  /// Collects the instructions to scalarize for each predicated instruction in
1452  /// the loop.
1453  void collectInstsToScalarize(ElementCount VF);
1454 
1455  /// Collect Uniform and Scalar values for the given \p VF.
1456  /// The sets depend on CM decision for Load/Store instructions
1457  /// that may be vectorized as interleave, gather-scatter or scalarized.
1459  // Do the analysis once.
1460  if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1461  return;
1462  setCostBasedWideningDecision(VF);
1463  collectLoopUniforms(VF);
1464  collectLoopScalars(VF);
1465  }
1466 
1467  /// Returns true if the target machine supports masked store operation
1468  /// for the given \p DataType and kind of access to \p Ptr.
1469  bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1470  return Legal->isConsecutivePtr(Ptr) &&
1471  TTI.isLegalMaskedStore(DataType, Alignment);
1472  }
1473 
1474  /// Returns true if the target machine supports masked load operation
1475  /// for the given \p DataType and kind of access to \p Ptr.
1476  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1477  return Legal->isConsecutivePtr(Ptr) &&
1478  TTI.isLegalMaskedLoad(DataType, Alignment);
1479  }
1480 
1481  /// Returns true if the target machine supports masked scatter operation
1482  /// for the given \p DataType.
1483  bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
1484  return TTI.isLegalMaskedScatter(DataType, Alignment);
1485  }
1486 
1487  /// Returns true if the target machine supports masked gather operation
1488  /// for the given \p DataType.
1489  bool isLegalMaskedGather(Type *DataType, Align Alignment) const {
1490  return TTI.isLegalMaskedGather(DataType, Alignment);
1491  }
1492 
1493  /// Returns true if the target machine can represent \p V as a masked gather
1494  /// or scatter operation.
1496  bool LI = isa<LoadInst>(V);
1497  bool SI = isa<StoreInst>(V);
1498  if (!LI && !SI)
1499  return false;
1500  auto *Ty = getMemInstValueType(V);
1502  return (LI && isLegalMaskedGather(Ty, Align)) ||
1503  (SI && isLegalMaskedScatter(Ty, Align));
1504  }
1505 
1506  /// Returns true if the target machine supports all of the reduction
1507  /// variables found for the given VF.
1509  return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1510  RecurrenceDescriptor RdxDesc = Reduction.second;
1511  return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1512  }));
1513  }
1514 
1515  /// Returns true if \p I is an instruction that will be scalarized with
1516  /// predication. Such instructions include conditional stores and
1517  /// instructions that may divide by zero.
1518  /// If a non-zero VF has been calculated, we check if I will be scalarized
1519  /// predication for that VF.
1520  bool
1521  isScalarWithPredication(Instruction *I,
1523 
1524  // Returns true if \p I is an instruction that will be predicated either
1525  // through scalar predication or masked load/store or masked gather/scatter.
1526  // Superset of instructions that return true for isScalarWithPredication.
1528  if (!blockNeedsPredication(I->getParent()))
1529  return false;
1530  // Loads and stores that need some form of masked operation are predicated
1531  // instructions.
1532  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1533  return Legal->isMaskRequired(I);
1534  return isScalarWithPredication(I, VF);
1535  }
1536 
1537  /// Returns true if \p I is a memory instruction with consecutive memory
1538  /// access that can be widened.
1539  bool
1540  memoryInstructionCanBeWidened(Instruction *I,
1542 
1543  /// Returns true if \p I is a memory instruction in an interleaved-group
1544  /// of memory accesses that can be vectorized with wide vector loads/stores
1545  /// and shuffles.
1546  bool
1547  interleavedAccessCanBeWidened(Instruction *I,
1549 
1550  /// Check if \p Instr belongs to any interleaved access group.
1552  return InterleaveInfo.isInterleaved(Instr);
1553  }
1554 
1555  /// Get the interleaved access group that \p Instr belongs to.
1558  return InterleaveInfo.getInterleaveGroup(Instr);
1559  }
1560 
1561  /// Returns true if we're required to use a scalar epilogue for at least
1562  /// the final iteration of the original loop.
1563  bool requiresScalarEpilogue() const {
1564  if (!isScalarEpilogueAllowed())
1565  return false;
1566  // If we might exit from anywhere but the latch, must run the exiting
1567  // iteration in scalar form.
1568  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1569  return true;
1570  return InterleaveInfo.requiresScalarEpilogue();
1571  }
1572 
1573  /// Returns true if a scalar epilogue is not allowed due to optsize or a
1574  /// loop hint annotation.
1576  return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1577  }
1578 
1579  /// Returns true if all loop blocks should be masked to fold tail loop.
1580  bool foldTailByMasking() const { return FoldTailByMasking; }
1581 
1583  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1584  }
1585 
1586  /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1587  /// nodes to the chain of instructions representing the reductions. Uses a
1588  /// MapVector to ensure deterministic iteration order.
1589  using ReductionChainMap =
1591 
1592  /// Return the chain of instructions representing an inloop reduction.
1594  return InLoopReductionChains;
1595  }
1596 
1597  /// Returns true if the Phi is part of an inloop reduction.
1598  bool isInLoopReduction(PHINode *Phi) const {
1599  return InLoopReductionChains.count(Phi);
1600  }
1601 
1602  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1603  /// with factor VF. Return the cost of the instruction, including
1604  /// scalarization overhead if it's needed.
1605  InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1606 
1607  /// Estimate cost of a call instruction CI if it were vectorized with factor
1608  /// VF. Return the cost of the instruction, including scalarization overhead
1609  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1610  /// scalarized -
1611  /// i.e. either vector version isn't available, or is too expensive.
1612  InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1613  bool &NeedToScalarize) const;
1614 
1615  /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1616  /// that of B.
1617  bool isMoreProfitable(const VectorizationFactor &A,
1618  const VectorizationFactor &B) const;
1619 
1620  /// Invalidates decisions already taken by the cost model.
1622  WideningDecisions.clear();
1623  Uniforms.clear();
1624  Scalars.clear();
1625  }
1626 
1627 private:
1628  unsigned NumPredStores = 0;
1629 
1630  /// \return An upper bound for the vectorization factor, a power-of-2 larger
1631  /// than zero. One is returned if vectorization should best be avoided due
1632  /// to cost.
1633  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1634  ElementCount UserVF);
1635 
1636  /// \return the maximized element count based on the targets vector
1637  /// registers and the loop trip-count, but limited to a maximum safe VF.
1638  /// This is a helper function of computeFeasibleMaxVF.
1639  /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1640  /// issue that occurred on one of the buildbots which cannot be reproduced
1641  /// without having access to the properietary compiler (see comments on
1642  /// D98509). The issue is currently under investigation and this workaround
1643  /// will be removed as soon as possible.
1644  ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1645  unsigned SmallestType,
1646  unsigned WidestType,
1647  const ElementCount &MaxSafeVF);
1648 
1649  /// \return the maximum legal scalable VF, based on the safe max number
1650  /// of elements.
1651  ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1652 
1653  /// The vectorization cost is a combination of the cost itself and a boolean
1654  /// indicating whether any of the contributing operations will actually
1655  /// operate on
1656  /// vector values after type legalization in the backend. If this latter value
1657  /// is
1658  /// false, then all operations will be scalarized (i.e. no vectorization has
1659  /// actually taken place).
1660  using VectorizationCostTy = std::pair<InstructionCost, bool>;
1661 
1662  /// Returns the expected execution cost. The unit of the cost does
1663  /// not matter because we use the 'cost' units to compare different
1664  /// vector widths. The cost that is returned is *not* normalized by
1665  /// the factor width.
1666  VectorizationCostTy expectedCost(ElementCount VF);
1667 
1668  /// Returns the execution time cost of an instruction for a given vector
1669  /// width. Vector width of one means scalar.
1670  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1671 
1672  /// The cost-computation logic from getInstructionCost which provides
1673  /// the vector type as an output parameter.
1674  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1675  Type *&VectorTy);
1676 
1677  /// Return the cost of instructions in an inloop reduction pattern, if I is
1678  /// part of that pattern.
1679  InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1680  Type *VectorTy,
1682 
1683  /// Calculate vectorization cost of memory instruction \p I.
1684  InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1685 
1686  /// The cost computation for scalarized memory instruction.
1687  InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1688 
1689  /// The cost computation for interleaving group of memory instructions.
1690  InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1691 
1692  /// The cost computation for Gather/Scatter instruction.
1693  InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1694 
1695  /// The cost computation for widening instruction \p I with consecutive
1696  /// memory access.
1697  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1698 
1699  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1700  /// Load: scalar load + broadcast.
1701  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1702  /// element)
1703  InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1704 
1705  /// Estimate the overhead of scalarizing an instruction. This is a
1706  /// convenience wrapper for the type-based getScalarizationOverhead API.
1707  InstructionCost getScalarizationOverhead(Instruction *I,
1708  ElementCount VF) const;
1709 
1710  /// Returns whether the instruction is a load or store and will be a emitted
1711  /// as a vector operation.
1712  bool isConsecutiveLoadOrStore(Instruction *I);
1713 
1714  /// Returns true if an artificially high cost for emulated masked memrefs
1715  /// should be used.
1716  bool useEmulatedMaskMemRefHack(Instruction *I);
1717 
1718  /// Map of scalar integer values to the smallest bitwidth they can be legally
1719  /// represented as. The vector equivalents of these values should be truncated
1720  /// to this type.
1722 
1723  /// A type representing the costs for instructions if they were to be
1724  /// scalarized rather than vectorized. The entries are Instruction-Cost
1725  /// pairs.
1726  using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1727 
1728  /// A set containing all BasicBlocks that are known to present after
1729  /// vectorization as a predicated block.
1730  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1731 
1732  /// Records whether it is allowed to have the original scalar loop execute at
1733  /// least once. This may be needed as a fallback loop in case runtime
1734  /// aliasing/dependence checks fail, or to handle the tail/remainder
1735  /// iterations when the trip count is unknown or doesn't divide by the VF,
1736  /// or as a peel-loop to handle gaps in interleave-groups.
1737  /// Under optsize and when the trip count is very small we don't allow any
1738  /// iterations to execute in the scalar loop.
1739  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1740 
1741  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1742  bool FoldTailByMasking = false;
1743 
1744  /// A map holding scalar costs for different vectorization factors. The
1745  /// presence of a cost for an instruction in the mapping indicates that the
1746  /// instruction will be scalarized when vectorizing with the associated
1747  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1748  DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1749 
1750  /// Holds the instructions known to be uniform after vectorization.
1751  /// The data is collected per VF.
1753 
1754  /// Holds the instructions known to be scalar after vectorization.
1755  /// The data is collected per VF.
1757 
1758  /// Holds the instructions (address computations) that are forced to be
1759  /// scalarized.
1761 
1762  /// PHINodes of the reductions that should be expanded in-loop along with
1763  /// their associated chains of reduction operations, in program order from top
1764  /// (PHI) to bottom
1765  ReductionChainMap InLoopReductionChains;
1766 
1767  /// A Map of inloop reduction operations and their immediate chain operand.
1768  /// FIXME: This can be removed once reductions can be costed correctly in
1769  /// vplan. This was added to allow quick lookup to the inloop operations,
1770  /// without having to loop through InLoopReductionChains.
1771  DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1772 
1773  /// Returns the expected difference in cost from scalarizing the expression
1774  /// feeding a predicated instruction \p PredInst. The instructions to
1775  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1776  /// non-negative return value implies the expression will be scalarized.
1777  /// Currently, only single-use chains are considered for scalarization.
1778  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1779  ElementCount VF);
1780 
1781  /// Collect the instructions that are uniform after vectorization. An
1782  /// instruction is uniform if we represent it with a single scalar value in
1783  /// the vectorized loop corresponding to each vector iteration. Examples of
1784  /// uniform instructions include pointer operands of consecutive or
1785  /// interleaved memory accesses. Note that although uniformity implies an
1786  /// instruction will be scalar, the reverse is not true. In general, a
1787  /// scalarized instruction will be represented by VF scalar values in the
1788  /// vectorized loop, each corresponding to an iteration of the original
1789  /// scalar loop.
1790  void collectLoopUniforms(ElementCount VF);
1791 
1792  /// Collect the instructions that are scalar after vectorization. An
1793  /// instruction is scalar if it is known to be uniform or will be scalarized
1794  /// during vectorization. Non-uniform scalarized instructions will be
1795  /// represented by VF values in the vectorized loop, each corresponding to an
1796  /// iteration of the original scalar loop.
1797  void collectLoopScalars(ElementCount VF);
1798 
1799  /// Keeps cost model vectorization decision and cost for instructions.
1800  /// Right now it is used for memory instructions only.
1802  std::pair<InstWidening, InstructionCost>>;
1803 
1804  DecisionList WideningDecisions;
1805 
1806  /// Returns true if \p V is expected to be vectorized and it needs to be
1807  /// extracted.
1808  bool needsExtract(Value *V, ElementCount VF) const {
1809  Instruction *I = dyn_cast<Instruction>(V);
1810  if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1811  TheLoop->isLoopInvariant(I))
1812  return false;
1813 
1814  // Assume we can vectorize V (and hence we need extraction) if the
1815  // scalars are not computed yet. This can happen, because it is called
1816  // via getScalarizationOverhead from setCostBasedWideningDecision, before
1817  // the scalars are collected. That should be a safe assumption in most
1818  // cases, because we check if the operands have vectorizable types
1819  // beforehand in LoopVectorizationLegality.
1820  return Scalars.find(VF) == Scalars.end() ||
1821  !isScalarAfterVectorization(I, VF);
1822  };
1823 
1824  /// Returns a range containing only operands needing to be extracted.
1825  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1826  ElementCount VF) const {
1828  Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1829  }
1830 
1831  /// Determines if we have the infrastructure to vectorize loop \p L and its
1832  /// epilogue, assuming the main loop is vectorized by \p VF.
1833  bool isCandidateForEpilogueVectorization(const Loop &L,
1834  const ElementCount VF) const;
1835 
1836  /// Returns true if epilogue vectorization is considered profitable, and
1837  /// false otherwise.
1838  /// \p VF is the vectorization factor chosen for the original loop.
1839  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1840 
1841 public:
1842  /// The loop that we evaluate.
1844 
1845  /// Predicated scalar evolution analysis.
1847 
1848  /// Loop Info analysis.
1850 
1851  /// Vectorization legality.
1853 
1854  /// Vector target information.
1856 
1857  /// Target Library Info.
1859 
1860  /// Demanded bits analysis.
1862 
1863  /// Assumption cache.
1865 
1866  /// Interface to emit optimization remarks.
1868 
1870 
1871  /// Loop Vectorize Hint.
1873 
1874  /// The interleave access information contains groups of interleaved accesses
1875  /// with the same stride and close to each other.
1877 
1878  /// Values to ignore in the cost model.
1880 
1881  /// Values to ignore in the cost model when VF > 1.
1883 
1884  /// Profitable vector factors.
1886 };
1887 } // end namespace llvm
1888 
1889 /// Helper struct to manage generating runtime checks for vectorization.
1890 ///
1891 /// The runtime checks are created up-front in temporary blocks to allow better
1892 /// estimating the cost and un-linked from the existing IR. After deciding to
1893 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1894 /// temporary blocks are completely removed.
1896  /// Basic block which contains the generated SCEV checks, if any.
1897  BasicBlock *SCEVCheckBlock = nullptr;
1898 
1899  /// The value representing the result of the generated SCEV checks. If it is
1900  /// nullptr, either no SCEV checks have been generated or they have been used.
1901  Value *SCEVCheckCond = nullptr;
1902 
1903  /// Basic block which contains the generated memory runtime checks, if any.
1904  BasicBlock *MemCheckBlock = nullptr;
1905 
1906  /// The value representing the result of the generated memory runtime checks.
1907  /// If it is nullptr, either no memory runtime checks have been generated or
1908  /// they have been used.
1909  Instruction *MemRuntimeCheckCond = nullptr;
1910 
1911  DominatorTree *DT;
1912  LoopInfo *LI;
1913 
1914  SCEVExpander SCEVExp;
1915  SCEVExpander MemCheckExp;
1916 
1917 public:
1919  const DataLayout &DL)
1920  : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1921  MemCheckExp(SE, DL, "scev.check") {}
1922 
1923  /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1924  /// accurately estimate the cost of the runtime checks. The blocks are
1925  /// un-linked from the IR and is added back during vector code generation. If
1926  /// there is no vector code generation, the check blocks are removed
1927  /// completely.
1928  void Create(Loop *L, const LoopAccessInfo &LAI,
1929  const SCEVUnionPredicate &UnionPred) {
1930 
1931  BasicBlock *LoopHeader = L->getHeader();
1932  BasicBlock *Preheader = L->getLoopPreheader();
1933 
1934  // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1935  // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1936  // may be used by SCEVExpander. The blocks will be un-linked from their
1937  // predecessors and removed from LI & DT at the end of the function.
1938  if (!UnionPred.isAlwaysTrue()) {
1939  SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1940  nullptr, "vector.scevcheck");
1941 
1942  SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1943  &UnionPred, SCEVCheckBlock->getTerminator());
1944  }
1945 
1946  const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1947  if (RtPtrChecking.Need) {
1948  auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1949  MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1950  "vector.memcheck");
1951 
1952  std::tie(std::ignore, MemRuntimeCheckCond) =
1953  addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1954  RtPtrChecking.getChecks(), MemCheckExp);
1955  assert(MemRuntimeCheckCond &&
1956  "no RT checks generated although RtPtrChecking "
1957  "claimed checks are required");
1958  }
1959 
1960  if (!MemCheckBlock && !SCEVCheckBlock)
1961  return;
1962 
1963  // Unhook the temporary block with the checks, update various places
1964  // accordingly.
1965  if (SCEVCheckBlock)
1966  SCEVCheckBlock->replaceAllUsesWith(Preheader);
1967  if (MemCheckBlock)
1968  MemCheckBlock->replaceAllUsesWith(Preheader);
1969 
1970  if (SCEVCheckBlock) {
1971  SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1972  new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1973  Preheader->getTerminator()->eraseFromParent();
1974  }
1975  if (MemCheckBlock) {
1976  MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1977  new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1978  Preheader->getTerminator()->eraseFromParent();
1979  }
1980 
1981  DT->changeImmediateDominator(LoopHeader, Preheader);
1982  if (MemCheckBlock) {
1983  DT->eraseNode(MemCheckBlock);
1984  LI->removeBlock(MemCheckBlock);
1985  }
1986  if (SCEVCheckBlock) {
1987  DT->eraseNode(SCEVCheckBlock);
1988  LI->removeBlock(SCEVCheckBlock);
1989  }
1990  }
1991 
1992  /// Remove the created SCEV & memory runtime check blocks & instructions, if
1993  /// unused.
1995  SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
1996  SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
1997  if (!SCEVCheckCond)
1998  SCEVCleaner.markResultUsed();
1999 
2000  if (!MemRuntimeCheckCond)
2001  MemCheckCleaner.markResultUsed();
2002 
2003  if (MemRuntimeCheckCond) {
2004  auto &SE = *MemCheckExp.getSE();
2005  // Memory runtime check generation creates compares that use expanded
2006  // values. Remove them before running the SCEVExpanderCleaners.
2007  for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2008  if (MemCheckExp.isInsertedInstruction(&I))
2009  continue;
2010  SE.forgetValue(&I);
2011  SE.eraseValueFromMap(&I);
2012  I.eraseFromParent();
2013  }
2014  }
2015  MemCheckCleaner.cleanup();
2016  SCEVCleaner.cleanup();
2017 
2018  if (SCEVCheckCond)
2019  SCEVCheckBlock->eraseFromParent();
2020  if (MemRuntimeCheckCond)
2021  MemCheckBlock->eraseFromParent();
2022  }
2023 
2024  /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2025  /// adjusts the branches to branch to the vector preheader or \p Bypass,
2026  /// depending on the generated condition.
2030  if (!SCEVCheckCond)
2031  return nullptr;
2032  if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2033  if (C->isZero())
2034  return nullptr;
2035 
2036  auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2037 
2038  BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2039  // Create new preheader for vector loop.
2040  if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2041  PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2042 
2043  SCEVCheckBlock->getTerminator()->eraseFromParent();
2044  SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2045  Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2046  SCEVCheckBlock);
2047 
2048  DT->addNewBlock(SCEVCheckBlock, Pred);
2050 
2052  SCEVCheckBlock->getTerminator(),
2053  BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2054  // Mark the check as used, to prevent it from being removed during cleanup.
2055  SCEVCheckCond = nullptr;
2056  return SCEVCheckBlock;
2057  }
2058 
2059  /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2060  /// the branches to branch to the vector preheader or \p Bypass, depending on
2061  /// the generated condition.
2064  // Check if we generated code that checks in runtime if arrays overlap.
2065  if (!MemRuntimeCheckCond)
2066  return nullptr;
2067 
2068  auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2070  MemCheckBlock);
2071 
2072  DT->addNewBlock(MemCheckBlock, Pred);
2074  MemCheckBlock->moveBefore(LoopVectorPreHeader);
2075 
2076  if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2077  PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2078 
2080  MemCheckBlock->getTerminator(),
2081  BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2082  MemCheckBlock->getTerminator()->setDebugLoc(
2083  Pred->getTerminator()->getDebugLoc());
2084 
2085  // Mark the check as used, to prevent it from being removed during cleanup.
2086  MemRuntimeCheckCond = nullptr;
2087  return MemCheckBlock;
2088  }
2089 };
2090 
2091 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2092 // vectorization. The loop needs to be annotated with #pragma omp simd
2093 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2094 // vector length information is not provided, vectorization is not considered
2095 // explicit. Interleave hints are not allowed either. These limitations will be
2096 // relaxed in the future.
2097 // Please, note that we are currently forced to abuse the pragma 'clang
2098 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2099 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2100 // provides *explicit vectorization hints* (LV can bypass legal checks and
2101 // assume that vectorization is legal). However, both hints are implemented
2102 // using the same metadata (llvm.loop.vectorize, processed by
2103 // LoopVectorizeHints). This will be fixed in the future when the native IR
2104 // representation for pragma 'omp simd' is introduced.
2105 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2107  assert(!OuterLp->isInnermost() && "This is not an outer loop");
2108  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2109 
2110  // Only outer loops with an explicit vectorization hint are supported.
2111  // Unannotated outer loops are ignored.
2113  return false;
2114 
2115  Function *Fn = OuterLp->getHeader()->getParent();
2116  if (!Hints.allowVectorization(Fn, OuterLp,
2117  true /*VectorizeOnlyWhenForced*/)) {
2118  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2119  return false;
2120  }
2121 
2122  if (Hints.getInterleave() > 1) {
2123  // TODO: Interleave support is future work.
2124  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2125  "outer loops.\n");
2126  Hints.emitRemarkWithHints();
2127  return false;
2128  }
2129 
2130  return true;
2131 }
2132 
2136  // Collect inner loops and outer loops without irreducible control flow. For
2137  // now, only collect outer loops that have explicit vectorization hints. If we
2138  // are stress testing the VPlan H-CFG construction, we collect the outermost
2139  // loop of every loop nest.
2140  if (L.isInnermost() || VPlanBuildStressTest ||
2142  LoopBlocksRPO RPOT(&L);
2143  RPOT.perform(LI);
2144  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2145  V.push_back(&L);
2146  // TODO: Collect inner loops inside marked outer loops in case
2147  // vectorization fails for the outer loop. Do not invoke
2148  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2149  // already known to be reducible. We can use an inherited attribute for
2150  // that.
2151  return;
2152  }
2153  }
2154  for (Loop *InnerL : L)
2155  collectSupportedLoops(*InnerL, LI, ORE, V);
2156 }
2157 
2158 namespace {
2159 
2160 /// The LoopVectorize Pass.
2161 struct LoopVectorize : public FunctionPass {
2162  /// Pass identification, replacement for typeid
2163  static char ID;
2164 
2165  LoopVectorizePass Impl;
2166 
2167  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2168  bool VectorizeOnlyWhenForced = false)
2169  : FunctionPass(ID),
2170  Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2172  }
2173 
2174  bool runOnFunction(Function &F) override {
2175  if (skipFunction(F))
2176  return false;
2177 
2178  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2179  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2180  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2181  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2182  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2183  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2184  auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2185  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2186  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2187  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2188  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2189  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2190  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2191 
2192  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2193  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2194 
2195  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2196  GetLAA, *ORE, PSI).MadeAnyChange;
2197  }
2198 
2199  void getAnalysisUsage(AnalysisUsage &AU) const override {
2211 
2212  // We currently do not preserve loopinfo/dominator analyses with outer loop
2213  // vectorization. Until this is addressed, mark these analyses as preserved
2214  // only for non-VPlan-native path.
2215  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2216  if (!EnableVPlanNativePath) {
2219  }
2220 
2224  }
2225 };
2226 
2227 } // end anonymous namespace
2228 
2229 //===----------------------------------------------------------------------===//
2230 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2231 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2232 //===----------------------------------------------------------------------===//
2233 
2235  // We need to place the broadcast of invariant variables outside the loop,
2236  // but only if it's proven safe to do so. Else, broadcast will be inside
2237  // vector loop body.
2238  Instruction *Instr = dyn_cast<Instruction>(V);
2239  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2240  (!Instr ||
2242  // Place the code for broadcasting invariant variables in the new preheader.
2244  if (SafeToHoist)
2246 
2247  // Broadcast the scalar into all locations in the vector.
2248  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2249 
2250  return Shuf;
2251 }
2252 
2254  const InductionDescriptor &II, Value *Step, Value *Start,
2255  Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2256  VPTransformState &State) {
2257  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2258  "Expected either an induction phi-node or a truncate of it!");
2259 
2260  // Construct the initial value of the vector IV in the vector loop preheader
2261  auto CurrIP = Builder.saveIP();
2263  if (isa<TruncInst>(EntryVal)) {
2264  assert(Start->getType()->isIntegerTy() &&
2265  "Truncation requires an integer type");
2266  auto *TruncType = cast<IntegerType>(EntryVal->getType());
2267  Step = Builder.CreateTrunc(Step, TruncType);
2268  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2269  }
2270  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2271  Value *SteppedStart =
2272  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2273 
2274  // We create vector phi nodes for both integer and floating-point induction
2275  // variables. Here, we determine the kind of arithmetic we will perform.
2276  Instruction::BinaryOps AddOp;
2277  Instruction::BinaryOps MulOp;
2278  if (Step->getType()->isIntegerTy()) {
2279  AddOp = Instruction::Add;
2280  MulOp = Instruction::Mul;
2281  } else {
2282  AddOp = II.getInductionOpcode();
2283  MulOp = Instruction::FMul;
2284  }
2285 
2286  // Multiply the vectorization factor by the step using integer or
2287  // floating-point arithmetic as appropriate.
2288  Type *StepType = Step->getType();
2289  if (Step->getType()->isFloatingPointTy())
2290  StepType = IntegerType::get(StepType->getContext(),
2291  StepType->getScalarSizeInBits());
2292  Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2293  if (Step->getType()->isFloatingPointTy())
2294  RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2295  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2296 
2297  // Create a vector splat to use in the induction update.
2298  //
2299  // FIXME: If the step is non-constant, we create the vector splat with
2300  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2301  // handle a constant vector splat.
2302  Value *SplatVF = isa<Constant>(Mul)
2303  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2305  Builder.restoreIP(CurrIP);
2306 
2307  // We may need to add the step a number of times, depending on the unroll
2308  // factor. The last of those goes into the PHI.
2309  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2311  VecInd->setDebugLoc(EntryVal->getDebugLoc());
2312  Instruction *LastInduction = VecInd;
2313  for (unsigned Part = 0; Part < UF; ++Part) {
2314  State.set(Def, LastInduction, Part);
2315 
2316  if (isa<TruncInst>(EntryVal))
2317  addMetadata(LastInduction, EntryVal);
2318  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2319  State, Part);
2320 
2321  LastInduction = cast<Instruction>(
2322  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2323  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2324  }
2325 
2326  // Move the last step to the end of the latch block. This ensures consistent
2327  // placement of all induction updates.
2328  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2329  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2330  auto *ICmp = cast<Instruction>(Br->getCondition());
2331  LastInduction->moveBefore(ICmp);
2332  LastInduction->setName("vec.ind.next");
2333 
2334  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2335  VecInd->addIncoming(LastInduction, LoopVectorLatch);
2336 }
2337 
2339  return Cost->isScalarAfterVectorization(I, VF) ||
2341 }
2342 
2345  return true;
2346  auto isScalarInst = [&](User *U) -> bool {
2347  auto *I = cast<Instruction>(U);
2349  };
2350  return llvm::any_of(IV->users(), isScalarInst);
2351 }
2352 
2354  const InductionDescriptor &ID, const Instruction *EntryVal,
2355  Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2356  unsigned Part, unsigned Lane) {
2357  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2358  "Expected either an induction phi-node or a truncate of it!");
2359 
2360  // This induction variable is not the phi from the original loop but the
2361  // newly-created IV based on the proof that casted Phi is equal to the
2362  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2363  // re-uses the same InductionDescriptor that original IV uses but we don't
2364  // have to do any recording in this case - that is done when original IV is
2365  // processed.
2366  if (isa<TruncInst>(EntryVal))
2367  return;
2368 
2369  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2370  if (Casts.empty())
2371  return;
2372  // Only the first Cast instruction in the Casts vector is of interest.
2373  // The rest of the Casts (if exist) have no uses outside the
2374  // induction update chain itself.
2375  if (Lane < UINT_MAX)
2376  State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2377  else
2378  State.set(CastDef, VectorLoopVal, Part);
2379 }
2380 
2382  TruncInst *Trunc, VPValue *Def,
2383  VPValue *CastDef,
2384  VPTransformState &State) {
2385  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2386  "Primary induction variable must have an integer type");
2387 
2388  auto II = Legal->getInductionVars().find(IV);
2389  assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2390 
2391  auto ID = II->second;
2392  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2393 
2394  // The value from the original loop to which we are mapping the new induction
2395  // variable.
2396  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2397 
2398  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2399 
2400  // Generate code for the induction step. Note that induction steps are
2401  // required to be loop-invariant
2402  auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2403  assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2404  "Induction step should be loop invariant");
2405  if (PSE.getSE()->isSCEVable(IV->getType())) {
2406  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2407  return Exp.expandCodeFor(Step, Step->getType(),
2409  }
2410  return cast<SCEVUnknown>(Step)->getValue();
2411  };
2412 
2413  // The scalar value to broadcast. This is derived from the canonical
2414  // induction variable. If a truncation type is given, truncate the canonical
2415  // induction variable and step. Otherwise, derive these values from the
2416  // induction descriptor.
2417  auto CreateScalarIV = [&](Value *&Step) -> Value * {
2418  Value *ScalarIV = Induction;
2419  if (IV != OldInduction) {
2420  ScalarIV = IV->getType()->isIntegerTy()
2422  : Builder.CreateCast(Instruction::SIToFP, Induction,
2423  IV->getType());
2424  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2425  ScalarIV->setName("offset.idx");
2426  }
2427  if (Trunc) {
2428  auto *TruncType = cast<IntegerType>(Trunc->getType());
2429  assert(Step->getType()->isIntegerTy() &&
2430  "Truncation requires an integer step");
2431  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2432  Step = Builder.CreateTrunc(Step, TruncType);
2433  }
2434  return ScalarIV;
2435  };
2436 
2437  // Create the vector values from the scalar IV, in the absence of creating a
2438  // vector IV.
2439  auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2440  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2441  for (unsigned Part = 0; Part < UF; ++Part) {
2442  assert(!VF.isScalable() && "scalable vectors not yet supported.");
2443  Value *EntryPart =
2444  getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2445  ID.getInductionOpcode());
2446  State.set(Def, EntryPart, Part);
2447  if (Trunc)
2448  addMetadata(EntryPart, Trunc);
2449  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2450  State, Part);
2451  }
2452  };
2453 
2454  // Fast-math-flags propagate from the original induction instruction.
2456  if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2457  Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2458 
2459  // Now do the actual transformations, and start with creating the step value.
2460  Value *Step = CreateStepValue(ID.getStep());
2461  if (VF.isZero() || VF.isScalar()) {
2462  Value *ScalarIV = CreateScalarIV(Step);
2463  CreateSplatIV(ScalarIV, Step);
2464  return;
2465  }
2466 
2467  // Determine if we want a scalar version of the induction variable. This is
2468  // true if the induction variable itself is not widened, or if it has at
2469  // least one user in the loop that is not widened.
2470  auto NeedsScalarIV = needsScalarInduction(EntryVal);
2471  if (!NeedsScalarIV) {
2472  createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2473  State);
2474  return;
2475  }
2476 
2477  // Try to create a new independent vector induction variable. If we can't
2478  // create the phi node, we will splat the scalar induction variable in each
2479  // loop iteration.
2480  if (!shouldScalarizeInstruction(EntryVal)) {
2481  createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2482  State);
2483  Value *ScalarIV = CreateScalarIV(Step);
2484  // Create scalar steps that can be used by instructions we will later
2485  // scalarize. Note that the addition of the scalar steps will not increase
2486  // the number of instructions in the loop in the common case prior to
2487  // InstCombine. We will be trading one vector extract for each scalar step.
2488  buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2489  return;
2490  }
2491 
2492  // All IV users are scalar instructions, so only emit a scalar IV, not a
2493  // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2494  // predicate used by the masked loads/stores.
2495  Value *ScalarIV = CreateScalarIV(Step);
2496  if (!Cost->isScalarEpilogueAllowed())
2497  CreateSplatIV(ScalarIV, Step);
2498  buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2499 }
2500 
2502  Instruction::BinaryOps BinOp) {
2503  // Create and check the types.
2504  auto *ValVTy = cast<VectorType>(Val->getType());
2505  ElementCount VLen = ValVTy->getElementCount();
2506 
2507  Type *STy = Val->getType()->getScalarType();
2508  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2509  "Induction Step must be an integer or FP");
2510  assert(Step->getType() == STy && "Step has wrong type");
2511 
2513 
2514  // Create a vector of consecutive numbers from zero to VF.
2515  VectorType *InitVecValVTy = ValVTy;
2516  Type *InitVecValSTy = STy;
2517  if (STy->isFloatingPointTy()) {
2518  InitVecValSTy =
2520  InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2521  }
2522  Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2523 
2524  // Add on StartIdx
2525  Value *StartIdxSplat = Builder.CreateVectorSplat(
2526  VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2527  InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2528 
2529  if (STy->isIntegerTy()) {
2530  Step = Builder.CreateVectorSplat(VLen, Step);
2531  assert(Step->getType() == Val->getType() && "Invalid step vec");
2532  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2533  // which can be found from the original scalar operations.
2534  Step = Builder.CreateMul(InitVec, Step);
2535  return Builder.CreateAdd(Val, Step, "induction");
2536  }
2537 
2538  // Floating point induction.
2539  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2540  "Binary Opcode should be specified for FP induction");
2541  InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2542  Step = Builder.CreateVectorSplat(VLen, Step);
2543  Value *MulOp = Builder.CreateFMul(InitVec, Step);
2544  return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2545 }
2546 
2548  Instruction *EntryVal,
2549  const InductionDescriptor &ID,
2550  VPValue *Def, VPValue *CastDef,
2551  VPTransformState &State) {
2552  // We shouldn't have to build scalar steps if we aren't vectorizing.
2553  assert(VF.isVector() && "VF should be greater than one");
2554  // Get the value type and ensure it and the step have the same integer type.
2555  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2556  assert(ScalarIVTy == Step->getType() &&
2557  "Val and Step should have the same type");
2558 
2559  // We build scalar steps for both integer and floating-point induction
2560  // variables. Here, we determine the kind of arithmetic we will perform.
2561  Instruction::BinaryOps AddOp;
2562  Instruction::BinaryOps MulOp;
2563  if (ScalarIVTy->isIntegerTy()) {
2564  AddOp = Instruction::Add;
2565  MulOp = Instruction::Mul;
2566  } else {
2567  AddOp = ID.getInductionOpcode();
2568  MulOp = Instruction::FMul;
2569  }
2570 
2571  // Determine the number of scalars we need to generate for each unroll
2572  // iteration. If EntryVal is uniform, we only need to generate the first
2573  // lane. Otherwise, we generate all VF values.
2574  bool IsUniform =
2575  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2576  unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2577  // Compute the scalar steps and save the results in State.
2578  Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2579  ScalarIVTy->getScalarSizeInBits());
2580  Type *VecIVTy = nullptr;
2581  Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2582  if (!IsUniform && VF.isScalable()) {
2583  VecIVTy = VectorType::get(ScalarIVTy, VF);
2584  UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2585  SplatStep = Builder.CreateVectorSplat(VF, Step);
2586  SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2587  }
2588 
2589  for (unsigned Part = 0; Part < UF; ++Part) {
2590  Value *StartIdx0 =
2591  createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2592 
2593  if (!IsUniform && VF.isScalable()) {
2594  auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2595  auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2596  if (ScalarIVTy->isFloatingPointTy())
2597  InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2598  auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2599  auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2600  State.set(Def, Add, Part);
2601  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2602  Part);
2603  // It's useful to record the lane values too for the known minimum number
2604  // of elements so we do those below. This improves the code quality when
2605  // trying to extract the first element, for example.
2606  }
2607 
2608  if (ScalarIVTy->isFloatingPointTy())
2609  StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2610 
2611  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2612  Value *StartIdx = Builder.CreateBinOp(
2613  AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2614  // The step returned by `createStepForVF` is a runtime-evaluated value
2615  // when VF is scalable. Otherwise, it should be folded into a Constant.
2616  assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2617  "Expected StartIdx to be folded to a constant when VF is not "
2618  "scalable");
2619  auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2620  auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2621  State.set(Def, Add, VPIteration(Part, Lane));
2622  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2623  Part, Lane);
2624  }
2625  }
2626 }
2627 
2629  const VPIteration &Instance,
2630  VPTransformState &State) {
2631  Value *ScalarInst = State.get(Def, Instance);
2632  Value *VectorValue = State.get(Def, Instance.Part);
2633  VectorValue = Builder.CreateInsertElement(
2634  VectorValue, ScalarInst,
2635  Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2636  State.set(Def, VectorValue, Instance.Part);
2637 }
2638 
2640  assert(Vec->getType()->isVectorTy() && "Invalid type");
2641  return Builder.CreateVectorReverse(Vec, "reverse");
2642 }
2643 
2644 // Return whether we allow using masked interleave-groups (for dealing with
2645 // strided loads/stores that reside in predicated blocks, or for dealing
2646 // with gaps).
2648  // If an override option has been passed in for interleaved accesses, use it.
2651 
2653 }
2654 
2655 // Try to vectorize the interleave group that \p Instr belongs to.
2656 //
2657 // E.g. Translate following interleaved load group (factor = 3):
2658 // for (i = 0; i < N; i+=3) {
2659 // R = Pic[i]; // Member of index 0
2660 // G = Pic[i+1]; // Member of index 1
2661 // B = Pic[i+2]; // Member of index 2
2662 // ... // do something to R, G, B
2663 // }
2664 // To:
2665 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2666 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2667 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2668 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2669 //
2670 // Or translate following interleaved store group (factor = 3):
2671 // for (i = 0; i < N; i+=3) {
2672 // ... do something to R, G, B
2673 // Pic[i] = R; // Member of index 0
2674 // Pic[i+1] = G; // Member of index 1
2675 // Pic[i+2] = B; // Member of index 2
2676 // }
2677 // To:
2678 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2679 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2680 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2681 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2682 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2685  VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2686  VPValue *BlockInMask) {
2687  Instruction *Instr = Group->getInsertPos();
2688  const DataLayout &DL = Instr->getModule()->getDataLayout();
2689 
2690  // Prepare for the vector type of the interleaved load/store.
2691  Type *ScalarTy = getMemInstValueType(Instr);
2692  unsigned InterleaveFactor = Group->getFactor();
2693  assert(!VF.isScalable() && "scalable vectors not yet supported.");
2694  auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2695 
2696  // Prepare for the new pointers.
2697  SmallVector<Value *, 2> AddrParts;
2698  unsigned Index = Group->getIndex(Instr);
2699 
2700  // TODO: extend the masked interleaved-group support to reversed access.
2701  assert((!BlockInMask || !Group->isReverse()) &&
2702  "Reversed masked interleave-group not supported.");
2703 
2704  // If the group is reverse, adjust the index to refer to the last vector lane
2705  // instead of the first. We adjust the index from the first vector lane,
2706  // rather than directly getting the pointer for lane VF - 1, because the
2707  // pointer operand of the interleaved access is supposed to be uniform. For
2708  // uniform instructions, we're only required to generate a value for the
2709  // first vector lane in each unroll iteration.
2710  if (Group->isReverse())
2711  Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2712 
2713  for (unsigned Part = 0; Part < UF; Part++) {
2714  Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2715  setDebugLocFromInst(Builder, AddrPart);
2716 
2717  // Notice current instruction could be any index. Need to adjust the address
2718  // to the member of index 0.
2719  //
2720  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2721  // b = A[i]; // Member of index 0
2722  // Current pointer is pointed to A[i+1], adjust it to A[i].
2723  //
2724  // E.g. A[i+1] = a; // Member of index 1
2725  // A[i] = b; // Member of index 0
2726  // A[i+2] = c; // Member of index 2 (Current instruction)
2727  // Current pointer is pointed to A[i+2], adjust it to A[i].
2728 
2729  bool InBounds = false;
2730  if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2731  InBounds = gep->isInBounds();
2732  AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2733  cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2734 
2735  // Cast to the vector pointer type.
2736  unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2737  Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2738  AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2739  }
2740 
2741  setDebugLocFromInst(Builder, Instr);
2742  Value *PoisonVec = PoisonValue::get(VecTy);
2743 
2744  Value *MaskForGaps = nullptr;
2745  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2746  MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2747  assert(MaskForGaps && "Mask for Gaps is required but it is null");
2748  }
2749 
2750  // Vectorize the interleaved load group.
2751  if (isa<LoadInst>(Instr)) {
2752  // For each unroll part, create a wide load for the group.
2753  SmallVector<Value *, 2> NewLoads;
2754  for (unsigned Part = 0; Part < UF; Part++) {
2755  Instruction *NewLoad;
2756  if (BlockInMask || MaskForGaps) {
2758  "masked interleaved groups are not allowed.");
2759  Value *GroupMask = MaskForGaps;
2760  if (BlockInMask) {
2761  Value *BlockInMaskPart = State.get(BlockInMask, Part);
2762  Value *ShuffledMask = Builder.CreateShuffleVector(
2763  BlockInMaskPart,
2764  createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2765  "interleaved.mask");
2766  GroupMask = MaskForGaps
2767  ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2768  MaskForGaps)
2769  : ShuffledMask;
2770  }
2771  NewLoad =
2772  Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2773  GroupMask, PoisonVec, "wide.masked.vec");
2774  }
2775  else
2776  NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2777  Group->getAlign(), "wide.vec");
2778  Group->addMetadata(NewLoad);
2779  NewLoads.push_back(NewLoad);
2780  }
2781 
2782  // For each member in the group, shuffle out the appropriate data from the
2783  // wide loads.
2784  unsigned J = 0;
2785  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2786  Instruction *Member = Group->getMember(I);
2787 
2788  // Skip the gaps in the group.
2789  if (!Member)
2790  continue;
2791 
2792  auto StrideMask =
2793  createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2794  for (unsigned Part = 0; Part < UF; Part++) {
2795  Value *StridedVec = Builder.CreateShuffleVector(
2796  NewLoads[Part], StrideMask, "strided.vec");
2797 
2798  // If this member has different type, cast the result type.
2799  if (Member->getType() != ScalarTy) {
2800  assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2801  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2802  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2803  }
2804 
2805  if (Group->isReverse())
2806  StridedVec = reverseVector(StridedVec);
2807 
2808  State.set(VPDefs[J], StridedVec, Part);
2809  }
2810  ++J;
2811  }
2812  return;
2813  }
2814 
2815  // The sub vector type for current instruction.
2816  auto *SubVT = VectorType::get(ScalarTy, VF);
2817 
2818  // Vectorize the interleaved store group.
2819  for (unsigned Part = 0; Part < UF; Part++) {
2820  // Collect the stored vector from each member.
2821  SmallVector<Value *, 4> StoredVecs;
2822  for (unsigned i = 0; i < InterleaveFactor; i++) {
2823  // Interleaved store group doesn't allow a gap, so each index has a member
2824  assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2825 
2826  Value *StoredVec = State.get(StoredValues[i], Part);
2827 
2828  if (Group->isReverse())
2829  StoredVec = reverseVector(StoredVec);
2830 
2831  // If this member has different type, cast it to a unified type.
2832 
2833  if (StoredVec->getType() != SubVT)
2834  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2835 
2836  StoredVecs.push_back(StoredVec);
2837  }
2838 
2839  // Concatenate all vectors into a wide vector.
2840  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2841 
2842  // Interleave the elements in the wide vector.
2844  WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2845  "interleaved.vec");
2846 
2847  Instruction *NewStoreInstr;
2848  if (BlockInMask) {
2849  Value *BlockInMaskPart = State.get(BlockInMask, Part);
2850  Value *ShuffledMask = Builder.CreateShuffleVector(
2851  BlockInMaskPart,
2852  createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2853  "interleaved.mask");
2854  NewStoreInstr = Builder.CreateMaskedStore(
2855  IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2856  }
2857  else
2858  NewStoreInstr =
2859  Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2860 
2861  Group->addMetadata(NewStoreInstr);
2862  }
2863 }
2864 
2866  Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2867  VPValue *StoredValue, VPValue *BlockInMask) {
2868  // Attempt to issue a wide load.
2869  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2870  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2871 
2872  assert((LI || SI) && "Invalid Load/Store instruction");
2873  assert((!SI || StoredValue) && "No stored value provided for widened store");
2874  assert((!LI || !StoredValue) && "Stored value provided for widened load");
2875 
2877  Cost->getWideningDecision(Instr, VF);
2881  "CM decision is not to widen the memory instruction");
2882 
2883  Type *ScalarDataTy = getMemInstValueType(Instr);
2884 
2885  auto *DataTy = VectorType::get(ScalarDataTy, VF);
2886  const Align Alignment = getLoadStoreAlignment(Instr);
2887 
2888  // Determine if the pointer operand of the access is either consecutive or
2889  // reverse consecutive.
2890  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2891  bool ConsecutiveStride =
2892  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2893  bool CreateGatherScatter =
2895 
2896  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2897  // gather/scatter. Otherwise Decision should have been to Scalarize.
2898  assert((ConsecutiveStride || CreateGatherScatter) &&
2899  "The instruction should be scalarized");
2900  (void)ConsecutiveStride;
2901 
2902  VectorParts BlockInMaskParts(UF);
2903  bool isMaskRequired = BlockInMask;
2904  if (isMaskRequired)
2905  for (unsigned Part = 0; Part < UF; ++Part)
2906  BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2907 
2908  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2909  // Calculate the pointer for the specific unroll-part.
2910  GetElementPtrInst *PartPtr = nullptr;
2911 
2912  bool InBounds = false;
2913  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2914  InBounds = gep->isInBounds();
2915  if (Reverse) {
2916  // If the address is consecutive but reversed, then the
2917  // wide store needs to start at the last vector element.
2918  // RunTimeVF = VScale * VF.getKnownMinValue()
2919  // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2920  Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2921  // NumElt = -Part * RunTimeVF
2922  Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2923  // LastLane = 1 - RunTimeVF
2924  Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2925  PartPtr =
2926  cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2927  PartPtr->setIsInBounds(InBounds);
2928  PartPtr = cast<GetElementPtrInst>(
2929  Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2930  PartPtr->setIsInBounds(InBounds);
2931  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2932  BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2933  } else {
2934  Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2935  PartPtr = cast<GetElementPtrInst>(
2936  Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2937  PartPtr->setIsInBounds(InBounds);
2938  }
2939 
2940  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2941  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2942  };
2943 
2944  // Handle Stores:
2945  if (SI) {
2947 
2948  for (unsigned Part = 0; Part < UF; ++Part) {
2949  Instruction *NewSI = nullptr;
2950  Value *StoredVal = State.get(StoredValue, Part);
2951  if (CreateGatherScatter) {
2952  Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2953  Value *VectorGep = State.get(Addr, Part);
2954  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2955  MaskPart);
2956  } else {
2957  if (Reverse) {
2958  // If we store to reverse consecutive memory locations, then we need
2959  // to reverse the order of elements in the stored value.
2960  StoredVal = reverseVector(StoredVal);
2961  // We don't want to update the value in the map as it might be used in
2962  // another expression. So don't call resetVectorValue(StoredVal).
2963  }
2964  auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2965  if (isMaskRequired)
2966  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2967  BlockInMaskParts[Part]);
2968  else
2969  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2970  }
2971  addMetadata(NewSI, SI);
2972  }
2973  return;
2974  }
2975 
2976  // Handle loads.
2977  assert(LI && "Must have a load instruction");
2979  for (unsigned Part = 0; Part < UF; ++Part) {
2980  Value *NewLI;
2981  if (CreateGatherScatter) {
2982  Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2983  Value *VectorGep = State.get(Addr, Part);
2984  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2985  nullptr, "wide.masked.gather");
2986  addMetadata(NewLI, LI);
2987  } else {
2988  auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2989  if (isMaskRequired)
2990  NewLI = Builder.CreateMaskedLoad(
2991  VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2992  "wide.masked.load");
2993  else
2994  NewLI =
2995  Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2996 
2997  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2998  addMetadata(NewLI, LI);
2999  if (Reverse)
3000  NewLI = reverseVector(NewLI);
3001  }
3002 
3003  State.set(Def, NewLI, Part);
3004  }
3005 }
3006 
3008  VPUser &User,
3009  const VPIteration &Instance,
3010  bool IfPredicateInstr,
3011  VPTransformState &State) {
3012  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3013 
3014  // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3015  // the first lane and part.
3016  if (isa<NoAliasScopeDeclInst>(Instr))
3017  if (!Instance.isFirstIteration())
3018  return;
3019 
3020  setDebugLocFromInst(Builder, Instr);
3021 
3022  // Does this instruction return a value ?
3023  bool IsVoidRetTy = Instr->getType()->isVoidTy();
3024 
3025  Instruction *Cloned = Instr->clone();
3026  if (!IsVoidRetTy)
3027  Cloned->setName(Instr->getName() + ".cloned");
3028 
3031  // Replace the operands of the cloned instructions with their scalar
3032  // equivalents in the new loop.
3033  for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3034  auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3035  auto InputInstance = Instance;
3036  if (!Operand || !OrigLoop->contains(Operand) ||
3037  (Cost->isUniformAfterVectorization(Operand, State.VF)))
3038  InputInstance.Lane = VPLane::getFirstLane();
3039  auto *NewOp = State.get(User.getOperand(op), InputInstance);
3040  Cloned->setOperand(op, NewOp);
3041  }
3042  addNewMetadata(Cloned, Instr);
3043 
3044  // Place the cloned scalar in the new loop.
3045  Builder.Insert(Cloned);
3046 
3047  State.set(Def, Cloned, Instance);
3048 
3049  // If we just cloned a new assumption, add it the assumption cache.
3050  if (auto *II = dyn_cast<AssumeInst>(Cloned))
3051  AC->registerAssumption(II);
3052 
3053  // End if-block.
3054  if (IfPredicateInstr)
3055  PredicatedInstructions.push_back(Cloned);
3056 }
3057 
3059  Value *End, Value *Step,
3060  Instruction *DL) {
3061  BasicBlock *Header = L->getHeader();
3062  BasicBlock *Latch = L->getLoopLatch();
3063  // As we're just creating this loop, it's possible no latch exists
3064  // yet. If so, use the header as this will be a single block loop.
3065  if (!Latch)
3066  Latch = Header;
3067 
3070  setDebugLocFromInst(Builder, OldInst);
3071  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3072 
3074  setDebugLocFromInst(Builder, OldInst);
3075 
3076  // Create i+1 and fill the PHINode.
3077  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3078  Induction->addIncoming(Start, L->getLoopPreheader());
3079  Induction->addIncoming(Next, Latch);
3080  // Create the compare.
3081  Value *ICmp = Builder.CreateICmpEQ(Next, End);
3082  Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3083 
3084  // Now we have two terminators. Remove the old one from the block.
3085  Latch->getTerminator()->eraseFromParent();
3086 
3087  return Induction;
3088 }
3089 
3091  if (TripCount)
3092  return TripCount;
3093 
3094  assert(L && "Create Trip Count for null loop.");
3096  // Find the loop boundaries.
3097  ScalarEvolution *SE = PSE.getSE();
3098  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3099  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3100  "Invalid loop count");
3101 
3102  Type *IdxTy = Legal->getWidestInductionType();
3103  assert(IdxTy && "No type for induction");
3104 
3105  // The exit count might have the type of i64 while the phi is i32. This can
3106  // happen if we have an induction variable that is sign extended before the
3107  // compare. The only way that we get a backedge taken count is that the
3108  // induction variable was signed and as such will not overflow. In such a case
3109  // truncation is legal.
3110  if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3111  IdxTy->getPrimitiveSizeInBits())
3112  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3113  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3114 
3115  // Get the total trip count from the count by adding 1.
3116  const SCEV *ExitCount = SE->getAddExpr(
3117  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3118 
3119  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3120 
3121  // Expand the trip count and place the new instructions in the preheader.
3122  // Notice that the pre-header does not change, only the loop body.
3123  SCEVExpander Exp(*SE, DL, "induction");
3124 
3125  // Count holds the overall loop count (N).
3126  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3128 
3129  if (TripCount->getType()->isPointerTy())
3130  TripCount =
3131  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3133 
3134  return TripCount;
3135 }
3136 
3138  if (VectorTripCount)
3139  return VectorTripCount;
3140 
3141  Value *TC = getOrCreateTripCount(L);
3143 
3144  Type *Ty = TC->getType();
3145  // This is where we can make the step a runtime constant.
3147 
3148  // If the tail is to be folded by masking, round the number of iterations N
3149  // up to a multiple of Step instead of rounding down. This is done by first
3150  // adding Step-1 and then rounding down. Note that it's ok if this addition
3151  // overflows: the vector induction variable will eventually wrap to zero given
3152  // that it starts at zero and its Step is a power of two; the loop will then
3153  // exit, with the last early-exit vector comparison also producing all-true.
3154  if (Cost->foldTailByMasking()) {
3156  "VF*UF must be a power of 2 when folding tail by masking");
3157  assert(!VF.isScalable() &&
3158  "Tail folding not yet supported for scalable vectors");
3159  TC = Builder.CreateAdd(
3160  TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3161  }
3162 
3163  // Now we need to generate the expression for the part of the loop that the
3164  // vectorized body will execute. This is equal to N - (N % Step) if scalar
3165  // iterations are not required for correctness, or N - Step, otherwise. Step
3166  // is equal to the vectorization factor (number of SIMD elements) times the
3167  // unroll factor (number of SIMD instructions).
3168  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3169 
3170  // There are two cases where we need to ensure (at least) the last iteration
3171  // runs in the scalar remainder loop. Thus, if the step evenly divides
3172  // the trip count, we set the remainder to be equal to the step. If the step
3173  // does not evenly divide the trip count, no adjustment is necessary since
3174  // there will already be scalar iterations. Note that the minimum iterations
3175  // check ensures that N >= Step. The cases are:
3176  // 1) If there is a non-reversed interleaved group that may speculatively
3177  // access memory out-of-bounds.
3178  // 2) If any instruction may follow a conditionally taken exit. That is, if
3179  // the loop contains multiple exiting blocks, or a single exiting block
3180  // which is not the latch.
3181  if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3182  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3183  R = Builder.CreateSelect(IsZero, Step, R);
3184  }
3185 
3186  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3187 
3188  return VectorTripCount;
3189 }
3190 
3192  const DataLayout &DL) {
3193  // Verify that V is a vector type with same number of elements as DstVTy.
3194  auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3195  unsigned VF = DstFVTy->getNumElements();
3196  auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3197  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3198  Type *SrcElemTy = SrcVecTy->getElementType();
3199  Type *DstElemTy = DstFVTy->getElementType();
3200  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3201  "Vector elements must have same size");
3202 
3203  // Do a direct cast if element types are castable.
3204  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3205  return Builder.CreateBitOrPointerCast(V, DstFVTy);
3206  }
3207  // V cannot be directly casted to desired vector type.
3208  // May happen when V is a floating point vector but DstVTy is a vector of
3209  // pointers or vice-versa. Handle this using a two-step bitcast using an
3210  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3211  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3212  "Only one type should be a pointer type");
3213  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3214  "Only one type should be a floating point type");
3215  Type *IntTy =
3216  IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3217  auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3218  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3219  return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3220 }
3221 
3223  BasicBlock *Bypass) {
3224  Value *Count = getOrCreateTripCount(L);
3225  // Reuse existing vector loop preheader for TC checks.
3226  // Note that new preheader block is generated for vector loop.
3227  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3228  IRBuilder<> Builder(TCCheckBlock->getTerminator());
3229 
3230  // Generate code to check if the loop's trip count is less than VF * UF, or
3231  // equal to it in case a scalar epilogue is required; this implies that the
3232  // vector trip count is zero. This check also covers the case where adding one
3233  // to the backedge-taken count overflowed leading to an incorrect trip count
3234  // of zero. In this case we will also jump to the scalar loop.
3237 
3238  // If tail is to be folded, vector loop takes care of all iterations.
3239  Value *CheckMinIters = Builder.getFalse();
3240  if (!Cost->foldTailByMasking()) {
3241  Value *Step =
3243  CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3244  }
3245  // Create new preheader for vector loop.
3247  SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3248  "vector.ph");
3249 
3250  assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3251  DT->getNode(Bypass)->getIDom()) &&
3252  "TC check is expected to dominate Bypass");
3253 
3254  // Update dominator for Bypass & LoopExit.
3255  DT->changeImmediateDominator(Bypass, TCCheckBlock);
3256  DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3257 
3259  TCCheckBlock->getTerminator(),
3260  BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3261  LoopBypassBlocks.push_back(TCCheckBlock);
3262 }
3263 
3265 
3266  BasicBlock *const SCEVCheckBlock =
3268  if (!SCEVCheckBlock)
3269  return nullptr;
3270 
3271  assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3274  "Cannot SCEV check stride or overflow when optimizing for size");
3275 
3276 
3277  // Update dominator only if this is first RT check.
3278  if (LoopBypassBlocks.empty()) {
3279  DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3280  DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3281  }
3282 
3283  LoopBypassBlocks.push_back(SCEVCheckBlock);
3284  AddedSafetyChecks = true;
3285  return SCEVCheckBlock;
3286 }
3287 
3289  BasicBlock *Bypass) {
3290  // VPlan-native path does not do any analysis for runtime checks currently.
3292  return nullptr;
3293 
3294  BasicBlock *const MemCheckBlock =
3296 
3297  // Check if we generated code that checks in runtime if arrays overlap. We put
3298  // the checks into a separate block to make the more common case of few
3299  // elements faster.
3300  if (!MemCheckBlock)
3301  return nullptr;
3302 
3303  if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3305  "Cannot emit memory checks when optimizing for size, unless forced "
3306  "to vectorize.");
3307  ORE->emit([&]() {
3308  return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3309  L->getStartLoc(), L->getHeader())
3310  << "Code-size may be reduced by not forcing "
3311  "vectorization, or by source-code modifications "
3312  "eliminating the need for runtime checks "
3313  "(e.g., adding 'restrict').";
3314  });
3315  }
3316 
3317  LoopBypassBlocks.push_back(MemCheckBlock);
3318 
3319  AddedSafetyChecks = true;
3320 
3321  // We currently don't use LoopVersioning for the actual loop cloning but we
3322  // still use it to add the noalias metadata.
3323  LVer = std::make_unique<LoopVersioning>(
3324  *Legal->getLAI(),
3326  DT, PSE.getSE());
3327  LVer->prepareNoAliasMetadata();
3328  return MemCheckBlock;
3329 }
3330 
3333  const InductionDescriptor &ID) const {
3334 
3335  SCEVExpander Exp(*SE, DL, "induction");
3336  auto Step = ID.getStep();
3337  auto StartValue = ID.getStartValue();
3338  assert(Index->getType() == Step->getType() &&
3339  "Index type does not match StepValue type");
3340 
3341  // Note: the IR at this point is broken. We cannot use SE to create any new
3342  // SCEV and then expand it, hoping that SCEV's simplification will give us
3343  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3344  // lead to various SCEV crashes. So all we can do is to use builder and rely
3345  // on InstCombine for future simplifications. Here we handle some trivial
3346  // cases only.
3347  auto CreateAdd = [&B](Value *X, Value *Y) {
3348  assert(X->getType() == Y->getType() && "Types don't match!");
3349  if (auto *CX = dyn_cast<ConstantInt>(X))
3350  if (CX->isZero())
3351  return Y;
3352  if (auto *CY = dyn_cast<ConstantInt>(Y))
3353  if (CY->isZero())
3354  return X;
3355  return B.CreateAdd(X, Y);
3356  };
3357 
3358  auto CreateMul = [&B](Value *X, Value *Y) {
3359  assert(X->getType() == Y->getType() && "Types don't match!");
3360  if (auto *CX = dyn_cast<ConstantInt>(X))
3361  if (CX->isOne())
3362  return Y;
3363  if (auto *CY = dyn_cast<ConstantInt>(Y))
3364  if (CY->isOne())
3365  return X;
3366  return B.CreateMul(X, Y);
3367  };
3368 
3369  // Get a suitable insert point for SCEV expansion. For blocks in the vector
3370  // loop, choose the end of the vector loop header (=LoopVectorBody), because
3371  // the DomTree is not kept up-to-date for additional blocks generated in the
3372  // vector loop. By using the header as insertion point, we guarantee that the
3373  // expanded instructions dominate all their uses.
3374  auto GetInsertPoint = [this, &B]() {
3375  BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3376  if (InsertBB != LoopVectorBody &&
3377  LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3378  return LoopVectorBody->getTerminator();
3379  return &*B.GetInsertPoint();
3380  };
3381 
3382  switch (ID.getKind()) {
3384  assert(Index->getType() == StartValue->getType() &&
3385  "Index type does not match StartValue type");
3386  if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3387  return B.CreateSub(StartValue, Index);
3388  auto *Offset = CreateMul(
3389  Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3390  return CreateAdd(StartValue, Offset);
3391  }
3393  assert(isa<SCEVConstant>(Step) &&
3394  "Expected constant step for pointer induction");
3395  return B.CreateGEP(
3396  StartValue->getType()->getPointerElementType(), StartValue,
3397  CreateMul(Index,
3398  Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3399  }
3401  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3402  auto InductionBinOp = ID.getInductionBinOp();
3403  assert(InductionBinOp &&
3404  (InductionBinOp->getOpcode() == Instruction::FAdd ||
3405  InductionBinOp->getOpcode() == Instruction::FSub) &&
3406  "Original bin op should be defined for FP induction");
3407 
3408  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3409  Value *MulExp = B.CreateFMul(StepValue, Index);
3410  return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3411  "induction");
3412  }
3414  return nullptr;
3415  }
3416  llvm_unreachable("invalid enum");
3417 }
3418 
3423  assert(LoopExitBlock && "Must have an exit block");
3424  assert(LoopVectorPreHeader && "Invalid loop structure");
3425 
3426  LoopMiddleBlock =
3428  LI, nullptr, Twine(Prefix) + "middle.block");
3431  nullptr, Twine(Prefix) + "scalar.ph");
3432 
3433  // Set up branch from middle block to the exit and scalar preheader blocks.
3434  // completeLoopSkeleton will update the condition to use an iteration check,
3435  // if required to decide whether to execute the remainder.
3436  BranchInst *BrInst =
3438  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3439  BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3441 
3442  // We intentionally don't let SplitBlock to update LoopInfo since
3443  // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3444  // LoopVectorBody is explicitly added to the correct place few lines later.
3445  LoopVectorBody =
3447  nullptr, nullptr, Twine(Prefix) + "vector.body");
3448 
3449  // Update dominator for loop exit.
3451 
3452  // Create and register the new vector loop.
3453  Loop *Lp = LI->AllocateLoop();
3454  Loop *ParentLoop = OrigLoop->getParentLoop();
3455 
3456  // Insert the new loop into the loop nest and register the new basic blocks
3457  // before calling any utilities such as SCEV that require valid LoopInfo.
3458  if (ParentLoop) {
3459  ParentLoop->addChildLoop(Lp);
3460  } else {
3461  LI->addTopLevelLoop(Lp);
3462  }
3464  return Lp;
3465 }
3466 
3468  Loop *L, Value *VectorTripCount,
3469  std::pair<BasicBlock *, Value *> AdditionalBypass) {
3470  assert(VectorTripCount && L && "Expected valid arguments");
3471  assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3472  (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3473  "Inconsistent information about additional bypass.");
3474  // We are going to resume the execution of the scalar loop.
3475  // Go over all of the induction variables that we found and fix the
3476  // PHIs that are left in the scalar version of the loop.
3477  // The starting values of PHI nodes depend on the counter of the last
3478  // iteration in the vectorized loop.
3479  // If we come from a bypass edge then we need to start from the original
3480  // start value.
3481  for (auto &InductionEntry : Legal->getInductionVars()) {
3482  PHINode *OrigPhi = InductionEntry.first;
3483  InductionDescriptor II = InductionEntry.second;
3484 
3485  // Create phi nodes to merge from the backedge-taken check block.
3486  PHINode *BCResumeVal =
3487  PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3489  // Copy original phi DL over to the new one.
3490  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3491  Value *&EndValue = IVEndValues[OrigPhi];
3492  Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3493  if (OrigPhi == OldInduction) {
3494  // We know what the end value is.
3495  EndValue = VectorTripCount;
3496  } else {
3498 
3499  // Fast-math-flags propagate from the original induction instruction.
3500  if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3501  B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3502 
3503  Type *StepType = II.getStep()->getType();
3504  Instruction::CastOps CastOp =
3505  CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3506  Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3508  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3509  EndValue->setName("ind.end");
3510 
3511  // Compute the end value for the additional bypass (if applicable).
3512  if (AdditionalBypass.first) {
3513  B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3514  CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3515  StepType, true);
3516  CRD =
3517  B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3518  EndValueFromAdditionalBypass =
3519  emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3520  EndValueFromAdditionalBypass->setName("ind.end");
3521  }
3522  }
3523  // The new PHI merges the original incoming value, in case of a bypass,
3524  // or the value at the end of the vectorized loop.
3525  BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3526 
3527  // Fix the scalar body counter (PHI node).
3528  // The old induction's phi node in the scalar body needs the truncated
3529  // value.
3530  for (BasicBlock *BB : LoopBypassBlocks)
3531  BCResumeVal->addIncoming(II.getStartValue(), BB);
3532 
3533  if (AdditionalBypass.first)
3534  BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3535  EndValueFromAdditionalBypass);
3536 
3537  OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3538  }
3539 }
3540 
3542  MDNode *OrigLoopID) {
3543  assert(L && "Expected valid loop.");
3544 
3545  // The trip counts should be cached by now.
3546  Value *Count = getOrCreateTripCount(L);
3548 
3549  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3550 
3551  // Add a check in the middle block to see if we have completed
3552  // all of the iterations in the first vector loop.
3553  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3554  // If tail is to be folded, we know we don't need to run the remainder.
3555  if (!Cost->foldTailByMasking()) {
3556  Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3557  Count, VectorTripCount, "cmp.n",
3559 
3560  // Here we use the same DebugLoc as the scalar loop latch terminator instead
3561  // of the corresponding compare because they may have ended up with
3562  // different line numbers and we want to avoid awkward line stepping while
3563  // debugging. Eg. if the compare has got a line number inside the loop.
3564  CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3565  cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3566  }
3567 
3568  // Get ready to start creating new instructions into the vectorized body.
3570  "Inconsistent vector loop preheader");
3572 
3573  Optional<MDNode *> VectorizedLoopID =
3576  if (VectorizedLoopID.hasValue()) {
3577  L->setLoopID(VectorizedLoopID.getValue());
3578 
3579  // Do not setAlreadyVectorized if loop attributes have been defined
3580  // explicitly.
3581  return LoopVectorPreHeader;
3582  }
3583 
3584  // Keep all loop hints from the original loop on the vector loop (we'll
3585  // replace the vectorizer-specific hints below).
3586  if (MDNode *LID = OrigLoop->getLoopID())
3587  L->setLoopID(LID);
3588 
3589  LoopVectorizeHints Hints(L, true, *ORE);
3590  Hints.setAlreadyVectorized();
3591 
3592 #ifdef EXPENSIVE_CHECKS
3594  LI->verify(*DT);
3595 #endif
3596 
3597  return LoopVectorPreHeader;
3598 }
3599 
3601  /*
3602  In this function we generate a new loop. The new loop will contain
3603  the vectorized instructions while the old loop will continue to run the
3604  scalar remainder.
3605 
3606  [ ] <-- loop iteration number check.
3607  / |
3608  / v
3609  | [ ] <-- vector loop bypass (may consist of multiple blocks).
3610  | / |
3611  | / v
3612  || [ ] <-- vector pre header.
3613  |/ |
3614  | v
3615  | [ ] \
3616  | [ ]_| <-- vector loop.
3617  | |
3618  | v
3619  | -[ ] <--- middle-block.
3620  | / |
3621  | / v
3622  -|- >[ ] <--- new preheader.
3623  | |
3624  | v
3625  | [ ] \
3626  | [ ]_| <-- old scalar loop to handle remainder.
3627  \ |
3628  \ v
3629  >[ ] <-- exit block.
3630  ...
3631  */
3632 
3633  // Get the metadata of the original loop before it gets modified.
3634  MDNode *OrigLoopID = OrigLoop->getLoopID();
3635 
3636  // Workaround! Compute the trip count of the original loop and cache it
3637  // before we start modifying the CFG. This code has a systemic problem
3638  // wherein it tries to run analysis over partially constructed IR; this is
3639  // wrong, and not simply for SCEV. The trip count of the original loop
3640  // simply happens to be prone to hitting this in practice. In theory, we
3641  // can hit the same issue for any SCEV, or ValueTracking query done during
3642  // mutation. See PR49900.
3644 
3645  // Create an empty vector loop, and prepare basic blocks for the runtime
3646  // checks.
3647  Loop *Lp = createVectorLoopSkeleton("");
3648 
3649  // Now, compare the new count to zero. If it is zero skip the vector loop and
3650  // jump to the scalar loop. This check also covers the case where the
3651  // backedge-taken count is uint##_max: adding one to it will overflow leading
3652  // to an incorrect trip count of zero. In this (rare) case we will also jump
3653  // to the scalar loop.
3655 
3656  // Generate the code to check any assumptions that we've made for SCEV
3657  // expressions.
3659 
3660  // Generate the code that checks in runtime if arrays overlap. We put the
3661  // checks into a separate block to make the more common case of few elements
3662  // faster.
3664 
3665  // Some loops have a single integer induction variable, while other loops
3666  // don't. One example is c++ iterators that often have multiple pointer
3667  // induction variables. In the code below we also support a case where we
3668  // don't have a single induction variable.
3669  //
3670  // We try to obtain an induction variable from the original loop as hard
3671  // as possible. However if we don't find one that:
3672  // - is an integer
3673  // - counts from zero, stepping by one
3674  // - is the size of the widest induction variable type
3675  // then we create a new one.
3677  Type *IdxTy = Legal->getWidestInductionType();
3678  Value *StartIdx = ConstantInt::get(IdxTy, 0);
3679  // The loop step is equal to the vectorization factor (num of SIMD elements)
3680  // times the unroll factor (num of SIMD instructions).
3682  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3683  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3684  Induction =
3685  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3687 
3688  // Emit phis for the new starting index of the scalar loop.
3689  createInductionResumeValues(Lp, CountRoundDown);
3690 
3691  return completeLoopSkeleton(Lp, OrigLoopID);
3692 }
3693 
3694 // Fix up external users of the induction variable. At this point, we are
3695 // in LCSSA form, with all external PHIs that use the IV having one input value,
3696 // coming from the remainder loop. We need those PHIs to also have a correct
3697 // value for the IV when arriving directly from the middle block.
3699  const InductionDescriptor &II,
3700  Value *CountRoundDown, Value *EndValue,
3701  BasicBlock *MiddleBlock) {
3702  // There are two kinds of external IV usages - those that use the value
3703  // computed in the last iteration (the PHI) and those that use the penultimate
3704  // value (the value that feeds into the phi from the loop latch).
3705  // We allow both, but they, obviously, have different values.
3706 
3707  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3708 
3709  DenseMap<Value *, Value *> MissingVals;
3710 
3711  // An external user of the last iteration's value should see the value that
3712  // the remainder loop uses to initialize its own IV.
3714  for (User *U : PostInc->users()) {
3715  Instruction *UI = cast<Instruction>(U);
3716  if (!OrigLoop->contains(UI)) {
3717  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3718  MissingVals[UI] = EndValue;
3719  }
3720  }
3721 
3722  // An external user of the penultimate value need to see EndValue - Step.
3723  // The simplest way to get this is to recompute it from the constituent SCEVs,
3724  // that is Start + (Step * (CRD - 1)).
3725  for (User *U : OrigPhi->users()) {
3726  auto *UI = cast<Instruction>(U);
3727  if (!OrigLoop->contains(UI)) {
3728  const DataLayout &DL =
3730  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3731 
3732  IRBuilder<> B(MiddleBlock->getTerminator());
3733 
3734  // Fast-math-flags propagate from the original induction instruction.
3735  if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3736  B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3737 
3738  Value *CountMinusOne = B.CreateSub(
3739  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3740  Value *CMO =
3741  !II.getStep()->getType()->isIntegerTy()
3742  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3743  II.getStep()->getType())
3744  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3745  CMO->setName("cast.cmo");
3746  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3747  Escape->setName("ind.escape");
3748  MissingVals[UI] = Escape;
3749  }
3750  }
3751 
3752  for (auto &I : MissingVals) {
3753  PHINode *PHI = cast<PHINode>(I.first);
3754  // One corner case we have to handle is two IVs "chasing" each-other,
3755  // that is %IV2 = phi [...], [ %IV1, %latch ]
3756  // In this case, if IV1 has an external use, we need to avoid adding both
3757  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3758  // don't already have an incoming value for the middle block.
3759  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3760  PHI->addIncoming(I.second, MiddleBlock);
3761  }
3762 }
3763 
3764 namespace {
3765 
3766 struct CSEDenseMapInfo {
3767  static bool canHandle(const Instruction *I) {
3768  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3769  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3770  }
3771 
3772  static inline Instruction *getEmptyKey() {
3774  }
3775 
3776  static inline Instruction *getTombstoneKey() {
3778  }
3779 
3780  static unsigned getHashValue(const Instruction *I) {
3781  assert(canHandle(I) && "Unknown instruction!");
3782  return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3783  I->value_op_end()));
3784  }
3785 
3786  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3787  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3788  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3789  return LHS == RHS;
3790  return LHS->isIdenticalTo(RHS);
3791  }
3792 };
3793 
3794 } // end anonymous namespace
3795 
3796 ///Perform cse of induction variable instructions.
3797 static void cse(BasicBlock *BB) {
3798  // Perform simple cse.
3800  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3801  Instruction *In = &*I++;
3802 
3803  if (!CSEDenseMapInfo::canHandle(In))
3804  continue;
3805 
3806  // Check if we can replace this instruction with any of the
3807  // visited instructions.
3808  if (Instruction *V = CSEMap.lookup(In)) {
3809  In->replaceAllUsesWith(V);
3810  In->eraseFromParent();
3811  continue;
3812  }
3813 
3814  CSEMap[In] = In;
3815  }
3816 }
3817 
3820  bool &NeedToScalarize) const {
3821  Function *F = CI->getCalledFunction();
3822  Type *ScalarRetTy = CI->getType();
3823  SmallVector<Type *, 4> Tys, ScalarTys;
3824  for (auto &ArgOp : CI->arg_operands())
3825  ScalarTys.push_back(ArgOp->getType());
3826 
3827  // Estimate cost of scalarized vector call. The source operands are assumed
3828  // to be vectors, so we need to extract individual elements from there,
3829  // execute VF scalar calls, and then gather the result into the vector return
3830  // value.
3831  InstructionCost ScalarCallCost =
3832  TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3833  if (VF.isScalar())
3834  return ScalarCallCost;
3835 
3836  // Compute corresponding vector type for return value and arguments.
3837  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3838  for (Type *ScalarTy : ScalarTys)
3839  Tys.push_back(ToVectorTy(ScalarTy, VF));
3840 
3841  // Compute costs of unpacking argument values for the scalar calls and
3842  // packing the return values to a vector.
3843  InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3844 
3846  ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3847 
3848  // If we can't emit a vector call for this function, then the currently found
3849  // cost is the cost we need to return.
3850  NeedToScalarize = true;
3851  VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3852  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3853 
3854  if (!TLI || CI->isNoBuiltin() || !VecFunc)
3855  return Cost;
3856 
3857  // If the corresponding vector cost is cheaper, return its cost.
3858  InstructionCost VectorCallCost =
3859  TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3860  if (VectorCallCost < Cost) {
3861  NeedToScalarize = false;
3862  Cost = VectorCallCost;
3863  }
3864  return Cost;
3865 }
3866 
3868  if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3869  return Elt;
3870  return VectorType::get(Elt, VF);
3871 }
3872 
3875  ElementCount VF) const {
3877  assert(ID && "Expected intrinsic call!");
3878  Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3879  FastMathFlags FMF;
3880  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3881  FMF = FPMO->getFastMathFlags();
3882 
3885  SmallVector<Type *> ParamTys;
3886  std::transform(FTy->param_begin(), FTy->param_end(),
3887  std::back_inserter(ParamTys),
3888  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3889 
3890  IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3891  dyn_cast<IntrinsicInst>(CI));
3892  return TTI.getIntrinsicInstrCost(CostAttrs,
3894 }
3895 
3897  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3898  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3899  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3900 }
3901 
3903  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3904  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3905  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3906 }
3907 
3909  // For every instruction `I` in MinBWs, truncate the operands, create a
3910  // truncated version of `I` and reextend its result. InstCombine runs
3911  // later and will remove any ext/trunc pairs.
3912  SmallPtrSet<Value *, 4> Erased;
3913  for (const auto &KV : Cost->getMinimalBitwidths()) {
3914  // If the value wasn't vectorized, we must maintain the original scalar
3915  // type. The absence of the value from State indicates that it
3916  // wasn't vectorized.
3917  VPValue *Def = State.Plan->getVPValue(KV.first);
3918  if (!State.hasAnyVectorValue(Def))
3919  continue;
3920  for (unsigned Part = 0; Part < UF; ++Part) {
3921  Value *I = State.get(Def, Part);
3922  if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3923  continue;
3924  Type *OriginalTy = I->getType();
3925  Type *ScalarTruncatedTy =
3926  IntegerType::get(OriginalTy->getContext(), KV.second);
3927  auto *TruncatedTy = FixedVectorType::get(
3928  ScalarTruncatedTy,
3929  cast<FixedVectorType>(OriginalTy)->getNumElements());
3930  if (TruncatedTy == OriginalTy)
3931  continue;
3932 
3933  IRBuilder<> B(cast<Instruction>(I));
3934  auto ShrinkOperand = [&](Value *V) -> Value * {
3935  if (auto *ZI = dyn_cast<ZExtInst>(V))
3936  if (ZI->getSrcTy() == TruncatedTy)
3937  return ZI->getOperand(0);
3938  return B.CreateZExtOrTrunc(V, TruncatedTy);
3939  };
3940 
3941  // The actual instruction modification depends on the instruction type,
3942  // unfortunately.
3943  Value *NewI = nullptr;
3944  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3945  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3946  ShrinkOperand(BO->getOperand(1)));
3947 
3948  // Any wrapping introduced by shrinking this operation shouldn't be
3949  // considered undefined behavior. So, we can't unconditionally copy
3950  // arithmetic wrapping flags to NewI.
3951  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3952  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3953  NewI =
3954  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3955  ShrinkOperand(CI->getOperand(1)));
3956  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3957  NewI = B.CreateSelect(SI->getCondition(),
3958  ShrinkOperand(SI->getTrueValue()),
3959  ShrinkOperand(SI->getFalseValue()));
3960  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3961  switch (CI->getOpcode()) {
3962  default:
3963  llvm_unreachable("Unhandled cast!");
3964  case Instruction::Trunc:
3965  NewI = ShrinkOperand(CI->getOperand(0));
3966  break;
3967  case Instruction::SExt:
3968  NewI = B.CreateSExtOrTrunc(
3969  CI->getOperand(0),
3970  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3971  break;
3972  case Instruction::ZExt:
3973  NewI = B.CreateZExtOrTrunc(
3974  CI->getOperand(0),
3975  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3976  break;
3977  }
3978  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3979  auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3980  ->getNumElements();
3981  auto *O0 = B.CreateZExtOrTrunc(
3982  SI->getOperand(0),
3983  FixedVectorType::get(ScalarTruncatedTy, Elements0));
3984  auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3985  ->getNumElements();
3986  auto *O1 = B.CreateZExtOrTrunc(
3987  SI->getOperand(1),
3988  FixedVectorType::get(ScalarTruncatedTy, Elements1));
3989 
3990  NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3991  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3992  // Don't do anything with the operands, just extend the result.
3993  continue;
3994  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3995  auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3996  ->getNumElements();
3997  auto *O0 = B.CreateZExtOrTrunc(
3998  IE->getOperand(0),
3999  FixedVectorType::get(ScalarTruncatedTy, Elements));
4000  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4001  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4002  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4003  auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
4004  ->getNumElements();
4005  auto *O0 = B.CreateZExtOrTrunc(
4006  EE->getOperand(0),
4007  FixedVectorType::get(ScalarTruncatedTy, Elements));
4008  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4009  } else {
4010  // If we don't know what to do, be conservative and don't do anything.
4011  continue;
4012  }
4013 
4014  // Lastly, extend the result.
4015  NewI->takeName(cast<Instruction>(I));
4016  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4017  I->replaceAllUsesWith(Res);
4018  cast<Instruction>(I)->eraseFromParent();
4019  Erased.insert(I);
4020  State.reset(Def, Res, Part);
4021  }
4022  }
4023 
4024  // We'll have created a bunch of ZExts that are now parentless. Clean up.
4025  for (const auto &KV : Cost->getMinimalBitwidths()) {
4026  // If the value wasn't vectorized, we must maintain the original scalar
4027  // type. The absence of the value from State indicates that it
4028  // wasn't vectorized.
4029  VPValue *Def = State.Plan->getVPValue(KV.first);
4030  if (!State.hasAnyVectorValue(Def))
4031  continue;
4032  for (unsigned Part = 0; Part < UF; ++Part) {
4033  Value *I = State.get(Def, Part);
4034  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4035  if (Inst && Inst->use_empty()) {
4036  Value *NewI = Inst->getOperand(0);
4037  Inst->eraseFromParent();
4038  State.reset(Def, NewI, Part);
4039  }
4040  }
4041  }
4042 }
4043 
4045  // Insert truncates and extends for any truncated instructions as hints to
4046  // InstCombine.
4047  if (VF.isVector())
4049 
4050  // Fix widened non-induction PHIs by setting up the PHI operands.
4051  if (OrigPHIsToFix.size()) {
4053  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4054  fixNonInductionPHIs(State);
4055  }
4056 
4057  // At this point every instruction in the original loop is widened to a
4058  // vector form. Now we need to fix the recurrences in the loop. These PHI
4059  // nodes are currently empty because we did not want to introduce cycles.
4060  // This is the second stage of vectorizing recurrences.
4061  fixCrossIterationPHIs(State);
4062 
4063  // Forget the original basic block.
4065 
4066  // Fix-up external users of the induction variables.
4067  for (auto &Entry : Legal->getInductionVars())
4068  fixupIVUsers(Entry.first, Entry.second,
4070  IVEndValues[Entry.first], LoopMiddleBlock);
4071 
4072  fixLCSSAPHIs(State);
4074  sinkScalarOperands(&*PI);
4075 
4076  // Remove redundant induction instructions.
4078 
4079  // Set/update profile weights for the vector and remainder loops as original
4080  // loop iterations are now distributed among them. Note that original loop
4081  // represented by LoopScalarBody becomes remainder loop after vectorization.
4082  //
4083  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4084  // end up getting slightly roughened result but that should be OK since
4085  // profile is not inherently precise anyway. Note also possible bypass of
4086  // vector code caused by legality checks is ignored, assigning all the weight
4087  // to the vector loop, optimistically.
4088  //
4089  // For scalable vectorization we can't know at compile time how many iterations
4090  // of the loop are handled in one vector iteration, so instead assume a pessimistic
4091  // vscale of '1'.
4095 }
4096 
4098  // In order to support recurrences we need to be able to vectorize Phi nodes.
4099  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4100  // stage #2: We now need to fix the recurrences by adding incoming edges to
4101  // the currently empty PHI nodes. At this point every instruction in the
4102  // original loop is widened to a vector form so we can use them to construct
4103  // the incoming edges.
4104  VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4105  for (VPRecipeBase &R : Header->phis()) {
4106  auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
4107  if (!PhiR)
4108  continue;
4109  auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4110  if (PhiR->getRecurrenceDescriptor()) {
4111  fixReduction(PhiR, State);
4112  } else if (Legal->isFirstOrderRecurrence(OrigPhi))
4113  fixFirstOrderRecurrence(OrigPhi, State);
4114  }
4115 }
4116 
4118  VPTransformState &State) {
4119  // This is the second phase of vectorizing first-order recurrences. An
4120  // overview of the transformation is described below. Suppose we have the
4121  // following loop.
4122  //
4123  // for (int i = 0; i < n; ++i)
4124  // b[i] = a[i] - a[i - 1];
4125  //
4126  // There is a first-order recurrence on "a". For this loop, the shorthand
4127  // scalar IR looks like:
4128  //
4129  // scalar.ph:
4130  // s_init = a[-1]
4131  // br scalar.body
4132  //
4133  // scalar.body:
4134  // i = phi [0, scalar.ph], [i+1, scalar.body]
4135  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4136  // s2 = a[i]
4137  // b[i] = s2 - s1
4138  // br cond, scalar.body, ...
4139  //
4140  // In this example, s1 is a recurrence because it's value depends on the
4141  // previous iteration. In the first phase of vectorization, we created a
4142  // temporary value for s1. We now complete the vectorization and produce the
4143  // shorthand vector IR shown below (for VF = 4, UF = 1).
4144  //
4145  // vector.ph:
4146  // v_init = vector(..., ..., ..., a[-1])
4147  // br vector.body
4148  //
4149  // vector.body
4150  // i = phi [0, vector.ph], [i+4, vector.body]
4151  // v1 = phi [v_init, vector.ph], [v2, vector.body]
4152  // v2 = a[i, i+1, i+2, i+3];
4153  // v3 = vector(v1(3), v2(0, 1, 2))
4154  // b[i, i+1, i+2, i+3] = v2 - v3
4155  // br cond, vector.body, middle.block
4156  //
4157  // middle.block:
4158  // x = v2(3)
4159  // br scalar.ph
4160  //
4161  // scalar.ph:
4162  // s_init = phi [x, middle.block], [a[-1], otherwise]
4163  // br scalar.body
4164  //
4165  // After execution completes the vector loop, we extract the next value of
4166  // the recurrence (x) to use as the initial value in the scalar loop.
4167 
4168  // Get the original loop preheader and single loop latch.
4169  auto *Preheader = OrigLoop->getLoopPreheader();
4170  auto *Latch = OrigLoop->getLoopLatch();
4171 
4172  // Get the initial and previous values of the scalar recurrence.
4173  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4174  auto *Previous = Phi->getIncomingValueForBlock(Latch);
4175 
4176  // Create a vector from the initial value.
4177  auto *VectorInit = ScalarInit;
4178  if (VF.isVector()) {
4180  assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4181  VectorInit = Builder.CreateInsertElement(
4182  PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4183  Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4184  }
4185 
4186  VPValue *PhiDef = State.Plan->getVPValue(Phi);
4187  VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4188  // We constructed a temporary phi node in the first phase of vectorization.
4189  // This phi node will eventually be deleted.
4190  Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4191 
4192  // Create a phi node for the new recurrence. The current value will either be
4193  // the initial value inserted into a vector or loop-varying vector value.
4194  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4195  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4196 
4197  // Get the vectorized previous value of the last part UF - 1. It appears last
4198  // among all unrolled iterations, due to the order of their construction.
4199  Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4200 
4201  // Find and set the insertion point after the previous value if it is an
4202  // instruction.
4203  BasicBlock::iterator InsertPt;
4204  // Note that the previous value may have been constant-folded so it is not
4205  // guaranteed to be an instruction in the vector loop.
4206  // FIXME: Loop invariant values do not form recurrences. We should deal with
4207  // them earlier.
4208  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4209  InsertPt = LoopVectorBody->getFirstInsertionPt();
4210  else {
4211  Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4212  if (isa<PHINode>(PreviousLastPart))
4213  // If the previous value is a phi node, we should insert after all the phi
4214  // nodes in the block containing the PHI to avoid breaking basic block
4215  // verification. Note that the basic block may be different to
4216  // LoopVectorBody, in case we predicate the loop.
4217  InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4218  else
4219  InsertPt = ++PreviousInst->getIterator();
4220  }
4221  Builder.SetInsertPoint(&*InsertPt);
4222 
4223  // We will construct a vector for the recurrence by combining the values for
4224  // the current and previous iterations. This is the required shuffle mask.
4225  assert(!VF.isScalable());
4226  SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4227  ShuffleMask[0] = VF.getKnownMinValue() - 1;
4228  for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4229  ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4230 
4231  // The vector from which to take the initial value for the current iteration
4232  // (actual or unrolled). Initially, this is the vector phi node.
4233  Value *Incoming = VecPhi;
4234 
4235  // Shuffle the current and previous vector and update the vector parts.
4236  for (unsigned Part = 0; Part < UF; ++Part) {
4237  Value *PreviousPart = State.get(PreviousDef, Part);
4238  Value *PhiPart = State.get(PhiDef, Part);
4239  auto *Shuffle =
4240  VF.isVector()
4241  ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4242  : Incoming;
4243  PhiPart->replaceAllUsesWith(Shuffle);
4244  cast<Instruction>(PhiPart)->eraseFromParent();
4245  State.reset(PhiDef, Shuffle, Part);
4246  Incoming = PreviousPart;
4247  }
4248 
4249  // Fix the latch value of the new recurrence in the vector loop.
4250  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4251 
4252  // Extract the last vector element in the middle block. This will be the
4253  // initial value for the recurrence when jumping to the scalar loop.
4254  auto *ExtractForScalar = Incoming;
4255  if (VF.isVector()) {
4257  ExtractForScalar = Builder.CreateExtractElement(
4258  ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4259  "vector.recur.extract");
4260  }
4261  // Extract the second last element in the middle block if the
4262  // Phi is used outside the loop. We need to extract the phi itself
4263  // and not the last element (the phi update in the current iteration). This
4264  // will be the value when jumping to the exit block from the LoopMiddleBlock,
4265  // when the scalar loop is not run at all.
4266  Value *ExtractForPhiUsedOutsideLoop = nullptr;
4267  if (VF.isVector())
4268  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4269  Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4270  "vector.recur.extract.for.phi");
4271  // When loop is unrolled without vectorizing, initialize
4272  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4273  // `Incoming`. This is analogous to the vectorized case above: extracting the
4274  // second last element when VF > 1.
4275  else if (UF > 1)
4276  ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4277 
4278  // Fix the initial value of the original recurrence in the scalar loop.
4280  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4281  for (auto *BB : predecessors(LoopScalarPreHeader)) {
4282  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4283  Start->addIncoming(Incoming, BB);
4284  }
4285 
4287  Phi->setName("scalar.recur");
4288 
4289  // Finally, fix users of the recurrence outside the loop. The users will need
4290  // either the last value of the scalar recurrence or the last value of the
4291  // vector recurrence we extracted in the middle block. Since the loop is in
4292  // LCSSA form, we just need to find all the phi nodes for the original scalar
4293  // recurrence in the exit block, and then add an edge for the middle block.
4294  // Note that LCSSA does not imply single entry when the original scalar loop
4295  // had multiple exiting edges (as we always run the last iteration in the
4296  // scalar epilogue); in that case, the exiting path through middle will be
4297  // dynamically dead and the value picked for the phi doesn't matter.
4298  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4299  if (any_of(LCSSAPhi.incoming_values(),
4300  [Phi](Value *V) { return V == Phi; }))
4301  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4302 }
4303 
4305  return EnableStrictReductions && RdxDesc.isOrdered();
4306 }
4307 
4309  VPTransformState &State) {
4310  PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4311  // Get it's reduction variable descriptor.
4312  assert(Legal->isReductionVariable(OrigPhi) &&
4313  "Unable to find the reduction variable");
4314  RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor();
4315 
4316  RecurKind RK = RdxDesc.getRecurrenceKind();
4317  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4318  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4319  setDebugLocFromInst(Builder, ReductionStartValue);
4320  bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi);
4321 
4322  VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4323  // This is the vector-clone of the value that leaves the loop.
4324  Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4325 
4326  // Wrap flags are in general invalid after vectorization, clear them.
4327  clearReductionWrapFlags(RdxDesc, State);
4328 
4329  // Fix the vector-loop phi.
4330 
4331  // Reductions do not have to start at zero. They can start with
4332  // any loop invariant values.
4333  BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4334 
4335  bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
4336  useOrderedReductions(RdxDesc);
4337 
4338  for (unsigned Part = 0; Part < UF; ++Part) {
4339  if (IsOrdered && Part > 0)
4340  break;
4341  Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
4342  Value *Val = State.get(PhiR->getBackedgeValue(), Part);
4343  if (IsOrdered)
4344  Val = State.get(PhiR->getBackedgeValue(), UF - 1);
4345 
4346  cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
4347  }
4348 
4349  // Before each round, move the insertion point right between
4350  // the PHIs and the values we are going to write.
4351  // This allows us to write both PHINodes and the extractelement
4352  // instructions.
4354 
4355  setDebugLocFromInst(Builder, LoopExitInst);
4356 
4357  Type *PhiTy = OrigPhi->getType();
4358  // If tail is folded by masking, the vector value to leave the loop should be
4359  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4360  // instead of the former. For an inloop reduction the reduction will already
4361  // be predicated, and does not need to be handled here.
4362  if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4363  for (unsigned Part = 0; Part < UF; ++Part) {
4364  Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4365  Value *Sel = nullptr;
4366  for (User *U : VecLoopExitInst->users()) {
4367  if (isa<SelectInst>(U)) {
4368  assert(!Sel && "Reduction exit feeding two selects");
4369  Sel = U;
4370  } else
4371  assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4372  }
4373  assert(Sel && "Reduction exit feeds no select");
4374  State.reset(LoopExitInstDef, Sel, Part);
4375 
4376  // If the target can create a predicated operator for the reduction at no
4377  // extra cost in the loop (for example a predicated vadd), it can be
4378  // cheaper for the select to remain in the loop than be sunk out of it,
4379  // and so use the select value for the phi instead of the old
4380  // LoopExitValue.
4383  RdxDesc.getOpcode(), PhiTy,
4385  auto *VecRdxPhi =
4386  cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4387  VecRdxPhi->setIncomingValueForBlock(
4389  }
4390  }
4391  }
4392 
4393  // If the vector reduction can be performed in a smaller type, we truncate
4394  // then extend the loop exit value to enable InstCombine to evaluate the
4395  // entire expression in the smaller type.
4396  if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4397  assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4398  assert(!VF.isScalable() && "scalable vectors not yet supported.");
4399  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4402  VectorParts RdxParts(UF);
4403  for (unsigned Part = 0; Part < UF; ++Part) {
4404  RdxParts[Part] = State.get(LoopExitInstDef, Part);
4405  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4406  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4407  : Builder.CreateZExt(Trunc, VecTy);
4408  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4409  UI != RdxParts[Part]->user_end();)
4410  if (*UI != Trunc) {
4411  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4412  RdxParts[Part] = Extnd;
4413  } else {
4414  ++UI;
4415  }
4416  }
4418  for (unsigned Part = 0; Part < UF; ++Part) {
4419  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4420  State.reset(LoopExitInstDef, RdxParts[Part], Part);
4421  }
4422  }
4423 
4424  // Reduce all of the unrolled parts into a single vector.
4425  Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4426  unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4427 
4428  // The middle block terminator has already been assigned a DebugLoc here (the
4429  // OrigLoop's single latch terminator). We want the whole middle block to
4430  // appear to execute on this line because: (a) it is all compiler generated,
4431  // (b) these instructions are always executed after evaluating the latch
4432  // conditional branch, and (c) other passes may add new predecessors which
4433  // terminate on this line. This is the easiest way to ensure we don't
4434  // accidentally cause an extra step back into the loop while debugging.
4436  if (IsOrdered)
4437  ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4438  else {
4439  // Floating-point operations should have some FMF to enable the reduction.
4442  for (unsigned Part = 1; Part < UF; ++Part) {
4443  Value *RdxPart = State.get(LoopExitInstDef, Part);
4444  if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4445  ReducedPartRdx = Builder.CreateBinOp(
4446  (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4447  } else {
4448  ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4449  }
4450  }
4451  }
4452 
4453  // Create the reduction after the loop. Note that inloop reductions create the
4454  // target reduction in the loop using a Reduction recipe.
4455  if (VF.isVector() && !IsInLoopReductionPhi) {
4456  ReducedPartRdx =
4457  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4458  // If the reduction can be performed in a smaller type, we need to extend
4459  // the reduction to the wider type before we branch to the original loop.
4460  if (PhiTy != RdxDesc.getRecurrenceType())
4461  ReducedPartRdx = RdxDesc.isSigned()
4462  ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4463  : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4464  }
4465 
4466  // Create a phi node that merges control-flow from the backedge-taken check
4467  // block and the middle block.
4468  PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4470  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4471  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4472  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4473 
4474  // Now, we need to fix the users of the reduction variable
4475  // inside and outside of the scalar remainder loop.
4476 
4477  // We know that the loop is in LCSSA form. We need to update the PHI nodes
4478  // in the exit blocks. See comment on analogous loop in
4479  // fixFirstOrderRecurrence for a more complete explaination of the logic.
4480  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4481  if (any_of(LCSSAPhi.incoming_values(),
4482  [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4483  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4484 
4485  // Fix the scalar loop reduction variable with the incoming reduction sum
4486  // from the vector body and from the backedge value.
4487  int IncomingEdgeBlockIdx =
4489  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4490  // Pick the other block.
4491  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4492  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4493  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4494 }
4495 
4497  VPTransformState &State) {
4498  RecurKind RK = RdxDesc.getRecurrenceKind();
4499  if (RK != RecurKind::Add && RK != RecurKind::Mul)
4500  return;
4501 
4502  Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4503  assert(LoopExitInstr && "null loop exit instruction");
4506  Worklist.push_back(LoopExitInstr);
4507  Visited.insert(LoopExitInstr);
4508 
4509  while (!Worklist.empty()) {
4510  Instruction *Cur = Worklist.pop_back_val();
4511  if (isa<OverflowingBinaryOperator>(Cur))
4512  for (unsigned Part = 0; Part < UF; ++Part) {
4513  Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4514  cast<Instruction>(V)->dropPoisonGeneratingFlags();
4515  }
4516 
4517  for (User *U : Cur->users()) {
4518  Instruction *UI = cast<Instruction>(U);
4519  if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4520  Visited.insert(UI).second)
4521  Worklist.push_back(UI);
4522  }
4523  }
4524 }
4525 
4527  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4528  if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4529  // Some phis were already hand updated by the reduction and recurrence
4530  // code above, leave them alone.
4531  continue;
4532 
4533  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4534  // Non-instruction incoming values will have only one value.
4535 
4536  VPLane Lane = VPLane::getFirstLane();
4537  if (isa<Instruction>(IncomingValue) &&
4538  !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4539  VF))
4540  Lane = VPLane::getLastLaneForVF(VF);
4541 
4542  // Can be a loop invariant incoming value or the last scalar value to be
4543  // extracted from the vectorized loop.
4545  Value *lastIncomingValue =
4546  OrigLoop->isLoopInvariant(IncomingValue)
4547  ? IncomingValue
4548  : State.get(State.Plan->getVPValue(IncomingValue),
4549  VPIteration(UF - 1, Lane));
4550  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4551  }
4552 }
4553 
4555  // The basic block and loop containing the predicated instruction.
4556  auto *PredBB = PredInst->getParent();
4557  auto *VectorLoop = LI->getLoopFor(PredBB);
4558 
4559  // Initialize a worklist with the operands of the predicated instruction.
4560  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4561 
4562  // Holds instructions that we need to analyze again. An instruction may be
4563  // reanalyzed if we don't yet know if we can sink it or not.
4564  SmallVector<Instruction *, 8> InstsToReanalyze;
4565 
4566  // Returns true if a given use occurs in the predicated block. Phi nodes use
4567