LLVM  8.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 // There is a development effort going on to migrate loop vectorizer to the
30 // VPlan infrastructure and to introduce outer loop vectorization support (see
31 // docs/Proposal/VectorizationPlan.rst and
32 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
33 // purpose, we temporarily introduced the VPlan-native vectorization path: an
34 // alternative vectorization path that is natively implemented on top of the
35 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //
37 //===----------------------------------------------------------------------===//
38 //
39 // The reduction-variable vectorization is based on the paper:
40 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 //
42 // Variable uniformity checks are inspired by:
43 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 //
45 // The interleaved access vectorization is based on the paper:
46 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
47 // Data for SIMD
48 //
49 // Other ideas/concepts are from:
50 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 //
52 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
53 // Vectorizing Compilers.
54 //
55 //===----------------------------------------------------------------------===//
56 
59 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SetVector.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
81 #include "llvm/Analysis/CFG.h"
87 #include "llvm/Analysis/LoopInfo.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/LLVMContext.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/MathExtras.h"
137 #include <algorithm>
138 #include <cassert>
139 #include <cstdint>
140 #include <cstdlib>
141 #include <functional>
142 #include <iterator>
143 #include <limits>
144 #include <memory>
145 #include <string>
146 #include <tuple>
147 #include <utility>
148 #include <vector>
149 
150 using namespace llvm;
151 
152 #define LV_NAME "loop-vectorize"
153 #define DEBUG_TYPE LV_NAME
154 
155 STATISTIC(LoopsVectorized, "Number of loops vectorized");
156 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
157 
158 /// Loops with a known constant trip count below this number are vectorized only
159 /// if no scalar iteration overheads are incurred.
161  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
162  cl::desc("Loops with a constant trip count that is smaller than this "
163  "value are vectorized only if no scalar iteration overheads "
164  "are incurred."));
165 
167  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
168  cl::desc("Maximize bandwidth when selecting vectorization factor which "
169  "will be determined by the smallest type in loop."));
170 
172  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
173  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
174 
176  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
177  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
178 
179 /// We don't interleave loops with a known constant trip count below this
180 /// number.
181 static const unsigned TinyTripCountInterleaveThreshold = 128;
182 
184  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
185  cl::desc("A flag that overrides the target's number of scalar registers."));
186 
188  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
189  cl::desc("A flag that overrides the target's number of vector registers."));
190 
192  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
193  cl::desc("A flag that overrides the target's max interleave factor for "
194  "scalar loops."));
195 
197  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
198  cl::desc("A flag that overrides the target's max interleave factor for "
199  "vectorized loops."));
200 
202  "force-target-instruction-cost", cl::init(0), cl::Hidden,
203  cl::desc("A flag that overrides the target's expected cost for "
204  "an instruction to a single constant value. Mostly "
205  "useful for getting consistent testing."));
206 
208  "small-loop-cost", cl::init(20), cl::Hidden,
209  cl::desc(
210  "The cost of a loop that is considered 'small' by the interleaver."));
211 
213  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
214  cl::desc("Enable the use of the block frequency analysis to access PGO "
215  "heuristics minimizing code growth in cold regions and being more "
216  "aggressive in hot regions."));
217 
218 // Runtime interleave loops for load/store throughput.
220  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
221  cl::desc(
222  "Enable runtime interleaving until load/store ports are saturated"));
223 
224 /// The number of stores in a loop that are allowed to need predication.
226  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
227  cl::desc("Max number of stores to be predicated behind an if."));
228 
230  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
231  cl::desc("Count the induction variable only once when interleaving"));
232 
234  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
235  cl::desc("Enable if predication of stores during vectorization."));
236 
238  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
239  cl::desc("The maximum interleave count to use when interleaving a scalar "
240  "reduction in a nested loop."));
241 
243  "enable-vplan-native-path", cl::init(false), cl::Hidden,
244  cl::desc("Enable VPlan-native vectorization path with "
245  "support for outer loop vectorization."));
246 
247 // This flag enables the stress testing of the VPlan H-CFG construction in the
248 // VPlan-native vectorization path. It must be used in conjuction with
249 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
250 // verification of the H-CFGs built.
252  "vplan-build-stress-test", cl::init(false), cl::Hidden,
253  cl::desc(
254  "Build VPlan for every supported loop nest in the function and bail "
255  "out right after the build (stress test the VPlan H-CFG construction "
256  "in the VPlan-native vectorization path)."));
257 
258 /// A helper function for converting Scalar types to vector types.
259 /// If the incoming type is void, we return void. If the VF is 1, we return
260 /// the scalar type.
261 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
262  if (Scalar->isVoidTy() || VF == 1)
263  return Scalar;
264  return VectorType::get(Scalar, VF);
265 }
266 
267 /// A helper function that returns the type of loaded or stored value.
269  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
270  "Expected Load or Store instruction");
271  if (auto *LI = dyn_cast<LoadInst>(I))
272  return LI->getType();
273  return cast<StoreInst>(I)->getValueOperand()->getType();
274 }
275 
276 /// A helper function that returns true if the given type is irregular. The
277 /// type is irregular if its allocated size doesn't equal the store size of an
278 /// element of the corresponding vector type at the given vectorization factor.
279 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
280  // Determine if an array of VF elements of type Ty is "bitcast compatible"
281  // with a <VF x Ty> vector.
282  if (VF > 1) {
283  auto *VectorTy = VectorType::get(Ty, VF);
284  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
285  }
286 
287  // If the vectorization factor is one, we just check if an array of type Ty
288  // requires padding between elements.
289  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
290 }
291 
292 /// A helper function that returns the reciprocal of the block probability of
293 /// predicated blocks. If we return X, we are assuming the predicated block
294 /// will execute once for every X iterations of the loop header.
295 ///
296 /// TODO: We should use actual block probability here, if available. Currently,
297 /// we always assume predicated blocks have a 50% chance of executing.
298 static unsigned getReciprocalPredBlockProb() { return 2; }
299 
300 /// A helper function that adds a 'fast' flag to floating-point operations.
302  if (isa<FPMathOperator>(V)) {
303  FastMathFlags Flags;
304  Flags.setFast();
305  cast<Instruction>(V)->setFastMathFlags(Flags);
306  }
307  return V;
308 }
309 
310 /// A helper function that returns an integer or floating-point constant with
311 /// value C.
312 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
313  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
314  : ConstantFP::get(Ty, C);
315 }
316 
317 namespace llvm {
318 
319 /// InnerLoopVectorizer vectorizes loops which contain only one basic
320 /// block to a specified vectorization factor (VF).
321 /// This class performs the widening of scalars into vectors, or multiple
322 /// scalars. This class also implements the following features:
323 /// * It inserts an epilogue loop for handling loops that don't have iteration
324 /// counts that are known to be a multiple of the vectorization factor.
325 /// * It handles the code generation for reduction variables.
326 /// * Scalarization (implementation using scalars) of un-vectorizable
327 /// instructions.
328 /// InnerLoopVectorizer does not perform any vectorization-legality
329 /// checks, and relies on the caller to check for the different legality
330 /// aspects. The InnerLoopVectorizer relies on the
331 /// LoopVectorizationLegality class to provide information about the induction
332 /// and reduction variables that were found to a given vectorization factor.
334 public:
337  const TargetLibraryInfo *TLI,
339  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
340  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
342  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
343  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
344  Builder(PSE.getSE()->getContext()),
345  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
346  virtual ~InnerLoopVectorizer() = default;
347 
348  /// Create a new empty loop. Unlink the old loop and connect the new one.
349  /// Return the pre-header block of the new loop.
351 
352  /// Widen a single instruction within the innermost loop.
354 
355  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
356  void fixVectorizedLoop();
357 
358  // Return true if any runtime check is added.
360 
361  /// A type for vectorized values in the new loop. Each value from the
362  /// original loop, when vectorized, is represented by UF vector values in the
363  /// new unrolled loop, where UF is the unroll factor.
365 
366  /// Vectorize a single PHINode in a block. This method handles the induction
367  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
368  /// arbitrary length vectors.
369  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
370 
371  /// A helper function to scalarize a single Instruction in the innermost loop.
372  /// Generates a sequence of scalar instances for each lane between \p MinLane
373  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
374  /// inclusive..
375  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
376  bool IfPredicateInstr);
377 
378  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
379  /// is provided, the integer induction variable will first be truncated to
380  /// the corresponding type.
381  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
382 
383  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
384  /// vector or scalar value on-demand if one is not yet available. When
385  /// vectorizing a loop, we visit the definition of an instruction before its
386  /// uses. When visiting the definition, we either vectorize or scalarize the
387  /// instruction, creating an entry for it in the corresponding map. (In some
388  /// cases, such as induction variables, we will create both vector and scalar
389  /// entries.) Then, as we encounter uses of the definition, we derive values
390  /// for each scalar or vector use unless such a value is already available.
391  /// For example, if we scalarize a definition and one of its uses is vector,
392  /// we build the required vector on-demand with an insertelement sequence
393  /// when visiting the use. Otherwise, if the use is scalar, we can use the
394  /// existing scalar definition.
395  ///
396  /// Return a value in the new loop corresponding to \p V from the original
397  /// loop at unroll index \p Part. If the value has already been vectorized,
398  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
399  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
400  /// a new vector value on-demand by inserting the scalar values into a vector
401  /// with an insertelement sequence. If the value has been neither vectorized
402  /// nor scalarized, it must be loop invariant, so we simply broadcast the
403  /// value into a vector.
404  Value *getOrCreateVectorValue(Value *V, unsigned Part);
405 
406  /// Return a value in the new loop corresponding to \p V from the original
407  /// loop at unroll and vector indices \p Instance. If the value has been
408  /// vectorized but not scalarized, the necessary extractelement instruction
409  /// will be generated.
410  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
411 
412  /// Construct the vector value of a scalarized value \p V one lane at a time.
413  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
414 
415  /// Try to vectorize the interleaved access group that \p Instr belongs to,
416  /// optionally masking the vector operations if \p BlockInMask is non-null.
418  VectorParts *BlockInMask = nullptr);
419 
420  /// Vectorize Load and Store instructions, optionally masking the vector
421  /// operations if \p BlockInMask is non-null.
423  VectorParts *BlockInMask = nullptr);
424 
425  /// Set the debug location in the builder using the debug location in
426  /// the instruction.
427  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
428 
429  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
430  void fixNonInductionPHIs(void);
431 
432 protected:
434 
435  /// A small list of PHINodes.
437 
438  /// A type for scalarized values in the new loop. Each value from the
439  /// original loop, when scalarized, is represented by UF x VF scalar values
440  /// in the new unrolled loop, where UF is the unroll factor and VF is the
441  /// vectorization factor.
443 
444  /// Set up the values of the IVs correctly when exiting the vector loop.
445  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
446  Value *CountRoundDown, Value *EndValue,
447  BasicBlock *MiddleBlock);
448 
449  /// Create a new induction variable inside L.
450  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
451  Value *Step, Instruction *DL);
452 
453  /// Handle all cross-iteration phis in the header.
454  void fixCrossIterationPHIs();
455 
456  /// Fix a first-order recurrence. This is the second phase of vectorizing
457  /// this phi node.
458  void fixFirstOrderRecurrence(PHINode *Phi);
459 
460  /// Fix a reduction cross-iteration phi. This is the second phase of
461  /// vectorizing this phi node.
462  void fixReduction(PHINode *Phi);
463 
464  /// The Loop exit block may have single value PHI nodes with some
465  /// incoming value. While vectorizing we only handled real values
466  /// that were defined inside the loop and we should have one value for
467  /// each predecessor of its parent basic block. See PR14725.
468  void fixLCSSAPHIs();
469 
470  /// Iteratively sink the scalarized operands of a predicated instruction into
471  /// the block that was created for it.
472  void sinkScalarOperands(Instruction *PredInst);
473 
474  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
475  /// represented as.
477 
478  /// Insert the new loop to the loop hierarchy and pass manager
479  /// and update the analysis passes.
480  void updateAnalysis();
481 
482  /// Create a broadcast instruction. This method generates a broadcast
483  /// instruction (shuffle) for loop invariant values and for the induction
484  /// value. If this is the induction variable then we extend it to N, N+1, ...
485  /// this is needed because each iteration in the loop corresponds to a SIMD
486  /// element.
487  virtual Value *getBroadcastInstrs(Value *V);
488 
489  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
490  /// to each vector element of Val. The sequence starts at StartIndex.
491  /// \p Opcode is relevant for FP induction variable.
492  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
493  Instruction::BinaryOps Opcode =
494  Instruction::BinaryOpsEnd);
495 
496  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
497  /// variable on which to base the steps, \p Step is the size of the step, and
498  /// \p EntryVal is the value from the original loop that maps to the steps.
499  /// Note that \p EntryVal doesn't have to be an induction variable - it
500  /// can also be a truncate instruction.
501  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
502  const InductionDescriptor &ID);
503 
504  /// Create a vector induction phi node based on an existing scalar one. \p
505  /// EntryVal is the value from the original loop that maps to the vector phi
506  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
507  /// truncate instruction, instead of widening the original IV, we widen a
508  /// version of the IV truncated to \p EntryVal's type.
510  Value *Step, Instruction *EntryVal);
511 
512  /// Returns true if an instruction \p I should be scalarized instead of
513  /// vectorized for the chosen vectorization factor.
515 
516  /// Returns true if we should generate a scalar version of \p IV.
517  bool needsScalarInduction(Instruction *IV) const;
518 
519  /// If there is a cast involved in the induction variable \p ID, which should
520  /// be ignored in the vectorized loop body, this function records the
521  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
522  /// cast. We had already proved that the casted Phi is equal to the uncasted
523  /// Phi in the vectorized loop (under a runtime guard), and therefore
524  /// there is no need to vectorize the cast - the same value can be used in the
525  /// vector loop for both the Phi and the cast.
526  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
527  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
528  ///
529  /// \p EntryVal is the value from the original loop that maps to the vector
530  /// phi node and is used to distinguish what is the IV currently being
531  /// processed - original one (if \p EntryVal is a phi corresponding to the
532  /// original IV) or the "newly-created" one based on the proof mentioned above
533  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
534  /// latter case \p EntryVal is a TruncInst and we must not record anything for
535  /// that IV, but it's error-prone to expect callers of this routine to care
536  /// about that, hence this explicit parameter.
538  const Instruction *EntryVal,
539  Value *VectorLoopValue,
540  unsigned Part,
541  unsigned Lane = UINT_MAX);
542 
543  /// Generate a shuffle sequence that will reverse the vector Vec.
544  virtual Value *reverseVector(Value *Vec);
545 
546  /// Returns (and creates if needed) the original loop trip count.
547  Value *getOrCreateTripCount(Loop *NewLoop);
548 
549  /// Returns (and creates if needed) the trip count of the widened loop.
551 
552  /// Returns a bitcasted value to the requested vector type.
553  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
555  const DataLayout &DL);
556 
557  /// Emit a bypass check to see if the vector trip count is zero, including if
558  /// it overflows.
560 
561  /// Emit a bypass check to see if all of the SCEV assumptions we've
562  /// had to make are correct.
563  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
564 
565  /// Emit bypass checks to check any memory assumptions we may have made.
566  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
567 
568  /// Compute the transformed value of Index at offset StartValue using step
569  /// StepValue.
570  /// For integer induction, returns StartValue + Index * StepValue.
571  /// For pointer induction, returns StartValue[Index * StepValue].
572  /// FIXME: The newly created binary instructions should contain nsw/nuw
573  /// flags, which can be found from the original scalar operations.
575  const DataLayout &DL,
576  const InductionDescriptor &ID) const;
577 
578  /// Add additional metadata to \p To that was not present on \p Orig.
579  ///
580  /// Currently this is used to add the noalias annotations based on the
581  /// inserted memchecks. Use this for instructions that are *cloned* into the
582  /// vector loop.
583  void addNewMetadata(Instruction *To, const Instruction *Orig);
584 
585  /// Add metadata from one instruction to another.
586  ///
587  /// This includes both the original MDs from \p From and additional ones (\see
588  /// addNewMetadata). Use this for *newly created* instructions in the vector
589  /// loop.
591 
592  /// Similar to the previous function but it adds the metadata to a
593  /// vector of instructions.
595 
596  /// The original loop.
598 
599  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
600  /// dynamic knowledge to simplify SCEV expressions and converts them to a
601  /// more usable form.
603 
604  /// Loop Info.
606 
607  /// Dominator Tree.
609 
610  /// Alias Analysis.
612 
613  /// Target Library Info.
615 
616  /// Target Transform Info.
618 
619  /// Assumption Cache.
621 
622  /// Interface to emit optimization remarks.
624 
625  /// LoopVersioning. It's only set up (non-null) if memchecks were
626  /// used.
627  ///
628  /// This is currently only used to add no-alias metadata based on the
629  /// memchecks. The actually versioning is performed manually.
630  std::unique_ptr<LoopVersioning> LVer;
631 
632  /// The vectorization SIMD factor to use. Each vector will have this many
633  /// vector elements.
634  unsigned VF;
635 
636  /// The vectorization unroll factor to use. Each scalar is vectorized to this
637  /// many different vector instructions.
638  unsigned UF;
639 
640  /// The builder that we use
642 
643  // --- Vectorization state ---
644 
645  /// The vector-loop preheader.
647 
648  /// The scalar-loop preheader.
650 
651  /// Middle Block between the vector and the scalar.
653 
654  /// The ExitBlock of the scalar loop.
656 
657  /// The vector loop body.
659 
660  /// The scalar loop body.
662 
663  /// A list of all bypass blocks. The first block is the entry of the loop.
665 
666  /// The new Induction variable which was added to the new block.
667  PHINode *Induction = nullptr;
668 
669  /// The induction variable of the old basic block.
670  PHINode *OldInduction = nullptr;
671 
672  /// Maps values from the original loop to their corresponding values in the
673  /// vectorized loop. A key value can map to either vector values, scalar
674  /// values or both kinds of values, depending on whether the key was
675  /// vectorized and scalarized.
677 
678  /// Store instructions that were predicated.
680 
681  /// Trip count of the original loop.
682  Value *TripCount = nullptr;
683 
684  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
685  Value *VectorTripCount = nullptr;
686 
687  /// The legality analysis.
689 
690  /// The profitablity analysis.
692 
693  // Record whether runtime checks are added.
694  bool AddedSafetyChecks = false;
695 
696  // Holds the end values for each induction variable. We save the end values
697  // so we can later fix-up the external users of the induction variables.
699 
700  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
701  // fixed up at the end of vector code generation.
703 };
704 
706 public:
709  const TargetLibraryInfo *TLI,
711  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
714  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
715  UnrollFactor, LVL, CM) {}
716 
717 private:
718  Value *getBroadcastInstrs(Value *V) override;
719  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
720  Instruction::BinaryOps Opcode =
721  Instruction::BinaryOpsEnd) override;
722  Value *reverseVector(Value *Vec) override;
723 };
724 
725 } // end namespace llvm
726 
727 /// Look for a meaningful debug location on the instruction or it's
728 /// operands.
730  if (!I)
731  return I;
732 
733  DebugLoc Empty;
734  if (I->getDebugLoc() != Empty)
735  return I;
736 
737  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
738  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
739  if (OpInst->getDebugLoc() != Empty)
740  return OpInst;
741  }
742 
743  return I;
744 }
745 
747  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
748  const DILocation *DIL = Inst->getDebugLoc();
749  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
750  !isa<DbgInfoIntrinsic>(Inst))
752  else
754  } else
756 }
757 
758 #ifndef NDEBUG
759 /// \return string containing a file name and a line # for the given loop.
760 static std::string getDebugLocString(const Loop *L) {
761  std::string Result;
762  if (L) {
763  raw_string_ostream OS(Result);
764  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
765  LoopDbgLoc.print(OS);
766  else
767  // Just print the module name.
768  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
769  OS.flush();
770  }
771  return Result;
772 }
773 #endif
774 
776  const Instruction *Orig) {
777  // If the loop was versioned with memchecks, add the corresponding no-alias
778  // metadata.
779  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
780  LVer->annotateInstWithNoAlias(To, Orig);
781 }
782 
784  Instruction *From) {
785  propagateMetadata(To, From);
786  addNewMetadata(To, From);
787 }
788 
790  Instruction *From) {
791  for (Value *V : To) {
792  if (Instruction *I = dyn_cast<Instruction>(V))
793  addMetadata(I, From);
794  }
795 }
796 
797 static void emitMissedWarning(Function *F, Loop *L,
798  const LoopVectorizeHints &LH,
800  LH.emitRemarkWithHints();
801 
803  if (LH.getWidth() != 1)
805  DEBUG_TYPE, "FailedRequestedVectorization",
806  L->getStartLoc(), L->getHeader())
807  << "loop not vectorized: "
808  << "failed explicitly specified loop vectorization");
809  else if (LH.getInterleave() != 1)
811  DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
812  L->getHeader())
813  << "loop not interleaved: "
814  << "failed explicitly specified loop interleaving");
815  }
816 }
817 
818 namespace llvm {
819 
820 /// LoopVectorizationCostModel - estimates the expected speedups due to
821 /// vectorization.
822 /// In many cases vectorization is not profitable. This can happen because of
823 /// a number of reasons. In this class we mainly attempt to predict the
824 /// expected speedup/slowdowns due to the supported instruction set. We use the
825 /// TargetTransformInfo to query the different backends for the cost of
826 /// different operations.
828 public:
831  const TargetTransformInfo &TTI,
832  const TargetLibraryInfo *TLI, DemandedBits *DB,
835  const LoopVectorizeHints *Hints,
837  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
838  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
839 
840  /// \return An upper bound for the vectorization factor, or None if
841  /// vectorization should be avoided up front.
842  Optional<unsigned> computeMaxVF(bool OptForSize);
843 
844  /// \return The most profitable vectorization factor and the cost of that VF.
845  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
846  /// then this vectorization factor will be selected if vectorization is
847  /// possible.
848  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
849 
850  /// Setup cost-based decisions for user vectorization factor.
851  void selectUserVectorizationFactor(unsigned UserVF) {
852  collectUniformsAndScalars(UserVF);
853  collectInstsToScalarize(UserVF);
854  }
855 
856  /// \return The size (in bits) of the smallest and widest types in the code
857  /// that needs to be vectorized. We ignore values that remain scalar such as
858  /// 64 bit loop indices.
859  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
860 
861  /// \return The desired interleave count.
862  /// If interleave count has been specified by metadata it will be returned.
863  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
864  /// are the selected vectorization factor and the cost of the selected VF.
865  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
866  unsigned LoopCost);
867 
868  /// Memory access instruction may be vectorized in more than one way.
869  /// Form of instruction after vectorization depends on cost.
870  /// This function takes cost-based decisions for Load/Store instructions
871  /// and collects them in a map. This decisions map is used for building
872  /// the lists of loop-uniform and loop-scalar instructions.
873  /// The calculated cost is saved with widening decision in order to
874  /// avoid redundant calculations.
875  void setCostBasedWideningDecision(unsigned VF);
876 
877  /// A struct that represents some properties of the register usage
878  /// of a loop.
879  struct RegisterUsage {
880  /// Holds the number of loop invariant values that are used in the loop.
882 
883  /// Holds the maximum number of concurrent live intervals in the loop.
884  unsigned MaxLocalUsers;
885  };
886 
887  /// \return Returns information about the register usages of the loop for the
888  /// given vectorization factors.
889  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
890 
891  /// Collect values we want to ignore in the cost model.
892  void collectValuesToIgnore();
893 
894  /// \returns The smallest bitwidth each instruction can be represented with.
895  /// The vector equivalents of these instructions should be truncated to this
896  /// type.
898  return MinBWs;
899  }
900 
901  /// \returns True if it is more profitable to scalarize instruction \p I for
902  /// vectorization factor \p VF.
903  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
904  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
905 
906  // Cost model is not run in the VPlan-native path - return conservative
907  // result until this changes.
909  return false;
910 
911  auto Scalars = InstsToScalarize.find(VF);
912  assert(Scalars != InstsToScalarize.end() &&
913  "VF not yet analyzed for scalarization profitability");
914  return Scalars->second.find(I) != Scalars->second.end();
915  }
916 
917  /// Returns true if \p I is known to be uniform after vectorization.
918  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
919  if (VF == 1)
920  return true;
921 
922  // Cost model is not run in the VPlan-native path - return conservative
923  // result until this changes.
925  return false;
926 
927  auto UniformsPerVF = Uniforms.find(VF);
928  assert(UniformsPerVF != Uniforms.end() &&
929  "VF not yet analyzed for uniformity");
930  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
931  }
932 
933  /// Returns true if \p I is known to be scalar after vectorization.
934  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
935  if (VF == 1)
936  return true;
937 
938  // Cost model is not run in the VPlan-native path - return conservative
939  // result until this changes.
941  return false;
942 
943  auto ScalarsPerVF = Scalars.find(VF);
944  assert(ScalarsPerVF != Scalars.end() &&
945  "Scalar values are not calculated for VF");
946  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
947  }
948 
949  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
950  /// for vectorization factor \p VF.
951  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
952  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
953  !isProfitableToScalarize(I, VF) &&
954  !isScalarAfterVectorization(I, VF);
955  }
956 
957  /// Decision that was taken during cost calculation for memory instruction.
960  CM_Widen, // For consecutive accesses with stride +1.
961  CM_Widen_Reverse, // For consecutive accesses with stride -1.
964  CM_Scalarize
965  };
966 
967  /// Save vectorization decision \p W and \p Cost taken by the cost model for
968  /// instruction \p I and vector width \p VF.
970  unsigned Cost) {
971  assert(VF >= 2 && "Expected VF >=2");
972  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
973  }
974 
975  /// Save vectorization decision \p W and \p Cost taken by the cost model for
976  /// interleaving group \p Grp and vector width \p VF.
977  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
978  InstWidening W, unsigned Cost) {
979  assert(VF >= 2 && "Expected VF >=2");
980  /// Broadcast this decicion to all instructions inside the group.
981  /// But the cost will be assigned to one instruction only.
982  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
983  if (auto *I = Grp->getMember(i)) {
984  if (Grp->getInsertPos() == I)
985  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
986  else
987  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
988  }
989  }
990  }
991 
992  /// Return the cost model decision for the given instruction \p I and vector
993  /// width \p VF. Return CM_Unknown if this instruction did not pass
994  /// through the cost modeling.
996  assert(VF >= 2 && "Expected VF >=2");
997 
998  // Cost model is not run in the VPlan-native path - return conservative
999  // result until this changes.
1001  return CM_GatherScatter;
1002 
1003  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1004  auto Itr = WideningDecisions.find(InstOnVF);
1005  if (Itr == WideningDecisions.end())
1006  return CM_Unknown;
1007  return Itr->second.first;
1008  }
1009 
1010  /// Return the vectorization cost for the given instruction \p I and vector
1011  /// width \p VF.
1012  unsigned getWideningCost(Instruction *I, unsigned VF) {
1013  assert(VF >= 2 && "Expected VF >=2");
1014  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1015  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1016  "The cost is not calculated");
1017  return WideningDecisions[InstOnVF].second;
1018  }
1019 
1020  /// Return True if instruction \p I is an optimizable truncate whose operand
1021  /// is an induction variable. Such a truncate will be removed by adding a new
1022  /// induction variable with the destination type.
1023  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1024  // If the instruction is not a truncate, return false.
1025  auto *Trunc = dyn_cast<TruncInst>(I);
1026  if (!Trunc)
1027  return false;
1028 
1029  // Get the source and destination types of the truncate.
1030  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1031  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1032 
1033  // If the truncate is free for the given types, return false. Replacing a
1034  // free truncate with an induction variable would add an induction variable
1035  // update instruction to each iteration of the loop. We exclude from this
1036  // check the primary induction variable since it will need an update
1037  // instruction regardless.
1038  Value *Op = Trunc->getOperand(0);
1039  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1040  return false;
1041 
1042  // If the truncated value is not an induction variable, return false.
1043  return Legal->isInductionPhi(Op);
1044  }
1045 
1046  /// Collects the instructions to scalarize for each predicated instruction in
1047  /// the loop.
1048  void collectInstsToScalarize(unsigned VF);
1049 
1050  /// Collect Uniform and Scalar values for the given \p VF.
1051  /// The sets depend on CM decision for Load/Store instructions
1052  /// that may be vectorized as interleave, gather-scatter or scalarized.
1053  void collectUniformsAndScalars(unsigned VF) {
1054  // Do the analysis once.
1055  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1056  return;
1057  setCostBasedWideningDecision(VF);
1058  collectLoopUniforms(VF);
1059  collectLoopScalars(VF);
1060  }
1061 
1062  /// Returns true if the target machine supports masked store operation
1063  /// for the given \p DataType and kind of access to \p Ptr.
1065  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1066  }
1067 
1068  /// Returns true if the target machine supports masked load operation
1069  /// for the given \p DataType and kind of access to \p Ptr.
1071  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1072  }
1073 
1074  /// Returns true if the target machine supports masked scatter operation
1075  /// for the given \p DataType.
1077  return TTI.isLegalMaskedScatter(DataType);
1078  }
1079 
1080  /// Returns true if the target machine supports masked gather operation
1081  /// for the given \p DataType.
1083  return TTI.isLegalMaskedGather(DataType);
1084  }
1085 
1086  /// Returns true if the target machine can represent \p V as a masked gather
1087  /// or scatter operation.
1089  bool LI = isa<LoadInst>(V);
1090  bool SI = isa<StoreInst>(V);
1091  if (!LI && !SI)
1092  return false;
1093  auto *Ty = getMemInstValueType(V);
1094  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1095  }
1096 
1097  /// Returns true if \p I is an instruction that will be scalarized with
1098  /// predication. Such instructions include conditional stores and
1099  /// instructions that may divide by zero.
1100  /// If a non-zero VF has been calculated, we check if I will be scalarized
1101  /// predication for that VF.
1102  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1103 
1104  // Returns true if \p I is an instruction that will be predicated either
1105  // through scalar predication or masked load/store or masked gather/scatter.
1106  // Superset of instructions that return true for isScalarWithPredication.
1109  return false;
1110  // Loads and stores that need some form of masked operation are predicated
1111  // instructions.
1112  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1113  return Legal->isMaskRequired(I);
1114  return isScalarWithPredication(I);
1115  }
1116 
1117  /// Returns true if \p I is a memory instruction with consecutive memory
1118  /// access that can be widened.
1119  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1120 
1121  /// Returns true if \p I is a memory instruction in an interleaved-group
1122  /// of memory accesses that can be vectorized with wide vector loads/stores
1123  /// and shuffles.
1124  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1125 
1126  /// Check if \p Instr belongs to any interleaved access group.
1128  return InterleaveInfo.isInterleaved(Instr);
1129  }
1130 
1131  /// Get the interleaved access group that \p Instr belongs to.
1133  return InterleaveInfo.getInterleaveGroup(Instr);
1134  }
1135 
1136  /// Returns true if an interleaved group requires a scalar iteration
1137  /// to handle accesses with gaps.
1138  bool requiresScalarEpilogue() const {
1139  return InterleaveInfo.requiresScalarEpilogue();
1140  }
1141 
1142 private:
1143  unsigned NumPredStores = 0;
1144 
1145  /// \return An upper bound for the vectorization factor, larger than zero.
1146  /// One is returned if vectorization should best be avoided due to cost.
1147  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1148 
1149  /// The vectorization cost is a combination of the cost itself and a boolean
1150  /// indicating whether any of the contributing operations will actually
1151  /// operate on
1152  /// vector values after type legalization in the backend. If this latter value
1153  /// is
1154  /// false, then all operations will be scalarized (i.e. no vectorization has
1155  /// actually taken place).
1156  using VectorizationCostTy = std::pair<unsigned, bool>;
1157 
1158  /// Returns the expected execution cost. The unit of the cost does
1159  /// not matter because we use the 'cost' units to compare different
1160  /// vector widths. The cost that is returned is *not* normalized by
1161  /// the factor width.
1162  VectorizationCostTy expectedCost(unsigned VF);
1163 
1164  /// Returns the execution time cost of an instruction for a given vector
1165  /// width. Vector width of one means scalar.
1166  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1167 
1168  /// The cost-computation logic from getInstructionCost which provides
1169  /// the vector type as an output parameter.
1170  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1171 
1172  /// Calculate vectorization cost of memory instruction \p I.
1173  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1174 
1175  /// The cost computation for scalarized memory instruction.
1176  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1177 
1178  /// The cost computation for interleaving group of memory instructions.
1179  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1180 
1181  /// The cost computation for Gather/Scatter instruction.
1182  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1183 
1184  /// The cost computation for widening instruction \p I with consecutive
1185  /// memory access.
1186  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1187 
1188  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1189  /// Load: scalar load + broadcast.
1190  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1191  /// element)
1192  /// TODO: Test the extra cost of the extract when loop variant value stored.
1193  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1194 
1195  /// Returns whether the instruction is a load or store and will be a emitted
1196  /// as a vector operation.
1197  bool isConsecutiveLoadOrStore(Instruction *I);
1198 
1199  /// Returns true if an artificially high cost for emulated masked memrefs
1200  /// should be used.
1201  bool useEmulatedMaskMemRefHack(Instruction *I);
1202 
1203  /// Create an analysis remark that explains why vectorization failed
1204  ///
1205  /// \p RemarkName is the identifier for the remark. \return the remark object
1206  /// that can be streamed to.
1207  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1208  return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1209  RemarkName, TheLoop);
1210  }
1211 
1212  /// Map of scalar integer values to the smallest bitwidth they can be legally
1213  /// represented as. The vector equivalents of these values should be truncated
1214  /// to this type.
1216 
1217  /// A type representing the costs for instructions if they were to be
1218  /// scalarized rather than vectorized. The entries are Instruction-Cost
1219  /// pairs.
1221 
1222  /// A set containing all BasicBlocks that are known to present after
1223  /// vectorization as a predicated block.
1224  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1225 
1226  /// A map holding scalar costs for different vectorization factors. The
1227  /// presence of a cost for an instruction in the mapping indicates that the
1228  /// instruction will be scalarized when vectorizing with the associated
1229  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1230  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1231 
1232  /// Holds the instructions known to be uniform after vectorization.
1233  /// The data is collected per VF.
1235 
1236  /// Holds the instructions known to be scalar after vectorization.
1237  /// The data is collected per VF.
1239 
1240  /// Holds the instructions (address computations) that are forced to be
1241  /// scalarized.
1243 
1244  /// Returns the expected difference in cost from scalarizing the expression
1245  /// feeding a predicated instruction \p PredInst. The instructions to
1246  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1247  /// non-negative return value implies the expression will be scalarized.
1248  /// Currently, only single-use chains are considered for scalarization.
1249  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1250  unsigned VF);
1251 
1252  /// Collect the instructions that are uniform after vectorization. An
1253  /// instruction is uniform if we represent it with a single scalar value in
1254  /// the vectorized loop corresponding to each vector iteration. Examples of
1255  /// uniform instructions include pointer operands of consecutive or
1256  /// interleaved memory accesses. Note that although uniformity implies an
1257  /// instruction will be scalar, the reverse is not true. In general, a
1258  /// scalarized instruction will be represented by VF scalar values in the
1259  /// vectorized loop, each corresponding to an iteration of the original
1260  /// scalar loop.
1261  void collectLoopUniforms(unsigned VF);
1262 
1263  /// Collect the instructions that are scalar after vectorization. An
1264  /// instruction is scalar if it is known to be uniform or will be scalarized
1265  /// during vectorization. Non-uniform scalarized instructions will be
1266  /// represented by VF values in the vectorized loop, each corresponding to an
1267  /// iteration of the original scalar loop.
1268  void collectLoopScalars(unsigned VF);
1269 
1270  /// Keeps cost model vectorization decision and cost for instructions.
1271  /// Right now it is used for memory instructions only.
1273  std::pair<InstWidening, unsigned>>;
1274 
1275  DecisionList WideningDecisions;
1276 
1277 public:
1278  /// The loop that we evaluate.
1280 
1281  /// Predicated scalar evolution analysis.
1283 
1284  /// Loop Info analysis.
1286 
1287  /// Vectorization legality.
1289 
1290  /// Vector target information.
1292 
1293  /// Target Library Info.
1295 
1296  /// Demanded bits analysis.
1298 
1299  /// Assumption cache.
1301 
1302  /// Interface to emit optimization remarks.
1304 
1306 
1307  /// Loop Vectorize Hint.
1309 
1310  /// The interleave access information contains groups of interleaved accesses
1311  /// with the same stride and close to each other.
1313 
1314  /// Values to ignore in the cost model.
1316 
1317  /// Values to ignore in the cost model when VF > 1.
1319 };
1320 
1321 } // end namespace llvm
1322 
1323 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1324 // vectorization. The loop needs to be annotated with #pragma omp simd
1325 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1326 // vector length information is not provided, vectorization is not considered
1327 // explicit. Interleave hints are not allowed either. These limitations will be
1328 // relaxed in the future.
1329 // Please, note that we are currently forced to abuse the pragma 'clang
1330 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1331 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1332 // provides *explicit vectorization hints* (LV can bypass legal checks and
1333 // assume that vectorization is legal). However, both hints are implemented
1334 // using the same metadata (llvm.loop.vectorize, processed by
1335 // LoopVectorizeHints). This will be fixed in the future when the native IR
1336 // representation for pragma 'omp simd' is introduced.
1337 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1339  assert(!OuterLp->empty() && "This is not an outer loop");
1340  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1341 
1342  // Only outer loops with an explicit vectorization hint are supported.
1343  // Unannotated outer loops are ignored.
1344  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1345  return false;
1346 
1347  Function *Fn = OuterLp->getHeader()->getParent();
1348  if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
1349  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1350  return false;
1351  }
1352 
1353  if (!Hints.getWidth()) {
1354  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
1355  emitMissedWarning(Fn, OuterLp, Hints, ORE);
1356  return false;
1357  }
1358 
1359  if (Hints.getInterleave() > 1) {
1360  // TODO: Interleave support is future work.
1361  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1362  "outer loops.\n");
1363  emitMissedWarning(Fn, OuterLp, Hints, ORE);
1364  return false;
1365  }
1366 
1367  return true;
1368 }
1369 
1373  // Collect inner loops and outer loops without irreducible control flow. For
1374  // now, only collect outer loops that have explicit vectorization hints. If we
1375  // are stress testing the VPlan H-CFG construction, we collect the outermost
1376  // loop of every loop nest.
1377  if (L.empty() || VPlanBuildStressTest ||
1379  LoopBlocksRPO RPOT(&L);
1380  RPOT.perform(LI);
1381  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1382  V.push_back(&L);
1383  // TODO: Collect inner loops inside marked outer loops in case
1384  // vectorization fails for the outer loop. Do not invoke
1385  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1386  // already known to be reducible. We can use an inherited attribute for
1387  // that.
1388  return;
1389  }
1390  }
1391  for (Loop *InnerL : L)
1392  collectSupportedLoops(*InnerL, LI, ORE, V);
1393 }
1394 
1395 namespace {
1396 
1397 /// The LoopVectorize Pass.
1398 struct LoopVectorize : public FunctionPass {
1399  /// Pass identification, replacement for typeid
1400  static char ID;
1401 
1402  LoopVectorizePass Impl;
1403 
1404  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
1405  : FunctionPass(ID) {
1406  Impl.DisableUnrolling = NoUnrolling;
1407  Impl.AlwaysVectorize = AlwaysVectorize;
1409  }
1410 
1411  bool runOnFunction(Function &F) override {
1412  if (skipFunction(F))
1413  return false;
1414 
1415  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1416  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1417  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1418  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1419  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1420  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1421  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1422  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1423  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1424  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1425  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1426  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1427 
1428  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1429  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1430 
1431  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1432  GetLAA, *ORE);
1433  }
1434 
1435  void getAnalysisUsage(AnalysisUsage &AU) const override {
1446 
1447  // We currently do not preserve loopinfo/dominator analyses with outer loop
1448  // vectorization. Until this is addressed, mark these analyses as preserved
1449  // only for non-VPlan-native path.
1450  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1451  if (!EnableVPlanNativePath) {
1454  }
1455 
1458  }
1459 };
1460 
1461 } // end anonymous namespace
1462 
1463 //===----------------------------------------------------------------------===//
1464 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1465 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1466 //===----------------------------------------------------------------------===//
1467 
1469  // We need to place the broadcast of invariant variables outside the loop,
1470  // but only if it's proven safe to do so. Else, broadcast will be inside
1471  // vector loop body.
1472  Instruction *Instr = dyn_cast<Instruction>(V);
1473  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1474  (!Instr ||
1476  // Place the code for broadcasting invariant variables in the new preheader.
1478  if (SafeToHoist)
1480 
1481  // Broadcast the scalar into all locations in the vector.
1482  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1483 
1484  return Shuf;
1485 }
1486 
1488  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1489  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1490  "Expected either an induction phi-node or a truncate of it!");
1491  Value *Start = II.getStartValue();
1492 
1493  // Construct the initial value of the vector IV in the vector loop preheader
1494  auto CurrIP = Builder.saveIP();
1496  if (isa<TruncInst>(EntryVal)) {
1497  assert(Start->getType()->isIntegerTy() &&
1498  "Truncation requires an integer type");
1499  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1500  Step = Builder.CreateTrunc(Step, TruncType);
1501  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1502  }
1503  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1504  Value *SteppedStart =
1505  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1506 
1507  // We create vector phi nodes for both integer and floating-point induction
1508  // variables. Here, we determine the kind of arithmetic we will perform.
1509  Instruction::BinaryOps AddOp;
1510  Instruction::BinaryOps MulOp;
1511  if (Step->getType()->isIntegerTy()) {
1512  AddOp = Instruction::Add;
1513  MulOp = Instruction::Mul;
1514  } else {
1515  AddOp = II.getInductionOpcode();
1516  MulOp = Instruction::FMul;
1517  }
1518 
1519  // Multiply the vectorization factor by the step using integer or
1520  // floating-point arithmetic as appropriate.
1521  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1522  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1523 
1524  // Create a vector splat to use in the induction update.
1525  //
1526  // FIXME: If the step is non-constant, we create the vector splat with
1527  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1528  // handle a constant vector splat.
1529  Value *SplatVF = isa<Constant>(Mul)
1530  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1531  : Builder.CreateVectorSplat(VF, Mul);
1532  Builder.restoreIP(CurrIP);
1533 
1534  // We may need to add the step a number of times, depending on the unroll
1535  // factor. The last of those goes into the PHI.
1536  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1538  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1539  Instruction *LastInduction = VecInd;
1540  for (unsigned Part = 0; Part < UF; ++Part) {
1541  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1542 
1543  if (isa<TruncInst>(EntryVal))
1544  addMetadata(LastInduction, EntryVal);
1545  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1546 
1547  LastInduction = cast<Instruction>(addFastMathFlag(
1548  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1549  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1550  }
1551 
1552  // Move the last step to the end of the latch block. This ensures consistent
1553  // placement of all induction updates.
1554  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1555  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1556  auto *ICmp = cast<Instruction>(Br->getCondition());
1557  LastInduction->moveBefore(ICmp);
1558  LastInduction->setName("vec.ind.next");
1559 
1560  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1561  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1562 }
1563 
1565  return Cost->isScalarAfterVectorization(I, VF) ||
1567 }
1568 
1571  return true;
1572  auto isScalarInst = [&](User *U) -> bool {
1573  auto *I = cast<Instruction>(U);
1575  };
1576  return llvm::any_of(IV->users(), isScalarInst);
1577 }
1578 
1580  const InductionDescriptor &ID, const Instruction *EntryVal,
1581  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1582  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1583  "Expected either an induction phi-node or a truncate of it!");
1584 
1585  // This induction variable is not the phi from the original loop but the
1586  // newly-created IV based on the proof that casted Phi is equal to the
1587  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1588  // re-uses the same InductionDescriptor that original IV uses but we don't
1589  // have to do any recording in this case - that is done when original IV is
1590  // processed.
1591  if (isa<TruncInst>(EntryVal))
1592  return;
1593 
1594  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1595  if (Casts.empty())
1596  return;
1597  // Only the first Cast instruction in the Casts vector is of interest.
1598  // The rest of the Casts (if exist) have no uses outside the
1599  // induction update chain itself.
1600  Instruction *CastInst = *Casts.begin();
1601  if (Lane < UINT_MAX)
1602  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1603  else
1604  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1605 }
1606 
1608  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1609  "Primary induction variable must have an integer type");
1610 
1611  auto II = Legal->getInductionVars()->find(IV);
1612  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1613 
1614  auto ID = II->second;
1615  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1616 
1617  // The scalar value to broadcast. This will be derived from the canonical
1618  // induction variable.
1619  Value *ScalarIV = nullptr;
1620 
1621  // The value from the original loop to which we are mapping the new induction
1622  // variable.
1623  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1624 
1625  // True if we have vectorized the induction variable.
1626  auto VectorizedIV = false;
1627 
1628  // Determine if we want a scalar version of the induction variable. This is
1629  // true if the induction variable itself is not widened, or if it has at
1630  // least one user in the loop that is not widened.
1631  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1632 
1633  // Generate code for the induction step. Note that induction steps are
1634  // required to be loop-invariant
1635  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1636  "Induction step should be loop invariant");
1637  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1638  Value *Step = nullptr;
1639  if (PSE.getSE()->isSCEVable(IV->getType())) {
1640  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1641  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1643  } else {
1644  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1645  }
1646 
1647  // Try to create a new independent vector induction variable. If we can't
1648  // create the phi node, we will splat the scalar induction variable in each
1649  // loop iteration.
1650  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1651  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1652  VectorizedIV = true;
1653  }
1654 
1655  // If we haven't yet vectorized the induction variable, or if we will create
1656  // a scalar one, we need to define the scalar induction variable and step
1657  // values. If we were given a truncation type, truncate the canonical
1658  // induction variable and step. Otherwise, derive these values from the
1659  // induction descriptor.
1660  if (!VectorizedIV || NeedsScalarIV) {
1661  ScalarIV = Induction;
1662  if (IV != OldInduction) {
1663  ScalarIV = IV->getType()->isIntegerTy()
1665  : Builder.CreateCast(Instruction::SIToFP, Induction,
1666  IV->getType());
1667  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1668  ScalarIV->setName("offset.idx");
1669  }
1670  if (Trunc) {
1671  auto *TruncType = cast<IntegerType>(Trunc->getType());
1672  assert(Step->getType()->isIntegerTy() &&
1673  "Truncation requires an integer step");
1674  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1675  Step = Builder.CreateTrunc(Step, TruncType);
1676  }
1677  }
1678 
1679  // If we haven't yet vectorized the induction variable, splat the scalar
1680  // induction variable, and build the necessary step vectors.
1681  // TODO: Don't do it unless the vectorized IV is really required.
1682  if (!VectorizedIV) {
1683  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1684  for (unsigned Part = 0; Part < UF; ++Part) {
1685  Value *EntryPart =
1686  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1687  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1688  if (Trunc)
1689  addMetadata(EntryPart, Trunc);
1690  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1691  }
1692  }
1693 
1694  // If an induction variable is only used for counting loop iterations or
1695  // calculating addresses, it doesn't need to be widened. Create scalar steps
1696  // that can be used by instructions we will later scalarize. Note that the
1697  // addition of the scalar steps will not increase the number of instructions
1698  // in the loop in the common case prior to InstCombine. We will be trading
1699  // one vector extract for each scalar step.
1700  if (NeedsScalarIV)
1701  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1702 }
1703 
1705  Instruction::BinaryOps BinOp) {
1706  // Create and check the types.
1707  assert(Val->getType()->isVectorTy() && "Must be a vector");
1708  int VLen = Val->getType()->getVectorNumElements();
1709 
1710  Type *STy = Val->getType()->getScalarType();
1711  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1712  "Induction Step must be an integer or FP");
1713  assert(Step->getType() == STy && "Step has wrong type");
1714 
1716 
1717  if (STy->isIntegerTy()) {
1718  // Create a vector of consecutive numbers from zero to VF.
1719  for (int i = 0; i < VLen; ++i)
1720  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1721 
1722  // Add the consecutive indices to the vector value.
1723  Constant *Cv = ConstantVector::get(Indices);
1724  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1725  Step = Builder.CreateVectorSplat(VLen, Step);
1726  assert(Step->getType() == Val->getType() && "Invalid step vec");
1727  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1728  // which can be found from the original scalar operations.
1729  Step = Builder.CreateMul(Cv, Step);
1730  return Builder.CreateAdd(Val, Step, "induction");
1731  }
1732 
1733  // Floating point induction.
1734  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1735  "Binary Opcode should be specified for FP induction");
1736  // Create a vector of consecutive numbers from zero to VF.
1737  for (int i = 0; i < VLen; ++i)
1738  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1739 
1740  // Add the consecutive indices to the vector value.
1741  Constant *Cv = ConstantVector::get(Indices);
1742 
1743  Step = Builder.CreateVectorSplat(VLen, Step);
1744 
1745  // Floating point operations had to be 'fast' to enable the induction.
1746  FastMathFlags Flags;
1747  Flags.setFast();
1748 
1749  Value *MulOp = Builder.CreateFMul(Cv, Step);
1750  if (isa<Instruction>(MulOp))
1751  // Have to check, MulOp may be a constant
1752  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1753 
1754  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1755  if (isa<Instruction>(BOp))
1756  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1757  return BOp;
1758 }
1759 
1761  Instruction *EntryVal,
1762  const InductionDescriptor &ID) {
1763  // We shouldn't have to build scalar steps if we aren't vectorizing.
1764  assert(VF > 1 && "VF should be greater than one");
1765 
1766  // Get the value type and ensure it and the step have the same integer type.
1767  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1768  assert(ScalarIVTy == Step->getType() &&
1769  "Val and Step should have the same type");
1770 
1771  // We build scalar steps for both integer and floating-point induction
1772  // variables. Here, we determine the kind of arithmetic we will perform.
1773  Instruction::BinaryOps AddOp;
1774  Instruction::BinaryOps MulOp;
1775  if (ScalarIVTy->isIntegerTy()) {
1776  AddOp = Instruction::Add;
1777  MulOp = Instruction::Mul;
1778  } else {
1779  AddOp = ID.getInductionOpcode();
1780  MulOp = Instruction::FMul;
1781  }
1782 
1783  // Determine the number of scalars we need to generate for each unroll
1784  // iteration. If EntryVal is uniform, we only need to generate the first
1785  // lane. Otherwise, we generate all VF values.
1786  unsigned Lanes =
1787  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1788  : VF;
1789  // Compute the scalar steps and save the results in VectorLoopValueMap.
1790  for (unsigned Part = 0; Part < UF; ++Part) {
1791  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1792  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1793  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1794  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1795  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1796  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1797  }
1798  }
1799 }
1800 
1802  assert(V != Induction && "The new induction variable should not be used.");
1803  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1804  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1805 
1806  // If we have a stride that is replaced by one, do it here. Defer this for
1807  // the VPlan-native path until we start running Legal checks in that path.
1809  V = ConstantInt::get(V->getType(), 1);
1810 
1811  // If we have a vector mapped to this value, return it.
1812  if (VectorLoopValueMap.hasVectorValue(V, Part))
1813  return VectorLoopValueMap.getVectorValue(V, Part);
1814 
1815  // If the value has not been vectorized, check if it has been scalarized
1816  // instead. If it has been scalarized, and we actually need the value in
1817  // vector form, we will construct the vector values on demand.
1819  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1820 
1821  // If we've scalarized a value, that value should be an instruction.
1822  auto *I = cast<Instruction>(V);
1823 
1824  // If we aren't vectorizing, we can just copy the scalar map values over to
1825  // the vector map.
1826  if (VF == 1) {
1827  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1828  return ScalarValue;
1829  }
1830 
1831  // Get the last scalar instruction we generated for V and Part. If the value
1832  // is known to be uniform after vectorization, this corresponds to lane zero
1833  // of the Part unroll iteration. Otherwise, the last instruction is the one
1834  // we created for the last vector lane of the Part unroll iteration.
1835  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1836  auto *LastInst = cast<Instruction>(
1837  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1838 
1839  // Set the insert point after the last scalarized instruction. This ensures
1840  // the insertelement sequence will directly follow the scalar definitions.
1841  auto OldIP = Builder.saveIP();
1842  auto NewIP = std::next(BasicBlock::iterator(LastInst));
1843  Builder.SetInsertPoint(&*NewIP);
1844 
1845  // However, if we are vectorizing, we need to construct the vector values.
1846  // If the value is known to be uniform after vectorization, we can just
1847  // broadcast the scalar value corresponding to lane zero for each unroll
1848  // iteration. Otherwise, we construct the vector values using insertelement
1849  // instructions. Since the resulting vectors are stored in
1850  // VectorLoopValueMap, we will only generate the insertelements once.
1851  Value *VectorValue = nullptr;
1853  VectorValue = getBroadcastInstrs(ScalarValue);
1854  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1855  } else {
1856  // Initialize packing with insertelements to start from undef.
1858  VectorLoopValueMap.setVectorValue(V, Part, Undef);
1859  for (unsigned Lane = 0; Lane < VF; ++Lane)
1860  packScalarIntoVectorValue(V, {Part, Lane});
1861  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1862  }
1863  Builder.restoreIP(OldIP);
1864  return VectorValue;
1865  }
1866 
1867  // If this scalar is unknown, assume that it is a constant or that it is
1868  // loop invariant. Broadcast V and save the value for future uses.
1869  Value *B = getBroadcastInstrs(V);
1870  VectorLoopValueMap.setVectorValue(V, Part, B);
1871  return B;
1872 }
1873 
1874 Value *
1876  const VPIteration &Instance) {
1877  // If the value is not an instruction contained in the loop, it should
1878  // already be scalar.
1879  if (OrigLoop->isLoopInvariant(V))
1880  return V;
1881 
1882  assert(Instance.Lane > 0
1883  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1884  : true && "Uniform values only have lane zero");
1885 
1886  // If the value from the original loop has not been vectorized, it is
1887  // represented by UF x VF scalar values in the new loop. Return the requested
1888  // scalar value.
1889  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1890  return VectorLoopValueMap.getScalarValue(V, Instance);
1891 
1892  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1893  // for the given unroll part. If this entry is not a vector type (i.e., the
1894  // vectorization factor is one), there is no need to generate an
1895  // extractelement instruction.
1896  auto *U = getOrCreateVectorValue(V, Instance.Part);
1897  if (!U->getType()->isVectorTy()) {
1898  assert(VF == 1 && "Value not scalarized has non-vector type");
1899  return U;
1900  }
1901 
1902  // Otherwise, the value from the original loop has been vectorized and is
1903  // represented by UF vector values. Extract and return the requested scalar
1904  // value from the appropriate vector lane.
1905  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1906 }
1907 
1909  Value *V, const VPIteration &Instance) {
1910  assert(V != Induction && "The new induction variable should not be used.");
1911  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1912  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1913 
1914  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1915  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1916  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1917  Builder.getInt32(Instance.Lane));
1918  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1919 }
1920 
1922  assert(Vec->getType()->isVectorTy() && "Invalid type");
1923  SmallVector<Constant *, 8> ShuffleMask;
1924  for (unsigned i = 0; i < VF; ++i)
1925  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1926 
1927  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1928  ConstantVector::get(ShuffleMask),
1929  "reverse");
1930 }
1931 
1932 // Try to vectorize the interleave group that \p Instr belongs to.
1933 //
1934 // E.g. Translate following interleaved load group (factor = 3):
1935 // for (i = 0; i < N; i+=3) {
1936 // R = Pic[i]; // Member of index 0
1937 // G = Pic[i+1]; // Member of index 1
1938 // B = Pic[i+2]; // Member of index 2
1939 // ... // do something to R, G, B
1940 // }
1941 // To:
1942 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
1943 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
1944 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
1945 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
1946 //
1947 // Or translate following interleaved store group (factor = 3):
1948 // for (i = 0; i < N; i+=3) {
1949 // ... do something to R, G, B
1950 // Pic[i] = R; // Member of index 0
1951 // Pic[i+1] = G; // Member of index 1
1952 // Pic[i+2] = B; // Member of index 2
1953 // }
1954 // To:
1955 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
1956 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
1957 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
1958 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
1959 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
1961  VectorParts *BlockInMask) {
1962  const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
1963  assert(Group && "Fail to get an interleaved access group.");
1964 
1965  // Skip if current instruction is not the insert position.
1966  if (Instr != Group->getInsertPos())
1967  return;
1968 
1969  const DataLayout &DL = Instr->getModule()->getDataLayout();
1970  Value *Ptr = getLoadStorePointerOperand(Instr);
1971 
1972  // Prepare for the vector type of the interleaved load/store.
1973  Type *ScalarTy = getMemInstValueType(Instr);
1974  unsigned InterleaveFactor = Group->getFactor();
1975  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
1976  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
1977 
1978  // Prepare for the new pointers.
1980  SmallVector<Value *, 2> NewPtrs;
1981  unsigned Index = Group->getIndex(Instr);
1982 
1983  VectorParts Mask;
1984  bool IsMaskRequired = BlockInMask;
1985  if (IsMaskRequired) {
1986  Mask = *BlockInMask;
1987  // TODO: extend the masked interleaved-group support to reversed access.
1988  assert(!Group->isReverse() && "Reversed masked interleave-group "
1989  "not supported.");
1990  }
1991 
1992  // If the group is reverse, adjust the index to refer to the last vector lane
1993  // instead of the first. We adjust the index from the first vector lane,
1994  // rather than directly getting the pointer for lane VF - 1, because the
1995  // pointer operand of the interleaved access is supposed to be uniform. For
1996  // uniform instructions, we're only required to generate a value for the
1997  // first vector lane in each unroll iteration.
1998  if (Group->isReverse())
1999  Index += (VF - 1) * Group->getFactor();
2000 
2001  bool InBounds = false;
2002  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2003  InBounds = gep->isInBounds();
2004 
2005  for (unsigned Part = 0; Part < UF; Part++) {
2006  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2007 
2008  // Notice current instruction could be any index. Need to adjust the address
2009  // to the member of index 0.
2010  //
2011  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2012  // b = A[i]; // Member of index 0
2013  // Current pointer is pointed to A[i+1], adjust it to A[i].
2014  //
2015  // E.g. A[i+1] = a; // Member of index 1
2016  // A[i] = b; // Member of index 0
2017  // A[i+2] = c; // Member of index 2 (Current instruction)
2018  // Current pointer is pointed to A[i+2], adjust it to A[i].
2019  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2020  if (InBounds)
2021  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2022 
2023  // Cast to the vector pointer type.
2024  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2025  }
2026 
2027  setDebugLocFromInst(Builder, Instr);
2028  Value *UndefVec = UndefValue::get(VecTy);
2029 
2030  // Vectorize the interleaved load group.
2031  if (isa<LoadInst>(Instr)) {
2032  // For each unroll part, create a wide load for the group.
2033  SmallVector<Value *, 2> NewLoads;
2034  for (unsigned Part = 0; Part < UF; Part++) {
2035  Instruction *NewLoad;
2036  if (IsMaskRequired) {
2037  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2038  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2039  Value *ShuffledMask = Builder.CreateShuffleVector(
2040  Mask[Part], Undefs, RepMask, "interleaved.mask");
2041  NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2042  ShuffledMask, UndefVec,
2043  "wide.masked.vec");
2044  }
2045  else
2046  NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],
2047  Group->getAlignment(), "wide.vec");
2048  Group->addMetadata(NewLoad);
2049  NewLoads.push_back(NewLoad);
2050  }
2051 
2052  // For each member in the group, shuffle out the appropriate data from the
2053  // wide loads.
2054  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2055  Instruction *Member = Group->getMember(I);
2056 
2057  // Skip the gaps in the group.
2058  if (!Member)
2059  continue;
2060 
2061  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2062  for (unsigned Part = 0; Part < UF; Part++) {
2063  Value *StridedVec = Builder.CreateShuffleVector(
2064  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2065 
2066  // If this member has different type, cast the result type.
2067  if (Member->getType() != ScalarTy) {
2068  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2069  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2070  }
2071 
2072  if (Group->isReverse())
2073  StridedVec = reverseVector(StridedVec);
2074 
2075  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2076  }
2077  }
2078  return;
2079  }
2080 
2081  // The sub vector type for current instruction.
2082  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2083 
2084  // Vectorize the interleaved store group.
2085  for (unsigned Part = 0; Part < UF; Part++) {
2086  // Collect the stored vector from each member.
2087  SmallVector<Value *, 4> StoredVecs;
2088  for (unsigned i = 0; i < InterleaveFactor; i++) {
2089  // Interleaved store group doesn't allow a gap, so each index has a member
2090  Instruction *Member = Group->getMember(i);
2091  assert(Member && "Fail to get a member from an interleaved store group");
2092 
2093  Value *StoredVec = getOrCreateVectorValue(
2094  cast<StoreInst>(Member)->getValueOperand(), Part);
2095  if (Group->isReverse())
2096  StoredVec = reverseVector(StoredVec);
2097 
2098  // If this member has different type, cast it to a unified type.
2099 
2100  if (StoredVec->getType() != SubVT)
2101  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2102 
2103  StoredVecs.push_back(StoredVec);
2104  }
2105 
2106  // Concatenate all vectors into a wide vector.
2107  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2108 
2109  // Interleave the elements in the wide vector.
2110  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2111  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2112  "interleaved.vec");
2113 
2114  Instruction *NewStoreInstr;
2115  if (IsMaskRequired) {
2116  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2117  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2118  Value *ShuffledMask = Builder.CreateShuffleVector(
2119  Mask[Part], Undefs, RepMask, "interleaved.mask");
2120  NewStoreInstr = Builder.CreateMaskedStore(
2121  IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2122  }
2123  else
2124  NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2125  Group->getAlignment());
2126 
2127  Group->addMetadata(NewStoreInstr);
2128  }
2129 }
2130 
2132  VectorParts *BlockInMask) {
2133  // Attempt to issue a wide load.
2134  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2135  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2136 
2137  assert((LI || SI) && "Invalid Load/Store instruction");
2138 
2140  Cost->getWideningDecision(Instr, VF);
2142  "CM decision should be taken at this point");
2144  return vectorizeInterleaveGroup(Instr);
2145 
2146  Type *ScalarDataTy = getMemInstValueType(Instr);
2147  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2148  Value *Ptr = getLoadStorePointerOperand(Instr);
2149  unsigned Alignment = getLoadStoreAlignment(Instr);
2150  // An alignment of 0 means target abi alignment. We need to use the scalar's
2151  // target abi alignment in such a case.
2152  const DataLayout &DL = Instr->getModule()->getDataLayout();
2153  if (!Alignment)
2154  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2155  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2156 
2157  // Determine if the pointer operand of the access is either consecutive or
2158  // reverse consecutive.
2159  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2160  bool ConsecutiveStride =
2161  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2162  bool CreateGatherScatter =
2164 
2165  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2166  // gather/scatter. Otherwise Decision should have been to Scalarize.
2167  assert((ConsecutiveStride || CreateGatherScatter) &&
2168  "The instruction should be scalarized");
2169 
2170  // Handle consecutive loads/stores.
2171  if (ConsecutiveStride)
2172  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2173 
2174  VectorParts Mask;
2175  bool isMaskRequired = BlockInMask;
2176  if (isMaskRequired)
2177  Mask = *BlockInMask;
2178 
2179  bool InBounds = false;
2180  if (auto *gep = dyn_cast<GetElementPtrInst>(
2181  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2182  InBounds = gep->isInBounds();
2183 
2184  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2185  // Calculate the pointer for the specific unroll-part.
2186  GetElementPtrInst *PartPtr = nullptr;
2187 
2188  if (Reverse) {
2189  // If the address is consecutive but reversed, then the
2190  // wide store needs to start at the last vector element.
2191  PartPtr = cast<GetElementPtrInst>(
2192  Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
2193  PartPtr->setIsInBounds(InBounds);
2194  PartPtr = cast<GetElementPtrInst>(
2195  Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
2196  PartPtr->setIsInBounds(InBounds);
2197  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2198  Mask[Part] = reverseVector(Mask[Part]);
2199  } else {
2200  PartPtr = cast<GetElementPtrInst>(
2201  Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
2202  PartPtr->setIsInBounds(InBounds);
2203  }
2204 
2205  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2206  };
2207 
2208  // Handle Stores:
2209  if (SI) {
2211 
2212  for (unsigned Part = 0; Part < UF; ++Part) {
2213  Instruction *NewSI = nullptr;
2214  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2215  if (CreateGatherScatter) {
2216  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2217  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2218  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2219  MaskPart);
2220  } else {
2221  if (Reverse) {
2222  // If we store to reverse consecutive memory locations, then we need
2223  // to reverse the order of elements in the stored value.
2224  StoredVal = reverseVector(StoredVal);
2225  // We don't want to update the value in the map as it might be used in
2226  // another expression. So don't call resetVectorValue(StoredVal).
2227  }
2228  auto *VecPtr = CreateVecPtr(Part, Ptr);
2229  if (isMaskRequired)
2230  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2231  Mask[Part]);
2232  else
2233  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2234  }
2235  addMetadata(NewSI, SI);
2236  }
2237  return;
2238  }
2239 
2240  // Handle loads.
2241  assert(LI && "Must have a load instruction");
2243  for (unsigned Part = 0; Part < UF; ++Part) {
2244  Value *NewLI;
2245  if (CreateGatherScatter) {
2246  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2247  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2248  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2249  nullptr, "wide.masked.gather");
2250  addMetadata(NewLI, LI);
2251  } else {
2252  auto *VecPtr = CreateVecPtr(Part, Ptr);
2253  if (isMaskRequired)
2254  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2255  UndefValue::get(DataTy),
2256  "wide.masked.load");
2257  else
2258  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
2259 
2260  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2261  addMetadata(NewLI, LI);
2262  if (Reverse)
2263  NewLI = reverseVector(NewLI);
2264  }
2265  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2266  }
2267 }
2268 
2270  const VPIteration &Instance,
2271  bool IfPredicateInstr) {
2272  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2273 
2274  setDebugLocFromInst(Builder, Instr);
2275 
2276  // Does this instruction return a value ?
2277  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2278 
2279  Instruction *Cloned = Instr->clone();
2280  if (!IsVoidRetTy)
2281  Cloned->setName(Instr->getName() + ".cloned");
2282 
2283  // Replace the operands of the cloned instructions with their scalar
2284  // equivalents in the new loop.
2285  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2286  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2287  Cloned->setOperand(op, NewOp);
2288  }
2289  addNewMetadata(Cloned, Instr);
2290 
2291  // Place the cloned scalar in the new loop.
2292  Builder.Insert(Cloned);
2293 
2294  // Add the cloned scalar to the scalar map entry.
2295  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2296 
2297  // If we just cloned a new assumption, add it the assumption cache.
2298  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2299  if (II->getIntrinsicID() == Intrinsic::assume)
2300  AC->registerAssumption(II);
2301 
2302  // End if-block.
2303  if (IfPredicateInstr)
2304  PredicatedInstructions.push_back(Cloned);
2305 }
2306 
2308  Value *End, Value *Step,
2309  Instruction *DL) {
2310  BasicBlock *Header = L->getHeader();
2311  BasicBlock *Latch = L->getLoopLatch();
2312  // As we're just creating this loop, it's possible no latch exists
2313  // yet. If so, use the header as this will be a single block loop.
2314  if (!Latch)
2315  Latch = Header;
2316 
2319  setDebugLocFromInst(Builder, OldInst);
2320  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2321 
2323  setDebugLocFromInst(Builder, OldInst);
2324 
2325  // Create i+1 and fill the PHINode.
2326  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2327  Induction->addIncoming(Start, L->getLoopPreheader());
2328  Induction->addIncoming(Next, Latch);
2329  // Create the compare.
2330  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2331  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2332 
2333  // Now we have two terminators. Remove the old one from the block.
2334  Latch->getTerminator()->eraseFromParent();
2335 
2336  return Induction;
2337 }
2338 
2340  if (TripCount)
2341  return TripCount;
2342 
2344  // Find the loop boundaries.
2345  ScalarEvolution *SE = PSE.getSE();
2346  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2347  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2348  "Invalid loop count");
2349 
2350  Type *IdxTy = Legal->getWidestInductionType();
2351  assert(IdxTy && "No type for induction");
2352 
2353  // The exit count might have the type of i64 while the phi is i32. This can
2354  // happen if we have an induction variable that is sign extended before the
2355  // compare. The only way that we get a backedge taken count is that the
2356  // induction variable was signed and as such will not overflow. In such a case
2357  // truncation is legal.
2358  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2359  IdxTy->getPrimitiveSizeInBits())
2360  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2361  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2362 
2363  // Get the total trip count from the count by adding 1.
2364  const SCEV *ExitCount = SE->getAddExpr(
2365  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2366 
2367  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2368 
2369  // Expand the trip count and place the new instructions in the preheader.
2370  // Notice that the pre-header does not change, only the loop body.
2371  SCEVExpander Exp(*SE, DL, "induction");
2372 
2373  // Count holds the overall loop count (N).
2374  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2376 
2377  if (TripCount->getType()->isPointerTy())
2378  TripCount =
2379  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2381 
2382  return TripCount;
2383 }
2384 
2386  if (VectorTripCount)
2387  return VectorTripCount;
2388 
2389  Value *TC = getOrCreateTripCount(L);
2391 
2392  // Now we need to generate the expression for the part of the loop that the
2393  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2394  // iterations are not required for correctness, or N - Step, otherwise. Step
2395  // is equal to the vectorization factor (number of SIMD elements) times the
2396  // unroll factor (number of SIMD instructions).
2397  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
2398  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2399 
2400  // If there is a non-reversed interleaved group that may speculatively access
2401  // memory out-of-bounds, we need to ensure that there will be at least one
2402  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2403  // the trip count, we set the remainder to be equal to the step. If the step
2404  // does not evenly divide the trip count, no adjustment is necessary since
2405  // there will already be scalar iterations. Note that the minimum iterations
2406  // check ensures that N >= Step.
2407  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2408  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2409  R = Builder.CreateSelect(IsZero, Step, R);
2410  }
2411 
2412  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2413 
2414  return VectorTripCount;
2415 }
2416 
2418  const DataLayout &DL) {
2419  // Verify that V is a vector type with same number of elements as DstVTy.
2420  unsigned VF = DstVTy->getNumElements();
2421  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2422  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2423  Type *SrcElemTy = SrcVecTy->getElementType();
2424  Type *DstElemTy = DstVTy->getElementType();
2425  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2426  "Vector elements must have same size");
2427 
2428  // Do a direct cast if element types are castable.
2429  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2430  return Builder.CreateBitOrPointerCast(V, DstVTy);
2431  }
2432  // V cannot be directly casted to desired vector type.
2433  // May happen when V is a floating point vector but DstVTy is a vector of
2434  // pointers or vice-versa. Handle this using a two-step bitcast using an
2435  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2436  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2437  "Only one type should be a pointer type");
2438  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2439  "Only one type should be a floating point type");
2440  Type *IntTy =
2442  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2443  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2444  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2445 }
2446 
2448  BasicBlock *Bypass) {
2449  Value *Count = getOrCreateTripCount(L);
2450  BasicBlock *BB = L->getLoopPreheader();
2452 
2453  // Generate code to check if the loop's trip count is less than VF * UF, or
2454  // equal to it in case a scalar epilogue is required; this implies that the
2455  // vector trip count is zero. This check also covers the case where adding one
2456  // to the backedge-taken count overflowed leading to an incorrect trip count
2457  // of zero. In this case we will also jump to the scalar loop.
2460  Value *CheckMinIters = Builder.CreateICmp(
2461  P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
2462 
2463  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2464  // Update dominator tree immediately if the generated block is a
2465  // LoopBypassBlock because SCEV expansions to generate loop bypass
2466  // checks may query it before the current function is finished.
2467  DT->addNewBlock(NewBB, BB);
2468  if (L->getParentLoop())
2469  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2471  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2472  LoopBypassBlocks.push_back(BB);
2473 }
2474 
2476  BasicBlock *BB = L->getLoopPreheader();
2477 
2478  // Generate the code to check that the SCEV assumptions that we made.
2479  // We want the new basic block to start at the first instruction in a
2480  // sequence of instructions that form a check.
2481  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2482  "scev.check");
2483  Value *SCEVCheck =
2484  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2485 
2486  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2487  if (C->isZero())
2488  return;
2489 
2490  // Create a new block containing the stride check.
2491  BB->setName("vector.scevcheck");
2492  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2493  // Update dominator tree immediately if the generated block is a
2494  // LoopBypassBlock because SCEV expansions to generate loop bypass
2495  // checks may query it before the current function is finished.
2496  DT->addNewBlock(NewBB, BB);
2497  if (L->getParentLoop())
2498  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2500  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2501  LoopBypassBlocks.push_back(BB);
2502  AddedSafetyChecks = true;
2503 }
2504 
2506  // VPlan-native path does not do any analysis for runtime checks currently.
2508  return;
2509 
2510  BasicBlock *BB = L->getLoopPreheader();
2511 
2512  // Generate the code that checks in runtime if arrays overlap. We put the
2513  // checks into a separate block to make the more common case of few elements
2514  // faster.
2515  Instruction *FirstCheckInst;
2516  Instruction *MemRuntimeCheck;
2517  std::tie(FirstCheckInst, MemRuntimeCheck) =
2519  if (!MemRuntimeCheck)
2520  return;
2521 
2522  // Create a new block containing the memory check.
2523  BB->setName("vector.memcheck");
2524  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2525  // Update dominator tree immediately if the generated block is a
2526  // LoopBypassBlock because SCEV expansions to generate loop bypass
2527  // checks may query it before the current function is finished.
2528  DT->addNewBlock(NewBB, BB);
2529  if (L->getParentLoop())
2530  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2532  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2533  LoopBypassBlocks.push_back(BB);
2534  AddedSafetyChecks = true;
2535 
2536  // We currently don't use LoopVersioning for the actual loop cloning but we
2537  // still use it to add the noalias metadata.
2538  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2539  PSE.getSE());
2540  LVer->prepareNoAliasMetadata();
2541 }
2542 
2544  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2545  const InductionDescriptor &ID) const {
2546 
2547  SCEVExpander Exp(*SE, DL, "induction");
2548  auto Step = ID.getStep();
2549  auto StartValue = ID.getStartValue();
2550  assert(Index->getType() == Step->getType() &&
2551  "Index type does not match StepValue type");
2552 
2553  // Note: the IR at this point is broken. We cannot use SE to create any new
2554  // SCEV and then expand it, hoping that SCEV's simplification will give us
2555  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2556  // lead to various SCEV crashes. So all we can do is to use builder and rely
2557  // on InstCombine for future simplifications. Here we handle some trivial
2558  // cases only.
2559  auto CreateAdd = [&B](Value *X, Value *Y) {
2560  assert(X->getType() == Y->getType() && "Types don't match!");
2561  if (auto *CX = dyn_cast<ConstantInt>(X))
2562  if (CX->isZero())
2563  return Y;
2564  if (auto *CY = dyn_cast<ConstantInt>(Y))
2565  if (CY->isZero())
2566  return X;
2567  return B.CreateAdd(X, Y);
2568  };
2569 
2570  auto CreateMul = [&B](Value *X, Value *Y) {
2571  assert(X->getType() == Y->getType() && "Types don't match!");
2572  if (auto *CX = dyn_cast<ConstantInt>(X))
2573  if (CX->isOne())
2574  return Y;
2575  if (auto *CY = dyn_cast<ConstantInt>(Y))
2576  if (CY->isOne())
2577  return X;
2578  return B.CreateMul(X, Y);
2579  };
2580 
2581  switch (ID.getKind()) {
2583  assert(Index->getType() == StartValue->getType() &&
2584  "Index type does not match StartValue type");
2586  return B.CreateSub(StartValue, Index);
2587  auto *Offset = CreateMul(
2588  Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2589  return CreateAdd(StartValue, Offset);
2590  }
2592  assert(isa<SCEVConstant>(Step) &&
2593  "Expected constant step for pointer induction");
2594  return B.CreateGEP(
2595  nullptr, StartValue,
2596  CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2597  &*B.GetInsertPoint())));
2598  }
2600  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2601  auto InductionBinOp = ID.getInductionBinOp();
2602  assert(InductionBinOp &&
2603  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2604  InductionBinOp->getOpcode() == Instruction::FSub) &&
2605  "Original bin op should be defined for FP induction");
2606 
2607  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2608 
2609  // Floating point operations had to be 'fast' to enable the induction.
2610  FastMathFlags Flags;
2611  Flags.setFast();
2612 
2613  Value *MulExp = B.CreateFMul(StepValue, Index);
2614  if (isa<Instruction>(MulExp))
2615  // We have to check, the MulExp may be a constant.
2616  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2617 
2618  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2619  "induction");
2620  if (isa<Instruction>(BOp))
2621  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2622 
2623  return BOp;
2624  }
2626  return nullptr;
2627  }
2628  llvm_unreachable("invalid enum");
2629 }
2630 
2632  /*
2633  In this function we generate a new loop. The new loop will contain
2634  the vectorized instructions while the old loop will continue to run the
2635  scalar remainder.
2636 
2637  [ ] <-- loop iteration number check.
2638  / |
2639  / v
2640  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2641  | / |
2642  | / v
2643  || [ ] <-- vector pre header.
2644  |/ |
2645  | v
2646  | [ ] \
2647  | [ ]_| <-- vector loop.
2648  | |
2649  | v
2650  | -[ ] <--- middle-block.
2651  | / |
2652  | / v
2653  -|- >[ ] <--- new preheader.
2654  | |
2655  | v
2656  | [ ] \
2657  | [ ]_| <-- old scalar loop to handle remainder.
2658  \ |
2659  \ v
2660  >[ ] <-- exit block.
2661  ...
2662  */
2663 
2664  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2665  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2666  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2667  assert(VectorPH && "Invalid loop structure");
2668  assert(ExitBlock && "Must have an exit block");
2669 
2670  // Some loops have a single integer induction variable, while other loops
2671  // don't. One example is c++ iterators that often have multiple pointer
2672  // induction variables. In the code below we also support a case where we
2673  // don't have a single induction variable.
2674  //
2675  // We try to obtain an induction variable from the original loop as hard
2676  // as possible. However if we don't find one that:
2677  // - is an integer
2678  // - counts from zero, stepping by one
2679  // - is the size of the widest induction variable type
2680  // then we create a new one.
2682  Type *IdxTy = Legal->getWidestInductionType();
2683 
2684  // Split the single block loop into the two loop structure described above.
2685  BasicBlock *VecBody =
2686  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2687  BasicBlock *MiddleBlock =
2688  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2689  BasicBlock *ScalarPH =
2690  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2691 
2692  // Create and register the new vector loop.
2693  Loop *Lp = LI->AllocateLoop();
2694  Loop *ParentLoop = OrigLoop->getParentLoop();
2695 
2696  // Insert the new loop into the loop nest and register the new basic blocks
2697  // before calling any utilities such as SCEV that require valid LoopInfo.
2698  if (ParentLoop) {
2699  ParentLoop->addChildLoop(Lp);
2700  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2701  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2702  } else {
2703  LI->addTopLevelLoop(Lp);
2704  }
2705  Lp->addBasicBlockToLoop(VecBody, *LI);
2706 
2707  // Find the loop boundaries.
2708  Value *Count = getOrCreateTripCount(Lp);
2709 
2710  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2711 
2712  // Now, compare the new count to zero. If it is zero skip the vector loop and
2713  // jump to the scalar loop. This check also covers the case where the
2714  // backedge-taken count is uint##_max: adding one to it will overflow leading
2715  // to an incorrect trip count of zero. In this (rare) case we will also jump
2716  // to the scalar loop.
2717  emitMinimumIterationCountCheck(Lp, ScalarPH);
2718 
2719  // Generate the code to check any assumptions that we've made for SCEV
2720  // expressions.
2721  emitSCEVChecks(Lp, ScalarPH);
2722 
2723  // Generate the code that checks in runtime if arrays overlap. We put the
2724  // checks into a separate block to make the more common case of few elements
2725  // faster.
2726  emitMemRuntimeChecks(Lp, ScalarPH);
2727 
2728  // Generate the induction variable.
2729  // The loop step is equal to the vectorization factor (num of SIMD elements)
2730  // times the unroll factor (num of SIMD instructions).
2731  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2732  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2733  Induction =
2734  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2736 
2737  // We are going to resume the execution of the scalar loop.
2738  // Go over all of the induction variables that we found and fix the
2739  // PHIs that are left in the scalar version of the loop.
2740  // The starting values of PHI nodes depend on the counter of the last
2741  // iteration in the vectorized loop.
2742  // If we come from a bypass edge then we need to start from the original
2743  // start value.
2744 
2745  // This variable saves the new starting index for the scalar loop. It is used
2746  // to test if there are any tail iterations left once the vector loop has
2747  // completed.
2749  for (auto &InductionEntry : *List) {
2750  PHINode *OrigPhi = InductionEntry.first;
2751  InductionDescriptor II = InductionEntry.second;
2752 
2753  // Create phi nodes to merge from the backedge-taken check block.
2754  PHINode *BCResumeVal = PHINode::Create(
2755  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2756  // Copy original phi DL over to the new one.
2757  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2758  Value *&EndValue = IVEndValues[OrigPhi];
2759  if (OrigPhi == OldInduction) {
2760  // We know what the end value is.
2761  EndValue = CountRoundDown;
2762  } else {
2764  Type *StepType = II.getStep()->getType();
2765  Instruction::CastOps CastOp =
2766  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2767  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2768  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2769  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2770  EndValue->setName("ind.end");
2771  }
2772 
2773  // The new PHI merges the original incoming value, in case of a bypass,
2774  // or the value at the end of the vectorized loop.
2775  BCResumeVal->addIncoming(EndValue, MiddleBlock);
2776 
2777  // Fix the scalar body counter (PHI node).
2778  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2779 
2780  // The old induction's phi node in the scalar body needs the truncated
2781  // value.
2782  for (BasicBlock *BB : LoopBypassBlocks)
2783  BCResumeVal->addIncoming(II.getStartValue(), BB);
2784  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2785  }
2786 
2787  // Add a check in the middle block to see if we have completed
2788  // all of the iterations in the first vector loop.
2789  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2790  Value *CmpN =
2791  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2792  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2793  ReplaceInstWithInst(MiddleBlock->getTerminator(),
2794  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
2795 
2796  // Get ready to start creating new instructions into the vectorized body.
2798 
2799  // Save the state.
2801  LoopScalarPreHeader = ScalarPH;
2802  LoopMiddleBlock = MiddleBlock;
2803  LoopExitBlock = ExitBlock;
2804  LoopVectorBody = VecBody;
2805  LoopScalarBody = OldBasicBlock;
2806 
2807  // Keep all loop hints from the original loop on the vector loop (we'll
2808  // replace the vectorizer-specific hints below).
2809  if (MDNode *LID = OrigLoop->getLoopID())
2810  Lp->setLoopID(LID);
2811 
2812  LoopVectorizeHints Hints(Lp, true, *ORE);
2813  Hints.setAlreadyVectorized();
2814 
2815  return LoopVectorPreHeader;
2816 }
2817 
2818 // Fix up external users of the induction variable. At this point, we are
2819 // in LCSSA form, with all external PHIs that use the IV having one input value,
2820 // coming from the remainder loop. We need those PHIs to also have a correct
2821 // value for the IV when arriving directly from the middle block.
2823  const InductionDescriptor &II,
2824  Value *CountRoundDown, Value *EndValue,
2825  BasicBlock *MiddleBlock) {
2826  // There are two kinds of external IV usages - those that use the value
2827  // computed in the last iteration (the PHI) and those that use the penultimate
2828  // value (the value that feeds into the phi from the loop latch).
2829  // We allow both, but they, obviously, have different values.
2830 
2831  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2832 
2833  DenseMap<Value *, Value *> MissingVals;
2834 
2835  // An external user of the last iteration's value should see the value that
2836  // the remainder loop uses to initialize its own IV.
2838  for (User *U : PostInc->users()) {
2839  Instruction *UI = cast<Instruction>(U);
2840  if (!OrigLoop->contains(UI)) {
2841  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2842  MissingVals[UI] = EndValue;
2843  }
2844  }
2845 
2846  // An external user of the penultimate value need to see EndValue - Step.
2847  // The simplest way to get this is to recompute it from the constituent SCEVs,
2848  // that is Start + (Step * (CRD - 1)).
2849  for (User *U : OrigPhi->users()) {
2850  auto *UI = cast<Instruction>(U);
2851  if (!OrigLoop->contains(UI)) {
2852  const DataLayout &DL =
2854  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2855 
2856  IRBuilder<> B(MiddleBlock->getTerminator());
2857  Value *CountMinusOne = B.CreateSub(
2858  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
2859  Value *CMO =
2860  !II.getStep()->getType()->isIntegerTy()
2861  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
2862  II.getStep()->getType())
2863  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
2864  CMO->setName("cast.cmo");
2865  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
2866  Escape->setName("ind.escape");
2867  MissingVals[UI] = Escape;
2868  }
2869  }
2870 
2871  for (auto &I : MissingVals) {
2872  PHINode *PHI = cast<PHINode>(I.first);
2873  // One corner case we have to handle is two IVs "chasing" each-other,
2874  // that is %IV2 = phi [...], [ %IV1, %latch ]
2875  // In this case, if IV1 has an external use, we need to avoid adding both
2876  // "last value of IV1" and "penultimate value of IV2". So, verify that we
2877  // don't already have an incoming value for the middle block.
2878  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2879  PHI->addIncoming(I.second, MiddleBlock);
2880  }
2881 }
2882 
2883 namespace {
2884 
2885 struct CSEDenseMapInfo {
2886  static bool canHandle(const Instruction *I) {
2887  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2888  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2889  }
2890 
2891  static inline Instruction *getEmptyKey() {
2893  }
2894 
2895  static inline Instruction *getTombstoneKey() {
2897  }
2898 
2899  static unsigned getHashValue(const Instruction *I) {
2900  assert(canHandle(I) && "Unknown instruction!");
2902  I->value_op_end()));
2903  }
2904 
2905  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2906  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2907  LHS == getTombstoneKey() || RHS == getTombstoneKey())
2908  return LHS == RHS;
2909  return LHS->isIdenticalTo(RHS);
2910  }
2911 };
2912 
2913 } // end anonymous namespace
2914 
2915 ///Perform cse of induction variable instructions.
2916 static void cse(BasicBlock *BB) {
2917  // Perform simple cse.
2919  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
2920  Instruction *In = &*I++;
2921 
2922  if (!CSEDenseMapInfo::canHandle(In))
2923  continue;
2924 
2925  // Check if we can replace this instruction with any of the
2926  // visited instructions.
2927  if (Instruction *V = CSEMap.lookup(In)) {
2928  In->replaceAllUsesWith(V);
2929  In->eraseFromParent();
2930  continue;
2931  }
2932 
2933  CSEMap[In] = In;
2934  }
2935 }
2936 
2937 /// Estimate the overhead of scalarizing an instruction. This is a
2938 /// convenience wrapper for the type-based getScalarizationOverhead API.
2939 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
2940  const TargetTransformInfo &TTI) {
2941  if (VF == 1)
2942  return 0;
2943 
2944  unsigned Cost = 0;
2945  Type *RetTy = ToVectorTy(I->getType(), VF);
2946  if (!RetTy->isVoidTy() &&
2947  (!isa<LoadInst>(I) ||
2949  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
2950 
2951  if (CallInst *CI = dyn_cast<CallInst>(I)) {
2952  SmallVector<const Value *, 4> Operands(CI->arg_operands());
2953  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
2954  }
2955  else if (!isa<StoreInst>(I) ||
2958  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
2959  }
2960 
2961  return Cost;
2962 }
2963 
2964 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
2965 // Return the cost of the instruction, including scalarization overhead if it's
2966 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
2967 // i.e. either vector version isn't available, or is too expensive.
2968 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
2969  const TargetTransformInfo &TTI,
2970  const TargetLibraryInfo *TLI,
2971  bool &NeedToScalarize) {
2972  Function *F = CI->getCalledFunction();
2973  StringRef FnName = CI->getCalledFunction()->getName();
2974  Type *ScalarRetTy = CI->getType();
2975  SmallVector<Type *, 4> Tys, ScalarTys;
2976  for (auto &ArgOp : CI->arg_operands())
2977  ScalarTys.push_back(ArgOp->getType());
2978 
2979  // Estimate cost of scalarized vector call. The source operands are assumed
2980  // to be vectors, so we need to extract individual elements from there,
2981  // execute VF scalar calls, and then gather the result into the vector return
2982  // value.
2983  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
2984  if (VF == 1)
2985  return ScalarCallCost;
2986 
2987  // Compute corresponding vector type for return value and arguments.
2988  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
2989  for (Type *ScalarTy : ScalarTys)
2990  Tys.push_back(ToVectorTy(ScalarTy, VF));
2991 
2992  // Compute costs of unpacking argument values for the scalar calls and
2993  // packing the return values to a vector.
2994  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
2995 
2996  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
2997 
2998  // If we can't emit a vector call for this function, then the currently found
2999  // cost is the cost we need to return.
3000  NeedToScalarize = true;
3001  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3002  return Cost;
3003 
3004  // If the corresponding vector cost is cheaper, return its cost.
3005  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3006  if (VectorCallCost < Cost) {
3007  NeedToScalarize = false;
3008  return VectorCallCost;
3009  }
3010  return Cost;
3011 }
3012 
3013 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3014 // factor VF. Return the cost of the instruction, including scalarization
3015 // overhead if it's needed.
3016 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3017  const TargetTransformInfo &TTI,
3018  const TargetLibraryInfo *TLI) {
3020  assert(ID && "Expected intrinsic call!");
3021 
3022  FastMathFlags FMF;
3023  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3024  FMF = FPMO->getFastMathFlags();
3025 
3026  SmallVector<Value *, 4> Operands(CI->arg_operands());
3027  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3028 }
3029 
3031  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3032  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3033  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3034 }
3036  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3037  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3038  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3039 }
3040 
3042  // For every instruction `I` in MinBWs, truncate the operands, create a
3043  // truncated version of `I` and reextend its result. InstCombine runs
3044  // later and will remove any ext/trunc pairs.
3045  SmallPtrSet<Value *, 4> Erased;
3046  for (const auto &KV : Cost->getMinimalBitwidths()) {
3047  // If the value wasn't vectorized, we must maintain the original scalar
3048  // type. The absence of the value from VectorLoopValueMap indicates that it
3049  // wasn't vectorized.
3050  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3051  continue;
3052  for (unsigned Part = 0; Part < UF; ++Part) {
3053  Value *I = getOrCreateVectorValue(KV.first, Part);
3054  if (Erased.find(I) != Erased.end() || I->use_empty() ||
3055  !isa<Instruction>(I))
3056  continue;
3057  Type *OriginalTy = I->getType();
3058  Type *ScalarTruncatedTy =
3059  IntegerType::get(OriginalTy->getContext(), KV.second);
3060  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3061  OriginalTy->getVectorNumElements());
3062  if (TruncatedTy == OriginalTy)
3063  continue;
3064 
3065  IRBuilder<> B(cast<Instruction>(I));
3066  auto ShrinkOperand = [&](Value *V) -> Value * {
3067  if (auto *ZI = dyn_cast<ZExtInst>(V))
3068  if (ZI->getSrcTy() == TruncatedTy)
3069  return ZI->getOperand(0);
3070  return B.CreateZExtOrTrunc(V, TruncatedTy);
3071  };
3072 
3073  // The actual instruction modification depends on the instruction type,
3074  // unfortunately.
3075  Value *NewI = nullptr;
3076  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3077  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3078  ShrinkOperand(BO->getOperand(1)));
3079 
3080  // Any wrapping introduced by shrinking this operation shouldn't be
3081  // considered undefined behavior. So, we can't unconditionally copy
3082  // arithmetic wrapping flags to NewI.
3083  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3084  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3085  NewI =
3086  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3087  ShrinkOperand(CI->getOperand(1)));
3088  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3089  NewI = B.CreateSelect(SI->getCondition(),
3090  ShrinkOperand(SI->getTrueValue()),
3091  ShrinkOperand(SI->getFalseValue()));
3092  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3093  switch (CI->getOpcode()) {
3094  default:
3095  llvm_unreachable("Unhandled cast!");
3096  case Instruction::Trunc:
3097  NewI = ShrinkOperand(CI->getOperand(0));
3098  break;
3099  case Instruction::SExt:
3100  NewI = B.CreateSExtOrTrunc(
3101  CI->getOperand(0),
3102  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3103  break;
3104  case Instruction::ZExt:
3105  NewI = B.CreateZExtOrTrunc(
3106  CI->getOperand(0),
3107  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3108  break;
3109  }
3110  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3111  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3112  auto *O0 = B.CreateZExtOrTrunc(
3113  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3114  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3115  auto *O1 = B.CreateZExtOrTrunc(
3116  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3117 
3118  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3119  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3120  // Don't do anything with the operands, just extend the result.
3121  continue;
3122  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3123  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3124  auto *O0 = B.CreateZExtOrTrunc(
3125  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3126  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3127  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3128  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3129  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3130  auto *O0 = B.CreateZExtOrTrunc(
3131  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3132  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3133  } else {
3134  // If we don't know what to do, be conservative and don't do anything.
3135  continue;
3136  }
3137 
3138  // Lastly, extend the result.
3139  NewI->takeName(cast<Instruction>(I));
3140  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3141  I->replaceAllUsesWith(Res);
3142  cast<Instruction>(I)->eraseFromParent();
3143  Erased.insert(I);
3144  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3145  }
3146  }
3147 
3148  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3149  for (const auto &KV : Cost->getMinimalBitwidths()) {
3150  // If the value wasn't vectorized, we must maintain the original scalar
3151  // type. The absence of the value from VectorLoopValueMap indicates that it
3152  // wasn't vectorized.
3153  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3154  continue;
3155  for (unsigned Part = 0; Part < UF; ++Part) {
3156  Value *I = getOrCreateVectorValue(KV.first, Part);
3157  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3158  if (Inst && Inst->use_empty()) {
3159  Value *NewI = Inst->getOperand(0);
3160  Inst->eraseFromParent();
3161  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3162  }
3163  }
3164  }
3165 }
3166 
3168  // Insert truncates and extends for any truncated instructions as hints to
3169  // InstCombine.
3170  if (VF > 1)
3172 
3173  // Fix widened non-induction PHIs by setting up the PHI operands.
3174  if (OrigPHIsToFix.size()) {
3176  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3178  }
3179 
3180  // At this point every instruction in the original loop is widened to a
3181  // vector form. Now we need to fix the recurrences in the loop. These PHI
3182  // nodes are currently empty because we did not want to introduce cycles.
3183  // This is the second stage of vectorizing recurrences.
3185 
3186  // Update the dominator tree.
3187  //
3188  // FIXME: After creating the structure of the new loop, the dominator tree is
3189  // no longer up-to-date, and it remains that way until we update it
3190  // here. An out-of-date dominator tree is problematic for SCEV,
3191  // because SCEVExpander uses it to guide code generation. The
3192  // vectorizer use SCEVExpanders in several places. Instead, we should
3193  // keep the dominator tree up-to-date as we go.
3194  updateAnalysis();
3195 
3196  // Fix-up external users of the induction variables.
3197  for (auto &Entry : *Legal->getInductionVars())
3198  fixupIVUsers(Entry.first, Entry.second,
3200  IVEndValues[Entry.first], LoopMiddleBlock);
3201 
3202  fixLCSSAPHIs();
3204  sinkScalarOperands(&*PI);
3205 
3206  // Remove redundant induction instructions.
3208 }
3209 
3211  // In order to support recurrences we need to be able to vectorize Phi nodes.
3212  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3213  // stage #2: We now need to fix the recurrences by adding incoming edges to
3214  // the currently empty PHI nodes. At this point every instruction in the
3215  // original loop is widened to a vector form so we can use them to construct
3216  // the incoming edges.
3217  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3218  // Handle first-order recurrences and reductions that need to be fixed.
3219  if (Legal->isFirstOrderRecurrence(&Phi))
3221  else if (Legal->isReductionVariable(&Phi))
3222  fixReduction(&Phi);
3223  }
3224 }
3225 
3227  // This is the second phase of vectorizing first-order recurrences. An
3228  // overview of the transformation is described below. Suppose we have the
3229  // following loop.
3230  //
3231  // for (int i = 0; i < n; ++i)
3232  // b[i] = a[i] - a[i - 1];
3233  //
3234  // There is a first-order recurrence on "a". For this loop, the shorthand
3235  // scalar IR looks like:
3236  //
3237  // scalar.ph:
3238  // s_init = a[-1]
3239  // br scalar.body
3240  //
3241  // scalar.body:
3242  // i = phi [0, scalar.ph], [i+1, scalar.body]
3243  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3244  // s2 = a[i]
3245  // b[i] = s2 - s1
3246  // br cond, scalar.body, ...
3247  //
3248  // In this example, s1 is a recurrence because it's value depends on the
3249  // previous iteration. In the first phase of vectorization, we created a
3250  // temporary value for s1. We now complete the vectorization and produce the
3251  // shorthand vector IR shown below (for VF = 4, UF = 1).
3252  //
3253  // vector.ph:
3254  // v_init = vector(..., ..., ..., a[-1])
3255  // br vector.body
3256  //
3257  // vector.body
3258  // i = phi [0, vector.ph], [i+4, vector.body]
3259  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3260  // v2 = a[i, i+1, i+2, i+3];
3261  // v3 = vector(v1(3), v2(0, 1, 2))
3262  // b[i, i+1, i+2, i+3] = v2 - v3
3263  // br cond, vector.body, middle.block
3264  //
3265  // middle.block:
3266  // x = v2(3)
3267  // br scalar.ph
3268  //
3269  // scalar.ph:
3270  // s_init = phi [x, middle.block], [a[-1], otherwise]
3271  // br scalar.body
3272  //
3273  // After execution completes the vector loop, we extract the next value of
3274  // the recurrence (x) to use as the initial value in the scalar loop.
3275 
3276  // Get the original loop preheader and single loop latch.
3277  auto *Preheader = OrigLoop->getLoopPreheader();
3278  auto *Latch = OrigLoop->getLoopLatch();
3279 
3280  // Get the initial and previous values of the scalar recurrence.
3281  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3282  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3283 
3284  // Create a vector from the initial value.
3285  auto *VectorInit = ScalarInit;
3286  if (VF > 1) {
3288  VectorInit = Builder.CreateInsertElement(
3289  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3290  Builder.getInt32(VF - 1), "vector.recur.init");
3291  }
3292 
3293  // We constructed a temporary phi node in the first phase of vectorization.
3294  // This phi node will eventually be deleted.
3296  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3297 
3298  // Create a phi node for the new recurrence. The current value will either be
3299  // the initial value inserted into a vector or loop-varying vector value.
3300  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3301  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3302 
3303  // Get the vectorized previous value of the last part UF - 1. It appears last
3304  // among all unrolled iterations, due to the order of their construction.
3305  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3306 
3307  // Set the insertion point after the previous value if it is an instruction.
3308  // Note that the previous value may have been constant-folded so it is not
3309  // guaranteed to be an instruction in the vector loop. Also, if the previous
3310  // value is a phi node, we should insert after all the phi nodes to avoid
3311  // breaking basic block verification.
3312  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3313  isa<PHINode>(PreviousLastPart))
3315  else
3317  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3318 
3319  // We will construct a vector for the recurrence by combining the values for
3320  // the current and previous iterations. This is the required shuffle mask.
3321  SmallVector<Constant *, 8> ShuffleMask(VF);
3322  ShuffleMask[0] = Builder.getInt32(VF - 1);
3323  for (unsigned I = 1; I < VF; ++I)
3324  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3325 
3326  // The vector from which to take the initial value for the current iteration
3327  // (actual or unrolled). Initially, this is the vector phi node.
3328  Value *Incoming = VecPhi;
3329 
3330  // Shuffle the current and previous vector and update the vector parts.
3331  for (unsigned Part = 0; Part < UF; ++Part) {
3332  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3333  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3334  auto *Shuffle =
3335  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3336  ConstantVector::get(ShuffleMask))
3337  : Incoming;
3338  PhiPart->replaceAllUsesWith(Shuffle);
3339  cast<Instruction>(PhiPart)->eraseFromParent();
3340  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3341  Incoming = PreviousPart;
3342  }
3343 
3344  // Fix the latch value of the new recurrence in the vector loop.
3345  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3346 
3347  // Extract the last vector element in the middle block. This will be the
3348  // initial value for the recurrence when jumping to the scalar loop.
3349  auto *ExtractForScalar = Incoming;
3350  if (VF > 1) {
3352  ExtractForScalar = Builder.CreateExtractElement(
3353  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3354  }
3355  // Extract the second last element in the middle block if the
3356  // Phi is used outside the loop. We need to extract the phi itself
3357  // and not the last element (the phi update in the current iteration). This
3358  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3359  // when the scalar loop is not run at all.
3360  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3361  if (VF > 1)
3362  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3363  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3364  // When loop is unrolled without vectorizing, initialize
3365  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3366  // `Incoming`. This is analogous to the vectorized case above: extracting the
3367  // second last element when VF > 1.
3368  else if (UF > 1)
3369  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3370 
3371  // Fix the initial value of the original recurrence in the scalar loop.
3373  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3374  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3375  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3376  Start->addIncoming(Incoming, BB);
3377  }
3378 
3380  Phi->setName("scalar.recur");
3381 
3382  // Finally, fix users of the recurrence outside the loop. The users will need
3383  // either the last value of the scalar recurrence or the last value of the
3384  // vector recurrence we extracted in the middle block. Since the loop is in
3385  // LCSSA form, we just need to find all the phi nodes for the original scalar
3386  // recurrence in the exit block, and then add an edge for the middle block.
3387  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3388  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3389  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3390  }
3391  }
3392 }
3393 
3395  Constant *Zero = Builder.getInt32(0);
3396 
3397  // Get it's reduction variable descriptor.
3399  "Unable to find the reduction variable");
3400  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3401 
3403  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3404  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3406  RdxDesc.getMinMaxRecurrenceKind();
3407  setDebugLocFromInst(Builder, ReductionStartValue);
3408 
3409  // We need to generate a reduction vector from the incoming scalar.
3410  // To do so, we need to generate the 'identity' vector and override
3411  // one of the elements with the incoming scalar reduction. We need
3412  // to do it in the vector-loop preheader.
3414 
3415  // This is the vector-clone of the value that leaves the loop.
3416  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3417 
3418  // Find the reduction identity variable. Zero for addition, or, xor,
3419  // one for multiplication, -1 for And.
3420  Value *Identity;
3421  Value *VectorStart;
3424  // MinMax reduction have the start value as their identify.
3425  if (VF == 1) {
3426  VectorStart = Identity = ReductionStartValue;
3427  } else {
3428  VectorStart = Identity =
3429  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3430  }
3431  } else {
3432  // Handle other reduction kinds:
3434  RK, VecTy->getScalarType());
3435  if (VF == 1) {
3436  Identity = Iden;
3437  // This vector is the Identity vector where the first element is the
3438  // incoming scalar reduction.
3439  VectorStart = ReductionStartValue;
3440  } else {
3441  Identity = ConstantVector::getSplat(VF, Iden);
3442 
3443  // This vector is the Identity vector where the first element is the
3444  // incoming scalar reduction.
3445  VectorStart =
3446  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3447  }
3448  }
3449 
3450  // Fix the vector-loop phi.
3451 
3452  // Reductions do not have to start at zero. They can start with
3453  // any loop invariant values.
3454  BasicBlock *Latch = OrigLoop->getLoopLatch();
3455  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3456  for (unsigned Part = 0; Part < UF; ++Part) {
3457  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3458  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3459  // Make sure to add the reduction stat value only to the
3460  // first unroll part.
3461  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3462  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3463  cast<PHINode>(VecRdxPhi)
3464  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3465  }
3466 
3467  // Before each round, move the insertion point right between
3468  // the PHIs and the values we are going to write.
3469  // This allows us to write both PHINodes and the extractelement
3470  // instructions.
3472 
3473  setDebugLocFromInst(Builder, LoopExitInst);
3474 
3475  // If the vector reduction can be performed in a smaller type, we truncate
3476  // then extend the loop exit value to enable InstCombine to evaluate the
3477  // entire expression in the smaller type.
3478  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3479  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3482  VectorParts RdxParts(UF);
3483  for (unsigned Part = 0; Part < UF; ++Part) {
3484  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3485  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3486  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3487  : Builder.CreateZExt(Trunc, VecTy);
3488  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3489  UI != RdxParts[Part]->user_end();)
3490  if (*UI != Trunc) {
3491  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3492  RdxParts[Part] = Extnd;
3493  } else {
3494  ++UI;
3495  }
3496  }
3498  for (unsigned Part = 0; Part < UF; ++Part) {
3499  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3500  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3501  }
3502  }
3503 
3504  // Reduce all of the unrolled parts into a single vector.
3505  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3507  setDebugLocFromInst(Builder, ReducedPartRdx);
3508  for (unsigned Part = 1; Part < UF; ++Part) {
3509  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3510  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3511  // Floating point operations had to be 'fast' to enable the reduction.
3512  ReducedPartRdx = addFastMathFlag(
3514  ReducedPartRdx, "bin.rdx"));
3515  else
3516  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3517  RdxPart);
3518  }
3519 
3520  if (VF > 1) {
3521  bool NoNaN = Legal->hasFunNoNaNAttr();
3522  ReducedPartRdx =
3523  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3524  // If the reduction can be performed in a smaller type, we need to extend
3525  // the reduction to the wider type before we branch to the original loop.
3526  if (Phi->getType() != RdxDesc.getRecurrenceType())
3527  ReducedPartRdx =
3528  RdxDesc.isSigned()
3529  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3530  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3531  }
3532 
3533  // Create a phi node that merges control-flow from the backedge-taken check
3534  // block and the middle block.
3535  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3537  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3538  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3539  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3540 
3541  // Now, we need to fix the users of the reduction variable
3542  // inside and outside of the scalar remainder loop.
3543  // We know that the loop is in LCSSA form. We need to update the
3544  // PHI nodes in the exit blocks.
3545  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3546  // All PHINodes need to have a single entry edge, or two if
3547  // we already fixed them.
3548  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3549 
3550  // We found a reduction value exit-PHI. Update it with the
3551  // incoming bypass edge.
3552  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3553  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3554  } // end of the LCSSA phi scan.
3555 
3556  // Fix the scalar loop reduction variable with the incoming reduction sum
3557  // from the vector body and from the backedge value.
3558  int IncomingEdgeBlockIdx =
3560  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3561  // Pick the other block.
3562  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3563  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3564  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3565 }
3566 
3568  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3569  if (LCSSAPhi.getNumIncomingValues() == 1) {
3570  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3571  // Non-instruction incoming values will have only one value.
3572  unsigned LastLane = 0;
3573  if (isa<Instruction>(IncomingValue))
3574  LastLane = Cost->isUniformAfterVectorization(
3575  cast<Instruction>(IncomingValue), VF)
3576  ? 0
3577  : VF - 1;
3578  // Can be a loop invariant incoming value or the last scalar value to be
3579  // extracted from the vectorized loop.
3581  Value *lastIncomingValue =
3582  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3583  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3584  }
3585  }
3586 }
3587 
3589  // The basic block and loop containing the predicated instruction.
3590  auto *PredBB = PredInst->getParent();
3591  auto *VectorLoop = LI->getLoopFor(PredBB);
3592 
3593  // Initialize a worklist with the operands of the predicated instruction.
3594  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3595 
3596  // Holds instructions that we need to analyze again. An instruction may be
3597  // reanalyzed if we don't yet know if we can sink it or not.
3598  SmallVector<Instruction *, 8> InstsToReanalyze;
3599 
3600  // Returns true if a given use occurs in the predicated block. Phi nodes use
3601  // their operands in their corresponding predecessor blocks.
3602  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3603  auto *I = cast<Instruction>(U.getUser());
3604  BasicBlock *BB = I->getParent();
3605  if (auto *Phi = dyn_cast<PHINode>(I))
3606  BB = Phi->getIncomingBlock(
3607  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3608  return BB == PredBB;
3609  };
3610 
3611  // Iteratively sink the scalarized operands of the predicated instruction
3612  // into the block we created for it. When an instruction is sunk, it's
3613  // operands are then added to the worklist. The algorithm ends after one pass
3614  // through the worklist doesn't sink a single instruction.
3615  bool Changed;
3616  do {
3617  // Add the instructions that need to be reanalyzed to the worklist, and
3618  // reset the changed indicator.
3619  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3620  InstsToReanalyze.clear();
3621  Changed = false;
3622 
3623  while (!Worklist.empty()) {
3624  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3625 
3626  // We can't sink an instruction if it is a phi node, is already in the
3627  // predicated block, is not in the loop, or may have side effects.
3628  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3629  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3630  continue;
3631 
3632  // It's legal to sink the instruction if all its uses occur in the
3633  // predicated block. Otherwise, there's nothing to do yet, and we may
3634  // need to reanalyze the instruction.
3635  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3636  InstsToReanalyze.push_back(I);
3637  continue;
3638  }
3639 
3640  // Move the instruction to the beginning of the predicated block, and add
3641  // it's operands to the worklist.
3642  I->moveBefore(&*PredBB->getFirstInsertionPt());
3643  Worklist.insert(I->op_begin(), I->op_end());
3644 
3645  // The sinking may have enabled other instructions to be sunk, so we will
3646  // need to iterate.
3647  Changed = true;
3648  }
3649  } while (Changed);
3650 }
3651 
3653  for (PHINode *OrigPhi : OrigPHIsToFix) {
3654  PHINode *NewPhi =
3655  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3656  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3657 
3658  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3659  predecessors(OrigPhi->getParent()));
3660  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3661  predecessors(NewPhi->getParent()));
3662  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3663  "Scalar and Vector BB should have the same number of predecessors");
3664 
3665  // The insertion point in Builder may be invalidated by the time we get
3666  // here. Force the Builder insertion point to something valid so that we do
3667  // not run into issues during insertion point restore in
3668  // getOrCreateVectorValue calls below.
3669  Builder.SetInsertPoint(NewPhi);
3670 
3671  // The predecessor order is preserved and we can rely on mapping between
3672  // scalar and vector block predecessors.
3673  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3674  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3675 
3676  // When looking up the new scalar/vector values to fix up, use incoming
3677  // values from original phi.
3678  Value *ScIncV =
3679  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3680 
3681  // Scalar incoming value may need a broadcast
3682  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3683  NewPhi->addIncoming(NewIncV, NewPredBB);
3684  }
3685  }
3686 }
3687 
3689  unsigned VF) {
3690  PHINode *P = cast<PHINode>(PN);
3691  if (EnableVPlanNativePath) {
3692  // Currently we enter here in the VPlan-native path for non-induction
3693  // PHIs where all control flow is uniform. We simply widen these PHIs.
3694  // Create a vector phi with no operands - the vector phi operands will be
3695  // set at the end of vector code generation.
3696  Type *VecTy =
3697  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3698  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3699  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3700  OrigPHIsToFix.push_back(P);
3701 
3702  return;
3703  }
3704 
3705  assert(PN->getParent() == OrigLoop->getHeader() &&
3706  "Non-header phis should have been handled elsewhere");
3707 
3708  // In order to support recurrences we need to be able to vectorize Phi nodes.
3709  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3710  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3711  // this value when we vectorize all of the instructions that use the PHI.
3713  for (unsigned Part = 0; Part < UF; ++Part) {
3714  // This is phase one of vectorizing PHIs.
3715  Type *VecTy =
3716  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3717  Value *EntryPart = PHINode::Create(
3718  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3719  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3720  }
3721  return;
3722  }
3723 
3725 
3726  // This PHINode must be an induction variable.
3727  // Make sure that we know about it.
3728  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3729 
3731  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3732 
3733  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3734  // which can be found from the original scalar operations.
3735  switch (II.getKind()) {
3737  llvm_unreachable("Unknown induction");
3740  llvm_unreachable("Integer/fp induction is handled elsewhere.");
3742  // Handle the pointer induction variable case.
3743  assert(P->getType()->isPointerTy() && "Unexpected type.");
3744  // This is the normalized GEP that starts counting at zero.
3745  Value *PtrInd = Induction;
3746  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3747  // Determine the number of scalars we need to generate for each unroll
3748  // iteration. If the instruction is uniform, we only need to generate the
3749  // first lane. Otherwise, we generate all VF values.
3750  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3751  // These are the scalar results. Notice that we don't generate vector GEPs
3752  // because scalar GEPs result in better code.
3753  for (unsigned Part = 0; Part < UF; ++Part) {
3754  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3755  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3756  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3757  Value *SclrGep =
3758  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3759  SclrGep->setName("next.gep");
3760  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3761  }
3762  }
3763  return;
3764  }
3765  }
3766 }
3767 
3768 /// A helper function for checking whether an integer division-related
3769 /// instruction may divide by zero (in which case it must be predicated if
3770 /// executed conditionally in the scalar code).
3771 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3772 /// Non-zero divisors that are non compile-time constants will not be
3773 /// converted into multiplication, so we will still end up scalarizing
3774 /// the division, but can do so w/o predication.
3776  assert((I.getOpcode() == Instruction::UDiv ||
3777  I.getOpcode() == Instruction::SDiv ||
3778  I.getOpcode() == Instruction::URem ||
3779  I.getOpcode() == Instruction::SRem) &&
3780  "Unexpected instruction");
3781  Value *Divisor = I.getOperand(1);
3782  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3783  return !CInt || CInt->isZero();
3784 }
3785 
3787  switch (I.getOpcode()) {
3788  case Instruction::Br:
3789  case Instruction::PHI:
3790  llvm_unreachable("This instruction is handled by a different recipe.");
3791  case Instruction::GetElementPtr: {
3792  // Construct a vector GEP by widening the operands of the scalar GEP as
3793  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3794  // results in a vector of pointers when at least one operand of the GEP
3795  // is vector-typed. Thus, to keep the representation compact, we only use
3796  // vector-typed operands for loop-varying values.
3797  auto *GEP = cast<GetElementPtrInst>(&I);
3798 
3799  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3800  // If we are vectorizing, but the GEP has only loop-invariant operands,
3801  // the GEP we build (by only using vector-typed operands for
3802  // loop-varying values) would be a scalar pointer. Thus, to ensure we
3803  // produce a vector of pointers, we need to either arbitrarily pick an
3804  // operand to broadcast, or broadcast a clone of the original GEP.
3805  // Here, we broadcast a clone of the original.
3806  //
3807  // TODO: If at some point we decide to scalarize instructions having
3808  // loop-invariant operands, this special case will no longer be
3809  // required. We would add the scalarization decision to
3810  // collectLoopScalars() and teach getVectorValue() to broadcast
3811  // the lane-zero scalar value.
3812  auto *Clone = Builder.Insert(GEP->clone());
3813  for (unsigned Part = 0; Part < UF; ++Part) {
3814  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3815  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3816  addMetadata(EntryPart, GEP);
3817  }
3818  } else {
3819  // If the GEP has at least one loop-varying operand, we are sure to
3820  // produce a vector of pointers. But if we are only unrolling, we want
3821  // to produce a scalar GEP for each unroll part. Thus, the GEP we
3822  // produce with the code below will be scalar (if VF == 1) or vector
3823  // (otherwise). Note that for the unroll-only case, we still maintain
3824  // values in the vector mapping with initVector, as we do for other
3825  // instructions.
3826  for (unsigned Part = 0; Part < UF; ++Part) {
3827  // The pointer operand of the new GEP. If it's loop-invariant, we
3828  // won't broadcast it.
3829  auto *Ptr =
3830  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3831  ? GEP->getPointerOperand()
3832  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3833 
3834  // Collect all the indices for the new GEP. If any index is
3835  // loop-invariant, we won't broadcast it.
3836  SmallVector<Value *, 4> Indices;
3837  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3838  if (OrigLoop->isLoopInvariant(U.get()))
3839  Indices.push_back(U.get());
3840  else
3841  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3842  }
3843 
3844  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3845  // but it should be a vector, otherwise.
3846  auto *NewGEP = GEP->isInBounds()
3847  ? Builder.CreateInBoundsGEP(Ptr, Indices)
3848  : Builder.CreateGEP(Ptr, Indices);
3849  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3850  "NewGEP is not a pointer vector");
3851  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3852  addMetadata(NewGEP, GEP);
3853  }
3854  }
3855 
3856  break;
3857  }
3858  case Instruction::UDiv:
3859  case Instruction::SDiv:
3860  case Instruction::SRem:
3861  case Instruction::URem:
3862  case Instruction::Add:
3863  case Instruction::FAdd:
3864  case Instruction::Sub:
3865  case Instruction::FSub:
3866  case Instruction::Mul:
3867  case Instruction::FMul:
3868  case Instruction::FDiv:
3869  case Instruction::FRem:
3870  case Instruction::Shl:
3871  case Instruction::LShr:
3872  case Instruction::AShr:
3873  case Instruction::And:
3874  case Instruction::Or:
3875  case Instruction::Xor: {
3876  // Just widen binops.
3877  auto *BinOp = cast<BinaryOperator>(&I);
3878  setDebugLocFromInst(Builder, BinOp);
3879 
3880  for (unsigned Part = 0; Part < UF; ++Part) {
3881  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
3882  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
3883  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
3884 
3885  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3886  VecOp->copyIRFlags(BinOp);
3887 
3888  // Use this vector value for all users of the original instruction.
3889  VectorLoopValueMap.setVectorValue(&I, Part, V);
3890  addMetadata(V, BinOp);
3891  }
3892 
3893  break;
3894  }
3895  case Instruction::Select: {
3896  // Widen selects.
3897  // If the selector is loop invariant we can create a select
3898  // instruction with a scalar condition. Otherwise, use vector-select.
3899  auto *SE = PSE.getSE();
3900  bool InvariantCond =
3903 
3904  // The condition can be loop invariant but still defined inside the
3905  // loop. This means that we can't just use the original 'cond' value.
3906  // We have to take the 'vectorized' value and pick the first lane.
3907  // Instcombine will make this a no-op.
3908 
3909  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
3910 
3911  for (unsigned Part = 0; Part < UF; ++Part) {
3912  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
3913  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
3914  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
3915  Value *Sel =
3916  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
3917  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
3918  addMetadata(Sel, &I);
3919  }
3920 
3921  break;
3922  }
3923 
3924  case Instruction::ICmp:
3925  case Instruction::FCmp: {
3926  // Widen compares. Generate vector compares.
3927  bool FCmp = (I.getOpcode() == Instruction::FCmp);
3928  auto *Cmp = dyn_cast<CmpInst>(&I);
3930  for (unsigned Part = 0; Part < UF; ++Part) {
3931  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
3932  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
3933  Value *C = nullptr;
3934  if (FCmp) {
3935  // Propagate fast math flags.
3937  Builder.setFastMathFlags(Cmp->getFastMathFlags());
3938  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
3939  } else {
3940  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
3941  }
3942  VectorLoopValueMap.setVectorValue(&I, Part, C);
3943  addMetadata(C, &I);
3944  }
3945 
3946  break;
3947  }
3948 
3949  case Instruction::ZExt:
3950  case Instruction::SExt:
3951  case Instruction::FPToUI:
3952  case Instruction::FPToSI:
3953  case Instruction::FPExt:
3954  case Instruction::PtrToInt:
3955  case Instruction::IntToPtr:
3956  case Instruction::SIToFP:
3957  case Instruction::UIToFP:
3958  case Instruction::Trunc:
3959  case Instruction::FPTrunc:
3960  case Instruction::BitCast: {
3961  auto *CI = dyn_cast<CastInst>(&I);
3963 
3964  /// Vectorize casts.
3965  Type *DestTy =
3966  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
3967 
3968  for (unsigned Part = 0; Part < UF; ++Part) {
3969  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
3970  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
3971  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
3972  addMetadata(Cast, &I);
3973  }
3974  break;
3975  }
3976 
3977  case Instruction::Call: {
3978  // Ignore dbg intrinsics.
3979  if (isa<DbgInfoIntrinsic>(I))
3980  break;
3982 
3983  Module *M = I.getParent()->getParent()->getParent();
3984  auto *CI = cast<CallInst>(&I);
3985 
3986  StringRef FnName = CI->getCalledFunction()->getName();
3987  Function *F = CI->getCalledFunction();
3988  Type *RetTy = ToVectorTy(CI->getType(), VF);
3990  for (Value *ArgOperand : CI->arg_operands())
3991  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
3992 
3994 
3995  // The flag shows whether we use Intrinsic or a usual Call for vectorized
3996  // version of the instruction.
3997  // Is it beneficial to perform intrinsic call compared to lib call?
3998  bool NeedToScalarize;
3999  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4000  bool UseVectorIntrinsic =
4001  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4002  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4003  "Instruction should be scalarized elsewhere.");
4004 
4005  for (unsigned Part = 0; Part < UF; ++Part) {
4007  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4008  Value *Arg = CI->getArgOperand(i);
4009  // Some intrinsics have a scalar argument - don't replace it with a
4010  // vector.
4011  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4012  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4013  Args.push_back(Arg);
4014  }
4015 
4016  Function *VectorF;
4017  if (UseVectorIntrinsic) {
4018  // Use vector version of the intrinsic.
4019  Type *TysForDecl[] = {CI->getType()};
4020  if (VF > 1)
4021  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4022  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4023  } else {
4024  // Use vector version of the library call.
4025  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4026  assert(!VFnName.empty() && "Vector function name is empty.");
4027  VectorF = M->getFunction(VFnName);
4028  if (!VectorF) {
4029  // Generate a declaration
4030  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4031  VectorF =
4032  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4033  VectorF->copyAttributesFrom(F);
4034  }
4035  }
4036  assert(VectorF && "Can't create vector function.");
4037 
4039  CI->getOperandBundlesAsDefs(OpBundles);
4040  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4041 
4042  if (isa<FPMathOperator>(V))
4043  V->copyFastMathFlags(CI);
4044 
4045  VectorLoopValueMap.setVectorValue(&I, Part, V);
4046  addMetadata(V, &I);
4047  }
4048 
4049  break;
4050  }
4051 
4052  default:
4053  // This instruction is not vectorized by simple widening.
4054  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4055  llvm_unreachable("Unhandled instruction!");
4056  } // end of switch.
4057 }
4058 
4060  // Forget the original basic block.
4062 
4063  // DT is not kept up-to-date for outer loop vectorization
4065  return;
4066 
4067  // Update the dominator tree information.
4069  "Entry does not dominate exit.");
4070 
4077 }
4078 
4079 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4080  // We should not collect Scalars more than once per VF. Right now, this
4081  // function is called from collectUniformsAndScalars(), which already does
4082  // this check. Collecting Scalars for VF=1 does not make any sense.
4083  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4084  "This function should not be visited twice for the same VF");
4085 
4087 
4088  // These sets are used to seed the analysis with pointers used by memory
4089  // accesses that will remain scalar.
4091  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4092 
4093  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4094  // The pointer operands of loads and stores will be scalar as long as the
4095  // memory access is not a gather or scatter operation. The value operand of a
4096  // store will remain scalar if the store is scalarized.
4097  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4098  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4099  assert(WideningDecision != CM_Unknown &&
4100  "Widening decision should be ready at this moment");
4101  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4102  if (Ptr == Store->getValueOperand())
4103  return WideningDecision == CM_Scalarize;
4104  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4105  "Ptr is neither a value or pointer operand");
4106  return WideningDecision != CM_GatherScatter;
4107  };
4108 
4109  // A helper that returns true if the given value is a bitcast or
4110  // getelementptr instruction contained in the loop.
4111  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4112  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4113  isa<GetElementPtrInst>(V)) &&
4114  !TheLoop->isLoopInvariant(V);
4115  };
4116 
4117  // A helper that evaluates a memory access's use of a pointer. If the use
4118  // will be a scalar use, and the pointer is only used by memory accesses, we
4119  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4120  // PossibleNonScalarPtrs.
4121  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4122  // We only care about bitcast and getelementptr instructions contained in
4123  // the loop.
4124  if (!isLoopVaryingBitCastOrGEP(Ptr))
4125  return;
4126 
4127  // If the pointer has already been identified as scalar (e.g., if it was
4128  // also identified as uniform), there's nothing to do.
4129  auto *I = cast<Instruction>(Ptr);
4130  if (Worklist.count(I))
4131  return;
4132 
4133  // If the use of the pointer will be a scalar use, and all users of the
4134  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4135  // place the pointer in PossibleNonScalarPtrs.
4136  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4137  return isa<LoadInst>(U) || isa<StoreInst>(U);
4138  }))
4139  ScalarPtrs.insert(I);
4140  else
4141  PossibleNonScalarPtrs.insert(I);
4142  };
4143 
4144  // We seed the scalars analysis with three classes of instructions: (1)
4145  // instructions marked uniform-after-vectorization, (2) bitcast and
4146  // getelementptr instructions used by memory accesses requiring a scalar use,
4147  // and (3) pointer induction variables and their update instructions (we
4148  // currently only scalarize these).
4149  //
4150  // (1) Add to the worklist all instructions that have been identified as
4151  // uniform-after-vectorization.
4152  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4153 
4154  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4155  // memory accesses requiring a scalar use. The pointer operands of loads and
4156  // stores will be scalar as long as the memory accesses is not a gather or
4157  // scatter operation. The value operand of a store will remain scalar if the
4158  // store is scalarized.
4159  for (auto *BB : TheLoop->blocks())
4160  for (auto &I : *BB) {
4161  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4162  evaluatePtrUse(Load, Load->getPointerOperand());
4163  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4164  evaluatePtrUse(Store, Store->getPointerOperand());
4165  evaluatePtrUse(Store, Store->getValueOperand());
4166  }
4167  }
4168  for (auto *I : ScalarPtrs)
4169  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4170  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4171  Worklist.insert(I);
4172  }
4173 
4174  // (3) Add to the worklist all pointer induction variables and their update
4175  // instructions.
4176  //
4177  // TODO: Once we are able to vectorize pointer induction variables we should
4178  // no longer insert them into the worklist here.
4179  auto *Latch = TheLoop->getLoopLatch();
4180  for (auto &Induction : *Legal->getInductionVars()) {
4181  auto *Ind = Induction.first;
4182  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4183  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4184  continue;
4185  Worklist.insert(Ind);
4186  Worklist.insert(IndUpdate);
4187  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4188  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4189  << "\n");
4190  }
4191 
4192  // Insert the forced scalars.
4193  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4194  // induction variable when the PHI user is scalarized.
4195  auto ForcedScalar = ForcedScalars.find(VF);
4196  if (ForcedScalar != ForcedScalars.end())
4197  for (auto *I : ForcedScalar->second)
4198  Worklist.insert(I);
4199 
4200  // Expand the worklist by looking through any bitcasts and getelementptr
4201  // instructions we've already identified as scalar. This is similar to the
4202  // expansion step in collectLoopUniforms(); however, here we're only
4203  // expanding to include additional bitcasts and getelementptr instructions.
4204  unsigned Idx = 0;
4205  while (Idx != Worklist.size()) {
4206  Instruction *Dst = Worklist[Idx++];
4207  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4208  continue;
4209  auto *Src = cast<Instruction>(Dst->getOperand(0));
4210  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4211  auto *J = cast<Instruction>(U);
4212  return !TheLoop->contains(J) || Worklist.count(J) ||
4213  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4214  isScalarUse(J, Src));
4215  })) {
4216  Worklist.insert(Src);
4217  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4218  }
4219  }
4220 
4221  // An induction variable will remain scalar if all users of the induction
4222  // variable and induction variable update remain scalar.
4223  for (auto &Induction : *Legal->getInductionVars()) {
4224  auto *Ind = Induction.first;
4225  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4226 
4227  // We already considered pointer induction variables, so there's no reason
4228  // to look at their users again.
4229  //
4230  // TODO: Once we are able to vectorize pointer induction variables we
4231  // should no longer skip over them here.
4232  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4233  continue;
4234 
4235  // Determine if all users of the induction variable are scalar after
4236  // vectorization.
4237  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4238  auto *I = cast<Instruction>(U);
4239  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4240  });
4241  if (!ScalarInd)
4242  continue;
4243 
4244  // Determine if all users of the induction variable update instruction are
4245  // scalar after vectorization.
4246  auto ScalarIndUpdate =
4247  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4248  auto *I = cast<Instruction>(U);
4249  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4250  });
4251  if (!ScalarIndUpdate)
4252  continue;
4253 
4254  // The induction variable and its update instruction will remain scalar.
4255  Worklist.insert(Ind);
4256  Worklist.insert(IndUpdate);
4257  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4258  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4259  << "\n");
4260  }
4261 
4262  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4263 }
4264 
4267  return false;
4268  switch(I->getOpcode()) {
4269  default:
4270  break;
4271  case Instruction::Load:
4272  case Instruction::Store: {
4273  if (!Legal->isMaskRequired(I))
4274  return false;
4275  auto *Ptr = getLoadStorePointerOperand(I);
4276  auto *Ty = getMemInstValueType(I);
4277  // We have already decided how to vectorize this instruction, get that
4278  // result.
4279  if (VF > 1) {
4280  InstWidening WideningDecision = getWideningDecision(I, VF);
4281  assert(WideningDecision != CM_Unknown &&
4282  "Widening decision should be ready at this moment");
4283  return WideningDecision == CM_Scalarize;
4284  }
4285  return isa<LoadInst>(I) ?
4286  !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4287  : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4288  }
4289  case Instruction::UDiv:
4290  case Instruction::SDiv:
4291  case Instruction::SRem:
4292  case Instruction::URem:
4293  return mayDivideByZero(*I);
4294  }
4295  return false;
4296 }
4297 
4299  if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
4301 
4302  // If an override option has been passed in for interleaved accesses, use it.
4304 }
4305 
4307  unsigned VF) {
4308  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4309  assert(getWideningDecision(I, VF) == CM_Unknown &&
4310  "Decision should not be set yet.");
4311 
4312  if (!Legal->blockNeedsPredication(I->getParent()) ||
4313  !Legal->isMaskRequired(I))
4314  return true;
4315 
4317  return false;
4318 
4319  auto *Ty = getMemInstValueType(I);
4320  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4321  : TTI.isLegalMaskedStore(Ty);
4322 }
4323 
4325  unsigned VF) {
4326  // Get and ensure we have a valid memory instruction.
4327  LoadInst *LI = dyn_cast<LoadInst>(I);
4329  assert((LI || SI) && "Invalid memory instruction");
4330 
4331  auto *Ptr = getLoadStorePointerOperand(I);
4332 
4333  // In order to be widened, the pointer should be consecutive, first of all.
4334  if (!Legal->isConsecutivePtr(Ptr))
4335  return false;
4336 
4337  // If the instruction is a store located in a predicated block, it will be
4338  // scalarized.
4339  if (isScalarWithPredication(I))
4340  return false;
4341 
4342  // If the instruction's allocated size doesn't equal it's type size, it
4343  // requires padding and will be scalarized.
4344  auto &DL = I->getModule()->getDataLayout();
4345  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4346  if (hasIrregularType(ScalarTy, DL, VF))
4347  return false;
4348 
4349  return true;
4350 }
4351 
4352 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4353  // We should not collect Uniforms more than once per VF. Right now,
4354  // this function is called from collectUniformsAndScalars(), which
4355  // already does this check. Collecting Uniforms for VF=1 does not make any
4356  // sense.
4357 
4358  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4359  "This function should not be visited twice for the same VF");
4360 
4361  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4362  // not analyze again. Uniforms.count(VF) will return 1.
4363  Uniforms[VF].clear();
4364 
4365  // We now know that the loop is vectorizable!
4366  // Collect instructions inside the loop that will remain uniform after
4367  // vectorization.
4368 
4369  // Global values, params and instructions outside of current loop are out of
4370  // scope.
4371  auto isOutOfScope = [&](Value *V) -> bool {
4373  return (!I || !TheLoop->contains(I));
4374  };
4375 
4376  SetVector<Instruction *> Worklist;
4377  BasicBlock *Latch = TheLoop->getLoopLatch();
4378 
4379  // Start with the conditional branch. If the branch condition is an
4380  // instruction contained in the loop that is only used by the branch, it is
4381  // uniform.
4382  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4383  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4384  Worklist.insert(Cmp);
4385  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4386  }
4387 
4388  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4389  // are pointers that are treated like consecutive pointers during
4390  // vectorization. The pointer operands of interleaved accesses are an
4391  // example.
4392  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4393 
4394  // Holds pointer operands of instructions that are possibly non-uniform.
4395  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4396 
4397  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4398  InstWidening WideningDecision = getWideningDecision(I, VF);
4399  assert(WideningDecision != CM_Unknown &&
4400  "Widening decision should be ready at this moment");
4401 
4402  return (WideningDecision == CM_Widen ||
4403  WideningDecision == CM_Widen_Reverse ||
4404  WideningDecision == CM_Interleave);
4405  };
4406  // Iterate over the instructions in the loop, and collect all
4407  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4408  // that a consecutive-like pointer operand will be scalarized, we collect it
4409  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4410  // getelementptr instruction can be used by both vectorized and scalarized
4411  // memory instructions. For example, if a loop loads and stores from the same
4412  // location, but the store is conditional, the store will be scalarized, and
4413  // the getelementptr won't remain uniform.
4414  for (auto *BB : TheLoop->blocks())
4415  for (auto &I : *BB) {
4416  // If there's no pointer operand, there's nothing to do.
4417  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4418  if (!Ptr)
4419  continue;
4420 
4421  // True if all users of Ptr are memory accesses that have Ptr as their
4422  // pointer operand.
4423  auto UsersAreMemAccesses =
4424  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4425  return getLoadStorePointerOperand(U) == Ptr;
4426  });
4427 
4428  // Ensure the memory instruction will not be scalarized or used by
4429  // gather/scatter, making its pointer operand non-uniform. If the pointer
4430  // operand is used by any instruction other than a memory access, we
4431  // conservatively assume the pointer operand may be non-uniform.
4432  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4433  PossibleNonUniformPtrs.insert(Ptr);
4434 
4435  // If the memory instruction will be vectorized and its pointer operand
4436  // is consecutive-like, or interleaving - the pointer operand should
4437  // remain uniform.
4438  else
4439  ConsecutiveLikePtrs.insert(Ptr);
4440  }
4441 
4442  // Add to the Worklist all consecutive and consecutive-like pointers that
4443  // aren't also identified as possibly non-uniform.
4444  for (auto *V : ConsecutiveLikePtrs)
4445  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4446  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4447  Worklist.insert(V);
4448  }
4449 
4450  // Expand Worklist in topological order: whenever a new instruction
4451  // is added , its users should be already inside Worklist. It ensures
4452  // a uniform instruction will only be used by uniform instructions.
4453  unsigned idx = 0;
4454  while (idx != Worklist.size()) {
4455  Instruction *I = Worklist[idx++];
4456 
4457  for (auto OV : I->operand_values()) {
4458  // isOutOfScope operands cannot be uniform instructions.
4459  if (isOutOfScope(OV))
4460  continue;
4461  // First order recurrence Phi's should typically be considered
4462  // non-uniform.
4463  auto *OP = dyn_cast<PHINode>(OV);
4464  if (OP && Legal->isFirstOrderRecurrence(OP))
4465  continue;
4466  // If all the users of the operand are uniform, then add the
4467  // operand into the uniform worklist.
4468  auto *OI = cast<Instruction>(OV);
4469  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4470  auto *J = cast<Instruction>(U);
4471  return Worklist.count(J) ||
4472  (OI == getLoadStorePointerOperand(J) &&
4473  isUniformDecision(J, VF));
4474  })) {
4475  Worklist.insert(OI);
4476  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4477  }
4478  }
4479  }
4480 
4481  // Returns true if Ptr is the pointer operand of a memory access instruction
4482  // I, and I is known to not require scalarization.
4483  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4484  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4485  };
4486 
4487  // For an instruction to be added into Worklist above, all its users inside
4488  // the loop should also be in Worklist. However, this condition cannot be
4489  // true for phi nodes that form a cyclic dependence. We must process phi
4490  // nodes separately. An induction variable will remain uniform if all users
4491  // of the induction variable and induction variable update remain uniform.
4492  // The code below handles both pointer and non-pointer induction variables.
4493  for (auto &Induction : *Legal->getInductionVars()) {
4494  auto *Ind = Induction.first;
4495  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4496 
4497  // Determine if all users of the induction variable are uniform after
4498  // vectorization.
4499  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4500  auto *I = cast<Instruction>(U);
4501  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4502  isVectorizedMemAccessUse(I, Ind);
4503  });
4504  if (!UniformInd)
4505  continue;
4506 
4507  // Determine if all users of the induction variable update instruction are
4508  // uniform after vectorization.
4509  auto UniformIndUpdate =
4510  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4511  auto *I = cast<Instruction>(U);
4512  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4513  isVectorizedMemAccessUse(I, IndUpdate);
4514  });
4515  if (!UniformIndUpdate)
4516  continue;
4517 
4518  // The induction variable and its update instruction will remain uniform.
4519  Worklist.insert(Ind);
4520  Worklist.insert(IndUpdate);
4521  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4522  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4523  << "\n");
4524  }
4525 
4526  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4527 }
4528 
4531  // TODO: It may by useful to do since it's still likely to be dynamically
4532  // uniform if the target can skip.
4533  LLVM_DEBUG(
4534  dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4535 
4536  ORE->emit(
4537  createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4538  << "runtime pointer checks needed. Not enabled for divergent target");
4539 
4540  return None;
4541  }
4542 
4543  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4544  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4545  return computeFeasibleMaxVF(OptForSize, TC);
4546 
4548  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4549  << "runtime pointer checks needed. Enable vectorization of this "
4550  "loop with '#pragma clang loop vectorize(enable)' when "
4551  "compiling with -Os/-Oz");
4552  LLVM_DEBUG(
4553  dbgs()
4554  << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4555  return None;
4556  }
4557 
4558  // If we optimize the program for size, avoid creating the tail loop.
4559  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4560 
4561  if (TC == 1) {
4562  ORE->emit(createMissedAnalysis("SingleIterationLoop")
4563  << "loop trip count is one, irrelevant for vectorization");
4564  LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4565  return None;
4566  }
4567 
4568  // If we don't know the precise trip count, don't try to vectorize.
4569  if (TC == 0) {
4570  ORE->emit(
4571  createMissedAnalysis("UnknownLoopCountComplexCFG")
4572  << "unable to calculate the loop count due to complex control flow");
4573  LLVM_DEBUG(
4574  dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
4575  return None;
4576  }
4577 
4578  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4579 
4580  if (TC % MaxVF != 0) {
4581  // If the trip count that we found modulo the vectorization factor is not
4582  // zero then we require a tail.
4583  // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
4584  // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
4585  // smaller MaxVF that does not require a scalar epilog.
4586 
4587  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4588  << "cannot optimize for size and vectorize at the "
4589  "same time. Enable vectorization of this loop "
4590  "with '#pragma clang loop vectorize(enable)' "
4591  "when compiling with -Os/-Oz");
4592  LLVM_DEBUG(
4593  dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
4594  return None;
4595  }
4596 
4597  return MaxVF;
4598 }
4599 
4600 unsigned
4601 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4602  unsigned ConstTripCount) {
4603  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4604  unsigned SmallestType, WidestType;
4605  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4606  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4607 
4608  // Get the maximum safe dependence distance in bits computed by LAA.
4609  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4610  // the memory accesses that is most restrictive (involved in the smallest
4611  // dependence distance).
4612  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4613 
4614  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4615 
4616  unsigned MaxVectorSize = WidestRegister / WidestType;
4617 
4618  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4619  << " / " << WidestType << " bits.\n");
4620  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4621  << WidestRegister << " bits.\n");
4622 
4623  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4624  " into one vector!");
4625  if (MaxVectorSize == 0) {
4626  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4627  MaxVectorSize = 1;
4628  return MaxVectorSize;
4629  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4630  isPowerOf2_32(ConstTripCount)) {
4631  // We need to clamp the VF to be the ConstTripCount. There is no point in
4632  // choosing a higher viable VF as done in the loop below.
4633  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4634  << ConstTripCount << "\n");
4635  MaxVectorSize = ConstTripCount;
4636  return MaxVectorSize;
4637  }
4638 
4639  unsigned MaxVF = MaxVectorSize;
4640  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4641  (MaximizeBandwidth && !OptForSize)) {
4642  // Collect all viable vectorization factors larger than the default MaxVF
4643  // (i.e. MaxVectorSize).
4645  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4646  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4647  VFs.push_back(VS);
4648 
4649  // For each VF calculate its register usage.
4650  auto RUs = calculateRegisterUsage(VFs);
4651 
4652  // Select the largest VF which doesn't require more registers than existing
4653  // ones.
4654  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4655  for (int i = RUs.size() - 1; i >= 0; --i) {
4656  if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4657  MaxVF = VFs[i];
4658  break;
4659  }
4660  }
4661  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4662  if (MaxVF < MinVF) {
4663  LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4664  << ") with target's minimum: " << MinVF << '\n');
4665  MaxVF = MinVF;
4666  }
4667  }
4668  }
4669  return MaxVF;
4670 }
4671 
4674  float Cost = expectedCost(1).first;
4675  const float ScalarCost = Cost;
4676  unsigned Width = 1;
4677  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4678 
4679  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4680  if (ForceVectorization && MaxVF > 1) {
4681  // Ignore scalar width, because the user explicitly wants vectorization.
4682  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4683  // evaluation.
4685  }
4686 
4687  for (unsigned i = 2; i <= MaxVF; i *= 2) {
4688  // Notice that the vector loop needs to be executed less times, so
4689  // we need to divide the cost of the vector loops by the width of
4690  // the vector elements.
4691  VectorizationCostTy C = expectedCost(i);
4692  float VectorCost = C.first / (float)i;
4693  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4694  << " costs: " << (int)VectorCost << ".\n");
4695  if (!C.second && !ForceVectorization) {
4696  LLVM_DEBUG(
4697  dbgs() << "LV: Not considering vector loop of width " << i
4698  << " because it will not generate any vector instructions.\n");
4699  continue;
4700  }
4701  if (VectorCost < Cost) {
4702  Cost = VectorCost;
4703  Width = i;
4704  }
4705  }
4706 
4707  if (!EnableCondStoresVectorization && NumPredStores) {
4708  ORE->emit(createMissedAnalysis("ConditionalStore")
4709  << "store that is conditionally executed prevents vectorization");
4710  LLVM_DEBUG(
4711  dbgs() << "LV: No vectorization. There are conditional stores.\n");
4712  Width = 1;
4713  Cost = ScalarCost;
4714  }
4715 
4716  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4717  << "LV: Vectorization seems to be not beneficial, "
4718  << "but was forced by a user.\n");
4719  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4720  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4721  return Factor;
4722 }
4723 
4724 std::pair<unsigned, unsigned>
4726  unsigned MinWidth = -1U;
4727  unsigned MaxWidth = 8;
4728  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4729 
4730  // For each block.
4731  for (BasicBlock *BB : TheLoop->blocks()) {
4732  // For each instruction in the loop.
4733  for (Instruction &I : BB->instructionsWithoutDebug()) {
4734  Type *T = I.getType();
4735 
4736  // Skip ignored values.
4737  if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4738  continue;
4739 
4740  // Only examine Loads, Stores and PHINodes.
4741  if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4742  continue;
4743 
4744  // Examine PHI nodes that are reduction variables. Update the type to
4745  // account for the recurrence type.
4746  if (auto *PN = dyn_cast<PHINode>(&I)) {
4747  if (!Legal->isReductionVariable(PN))
4748  continue;
4749  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4750  T = RdxDesc.getRecurrenceType();
4751  }
4752 
4753  // Examine the stored values.
4754  if (auto *ST = dyn_cast<StoreInst>(&I))
4755  T = ST->getValueOperand()->getType();
4756 
4757  // Ignore loaded pointer types and stored pointer types that are not
4758  // vectorizable.
4759  //
4760  // FIXME: The check here attempts to predict whether a load or store will
4761  // be vectorized. We only know this for certain after a VF has
4762  // been selected. Here, we assume that if an access can be
4763  // vectorized, it will be. We should also look at extending this
4764  // optimization to non-pointer types.
4765  //
4766  if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4767  !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4768  continue;
4769 
4770  MinWidth = std::min(MinWidth,
4771  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4772  MaxWidth = std::max(MaxWidth,
4773  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4774  }
4775  }
4776 
4777  return {MinWidth, MaxWidth};
4778 }
4779 
4781  unsigned VF,
4782  unsigned LoopCost) {
4783  // -- The interleave heuristics --
4784  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4785  // There are many micro-architectural considerations that we can't predict
4786  // at this level. For example, frontend pressure (on decode or fetch) due to
4787  // code size, or the number and capabilities of the execution ports.
4788  //
4789  // We use the following heuristics to select the interleave count:
4790  // 1. If the code has reductions, then we interleave to break the cross
4791  // iteration dependency.
4792  // 2. If the loop is really small, then we interleave to reduce the loop
4793  // overhead.
4794  // 3. We don't interleave if we think that we will spill registers to memory
4795  // due to the increased register pressure.
4796 
4797  // When we optimize for size, we don't interleave.
4798  if (OptForSize)
4799  return 1;
4800 
4801  // We used the distance for the interleave count.
4802  if (Legal->getMaxSafeDepDistBytes() != -1U)
4803  return 1;
4804 
4805  // Do not interleave loops with a relatively small trip count.
4806  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4807  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4808  return 1;
4809 
4810  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4811  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4812  << " registers\n");
4813 
4814  if (VF == 1) {
4815  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4816  TargetNumRegisters = ForceTargetNumScalarRegs;
4817  } else {
4818  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4819  TargetNumRegisters = ForceTargetNumVectorRegs;
4820  }
4821 
4822  RegisterUsage R = calculateRegisterUsage({VF})[0];
4823  // We divide by these constants so assume that we have at least one
4824  // instruction that uses at least one register.
4826 
4827  // We calculate the interleave count using the following formula.
4828  // Subtract the number of loop invariants from the number of available
4829  // registers. These registers are used by all of the interleaved instances.
4830  // Next, divide the remaining registers by the number of registers that is
4831  // required by the loop, in order to estimate how many parallel instances
4832  // fit without causing spills. All of this is rounded down if necessary to be
4833  // a power of two. We want power of two interleave count to simplify any
4834  // addressing operations or alignment considerations.
4835  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
4836  R.MaxLocalUsers);
4837 
4838  // Don't count the induction variable as interleaved.
4840  IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
4841  std::max(1U, (R.MaxLocalUsers - 1)));
4842 
4843  // Clamp the interleave ranges to reasonable counts.
4844  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4845 
4846  // Check if the user has overridden the max.
4847  if (VF == 1) {
4848  if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4849  MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4850  } else {
4851  if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4852  MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4853  }
4854 
4855  // If we did not calculate the cost for VF (because the user selected the VF)
4856  // then we calculate the cost of VF here.
4857  if (LoopCost == 0)
4858  LoopCost = expectedCost(VF).first;
4859 
4860  // Clamp the calculated IC to be between the 1 and the max interleave count
4861  // that the target allows.
4862  if (IC > MaxInterleaveCount)
4863  IC = MaxInterleaveCount;
4864  else if (IC < 1)
4865  IC = 1;
4866 
4867  // Interleave if we vectorized this loop and there is a reduction that could
4868  // benefit from interleaving.
4869  if (VF > 1 && !Legal->getReductionVars()->empty()) {
4870  LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4871  return IC;
4872  }
4873 
4874  // Note that if we've already vectorized the loop we will have done the
4875  // runtime check and so interleaving won't require further checks.
4876  bool InterleavingRequiresRuntimePointerCheck =
4877  (VF == 1 && Legal->getRuntimePointerChecking()->Need);
4878 
4879  // We want to interleave small loops in order to reduce the loop overhead and
4880  // potentially expose ILP opportunities.
4881  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
4882  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
4883  // We assume that the cost overhead is 1 and we use the cost model
4884  // to estimate the cost of the loop and interleave until the cost of the
4885  // loop overhead is about 5% of the cost of the loop.
4886  unsigned SmallIC =
4887  std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
4888 
4889  // Interleave until store/load ports (estimated by max interleave count) are
4890  // saturated.
4891  unsigned NumStores = Legal->getNumStores();
4892  unsigned NumLoads = Legal->getNumLoads();
4893  unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4894  unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4895 
4896  // If we have a scalar reduction (vector reductions are already dealt with
4897  // by this point), we can increase the critical path length if the loop
4898  // we're interleaving is inside another loop. Limit, by default to 2, so the
4899  // critical path only gets increased by one reduction operation.
4900  if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
4901  unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
4902  SmallIC = std::min(SmallIC, F);
4903  StoresIC = std::min(StoresIC, F);
4904  LoadsIC = std::min(LoadsIC, F);
4905  }
4906 
4908  std::max(StoresIC, LoadsIC) > SmallIC) {
4909  LLVM_DEBUG(
4910  dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4911  return std::max(StoresIC, LoadsIC);
4912  }
4913 
4914  LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4915  return SmallIC;
4916  }
4917 
4918  // Interleave if this is a large loop (small loops are already dealt with by
4919  // this point) that could benefit from interleaving.
4920  bool HasReductions = !Legal->getReductionVars()->empty();
4921  if (TTI.enableAggressiveInterleaving(HasReductions)) {
4922  LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4923  return IC;
4924  }
4925 
4926  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4927  return 1;
4928 }
4929 
4932  // This function calculates the register usage by measuring the highest number
4933  // of values that are alive at a single location. Obviously, this is a very
4934  // rough estimation. We scan the loop in a topological order in order and
4935  // assign a number to each instruction. We use RPO to ensure that defs are
4936  // met before their users. We assume that each instruction that has in-loop
4937  // users starts an interval. We record every time that an in-loop value is
4938  // used, so we have a list of the first and last occurrences of each
4939  // instruction. Next, we transpose this data structure into a multi map that
4940  // holds the list of intervals that *end* at a specific location. This multi
4941  // map allows us to perform a linear search. We scan the instructions linearly
4942  // and record each time that a new interval starts, by placing it in a set.
4943  // If we find this value in the multi-map then we remove it from the set.
4944  // The max register usage is the maximum size of the set.
4945  // We also search for instructions that are defined outside the loop, but are
4946  // used inside the loop. We need this number separately from the max-interval
4947  // usage number because when we unroll, loop-invariant values do not take
4948  // more register.
4949  LoopBlocksDFS DFS(TheLoop);
4950  DFS.perform(LI);
4951 
4952  RegisterUsage RU;
4953 
4954  // Each 'key' in the map opens a new interval. The values
4955  // of the map are the index of the 'last seen' usage of the
4956  // instruction that is the key.
4958 
4959  // Maps instruction to its index.
4960  SmallVector<Instruction *, 64> IdxToInstr;