LLVM  13.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/DenseSet.h"
22 #include "llvm/ADT/Optional.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallPtrSet.h"
28 #include "llvm/ADT/SmallSet.h"
29 #include "llvm/ADT/SmallString.h"
30 #include "llvm/ADT/Statistic.h"
31 #include "llvm/ADT/iterator.h"
40 #include "llvm/Analysis/LoopInfo.h"
49 #include "llvm/IR/Attributes.h"
50 #include "llvm/IR/BasicBlock.h"
51 #include "llvm/IR/Constant.h"
52 #include "llvm/IR/Constants.h"
53 #include "llvm/IR/DataLayout.h"
54 #include "llvm/IR/DebugLoc.h"
55 #include "llvm/IR/DerivedTypes.h"
56 #include "llvm/IR/Dominators.h"
57 #include "llvm/IR/Function.h"
58 #include "llvm/IR/IRBuilder.h"
59 #include "llvm/IR/InstrTypes.h"
60 #include "llvm/IR/Instruction.h"
61 #include "llvm/IR/Instructions.h"
62 #include "llvm/IR/IntrinsicInst.h"
63 #include "llvm/IR/Intrinsics.h"
64 #include "llvm/IR/Module.h"
65 #include "llvm/IR/NoFolder.h"
66 #include "llvm/IR/Operator.h"
67 #include "llvm/IR/PatternMatch.h"
68 #include "llvm/IR/Type.h"
69 #include "llvm/IR/Use.h"
70 #include "llvm/IR/User.h"
71 #include "llvm/IR/Value.h"
72 #include "llvm/IR/ValueHandle.h"
73 #include "llvm/IR/Verifier.h"
74 #include "llvm/InitializePasses.h"
75 #include "llvm/Pass.h"
76 #include "llvm/Support/Casting.h"
78 #include "llvm/Support/Compiler.h"
80 #include "llvm/Support/Debug.h"
84 #include "llvm/Support/KnownBits.h"
90 #include <algorithm>
91 #include <cassert>
92 #include <cstdint>
93 #include <iterator>
94 #include <memory>
95 #include <set>
96 #include <string>
97 #include <tuple>
98 #include <utility>
99 #include <vector>
100 
101 using namespace llvm;
102 using namespace llvm::PatternMatch;
103 using namespace slpvectorizer;
104 
105 #define SV_NAME "slp-vectorizer"
106 #define DEBUG_TYPE "SLP"
107 
108 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
109 
110 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
111  cl::desc("Run the SLP vectorization passes"));
112 
113 static cl::opt<int>
114  SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
115  cl::desc("Only vectorize if you gain more than this "
116  "number "));
117 
118 static cl::opt<bool>
119 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
120  cl::desc("Attempt to vectorize horizontal reductions"));
121 
123  "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
124  cl::desc(
125  "Attempt to vectorize horizontal reductions feeding into a store"));
126 
127 static cl::opt<int>
128 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
129  cl::desc("Attempt to vectorize for this register size in bits"));
130 
131 static cl::opt<unsigned>
132 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
133  cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
134 
135 static cl::opt<int>
136 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
137  cl::desc("Maximum depth of the lookup for consecutive stores."));
138 
139 /// Limits the size of scheduling regions in a block.
140 /// It avoid long compile times for _very_ large blocks where vector
141 /// instructions are spread over a wide range.
142 /// This limit is way higher than needed by real-world functions.
143 static cl::opt<int>
144 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
145  cl::desc("Limit the size of the SLP scheduling region per block"));
146 
148  "slp-min-reg-size", cl::init(128), cl::Hidden,
149  cl::desc("Attempt to vectorize for this register size in bits"));
150 
152  "slp-recursion-max-depth", cl::init(12), cl::Hidden,
153  cl::desc("Limit the recursion depth when building a vectorizable tree"));
154 
156  "slp-min-tree-size", cl::init(3), cl::Hidden,
157  cl::desc("Only vectorize small trees if they are fully vectorizable"));
158 
159 // The maximum depth that the look-ahead score heuristic will explore.
160 // The higher this value, the higher the compilation time overhead.
162  "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
163  cl::desc("The maximum look-ahead depth for operand reordering scores"));
164 
165 // The Look-ahead heuristic goes through the users of the bundle to calculate
166 // the users cost in getExternalUsesCost(). To avoid compilation time increase
167 // we limit the number of users visited to this value.
169  "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
170  cl::desc("The maximum number of users to visit while visiting the "
171  "predecessors. This prevents compilation time increase."));
172 
173 static cl::opt<bool>
174  ViewSLPTree("view-slp-tree", cl::Hidden,
175  cl::desc("Display the SLP trees with Graphviz"));
176 
177 // Limit the number of alias checks. The limit is chosen so that
178 // it has no negative effect on the llvm benchmarks.
179 static const unsigned AliasedCheckLimit = 10;
180 
181 // Another limit for the alias checks: The maximum distance between load/store
182 // instructions where alias checks are done.
183 // This limit is useful for very large basic blocks.
184 static const unsigned MaxMemDepDistance = 160;
185 
186 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
187 /// regions to be handled.
188 static const int MinScheduleRegionSize = 16;
189 
190 /// Predicate for the element types that the SLP vectorizer supports.
191 ///
192 /// The most important thing to filter here are types which are invalid in LLVM
193 /// vectors. We also filter target specific types which have absolutely no
194 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
195 /// avoids spending time checking the cost model and realizing that they will
196 /// be inevitably scalarized.
197 static bool isValidElementType(Type *Ty) {
198  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
199  !Ty->isPPC_FP128Ty();
200 }
201 
202 /// \returns true if all of the instructions in \p VL are in the same block or
203 /// false otherwise.
205  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
206  if (!I0)
207  return false;
208  BasicBlock *BB = I0->getParent();
209  for (int I = 1, E = VL.size(); I < E; I++) {
210  auto *II = dyn_cast<Instruction>(VL[I]);
211  if (!II)
212  return false;
213 
214  if (BB != II->getParent())
215  return false;
216  }
217  return true;
218 }
219 
220 /// \returns True if all of the values in \p VL are constants (but not
221 /// globals/constant expressions).
223  // Constant expressions and globals can't be vectorized like normal integer/FP
224  // constants.
225  for (Value *i : VL)
226  if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
227  return false;
228  return true;
229 }
230 
231 /// \returns True if all of the values in \p VL are identical.
232 static bool isSplat(ArrayRef<Value *> VL) {
233  for (unsigned i = 1, e = VL.size(); i < e; ++i)
234  if (VL[i] != VL[0])
235  return false;
236  return true;
237 }
238 
239 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
240 static bool isCommutative(Instruction *I) {
241  if (auto *Cmp = dyn_cast<CmpInst>(I))
242  return Cmp->isCommutative();
243  if (auto *BO = dyn_cast<BinaryOperator>(I))
244  return BO->isCommutative();
245  // TODO: This should check for generic Instruction::isCommutative(), but
246  // we need to confirm that the caller code correctly handles Intrinsics
247  // for example (does not have 2 operands).
248  return false;
249 }
250 
251 /// Checks if the vector of instructions can be represented as a shuffle, like:
252 /// %x0 = extractelement <4 x i8> %x, i32 0
253 /// %x3 = extractelement <4 x i8> %x, i32 3
254 /// %y1 = extractelement <4 x i8> %y, i32 1
255 /// %y2 = extractelement <4 x i8> %y, i32 2
256 /// %x0x0 = mul i8 %x0, %x0
257 /// %x3x3 = mul i8 %x3, %x3
258 /// %y1y1 = mul i8 %y1, %y1
259 /// %y2y2 = mul i8 %y2, %y2
260 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
261 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
262 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
263 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
264 /// ret <4 x i8> %ins4
265 /// can be transformed into:
266 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
267 /// i32 6>
268 /// %2 = mul <4 x i8> %1, %1
269 /// ret <4 x i8> %2
270 /// We convert this initially to something like:
271 /// %x0 = extractelement <4 x i8> %x, i32 0
272 /// %x3 = extractelement <4 x i8> %x, i32 3
273 /// %y1 = extractelement <4 x i8> %y, i32 1
274 /// %y2 = extractelement <4 x i8> %y, i32 2
275 /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
276 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
277 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
278 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
279 /// %5 = mul <4 x i8> %4, %4
280 /// %6 = extractelement <4 x i8> %5, i32 0
281 /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
282 /// %7 = extractelement <4 x i8> %5, i32 1
283 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
284 /// %8 = extractelement <4 x i8> %5, i32 2
285 /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
286 /// %9 = extractelement <4 x i8> %5, i32 3
287 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
288 /// ret <4 x i8> %ins4
289 /// InstCombiner transforms this into a shuffle and vector mul
290 /// Mask will return the Shuffle Mask equivalent to the extracted elements.
291 /// TODO: Can we split off and reuse the shuffle mask detection from
292 /// TargetTransformInfo::getInstructionThroughput?
295  auto *EI0 = cast<ExtractElementInst>(VL[0]);
296  unsigned Size =
297  cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
298  Value *Vec1 = nullptr;
299  Value *Vec2 = nullptr;
300  enum ShuffleMode { Unknown, Select, Permute };
301  ShuffleMode CommonShuffleMode = Unknown;
302  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
303  auto *EI = cast<ExtractElementInst>(VL[I]);
304  auto *Vec = EI->getVectorOperand();
305  // All vector operands must have the same number of vector elements.
306  if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
307  return None;
308  auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
309  if (!Idx)
310  return None;
311  // Undefined behavior if Idx is negative or >= Size.
312  if (Idx->getValue().uge(Size)) {
313  Mask.push_back(UndefMaskElem);
314  continue;
315  }
316  unsigned IntIdx = Idx->getValue().getZExtValue();
317  Mask.push_back(IntIdx);
318  // We can extractelement from undef or poison vector.
319  if (isa<UndefValue>(Vec))
320  continue;
321  // For correct shuffling we have to have at most 2 different vector operands
322  // in all extractelement instructions.
323  if (!Vec1 || Vec1 == Vec)
324  Vec1 = Vec;
325  else if (!Vec2 || Vec2 == Vec)
326  Vec2 = Vec;
327  else
328  return None;
329  if (CommonShuffleMode == Permute)
330  continue;
331  // If the extract index is not the same as the operation number, it is a
332  // permutation.
333  if (IntIdx != I) {
334  CommonShuffleMode = Permute;
335  continue;
336  }
337  CommonShuffleMode = Select;
338  }
339  // If we're not crossing lanes in different vectors, consider it as blending.
340  if (CommonShuffleMode == Select && Vec2)
342  // If Vec2 was never used, we have a permutation of a single vector, otherwise
343  // we have permutation of 2 vectors.
346 }
347 
348 namespace {
349 
350 /// Main data required for vectorization of instructions.
351 struct InstructionsState {
352  /// The very first instruction in the list with the main opcode.
353  Value *OpValue = nullptr;
354 
355  /// The main/alternate instruction.
356  Instruction *MainOp = nullptr;
357  Instruction *AltOp = nullptr;
358 
359  /// The main/alternate opcodes for the list of instructions.
360  unsigned getOpcode() const {
361  return MainOp ? MainOp->getOpcode() : 0;
362  }
363 
364  unsigned getAltOpcode() const {
365  return AltOp ? AltOp->getOpcode() : 0;
366  }
367 
368  /// Some of the instructions in the list have alternate opcodes.
369  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
370 
371  bool isOpcodeOrAlt(Instruction *I) const {
372  unsigned CheckedOpcode = I->getOpcode();
373  return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
374  }
375 
376  InstructionsState() = delete;
377  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
378  : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
379 };
380 
381 } // end anonymous namespace
382 
383 /// Chooses the correct key for scheduling data. If \p Op has the same (or
384 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
385 /// OpValue.
386 static Value *isOneOf(const InstructionsState &S, Value *Op) {
387  auto *I = dyn_cast<Instruction>(Op);
388  if (I && S.isOpcodeOrAlt(I))
389  return Op;
390  return S.OpValue;
391 }
392 
393 /// \returns true if \p Opcode is allowed as part of of the main/alternate
394 /// instruction for SLP vectorization.
395 ///
396 /// Example of unsupported opcode is SDIV that can potentially cause UB if the
397 /// "shuffled out" lane would result in division by zero.
398 static bool isValidForAlternation(unsigned Opcode) {
399  if (Instruction::isIntDivRem(Opcode))
400  return false;
401 
402  return true;
403 }
404 
405 /// \returns analysis of the Instructions in \p VL described in
406 /// InstructionsState, the Opcode that we suppose the whole list
407 /// could be vectorized even if its structure is diverse.
408 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
409  unsigned BaseIndex = 0) {
410  // Make sure these are all Instructions.
411  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
412  return InstructionsState(VL[BaseIndex], nullptr, nullptr);
413 
414  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
415  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
416  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
417  unsigned AltOpcode = Opcode;
418  unsigned AltIndex = BaseIndex;
419 
420  // Check for one alternate opcode from another BinaryOperator.
421  // TODO - generalize to support all operators (types, calls etc.).
422  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
423  unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
424  if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
425  if (InstOpcode == Opcode || InstOpcode == AltOpcode)
426  continue;
427  if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
428  isValidForAlternation(Opcode)) {
429  AltOpcode = InstOpcode;
430  AltIndex = Cnt;
431  continue;
432  }
433  } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
434  Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
435  Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
436  if (Ty0 == Ty1) {
437  if (InstOpcode == Opcode || InstOpcode == AltOpcode)
438  continue;
439  if (Opcode == AltOpcode) {
440  assert(isValidForAlternation(Opcode) &&
441  isValidForAlternation(InstOpcode) &&
442  "Cast isn't safe for alternation, logic needs to be updated!");
443  AltOpcode = InstOpcode;
444  AltIndex = Cnt;
445  continue;
446  }
447  }
448  } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
449  continue;
450  return InstructionsState(VL[BaseIndex], nullptr, nullptr);
451  }
452 
453  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
454  cast<Instruction>(VL[AltIndex]));
455 }
456 
457 /// \returns true if all of the values in \p VL have the same type or false
458 /// otherwise.
460  Type *Ty = VL[0]->getType();
461  for (int i = 1, e = VL.size(); i < e; i++)
462  if (VL[i]->getType() != Ty)
463  return false;
464 
465  return true;
466 }
467 
468 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
470  unsigned Opcode = E->getOpcode();
471  assert((Opcode == Instruction::ExtractElement ||
472  Opcode == Instruction::ExtractValue) &&
473  "Expected extractelement or extractvalue instruction.");
474  if (Opcode == Instruction::ExtractElement) {
475  auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
476  if (!CI)
477  return None;
478  return CI->getZExtValue();
479  }
480  ExtractValueInst *EI = cast<ExtractValueInst>(E);
481  if (EI->getNumIndices() != 1)
482  return None;
483  return *EI->idx_begin();
484 }
485 
486 /// \returns True if in-tree use also needs extract. This refers to
487 /// possible scalar operand in vectorized instruction.
488 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
489  TargetLibraryInfo *TLI) {
490  unsigned Opcode = UserInst->getOpcode();
491  switch (Opcode) {
492  case Instruction::Load: {
493  LoadInst *LI = cast<LoadInst>(UserInst);
494  return (LI->getPointerOperand() == Scalar);
495  }
496  case Instruction::Store: {
497  StoreInst *SI = cast<StoreInst>(UserInst);
498  return (SI->getPointerOperand() == Scalar);
499  }
500  case Instruction::Call: {
501  CallInst *CI = cast<CallInst>(UserInst);
503  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
505  return (CI->getArgOperand(i) == Scalar);
506  }
508  }
509  default:
510  return false;
511  }
512 }
513 
514 /// \returns the AA location that is being access by the instruction.
516  if (StoreInst *SI = dyn_cast<StoreInst>(I))
517  return MemoryLocation::get(SI);
518  if (LoadInst *LI = dyn_cast<LoadInst>(I))
519  return MemoryLocation::get(LI);
520  return MemoryLocation();
521 }
522 
523 /// \returns True if the instruction is not a volatile or atomic load/store.
524 static bool isSimple(Instruction *I) {
525  if (LoadInst *LI = dyn_cast<LoadInst>(I))
526  return LI->isSimple();
527  if (StoreInst *SI = dyn_cast<StoreInst>(I))
528  return SI->isSimple();
529  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
530  return !MI->isVolatile();
531  return true;
532 }
533 
534 namespace llvm {
535 
538  Mask.clear();
539  const unsigned E = Indices.size();
540  Mask.resize(E, E + 1);
541  for (unsigned I = 0; I < E; ++I)
542  Mask[Indices[I]] = I;
543 }
544 
545 namespace slpvectorizer {
546 
547 /// Bottom Up SLP Vectorizer.
548 class BoUpSLP {
549  struct TreeEntry;
550  struct ScheduleData;
551 
552 public:
560 
562  TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
565  : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
566  DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
567  CodeMetrics::collectEphemeralValues(F, AC, EphValues);
568  // Use the vector register size specified by the target unless overridden
569  // by a command-line option.
570  // TODO: It would be better to limit the vectorization factor based on
571  // data type rather than just register size. For example, x86 AVX has
572  // 256-bit registers, but it does not support integer operations
573  // at that width (that requires AVX2).
574  if (MaxVectorRegSizeOption.getNumOccurrences())
575  MaxVecRegSize = MaxVectorRegSizeOption;
576  else
577  MaxVecRegSize =
579  .getFixedSize();
580 
581  if (MinVectorRegSizeOption.getNumOccurrences())
582  MinVecRegSize = MinVectorRegSizeOption;
583  else
584  MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
585  }
586 
587  /// Vectorize the tree that starts with the elements in \p VL.
588  /// Returns the vectorized root.
589  Value *vectorizeTree();
590 
591  /// Vectorize the tree but with the list of externally used values \p
592  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
593  /// generated extractvalue instructions.
594  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
595 
596  /// \returns the cost incurred by unwanted spills and fills, caused by
597  /// holding live values over call sites.
598  InstructionCost getSpillCost() const;
599 
600  /// \returns the vectorization cost of the subtree that starts at \p VL.
601  /// A negative number means that this is profitable.
602  InstructionCost getTreeCost();
603 
604  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
605  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
606  void buildTree(ArrayRef<Value *> Roots,
607  ArrayRef<Value *> UserIgnoreLst = None);
608 
609  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
610  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
611  /// into account (and updating it, if required) list of externally used
612  /// values stored in \p ExternallyUsedValues.
613  void buildTree(ArrayRef<Value *> Roots,
614  ExtraValueToDebugLocsMap &ExternallyUsedValues,
615  ArrayRef<Value *> UserIgnoreLst = None);
616 
617  /// Clear the internal data structures that are created by 'buildTree'.
618  void deleteTree() {
619  VectorizableTree.clear();
620  ScalarToTreeEntry.clear();
621  MustGather.clear();
622  ExternalUses.clear();
623  NumOpsWantToKeepOrder.clear();
624  NumOpsWantToKeepOriginalOrder = 0;
625  for (auto &Iter : BlocksSchedules) {
626  BlockScheduling *BS = Iter.second.get();
627  BS->clear();
628  }
629  MinBWs.clear();
630  InstrElementSize.clear();
631  }
632 
633  unsigned getTreeSize() const { return VectorizableTree.size(); }
634 
635  /// Perform LICM and CSE on the newly generated gather sequences.
636  void optimizeGatherSequence();
637 
638  /// \returns The best order of instructions for vectorization.
641  NumOpsWantToKeepOrder,
642  [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
643  return D.getFirst().size() ==
644  VectorizableTree[0]->Scalars.size();
645  }) &&
646  "All orders must have the same size as number of instructions in "
647  "tree node.");
648  auto I = std::max_element(
649  NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
650  [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
651  const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
652  return D1.second < D2.second;
653  });
654  if (I == NumOpsWantToKeepOrder.end() ||
655  I->getSecond() <= NumOpsWantToKeepOriginalOrder)
656  return None;
657 
658  return makeArrayRef(I->getFirst());
659  }
660 
661  /// Builds the correct order for root instructions.
662  /// If some leaves have the same instructions to be vectorized, we may
663  /// incorrectly evaluate the best order for the root node (it is built for the
664  /// vector of instructions without repeated instructions and, thus, has less
665  /// elements than the root node). This function builds the correct order for
666  /// the root node.
667  /// For example, if the root node is <a+b, a+c, a+d, f+e>, then the leaves
668  /// are <a, a, a, f> and <b, c, d, e>. When we try to vectorize the first
669  /// leaf, it will be shrink to <a, b>. If instructions in this leaf should
670  /// be reordered, the best order will be <1, 0>. We need to extend this
671  /// order for the root node. For the root node this order should look like
672  /// <3, 0, 1, 2>. This function extends the order for the reused
673  /// instructions.
674  void findRootOrder(OrdersType &Order) {
675  // If the leaf has the same number of instructions to vectorize as the root
676  // - order must be set already.
677  unsigned RootSize = VectorizableTree[0]->Scalars.size();
678  if (Order.size() == RootSize)
679  return;
680  SmallVector<unsigned, 4> RealOrder(Order.size());
681  std::swap(Order, RealOrder);
683  inversePermutation(RealOrder, Mask);
684  Order.assign(Mask.begin(), Mask.end());
685  // The leaf has less number of instructions - need to find the true order of
686  // the root.
687  // Scan the nodes starting from the leaf back to the root.
688  const TreeEntry *PNode = VectorizableTree.back().get();
689  SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
691  while (!Nodes.empty() && Order.size() != RootSize) {
692  const TreeEntry *PNode = Nodes.pop_back_val();
693  if (!Visited.insert(PNode).second)
694  continue;
695  const TreeEntry &Node = *PNode;
696  for (const EdgeInfo &EI : Node.UserTreeIndices)
697  if (EI.UserTE)
698  Nodes.push_back(EI.UserTE);
699  if (Node.ReuseShuffleIndices.empty())
700  continue;
701  // Build the order for the parent node.
702  OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
703  SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
704  // The algorithm of the order extension is:
705  // 1. Calculate the number of the same instructions for the order.
706  // 2. Calculate the index of the new order: total number of instructions
707  // with order less than the order of the current instruction + reuse
708  // number of the current instruction.
709  // 3. The new order is just the index of the instruction in the original
710  // vector of the instructions.
711  for (unsigned I : Node.ReuseShuffleIndices)
712  ++OrderCounter[Order[I]];
713  SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
714  for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
715  unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
716  unsigned OrderIdx = Order[ReusedIdx];
717  unsigned NewIdx = 0;
718  for (unsigned J = 0; J < OrderIdx; ++J)
719  NewIdx += OrderCounter[J];
720  NewIdx += CurrentCounter[OrderIdx];
721  ++CurrentCounter[OrderIdx];
722  assert(NewOrder[NewIdx] == RootSize &&
723  "The order index should not be written already.");
724  NewOrder[NewIdx] = I;
725  }
726  std::swap(Order, NewOrder);
727  }
728  assert(Order.size() == RootSize &&
729  "Root node is expected or the size of the order must be the same as "
730  "the number of elements in the root node.");
731  assert(llvm::all_of(Order,
732  [RootSize](unsigned Val) { return Val != RootSize; }) &&
733  "All indices must be initialized");
734  }
735 
736  /// \return The vector element size in bits to use when vectorizing the
737  /// expression tree ending at \p V. If V is a store, the size is the width of
738  /// the stored value. Otherwise, the size is the width of the largest loaded
739  /// value reaching V. This method is used by the vectorizer to calculate
740  /// vectorization factors.
741  unsigned getVectorElementSize(Value *V);
742 
743  /// Compute the minimum type sizes required to represent the entries in a
744  /// vectorizable tree.
746 
747  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
748  unsigned getMaxVecRegSize() const {
749  return MaxVecRegSize;
750  }
751 
752  // \returns minimum vector register size as set by cl::opt.
753  unsigned getMinVecRegSize() const {
754  return MinVecRegSize;
755  }
756 
757  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
758  unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
759  MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
760  return MaxVF ? MaxVF : UINT_MAX;
761  }
762 
763  /// Check if homogeneous aggregate is isomorphic to some VectorType.
764  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
765  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
766  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
767  ///
768  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
769  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
770 
771  /// \returns True if the VectorizableTree is both tiny and not fully
772  /// vectorizable. We do not vectorize such trees.
773  bool isTreeTinyAndNotFullyVectorizable() const;
774 
775  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
776  /// can be load combined in the backend. Load combining may not be allowed in
777  /// the IR optimizer, so we do not want to alter the pattern. For example,
778  /// partially transforming a scalar bswap() pattern into vector code is
779  /// effectively impossible for the backend to undo.
780  /// TODO: If load combining is allowed in the IR optimizer, this analysis
781  /// may not be necessary.
782  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
783 
784  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
785  /// can be load combined in the backend. Load combining may not be allowed in
786  /// the IR optimizer, so we do not want to alter the pattern. For example,
787  /// partially transforming a scalar bswap() pattern into vector code is
788  /// effectively impossible for the backend to undo.
789  /// TODO: If load combining is allowed in the IR optimizer, this analysis
790  /// may not be necessary.
791  bool isLoadCombineCandidate() const;
792 
794 
795  /// This structure holds any data we need about the edges being traversed
796  /// during buildTree_rec(). We keep track of:
797  /// (i) the user TreeEntry index, and
798  /// (ii) the index of the edge.
799  struct EdgeInfo {
800  EdgeInfo() = default;
801  EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
802  : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
803  /// The user TreeEntry.
804  TreeEntry *UserTE = nullptr;
805  /// The operand index of the use.
806  unsigned EdgeIdx = UINT_MAX;
807 #ifndef NDEBUG
808  friend inline raw_ostream &operator<<(raw_ostream &OS,
809  const BoUpSLP::EdgeInfo &EI) {
810  EI.dump(OS);
811  return OS;
812  }
813  /// Debug print.
814  void dump(raw_ostream &OS) const {
815  OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
816  << " EdgeIdx:" << EdgeIdx << "}";
817  }
818  LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
819 #endif
820  };
821 
822  /// A helper data structure to hold the operands of a vector of instructions.
823  /// This supports a fixed vector length for all operand vectors.
824  class VLOperands {
825  /// For each operand we need (i) the value, and (ii) the opcode that it
826  /// would be attached to if the expression was in a left-linearized form.
827  /// This is required to avoid illegal operand reordering.
828  /// For example:
829  /// \verbatim
830  /// 0 Op1
831  /// |/
832  /// Op1 Op2 Linearized + Op2
833  /// \ / ----------> |/
834  /// - -
835  ///
836  /// Op1 - Op2 (0 + Op1) - Op2
837  /// \endverbatim
838  ///
839  /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
840  ///
841  /// Another way to think of this is to track all the operations across the
842  /// path from the operand all the way to the root of the tree and to
843  /// calculate the operation that corresponds to this path. For example, the
844  /// path from Op2 to the root crosses the RHS of the '-', therefore the
845  /// corresponding operation is a '-' (which matches the one in the
846  /// linearized tree, as shown above).
847  ///
848  /// For lack of a better term, we refer to this operation as Accumulated
849  /// Path Operation (APO).
850  struct OperandData {
851  OperandData() = default;
852  OperandData(Value *V, bool APO, bool IsUsed)
853  : V(V), APO(APO), IsUsed(IsUsed) {}
854  /// The operand value.
855  Value *V = nullptr;
856  /// TreeEntries only allow a single opcode, or an alternate sequence of
857  /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
858  /// APO. It is set to 'true' if 'V' is attached to an inverse operation
859  /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
860  /// (e.g., Add/Mul)
861  bool APO = false;
862  /// Helper data for the reordering function.
863  bool IsUsed = false;
864  };
865 
866  /// During operand reordering, we are trying to select the operand at lane
867  /// that matches best with the operand at the neighboring lane. Our
868  /// selection is based on the type of value we are looking for. For example,
869  /// if the neighboring lane has a load, we need to look for a load that is
870  /// accessing a consecutive address. These strategies are summarized in the
871  /// 'ReorderingMode' enumerator.
872  enum class ReorderingMode {
873  Load, ///< Matching loads to consecutive memory addresses
874  Opcode, ///< Matching instructions based on opcode (same or alternate)
875  Constant, ///< Matching constants
876  Splat, ///< Matching the same instruction multiple times (broadcast)
877  Failed, ///< We failed to create a vectorizable group
878  };
879 
881 
882  /// A vector of operand vectors.
884 
885  const DataLayout &DL;
886  ScalarEvolution &SE;
887  const BoUpSLP &R;
888 
889  /// \returns the operand data at \p OpIdx and \p Lane.
890  OperandData &getData(unsigned OpIdx, unsigned Lane) {
891  return OpsVec[OpIdx][Lane];
892  }
893 
894  /// \returns the operand data at \p OpIdx and \p Lane. Const version.
895  const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
896  return OpsVec[OpIdx][Lane];
897  }
898 
899  /// Clears the used flag for all entries.
900  void clearUsed() {
901  for (unsigned OpIdx = 0, NumOperands = getNumOperands();
902  OpIdx != NumOperands; ++OpIdx)
903  for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
904  ++Lane)
905  OpsVec[OpIdx][Lane].IsUsed = false;
906  }
907 
908  /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
909  void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
910  std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
911  }
912 
913  // The hard-coded scores listed here are not very important. When computing
914  // the scores of matching one sub-tree with another, we are basically
915  // counting the number of values that are matching. So even if all scores
916  // are set to 1, we would still get a decent matching result.
917  // However, sometimes we have to break ties. For example we may have to
918  // choose between matching loads vs matching opcodes. This is what these
919  // scores are helping us with: they provide the order of preference.
920 
921  /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
922  static const int ScoreConsecutiveLoads = 3;
923  /// ExtractElementInst from same vector and consecutive indexes.
924  static const int ScoreConsecutiveExtracts = 3;
925  /// Constants.
926  static const int ScoreConstants = 2;
927  /// Instructions with the same opcode.
928  static const int ScoreSameOpcode = 2;
929  /// Instructions with alt opcodes (e.g, add + sub).
930  static const int ScoreAltOpcodes = 1;
931  /// Identical instructions (a.k.a. splat or broadcast).
932  static const int ScoreSplat = 1;
933  /// Matching with an undef is preferable to failing.
934  static const int ScoreUndef = 1;
935  /// Score for failing to find a decent match.
936  static const int ScoreFail = 0;
937  /// User exteranl to the vectorized code.
938  static const int ExternalUseCost = 1;
939  /// The user is internal but in a different lane.
940  static const int UserInDiffLaneCost = ExternalUseCost;
941 
942  /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
943  static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
944  ScalarEvolution &SE) {
945  auto *LI1 = dyn_cast<LoadInst>(V1);
946  auto *LI2 = dyn_cast<LoadInst>(V2);
947  if (LI1 && LI2) {
948  if (LI1->getParent() != LI2->getParent())
949  return VLOperands::ScoreFail;
950 
951  Optional<int> Dist =
952  getPointersDiff(LI1->getPointerOperand(), LI2->getPointerOperand(),
953  DL, SE, /*StrictCheck=*/true);
954  return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads
955  : VLOperands::ScoreFail;
956  }
957 
958  auto *C1 = dyn_cast<Constant>(V1);
959  auto *C2 = dyn_cast<Constant>(V2);
960  if (C1 && C2)
961  return VLOperands::ScoreConstants;
962 
963  // Extracts from consecutive indexes of the same vector better score as
964  // the extracts could be optimized away.
965  Value *EV;
966  ConstantInt *Ex1Idx, *Ex2Idx;
967  if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
968  match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
969  Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
970  return VLOperands::ScoreConsecutiveExtracts;
971 
972  auto *I1 = dyn_cast<Instruction>(V1);
973  auto *I2 = dyn_cast<Instruction>(V2);
974  if (I1 && I2) {
975  if (I1 == I2)
976  return VLOperands::ScoreSplat;
977  InstructionsState S = getSameOpcode({I1, I2});
978  // Note: Only consider instructions with <= 2 operands to avoid
979  // complexity explosion.
980  if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
981  return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
982  : VLOperands::ScoreSameOpcode;
983  }
984 
985  if (isa<UndefValue>(V2))
986  return VLOperands::ScoreUndef;
987 
988  return VLOperands::ScoreFail;
989  }
990 
991  /// Holds the values and their lane that are taking part in the look-ahead
992  /// score calculation. This is used in the external uses cost calculation.
993  SmallDenseMap<Value *, int> InLookAheadValues;
994 
995  /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
996  /// either external to the vectorized code, or require shuffling.
997  int getExternalUsesCost(const std::pair<Value *, int> &LHS,
998  const std::pair<Value *, int> &RHS) {
999  int Cost = 0;
1000  std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
1001  for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
1002  Value *V = Values[Idx].first;
1003  if (isa<Constant>(V)) {
1004  // Since this is a function pass, it doesn't make semantic sense to
1005  // walk the users of a subclass of Constant. The users could be in
1006  // another function, or even another module that happens to be in
1007  // the same LLVMContext.
1008  continue;
1009  }
1010 
1011  // Calculate the absolute lane, using the minimum relative lane of LHS
1012  // and RHS as base and Idx as the offset.
1013  int Ln = std::min(LHS.second, RHS.second) + Idx;
1014  assert(Ln >= 0 && "Bad lane calculation");
1015  unsigned UsersBudget = LookAheadUsersBudget;
1016  for (User *U : V->users()) {
1017  if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
1018  // The user is in the VectorizableTree. Check if we need to insert.
1019  auto It = llvm::find(UserTE->Scalars, U);
1020  assert(It != UserTE->Scalars.end() && "U is in UserTE");
1021  int UserLn = std::distance(UserTE->Scalars.begin(), It);
1022  assert(UserLn >= 0 && "Bad lane");
1023  if (UserLn != Ln)
1024  Cost += UserInDiffLaneCost;
1025  } else {
1026  // Check if the user is in the look-ahead code.
1027  auto It2 = InLookAheadValues.find(U);
1028  if (It2 != InLookAheadValues.end()) {
1029  // The user is in the look-ahead code. Check the lane.
1030  if (It2->second != Ln)
1031  Cost += UserInDiffLaneCost;
1032  } else {
1033  // The user is neither in SLP tree nor in the look-ahead code.
1034  Cost += ExternalUseCost;
1035  }
1036  }
1037  // Limit the number of visited uses to cap compilation time.
1038  if (--UsersBudget == 0)
1039  break;
1040  }
1041  }
1042  return Cost;
1043  }
1044 
1045  /// Go through the operands of \p LHS and \p RHS recursively until \p
1046  /// MaxLevel, and return the cummulative score. For example:
1047  /// \verbatim
1048  /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1049  /// \ / \ / \ / \ /
1050  /// + + + +
1051  /// G1 G2 G3 G4
1052  /// \endverbatim
1053  /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1054  /// each level recursively, accumulating the score. It starts from matching
1055  /// the additions at level 0, then moves on to the loads (level 1). The
1056  /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1057  /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
1058  /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
1059  /// Please note that the order of the operands does not matter, as we
1060  /// evaluate the score of all profitable combinations of operands. In
1061  /// other words the score of G1 and G4 is the same as G1 and G2. This
1062  /// heuristic is based on ideas described in:
1063  /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1064  /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1065  /// Luís F. W. Góes
1066  int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
1067  const std::pair<Value *, int> &RHS, int CurrLevel,
1068  int MaxLevel) {
1069 
1070  Value *V1 = LHS.first;
1071  Value *V2 = RHS.first;
1072  // Get the shallow score of V1 and V2.
1073  int ShallowScoreAtThisLevel =
1074  std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
1075  getExternalUsesCost(LHS, RHS));
1076  int Lane1 = LHS.second;
1077  int Lane2 = RHS.second;
1078 
1079  // If reached MaxLevel,
1080  // or if V1 and V2 are not instructions,
1081  // or if they are SPLAT,
1082  // or if they are not consecutive, early return the current cost.
1083  auto *I1 = dyn_cast<Instruction>(V1);
1084  auto *I2 = dyn_cast<Instruction>(V2);
1085  if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1086  ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
1087  (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
1088  return ShallowScoreAtThisLevel;
1089  assert(I1 && I2 && "Should have early exited.");
1090 
1091  // Keep track of in-tree values for determining the external-use cost.
1092  InLookAheadValues[V1] = Lane1;
1093  InLookAheadValues[V2] = Lane2;
1094 
1095  // Contains the I2 operand indexes that got matched with I1 operands.
1096  SmallSet<unsigned, 4> Op2Used;
1097 
1098  // Recursion towards the operands of I1 and I2. We are trying all possbile
1099  // operand pairs, and keeping track of the best score.
1100  for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1101  OpIdx1 != NumOperands1; ++OpIdx1) {
1102  // Try to pair op1I with the best operand of I2.
1103  int MaxTmpScore = 0;
1104  unsigned MaxOpIdx2 = 0;
1105  bool FoundBest = false;
1106  // If I2 is commutative try all combinations.
1107  unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1108  unsigned ToIdx = isCommutative(I2)
1109  ? I2->getNumOperands()
1110  : std::min(I2->getNumOperands(), OpIdx1 + 1);
1111  assert(FromIdx <= ToIdx && "Bad index");
1112  for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1113  // Skip operands already paired with OpIdx1.
1114  if (Op2Used.count(OpIdx2))
1115  continue;
1116  // Recursively calculate the cost at each level
1117  int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
1118  {I2->getOperand(OpIdx2), Lane2},
1119  CurrLevel + 1, MaxLevel);
1120  // Look for the best score.
1121  if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
1122  MaxTmpScore = TmpScore;
1123  MaxOpIdx2 = OpIdx2;
1124  FoundBest = true;
1125  }
1126  }
1127  if (FoundBest) {
1128  // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1129  Op2Used.insert(MaxOpIdx2);
1130  ShallowScoreAtThisLevel += MaxTmpScore;
1131  }
1132  }
1133  return ShallowScoreAtThisLevel;
1134  }
1135 
1136  /// \Returns the look-ahead score, which tells us how much the sub-trees
1137  /// rooted at \p LHS and \p RHS match, the more they match the higher the
1138  /// score. This helps break ties in an informed way when we cannot decide on
1139  /// the order of the operands by just considering the immediate
1140  /// predecessors.
1141  int getLookAheadScore(const std::pair<Value *, int> &LHS,
1142  const std::pair<Value *, int> &RHS) {
1143  InLookAheadValues.clear();
1144  return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
1145  }
1146 
1147  // Search all operands in Ops[*][Lane] for the one that matches best
1148  // Ops[OpIdx][LastLane] and return its opreand index.
1149  // If no good match can be found, return None.
1151  getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1152  ArrayRef<ReorderingMode> ReorderingModes) {
1153  unsigned NumOperands = getNumOperands();
1154 
1155  // The operand of the previous lane at OpIdx.
1156  Value *OpLastLane = getData(OpIdx, LastLane).V;
1157 
1158  // Our strategy mode for OpIdx.
1159  ReorderingMode RMode = ReorderingModes[OpIdx];
1160 
1161  // The linearized opcode of the operand at OpIdx, Lane.
1162  bool OpIdxAPO = getData(OpIdx, Lane).APO;
1163 
1164  // The best operand index and its score.
1165  // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1166  // are using the score to differentiate between the two.
1167  struct BestOpData {
1168  Optional<unsigned> Idx = None;
1169  unsigned Score = 0;
1170  } BestOp;
1171 
1172  // Iterate through all unused operands and look for the best.
1173  for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1174  // Get the operand at Idx and Lane.
1175  OperandData &OpData = getData(Idx, Lane);
1176  Value *Op = OpData.V;
1177  bool OpAPO = OpData.APO;
1178 
1179  // Skip already selected operands.
1180  if (OpData.IsUsed)
1181  continue;
1182 
1183  // Skip if we are trying to move the operand to a position with a
1184  // different opcode in the linearized tree form. This would break the
1185  // semantics.
1186  if (OpAPO != OpIdxAPO)
1187  continue;
1188 
1189  // Look for an operand that matches the current mode.
1190  switch (RMode) {
1191  case ReorderingMode::Load:
1193  case ReorderingMode::Opcode: {
1194  bool LeftToRight = Lane > LastLane;
1195  Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1196  Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1197  unsigned Score =
1198  getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
1199  if (Score > BestOp.Score) {
1200  BestOp.Idx = Idx;
1201  BestOp.Score = Score;
1202  }
1203  break;
1204  }
1205  case ReorderingMode::Splat:
1206  if (Op == OpLastLane)
1207  BestOp.Idx = Idx;
1208  break;
1210  return None;
1211  }
1212  }
1213 
1214  if (BestOp.Idx) {
1215  getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
1216  return BestOp.Idx;
1217  }
1218  // If we could not find a good match return None.
1219  return None;
1220  }
1221 
1222  /// Helper for reorderOperandVecs. \Returns the lane that we should start
1223  /// reordering from. This is the one which has the least number of operands
1224  /// that can freely move about.
1225  unsigned getBestLaneToStartReordering() const {
1226  unsigned BestLane = 0;
1227  unsigned Min = UINT_MAX;
1228  for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1229  ++Lane) {
1230  unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
1231  if (NumFreeOps < Min) {
1232  Min = NumFreeOps;
1233  BestLane = Lane;
1234  }
1235  }
1236  return BestLane;
1237  }
1238 
1239  /// \Returns the maximum number of operands that are allowed to be reordered
1240  /// for \p Lane. This is used as a heuristic for selecting the first lane to
1241  /// start operand reordering.
1242  unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1243  unsigned CntTrue = 0;
1244  unsigned NumOperands = getNumOperands();
1245  // Operands with the same APO can be reordered. We therefore need to count
1246  // how many of them we have for each APO, like this: Cnt[APO] = x.
1247  // Since we only have two APOs, namely true and false, we can avoid using
1248  // a map. Instead we can simply count the number of operands that
1249  // correspond to one of them (in this case the 'true' APO), and calculate
1250  // the other by subtracting it from the total number of operands.
1251  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
1252  if (getData(OpIdx, Lane).APO)
1253  ++CntTrue;
1254  unsigned CntFalse = NumOperands - CntTrue;
1255  return std::max(CntTrue, CntFalse);
1256  }
1257 
1258  /// Go through the instructions in VL and append their operands.
1259  void appendOperandsOfVL(ArrayRef<Value *> VL) {
1260  assert(!VL.empty() && "Bad VL");
1261  assert((empty() || VL.size() == getNumLanes()) &&
1262  "Expected same number of lanes");
1263  assert(isa<Instruction>(VL[0]) && "Expected instruction");
1264  unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1265  OpsVec.resize(NumOperands);
1266  unsigned NumLanes = VL.size();
1267  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1268  OpsVec[OpIdx].resize(NumLanes);
1269  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1270  assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
1271  // Our tree has just 3 nodes: the root and two operands.
1272  // It is therefore trivial to get the APO. We only need to check the
1273  // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1274  // RHS operand. The LHS operand of both add and sub is never attached
1275  // to an inversese operation in the linearized form, therefore its APO
1276  // is false. The RHS is true only if VL[Lane] is an inverse operation.
1277 
1278  // Since operand reordering is performed on groups of commutative
1279  // operations or alternating sequences (e.g., +, -), we can safely
1280  // tell the inverse operations by checking commutativity.
1281  bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1282  bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1283  OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1284  APO, false};
1285  }
1286  }
1287  }
1288 
1289  /// \returns the number of operands.
1290  unsigned getNumOperands() const { return OpsVec.size(); }
1291 
1292  /// \returns the number of lanes.
1293  unsigned getNumLanes() const { return OpsVec[0].size(); }
1294 
1295  /// \returns the operand value at \p OpIdx and \p Lane.
1296  Value *getValue(unsigned OpIdx, unsigned Lane) const {
1297  return getData(OpIdx, Lane).V;
1298  }
1299 
1300  /// \returns true if the data structure is empty.
1301  bool empty() const { return OpsVec.empty(); }
1302 
1303  /// Clears the data.
1304  void clear() { OpsVec.clear(); }
1305 
1306  /// \Returns true if there are enough operands identical to \p Op to fill
1307  /// the whole vector.
1308  /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1309  bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1310  bool OpAPO = getData(OpIdx, Lane).APO;
1311  for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1312  if (Ln == Lane)
1313  continue;
1314  // This is set to true if we found a candidate for broadcast at Lane.
1315  bool FoundCandidate = false;
1316  for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1317  OperandData &Data = getData(OpI, Ln);
1318  if (Data.APO != OpAPO || Data.IsUsed)
1319  continue;
1320  if (Data.V == Op) {
1321  FoundCandidate = true;
1322  Data.IsUsed = true;
1323  break;
1324  }
1325  }
1326  if (!FoundCandidate)
1327  return false;
1328  }
1329  return true;
1330  }
1331 
1332  public:
1333  /// Initialize with all the operands of the instruction vector \p RootVL.
1335  ScalarEvolution &SE, const BoUpSLP &R)
1336  : DL(DL), SE(SE), R(R) {
1337  // Append all the operands of RootVL.
1338  appendOperandsOfVL(RootVL);
1339  }
1340 
1341  /// \Returns a value vector with the operands across all lanes for the
1342  /// opearnd at \p OpIdx.
1343  ValueList getVL(unsigned OpIdx) const {
1344  ValueList OpVL(OpsVec[OpIdx].size());
1345  assert(OpsVec[OpIdx].size() == getNumLanes() &&
1346  "Expected same num of lanes across all operands");
1347  for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1348  OpVL[Lane] = OpsVec[OpIdx][Lane].V;
1349  return OpVL;
1350  }
1351 
1352  // Performs operand reordering for 2 or more operands.
1353  // The original operands are in OrigOps[OpIdx][Lane].
1354  // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
1355  void reorder() {
1356  unsigned NumOperands = getNumOperands();
1357  unsigned NumLanes = getNumLanes();
1358  // Each operand has its own mode. We are using this mode to help us select
1359  // the instructions for each lane, so that they match best with the ones
1360  // we have selected so far.
1361  SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
1362 
1363  // This is a greedy single-pass algorithm. We are going over each lane
1364  // once and deciding on the best order right away with no back-tracking.
1365  // However, in order to increase its effectiveness, we start with the lane
1366  // that has operands that can move the least. For example, given the
1367  // following lanes:
1368  // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
1369  // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
1370  // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
1371  // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
1372  // we will start at Lane 1, since the operands of the subtraction cannot
1373  // be reordered. Then we will visit the rest of the lanes in a circular
1374  // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
1375 
1376  // Find the first lane that we will start our search from.
1377  unsigned FirstLane = getBestLaneToStartReordering();
1378 
1379  // Initialize the modes.
1380  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1381  Value *OpLane0 = getValue(OpIdx, FirstLane);
1382  // Keep track if we have instructions with all the same opcode on one
1383  // side.
1384  if (isa<LoadInst>(OpLane0))
1385  ReorderingModes[OpIdx] = ReorderingMode::Load;
1386  else if (isa<Instruction>(OpLane0)) {
1387  // Check if OpLane0 should be broadcast.
1388  if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
1389  ReorderingModes[OpIdx] = ReorderingMode::Splat;
1390  else
1391  ReorderingModes[OpIdx] = ReorderingMode::Opcode;
1392  }
1393  else if (isa<Constant>(OpLane0))
1394  ReorderingModes[OpIdx] = ReorderingMode::Constant;
1395  else if (isa<Argument>(OpLane0))
1396  // Our best hope is a Splat. It may save some cost in some cases.
1397  ReorderingModes[OpIdx] = ReorderingMode::Splat;
1398  else
1399  // NOTE: This should be unreachable.
1400  ReorderingModes[OpIdx] = ReorderingMode::Failed;
1401  }
1402 
1403  // If the initial strategy fails for any of the operand indexes, then we
1404  // perform reordering again in a second pass. This helps avoid assigning
1405  // high priority to the failed strategy, and should improve reordering for
1406  // the non-failed operand indexes.
1407  for (int Pass = 0; Pass != 2; ++Pass) {
1408  // Skip the second pass if the first pass did not fail.
1409  bool StrategyFailed = false;
1410  // Mark all operand data as free to use.
1411  clearUsed();
1412  // We keep the original operand order for the FirstLane, so reorder the
1413  // rest of the lanes. We are visiting the nodes in a circular fashion,
1414  // using FirstLane as the center point and increasing the radius
1415  // distance.
1416  for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
1417  // Visit the lane on the right and then the lane on the left.
1418  for (int Direction : {+1, -1}) {
1419  int Lane = FirstLane + Direction * Distance;
1420  if (Lane < 0 || Lane >= (int)NumLanes)
1421  continue;
1422  int LastLane = Lane - Direction;
1423  assert(LastLane >= 0 && LastLane < (int)NumLanes &&
1424  "Out of bounds");
1425  // Look for a good match for each operand.
1426  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1427  // Search for the operand that matches SortedOps[OpIdx][Lane-1].
1428  Optional<unsigned> BestIdx =
1429  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
1430  // By not selecting a value, we allow the operands that follow to
1431  // select a better matching value. We will get a non-null value in
1432  // the next run of getBestOperand().
1433  if (BestIdx) {
1434  // Swap the current operand with the one returned by
1435  // getBestOperand().
1436  swap(OpIdx, BestIdx.getValue(), Lane);
1437  } else {
1438  // We failed to find a best operand, set mode to 'Failed'.
1439  ReorderingModes[OpIdx] = ReorderingMode::Failed;
1440  // Enable the second pass.
1441  StrategyFailed = true;
1442  }
1443  }
1444  }
1445  }
1446  // Skip second pass if the strategy did not fail.
1447  if (!StrategyFailed)
1448  break;
1449  }
1450  }
1451 
1452 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1453  LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
1454  switch (RMode) {
1455  case ReorderingMode::Load:
1456  return "Load";
1457  case ReorderingMode::Opcode:
1458  return "Opcode";
1460  return "Constant";
1461  case ReorderingMode::Splat:
1462  return "Splat";
1464  return "Failed";
1465  }
1466  llvm_unreachable("Unimplemented Reordering Type");
1467  }
1468 
1469  LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
1470  raw_ostream &OS) {
1471  return OS << getModeStr(RMode);
1472  }
1473 
1474  /// Debug print.
1475  LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
1476  printMode(RMode, dbgs());
1477  }
1478 
1479  friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
1480  return printMode(RMode, OS);
1481  }
1482 
1484  const unsigned Indent = 2;
1485  unsigned Cnt = 0;
1486  for (const OperandDataVec &OpDataVec : OpsVec) {
1487  OS << "Operand " << Cnt++ << "\n";
1488  for (const OperandData &OpData : OpDataVec) {
1489  OS.indent(Indent) << "{";
1490  if (Value *V = OpData.V)
1491  OS << *V;
1492  else
1493  OS << "null";
1494  OS << ", APO:" << OpData.APO << "}\n";
1495  }
1496  OS << "\n";
1497  }
1498  return OS;
1499  }
1500 
1501  /// Debug print.
1502  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
1503 #endif
1504  };
1505 
1506  /// Checks if the instruction is marked for deletion.
1507  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
1508 
1509  /// Marks values operands for later deletion by replacing them with Undefs.
1510  void eraseInstructions(ArrayRef<Value *> AV);
1511 
1512  ~BoUpSLP();
1513 
1514 private:
1515  /// Checks if all users of \p I are the part of the vectorization tree.
1516  bool areAllUsersVectorized(Instruction *I) const;
1517 
1518  /// \returns the cost of the vectorizable entry.
1519  InstructionCost getEntryCost(TreeEntry *E);
1520 
1521  /// This is the recursive part of buildTree.
1522  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
1523  const EdgeInfo &EI);
1524 
1525  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
1526  /// be vectorized to use the original vector (or aggregate "bitcast" to a
1527  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
1528  /// returns false, setting \p CurrentOrder to either an empty vector or a
1529  /// non-identity permutation that allows to reuse extract instructions.
1530  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
1531  SmallVectorImpl<unsigned> &CurrentOrder) const;
1532 
1533  /// Vectorize a single entry in the tree.
1534  Value *vectorizeTree(TreeEntry *E);
1535 
1536  /// Vectorize a single entry in the tree, starting in \p VL.
1537  Value *vectorizeTree(ArrayRef<Value *> VL);
1538 
1539  /// \returns the scalarization cost for this type. Scalarization in this
1540  /// context means the creation of vectors from a group of scalars.
1542  getGatherCost(FixedVectorType *Ty,
1543  const DenseSet<unsigned> &ShuffledIndices) const;
1544 
1545  /// \returns the scalarization cost for this list of values. Assuming that
1546  /// this subtree gets vectorized, we may need to extract the values from the
1547  /// roots. This method calculates the cost of extracting the values.
1548  InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
1549 
1550  /// Set the Builder insert point to one after the last instruction in
1551  /// the bundle
1552  void setInsertPointAfterBundle(TreeEntry *E);
1553 
1554  /// \returns a vector from a collection of scalars in \p VL.
1555  Value *gather(ArrayRef<Value *> VL);
1556 
1557  /// \returns whether the VectorizableTree is fully vectorizable and will
1558  /// be beneficial even the tree height is tiny.
1559  bool isFullyVectorizableTinyTree() const;
1560 
1561  /// Reorder commutative or alt operands to get better probability of
1562  /// generating vectorized code.
1563  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1565  SmallVectorImpl<Value *> &Right,
1566  const DataLayout &DL,
1567  ScalarEvolution &SE,
1568  const BoUpSLP &R);
1569  struct TreeEntry {
1570  using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
1571  TreeEntry(VecTreeTy &Container) : Container(Container) {}
1572 
1573  /// \returns true if the scalars in VL are equal to this entry.
1574  bool isSame(ArrayRef<Value *> VL) const {
1575  if (VL.size() == Scalars.size())
1576  return std::equal(VL.begin(), VL.end(), Scalars.begin());
1577  return VL.size() == ReuseShuffleIndices.size() &&
1578  std::equal(
1579  VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
1580  [this](Value *V, int Idx) { return V == Scalars[Idx]; });
1581  }
1582 
1583  /// A vector of scalars.
1584  ValueList Scalars;
1585 
1586  /// The Scalars are vectorized into this value. It is initialized to Null.
1587  Value *VectorizedValue = nullptr;
1588 
1589  /// Do we need to gather this sequence or vectorize it
1590  /// (either with vector instruction or with scatter/gather
1591  /// intrinsics for store/load)?
1592  enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
1593  EntryState State;
1594 
1595  /// Does this sequence require some shuffling?
1596  SmallVector<int, 4> ReuseShuffleIndices;
1597 
1598  /// Does this entry require reordering?
1599  SmallVector<unsigned, 4> ReorderIndices;
1600 
1601  /// Points back to the VectorizableTree.
1602  ///
1603  /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
1604  /// to be a pointer and needs to be able to initialize the child iterator.
1605  /// Thus we need a reference back to the container to translate the indices
1606  /// to entries.
1607  VecTreeTy &Container;
1608 
1609  /// The TreeEntry index containing the user of this entry. We can actually
1610  /// have multiple users so the data structure is not truly a tree.
1611  SmallVector<EdgeInfo, 1> UserTreeIndices;
1612 
1613  /// The index of this treeEntry in VectorizableTree.
1614  int Idx = -1;
1615 
1616  private:
1617  /// The operands of each instruction in each lane Operands[op_index][lane].
1618  /// Note: This helps avoid the replication of the code that performs the
1619  /// reordering of operands during buildTree_rec() and vectorizeTree().
1621 
1622  /// The main/alternate instruction.
1623  Instruction *MainOp = nullptr;
1624  Instruction *AltOp = nullptr;
1625 
1626  public:
1627  /// Set this bundle's \p OpIdx'th operand to \p OpVL.
1628  void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
1629  if (Operands.size() < OpIdx + 1)
1630  Operands.resize(OpIdx + 1);
1631  assert(Operands[OpIdx].size() == 0 && "Already resized?");
1632  Operands[OpIdx].resize(Scalars.size());
1633  for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
1634  Operands[OpIdx][Lane] = OpVL[Lane];
1635  }
1636 
1637  /// Set the operands of this bundle in their original order.
1638  void setOperandsInOrder() {
1639  assert(Operands.empty() && "Already initialized?");
1640  auto *I0 = cast<Instruction>(Scalars[0]);
1641  Operands.resize(I0->getNumOperands());
1642  unsigned NumLanes = Scalars.size();
1643  for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
1644  OpIdx != NumOperands; ++OpIdx) {
1645  Operands[OpIdx].resize(NumLanes);
1646  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1647  auto *I = cast<Instruction>(Scalars[Lane]);
1648  assert(I->getNumOperands() == NumOperands &&
1649  "Expected same number of operands");
1650  Operands[OpIdx][Lane] = I->getOperand(OpIdx);
1651  }
1652  }
1653  }
1654 
1655  /// \returns the \p OpIdx operand of this TreeEntry.
1656  ValueList &getOperand(unsigned OpIdx) {
1657  assert(OpIdx < Operands.size() && "Off bounds");
1658  return Operands[OpIdx];
1659  }
1660 
1661  /// \returns the number of operands.
1662  unsigned getNumOperands() const { return Operands.size(); }
1663 
1664  /// \return the single \p OpIdx operand.
1665  Value *getSingleOperand(unsigned OpIdx) const {
1666  assert(OpIdx < Operands.size() && "Off bounds");
1667  assert(!Operands[OpIdx].empty() && "No operand available");
1668  return Operands[OpIdx][0];
1669  }
1670 
1671  /// Some of the instructions in the list have alternate opcodes.
1672  bool isAltShuffle() const {
1673  return getOpcode() != getAltOpcode();
1674  }
1675 
1676  bool isOpcodeOrAlt(Instruction *I) const {
1677  unsigned CheckedOpcode = I->getOpcode();
1678  return (getOpcode() == CheckedOpcode ||
1679  getAltOpcode() == CheckedOpcode);
1680  }
1681 
1682  /// Chooses the correct key for scheduling data. If \p Op has the same (or
1683  /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
1684  /// \p OpValue.
1685  Value *isOneOf(Value *Op) const {
1686  auto *I = dyn_cast<Instruction>(Op);
1687  if (I && isOpcodeOrAlt(I))
1688  return Op;
1689  return MainOp;
1690  }
1691 
1692  void setOperations(const InstructionsState &S) {
1693  MainOp = S.MainOp;
1694  AltOp = S.AltOp;
1695  }
1696 
1697  Instruction *getMainOp() const {
1698  return MainOp;
1699  }
1700 
1701  Instruction *getAltOp() const {
1702  return AltOp;
1703  }
1704 
1705  /// The main/alternate opcodes for the list of instructions.
1706  unsigned getOpcode() const {
1707  return MainOp ? MainOp->getOpcode() : 0;
1708  }
1709 
1710  unsigned getAltOpcode() const {
1711  return AltOp ? AltOp->getOpcode() : 0;
1712  }
1713 
1714  /// Update operations state of this entry if reorder occurred.
1715  bool updateStateIfReorder() {
1716  if (ReorderIndices.empty())
1717  return false;
1718  InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
1719  setOperations(S);
1720  return true;
1721  }
1722 
1723 #ifndef NDEBUG
1724  /// Debug printer.
1725  LLVM_DUMP_METHOD void dump() const {
1726  dbgs() << Idx << ".\n";
1727  for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
1728  dbgs() << "Operand " << OpI << ":\n";
1729  for (const Value *V : Operands[OpI])
1730  dbgs().indent(2) << *V << "\n";
1731  }
1732  dbgs() << "Scalars: \n";
1733  for (Value *V : Scalars)
1734  dbgs().indent(2) << *V << "\n";
1735  dbgs() << "State: ";
1736  switch (State) {
1737  case Vectorize:
1738  dbgs() << "Vectorize\n";
1739  break;
1740  case ScatterVectorize:
1741  dbgs() << "ScatterVectorize\n";
1742  break;
1743  case NeedToGather:
1744  dbgs() << "NeedToGather\n";
1745  break;
1746  }
1747  dbgs() << "MainOp: ";
1748  if (MainOp)
1749  dbgs() << *MainOp << "\n";
1750  else
1751  dbgs() << "NULL\n";
1752  dbgs() << "AltOp: ";
1753  if (AltOp)
1754  dbgs() << *AltOp << "\n";
1755  else
1756  dbgs() << "NULL\n";
1757  dbgs() << "VectorizedValue: ";
1758  if (VectorizedValue)
1759  dbgs() << *VectorizedValue << "\n";
1760  else
1761  dbgs() << "NULL\n";
1762  dbgs() << "ReuseShuffleIndices: ";
1763  if (ReuseShuffleIndices.empty())
1764  dbgs() << "Empty";
1765  else
1766  for (unsigned ReuseIdx : ReuseShuffleIndices)
1767  dbgs() << ReuseIdx << ", ";
1768  dbgs() << "\n";
1769  dbgs() << "ReorderIndices: ";
1770  for (unsigned ReorderIdx : ReorderIndices)
1771  dbgs() << ReorderIdx << ", ";
1772  dbgs() << "\n";
1773  dbgs() << "UserTreeIndices: ";
1774  for (const auto &EInfo : UserTreeIndices)
1775  dbgs() << EInfo << ", ";
1776  dbgs() << "\n";
1777  }
1778 #endif
1779  };
1780 
1781 #ifndef NDEBUG
1782  void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
1783  InstructionCost VecCost,
1784  InstructionCost ScalarCost) const {
1785  dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
1786  dbgs() << "SLP: Costs:\n";
1787  dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
1788  dbgs() << "SLP: VectorCost = " << VecCost << "\n";
1789  dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
1790  dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
1791  ReuseShuffleCost + VecCost - ScalarCost << "\n";
1792  }
1793 #endif
1794 
1795  /// Create a new VectorizableTree entry.
1796  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
1797  const InstructionsState &S,
1798  const EdgeInfo &UserTreeIdx,
1799  ArrayRef<unsigned> ReuseShuffleIndices = None,
1800  ArrayRef<unsigned> ReorderIndices = None) {
1801  TreeEntry::EntryState EntryState =
1802  Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
1803  return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
1804  ReuseShuffleIndices, ReorderIndices);
1805  }
1806 
1807  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
1808  TreeEntry::EntryState EntryState,
1809  Optional<ScheduleData *> Bundle,
1810  const InstructionsState &S,
1811  const EdgeInfo &UserTreeIdx,
1812  ArrayRef<unsigned> ReuseShuffleIndices = None,
1813  ArrayRef<unsigned> ReorderIndices = None) {
1814  assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
1815  (Bundle && EntryState != TreeEntry::NeedToGather)) &&
1816  "Need to vectorize gather entry?");
1817  VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
1818  TreeEntry *Last = VectorizableTree.back().get();
1819  Last->Idx = VectorizableTree.size() - 1;
1820  Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
1821  Last->State = EntryState;
1822  Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
1823  ReuseShuffleIndices.end());
1824  Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
1825  Last->setOperations(S);
1826  if (Last->State != TreeEntry::NeedToGather) {
1827  for (Value *V : VL) {
1828  assert(!getTreeEntry(V) && "Scalar already in tree!");
1829  ScalarToTreeEntry[V] = Last;
1830  }
1831  // Update the scheduler bundle to point to this TreeEntry.
1832  unsigned Lane = 0;
1833  for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
1834  BundleMember = BundleMember->NextInBundle) {
1835  BundleMember->TE = Last;
1836  BundleMember->Lane = Lane;
1837  ++Lane;
1838  }
1839  assert((!Bundle.getValue() || Lane == VL.size()) &&
1840  "Bundle and VL out of sync");
1841  } else {
1842  MustGather.insert(VL.begin(), VL.end());
1843  }
1844 
1845  if (UserTreeIdx.UserTE)
1846  Last->UserTreeIndices.push_back(UserTreeIdx);
1847 
1848  return Last;
1849  }
1850 
1851  /// -- Vectorization State --
1852  /// Holds all of the tree entries.
1853  TreeEntry::VecTreeTy VectorizableTree;
1854 
1855 #ifndef NDEBUG
1856  /// Debug printer.
1857  LLVM_DUMP_METHOD void dumpVectorizableTree() const {
1858  for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
1859  VectorizableTree[Id]->dump();
1860  dbgs() << "\n";
1861  }
1862  }
1863 #endif
1864 
1865  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
1866 
1867  const TreeEntry *getTreeEntry(Value *V) const {
1868  return ScalarToTreeEntry.lookup(V);
1869  }
1870 
1871  /// Maps a specific scalar to its tree entry.
1872  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
1873 
1874  /// Maps a value to the proposed vectorizable size.
1875  SmallDenseMap<Value *, unsigned> InstrElementSize;
1876 
1877  /// A list of scalars that we found that we need to keep as scalars.
1878  ValueSet MustGather;
1879 
1880  /// This POD struct describes one external user in the vectorized tree.
1881  struct ExternalUser {
1882  ExternalUser(Value *S, llvm::User *U, int L)
1883  : Scalar(S), User(U), Lane(L) {}
1884 
1885  // Which scalar in our function.
1886  Value *Scalar;
1887 
1888  // Which user that uses the scalar.
1889  llvm::User *User;
1890 
1891  // Which lane does the scalar belong to.
1892  int Lane;
1893  };
1894  using UserList = SmallVector<ExternalUser, 16>;
1895 
1896  /// Checks if two instructions may access the same memory.
1897  ///
1898  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
1899  /// is invariant in the calling loop.
1900  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
1901  Instruction *Inst2) {
1902  // First check if the result is already in the cache.
1903  AliasCacheKey key = std::make_pair(Inst1, Inst2);
1904  Optional<bool> &result = AliasCache[key];
1905  if (result.hasValue()) {
1906  return result.getValue();
1907  }
1908  MemoryLocation Loc2 = getLocation(Inst2, AA);
1909  bool aliased = true;
1910  if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
1911  // Do the alias check.
1912  aliased = !AA->isNoAlias(Loc1, Loc2);
1913  }
1914  // Store the result in the cache.
1915  result = aliased;
1916  return aliased;
1917  }
1918 
1919  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
1920 
1921  /// Cache for alias results.
1922  /// TODO: consider moving this to the AliasAnalysis itself.
1924 
1925  /// Removes an instruction from its block and eventually deletes it.
1926  /// It's like Instruction::eraseFromParent() except that the actual deletion
1927  /// is delayed until BoUpSLP is destructed.
1928  /// This is required to ensure that there are no incorrect collisions in the
1929  /// AliasCache, which can happen if a new instruction is allocated at the
1930  /// same address as a previously deleted instruction.
1931  void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
1932  auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
1933  It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
1934  }
1935 
1936  /// Temporary store for deleted instructions. Instructions will be deleted
1937  /// eventually when the BoUpSLP is destructed.
1938  DenseMap<Instruction *, bool> DeletedInstructions;
1939 
1940  /// A list of values that need to extracted out of the tree.
1941  /// This list holds pairs of (Internal Scalar : External User). External User
1942  /// can be nullptr, it means that this Internal Scalar will be used later,
1943  /// after vectorization.
1944  UserList ExternalUses;
1945 
1946  /// Values used only by @llvm.assume calls.
1948 
1949  /// Holds all of the instructions that we gathered.
1950  SetVector<Instruction *> GatherSeq;
1951 
1952  /// A list of blocks that we are going to CSE.
1953  SetVector<BasicBlock *> CSEBlocks;
1954 
1955  /// Contains all scheduling relevant data for an instruction.
1956  /// A ScheduleData either represents a single instruction or a member of an
1957  /// instruction bundle (= a group of instructions which is combined into a
1958  /// vector instruction).
1959  struct ScheduleData {
1960  // The initial value for the dependency counters. It means that the
1961  // dependencies are not calculated yet.
1962  enum { InvalidDeps = -1 };
1963 
1964  ScheduleData() = default;
1965 
1966  void init(int BlockSchedulingRegionID, Value *OpVal) {
1967  FirstInBundle = this;
1968  NextInBundle = nullptr;
1969  NextLoadStore = nullptr;
1970  IsScheduled = false;
1971  SchedulingRegionID = BlockSchedulingRegionID;
1972  UnscheduledDepsInBundle = UnscheduledDeps;
1973  clearDependencies();
1974  OpValue = OpVal;
1975  TE = nullptr;
1976  Lane = -1;
1977  }
1978 
1979  /// Returns true if the dependency information has been calculated.
1980  bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
1981 
1982  /// Returns true for single instructions and for bundle representatives
1983  /// (= the head of a bundle).
1984  bool isSchedulingEntity() const { return FirstInBundle == this; }
1985 
1986  /// Returns true if it represents an instruction bundle and not only a
1987  /// single instruction.
1988  bool isPartOfBundle() const {
1989  return NextInBundle != nullptr || FirstInBundle != this;
1990  }
1991 
1992  /// Returns true if it is ready for scheduling, i.e. it has no more
1993  /// unscheduled depending instructions/bundles.
1994  bool isReady() const {
1995  assert(isSchedulingEntity() &&
1996  "can't consider non-scheduling entity for ready list");
1997  return UnscheduledDepsInBundle == 0 && !IsScheduled;
1998  }
1999 
2000  /// Modifies the number of unscheduled dependencies, also updating it for
2001  /// the whole bundle.
2002  int incrementUnscheduledDeps(int Incr) {
2003  UnscheduledDeps += Incr;
2004  return FirstInBundle->UnscheduledDepsInBundle += Incr;
2005  }
2006 
2007  /// Sets the number of unscheduled dependencies to the number of
2008  /// dependencies.
2009  void resetUnscheduledDeps() {
2010  incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
2011  }
2012 
2013  /// Clears all dependency information.
2014  void clearDependencies() {
2015  Dependencies = InvalidDeps;
2016  resetUnscheduledDeps();
2017  MemoryDependencies.clear();
2018  }
2019 
2020  void dump(raw_ostream &os) const {
2021  if (!isSchedulingEntity()) {
2022  os << "/ " << *Inst;
2023  } else if (NextInBundle) {
2024  os << '[' << *Inst;
2025  ScheduleData *SD = NextInBundle;
2026  while (SD) {
2027  os << ';' << *SD->Inst;
2028  SD = SD->NextInBundle;
2029  }
2030  os << ']';
2031  } else {
2032  os << *Inst;
2033  }
2034  }
2035 
2036  Instruction *Inst = nullptr;
2037 
2038  /// Points to the head in an instruction bundle (and always to this for
2039  /// single instructions).
2040  ScheduleData *FirstInBundle = nullptr;
2041 
2042  /// Single linked list of all instructions in a bundle. Null if it is a
2043  /// single instruction.
2044  ScheduleData *NextInBundle = nullptr;
2045 
2046  /// Single linked list of all memory instructions (e.g. load, store, call)
2047  /// in the block - until the end of the scheduling region.
2048  ScheduleData *NextLoadStore = nullptr;
2049 
2050  /// The dependent memory instructions.
2051  /// This list is derived on demand in calculateDependencies().
2052  SmallVector<ScheduleData *, 4> MemoryDependencies;
2053 
2054  /// This ScheduleData is in the current scheduling region if this matches
2055  /// the current SchedulingRegionID of BlockScheduling.
2056  int SchedulingRegionID = 0;
2057 
2058  /// Used for getting a "good" final ordering of instructions.
2059  int SchedulingPriority = 0;
2060 
2061  /// The number of dependencies. Constitutes of the number of users of the
2062  /// instruction plus the number of dependent memory instructions (if any).
2063  /// This value is calculated on demand.
2064  /// If InvalidDeps, the number of dependencies is not calculated yet.
2065  int Dependencies = InvalidDeps;
2066 
2067  /// The number of dependencies minus the number of dependencies of scheduled
2068  /// instructions. As soon as this is zero, the instruction/bundle gets ready
2069  /// for scheduling.
2070  /// Note that this is negative as long as Dependencies is not calculated.
2071  int UnscheduledDeps = InvalidDeps;
2072 
2073  /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
2074  /// single instructions.
2075  int UnscheduledDepsInBundle = InvalidDeps;
2076 
2077  /// True if this instruction is scheduled (or considered as scheduled in the
2078  /// dry-run).
2079  bool IsScheduled = false;
2080 
2081  /// Opcode of the current instruction in the schedule data.
2082  Value *OpValue = nullptr;
2083 
2084  /// The TreeEntry that this instruction corresponds to.
2085  TreeEntry *TE = nullptr;
2086 
2087  /// The lane of this node in the TreeEntry.
2088  int Lane = -1;
2089  };
2090 
2091 #ifndef NDEBUG
2092  friend inline raw_ostream &operator<<(raw_ostream &os,
2093  const BoUpSLP::ScheduleData &SD) {
2094  SD.dump(os);
2095  return os;
2096  }
2097 #endif
2098 
2099  friend struct GraphTraits<BoUpSLP *>;
2100  friend struct DOTGraphTraits<BoUpSLP *>;
2101 
2102  /// Contains all scheduling data for a basic block.
2103  struct BlockScheduling {
2104  BlockScheduling(BasicBlock *BB)
2105  : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
2106 
2107  void clear() {
2108  ReadyInsts.clear();
2109  ScheduleStart = nullptr;
2110  ScheduleEnd = nullptr;
2111  FirstLoadStoreInRegion = nullptr;
2112  LastLoadStoreInRegion = nullptr;
2113 
2114  // Reduce the maximum schedule region size by the size of the
2115  // previous scheduling run.
2116  ScheduleRegionSizeLimit -= ScheduleRegionSize;
2117  if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
2118  ScheduleRegionSizeLimit = MinScheduleRegionSize;
2119  ScheduleRegionSize = 0;
2120 
2121  // Make a new scheduling region, i.e. all existing ScheduleData is not
2122  // in the new region yet.
2123  ++SchedulingRegionID;
2124  }
2125 
2126  ScheduleData *getScheduleData(Value *V) {
2127  ScheduleData *SD = ScheduleDataMap[V];
2128  if (SD && SD->SchedulingRegionID == SchedulingRegionID)
2129  return SD;
2130  return nullptr;
2131  }
2132 
2133  ScheduleData *getScheduleData(Value *V, Value *Key) {
2134  if (V == Key)
2135  return getScheduleData(V);
2136  auto I = ExtraScheduleDataMap.find(V);
2137  if (I != ExtraScheduleDataMap.end()) {
2138  ScheduleData *SD = I->second[Key];
2139  if (SD && SD->SchedulingRegionID == SchedulingRegionID)
2140  return SD;
2141  }
2142  return nullptr;
2143  }
2144 
2145  bool isInSchedulingRegion(ScheduleData *SD) const {
2146  return SD->SchedulingRegionID == SchedulingRegionID;
2147  }
2148 
2149  /// Marks an instruction as scheduled and puts all dependent ready
2150  /// instructions into the ready-list.
2151  template <typename ReadyListType>
2152  void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
2153  SD->IsScheduled = true;
2154  LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
2155 
2156  ScheduleData *BundleMember = SD;
2157  while (BundleMember) {
2158  if (BundleMember->Inst != BundleMember->OpValue) {
2159  BundleMember = BundleMember->NextInBundle;
2160  continue;
2161  }
2162  // Handle the def-use chain dependencies.
2163 
2164  // Decrement the unscheduled counter and insert to ready list if ready.
2165  auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
2166  doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
2167  if (OpDef && OpDef->hasValidDependencies() &&
2168  OpDef->incrementUnscheduledDeps(-1) == 0) {
2169  // There are no more unscheduled dependencies after
2170  // decrementing, so we can put the dependent instruction
2171  // into the ready list.
2172  ScheduleData *DepBundle = OpDef->FirstInBundle;
2173  assert(!DepBundle->IsScheduled &&
2174  "already scheduled bundle gets ready");
2175  ReadyList.insert(DepBundle);
2176  LLVM_DEBUG(dbgs()
2177  << "SLP: gets ready (def): " << *DepBundle << "\n");
2178  }
2179  });
2180  };
2181 
2182  // If BundleMember is a vector bundle, its operands may have been
2183  // reordered duiring buildTree(). We therefore need to get its operands
2184  // through the TreeEntry.
2185  if (TreeEntry *TE = BundleMember->TE) {
2186  int Lane = BundleMember->Lane;
2187  assert(Lane >= 0 && "Lane not set");
2188 
2189  // Since vectorization tree is being built recursively this assertion
2190  // ensures that the tree entry has all operands set before reaching
2191  // this code. Couple of exceptions known at the moment are extracts
2192  // where their second (immediate) operand is not added. Since
2193  // immediates do not affect scheduler behavior this is considered
2194  // okay.
2195  auto *In = TE->getMainOp();
2196  assert(In &&
2197  (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
2198  In->getNumOperands() == TE->getNumOperands()) &&
2199  "Missed TreeEntry operands?");
2200  (void)In; // fake use to avoid build failure when assertions disabled
2201 
2202  for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
2203  OpIdx != NumOperands; ++OpIdx)
2204  if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
2205  DecrUnsched(I);
2206  } else {
2207  // If BundleMember is a stand-alone instruction, no operand reordering
2208  // has taken place, so we directly access its operands.
2209  for (Use &U : BundleMember->Inst->operands())
2210  if (auto *I = dyn_cast<Instruction>(U.get()))
2211  DecrUnsched(I);
2212  }
2213  // Handle the memory dependencies.
2214  for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
2215  if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
2216  // There are no more unscheduled dependencies after decrementing,
2217  // so we can put the dependent instruction into the ready list.
2218  ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
2219  assert(!DepBundle->IsScheduled &&
2220  "already scheduled bundle gets ready");
2221  ReadyList.insert(DepBundle);
2222  LLVM_DEBUG(dbgs()
2223  << "SLP: gets ready (mem): " << *DepBundle << "\n");
2224  }
2225  }
2226  BundleMember = BundleMember->NextInBundle;
2227  }
2228  }
2229 
2230  void doForAllOpcodes(Value *V,
2231  function_ref<void(ScheduleData *SD)> Action) {
2232  if (ScheduleData *SD = getScheduleData(V))
2233  Action(SD);
2234  auto I = ExtraScheduleDataMap.find(V);
2235  if (I != ExtraScheduleDataMap.end())
2236  for (auto &P : I->second)
2237  if (P.second->SchedulingRegionID == SchedulingRegionID)
2238  Action(P.second);
2239  }
2240 
2241  /// Put all instructions into the ReadyList which are ready for scheduling.
2242  template <typename ReadyListType>
2243  void initialFillReadyList(ReadyListType &ReadyList) {
2244  for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2245  doForAllOpcodes(I, [&](ScheduleData *SD) {
2246  if (SD->isSchedulingEntity() && SD->isReady()) {
2247  ReadyList.insert(SD);
2248  LLVM_DEBUG(dbgs()
2249  << "SLP: initially in ready list: " << *I << "\n");
2250  }
2251  });
2252  }
2253  }
2254 
2255  /// Checks if a bundle of instructions can be scheduled, i.e. has no
2256  /// cyclic dependencies. This is only a dry-run, no instructions are
2257  /// actually moved at this stage.
2258  /// \returns the scheduling bundle. The returned Optional value is non-None
2259  /// if \p VL is allowed to be scheduled.
2261  tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
2262  const InstructionsState &S);
2263 
2264  /// Un-bundles a group of instructions.
2265  void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
2266 
2267  /// Allocates schedule data chunk.
2268  ScheduleData *allocateScheduleDataChunks();
2269 
2270  /// Extends the scheduling region so that V is inside the region.
2271  /// \returns true if the region size is within the limit.
2272  bool extendSchedulingRegion(Value *V, const InstructionsState &S);
2273 
2274  /// Initialize the ScheduleData structures for new instructions in the
2275  /// scheduling region.
2276  void initScheduleData(Instruction *FromI, Instruction *ToI,
2277  ScheduleData *PrevLoadStore,
2278  ScheduleData *NextLoadStore);
2279 
2280  /// Updates the dependency information of a bundle and of all instructions/
2281  /// bundles which depend on the original bundle.
2282  void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
2283  BoUpSLP *SLP);
2284 
2285  /// Sets all instruction in the scheduling region to un-scheduled.
2286  void resetSchedule();
2287 
2288  BasicBlock *BB;
2289 
2290  /// Simple memory allocation for ScheduleData.
2291  std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
2292 
2293  /// The size of a ScheduleData array in ScheduleDataChunks.
2294  int ChunkSize;
2295 
2296  /// The allocator position in the current chunk, which is the last entry
2297  /// of ScheduleDataChunks.
2298  int ChunkPos;
2299 
2300  /// Attaches ScheduleData to Instruction.
2301  /// Note that the mapping survives during all vectorization iterations, i.e.
2302  /// ScheduleData structures are recycled.
2303  DenseMap<Value *, ScheduleData *> ScheduleDataMap;
2304 
2305  /// Attaches ScheduleData to Instruction with the leading key.
2307  ExtraScheduleDataMap;
2308 
2309  struct ReadyList : SmallVector<ScheduleData *, 8> {
2310  void insert(ScheduleData *SD) { push_back(SD); }
2311  };
2312 
2313  /// The ready-list for scheduling (only used for the dry-run).
2314  ReadyList ReadyInsts;
2315 
2316  /// The first instruction of the scheduling region.
2317  Instruction *ScheduleStart = nullptr;
2318 
2319  /// The first instruction _after_ the scheduling region.
2320  Instruction *ScheduleEnd = nullptr;
2321 
2322  /// The first memory accessing instruction in the scheduling region
2323  /// (can be null).
2324  ScheduleData *FirstLoadStoreInRegion = nullptr;
2325 
2326  /// The last memory accessing instruction in the scheduling region
2327  /// (can be null).
2328  ScheduleData *LastLoadStoreInRegion = nullptr;
2329 
2330  /// The current size of the scheduling region.
2331  int ScheduleRegionSize = 0;
2332 
2333  /// The maximum size allowed for the scheduling region.
2334  int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
2335 
2336  /// The ID of the scheduling region. For a new vectorization iteration this
2337  /// is incremented which "removes" all ScheduleData from the region.
2338  // Make sure that the initial SchedulingRegionID is greater than the
2339  // initial SchedulingRegionID in ScheduleData (which is 0).
2340  int SchedulingRegionID = 1;
2341  };
2342 
2343  /// Attaches the BlockScheduling structures to basic blocks.
2345 
2346  /// Performs the "real" scheduling. Done before vectorization is actually
2347  /// performed in a basic block.
2348  void scheduleBlock(BlockScheduling *BS);
2349 
2350  /// List of users to ignore during scheduling and that don't need extracting.
2351  ArrayRef<Value *> UserIgnoreList;
2352 
2353  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
2354  /// sorted SmallVectors of unsigned.
2355  struct OrdersTypeDenseMapInfo {
2356  static OrdersType getEmptyKey() {
2357  OrdersType V;
2358  V.push_back(~1U);
2359  return V;
2360  }
2361 
2362  static OrdersType getTombstoneKey() {
2363  OrdersType V;
2364  V.push_back(~2U);
2365  return V;
2366  }
2367 
2368  static unsigned getHashValue(const OrdersType &V) {
2369  return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
2370  }
2371 
2372  static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
2373  return LHS == RHS;
2374  }
2375  };
2376 
2377  /// Contains orders of operations along with the number of bundles that have
2378  /// operations in this order. It stores only those orders that require
2379  /// reordering, if reordering is not required it is counted using \a
2380  /// NumOpsWantToKeepOriginalOrder.
2382  /// Number of bundles that do not require reordering.
2383  unsigned NumOpsWantToKeepOriginalOrder = 0;
2384 
2385  // Analysis and block reference.
2386  Function *F;
2387  ScalarEvolution *SE;
2389  TargetLibraryInfo *TLI;
2390  AAResults *AA;
2391  LoopInfo *LI;
2392  DominatorTree *DT;
2393  AssumptionCache *AC;
2394  DemandedBits *DB;
2395  const DataLayout *DL;
2397 
2398  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
2399  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
2400 
2401  /// Instruction builder to construct the vectorized tree.
2403 
2404  /// A map of scalar integer values to the smallest bit width with which they
2405  /// can legally be represented. The values map to (width, signed) pairs,
2406  /// where "width" indicates the minimum bit width and "signed" is True if the
2407  /// value must be signed-extended, rather than zero-extended, back to its
2408  /// original width.
2410 };
2411 
2412 } // end namespace slpvectorizer
2413 
2414 template <> struct GraphTraits<BoUpSLP *> {
2415  using TreeEntry = BoUpSLP::TreeEntry;
2416 
2417  /// NodeRef has to be a pointer per the GraphWriter.
2418  using NodeRef = TreeEntry *;
2419 
2420  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
2421 
2422  /// Add the VectorizableTree to the index iterator to be able to return
2423  /// TreeEntry pointers.
2424  struct ChildIteratorType
2425  : public iterator_adaptor_base<
2426  ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
2428 
2430  ContainerTy &VT)
2431  : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
2432 
2433  NodeRef operator*() { return I->UserTE; }
2434  };
2435 
2436  static NodeRef getEntryNode(BoUpSLP &R) {
2437  return R.VectorizableTree[0].get();
2438  }
2439 
2440  static ChildIteratorType child_begin(NodeRef N) {
2441  return {N->UserTreeIndices.begin(), N->Container};
2442  }
2443 
2444  static ChildIteratorType child_end(NodeRef N) {
2445  return {N->UserTreeIndices.end(), N->Container};
2446  }
2447 
2448  /// For the node iterator we just need to turn the TreeEntry iterator into a
2449  /// TreeEntry* iterator so that it dereferences to NodeRef.
2450  class nodes_iterator {
2451  using ItTy = ContainerTy::iterator;
2452  ItTy It;
2453 
2454  public:
2455  nodes_iterator(const ItTy &It2) : It(It2) {}
2456  NodeRef operator*() { return It->get(); }
2457  nodes_iterator operator++() {
2458  ++It;
2459  return *this;
2460  }
2461  bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
2462  };
2463 
2464  static nodes_iterator nodes_begin(BoUpSLP *R) {
2465  return nodes_iterator(R->VectorizableTree.begin());
2466  }
2467 
2468  static nodes_iterator nodes_end(BoUpSLP *R) {
2469  return nodes_iterator(R->VectorizableTree.end());
2470  }
2471 
2472  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
2473 };
2474 
2475 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
2476  using TreeEntry = BoUpSLP::TreeEntry;
2477 
2479 
2480  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
2481  std::string Str;
2482  raw_string_ostream OS(Str);
2483  if (isSplat(Entry->Scalars)) {
2484  OS << "<splat> " << *Entry->Scalars[0];
2485  return Str;
2486  }
2487  for (auto V : Entry->Scalars) {
2488  OS << *V;
2489  if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
2490  return EU.Scalar == V;
2491  }))
2492  OS << " <extract>";
2493  OS << "\n";
2494  }
2495  return Str;
2496  }
2497 
2498  static std::string getNodeAttributes(const TreeEntry *Entry,
2499  const BoUpSLP *) {
2500  if (Entry->State == TreeEntry::NeedToGather)
2501  return "color=red";
2502  return "";
2503  }
2504 };
2505 
2506 } // end namespace llvm
2507 
2508 BoUpSLP::~BoUpSLP() {
2509  for (const auto &Pair : DeletedInstructions) {
2510  // Replace operands of ignored instructions with Undefs in case if they were
2511  // marked for deletion.
2512  if (Pair.getSecond()) {
2513  Value *Undef = UndefValue::get(Pair.getFirst()->getType());
2514  Pair.getFirst()->replaceAllUsesWith(Undef);
2515  }
2516  Pair.getFirst()->dropAllReferences();
2517  }
2518  for (const auto &Pair : DeletedInstructions) {
2519  assert(Pair.getFirst()->use_empty() &&
2520  "trying to erase instruction with users.");
2521  Pair.getFirst()->eraseFromParent();
2522  }
2523 #ifdef EXPENSIVE_CHECKS
2524  // If we could guarantee that this call is not extremely slow, we could
2525  // remove the ifdef limitation (see PR47712).
2526  assert(!verifyFunction(*F, &dbgs()));
2527 #endif
2528 }
2529 
2530 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
2531  for (auto *V : AV) {
2532  if (auto *I = dyn_cast<Instruction>(V))
2533  eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
2534  };
2535 }
2536 
2537 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
2538  ArrayRef<Value *> UserIgnoreLst) {
2539  ExtraValueToDebugLocsMap ExternallyUsedValues;
2540  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
2541 }
2542 
2543 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
2544  ExtraValueToDebugLocsMap &ExternallyUsedValues,
2545  ArrayRef<Value *> UserIgnoreLst) {
2546  deleteTree();
2547  UserIgnoreList = UserIgnoreLst;
2548  if (!allSameType(Roots))
2549  return;
2550  buildTree_rec(Roots, 0, EdgeInfo());
2551 
2552  // Collect the values that we need to extract from the tree.
2553  for (auto &TEPtr : VectorizableTree) {
2554  TreeEntry *Entry = TEPtr.get();
2555 
2556  // No need to handle users of gathered values.
2557  if (Entry->State == TreeEntry::NeedToGather)
2558  continue;
2559 
2560  // For each lane:
2561  for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2562  Value *Scalar = Entry->Scalars[Lane];
2563  int FoundLane = Lane;
2564  if (!Entry->ReuseShuffleIndices.empty()) {
2565  FoundLane =
2566  std::distance(Entry->ReuseShuffleIndices.begin(),
2567  llvm::find(Entry->ReuseShuffleIndices, FoundLane));
2568  }
2569 
2570  // Check if the scalar is externally used as an extra arg.
2571  auto ExtI = ExternallyUsedValues.find(Scalar);
2572  if (ExtI != ExternallyUsedValues.end()) {
2573  LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
2574  << Lane << " from " << *Scalar << ".\n");
2575  ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
2576  }
2577  for (User *U : Scalar->users()) {
2578  LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
2579 
2580  Instruction *UserInst = dyn_cast<Instruction>(U);
2581  if (!UserInst)
2582  continue;
2583 
2584  // Skip in-tree scalars that become vectors
2585  if (TreeEntry *UseEntry = getTreeEntry(U)) {
2586  Value *UseScalar = UseEntry->Scalars[0];
2587  // Some in-tree scalars will remain as scalar in vectorized
2588  // instructions. If that is the case, the one in Lane 0 will
2589  // be used.
2590  if (UseScalar != U ||
2591  UseEntry->State == TreeEntry::ScatterVectorize ||
2592  !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
2593  LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
2594  << ".\n");
2595  assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
2596  continue;
2597  }
2598  }
2599 
2600  // Ignore users in the user ignore list.
2601  if (is_contained(UserIgnoreList, UserInst))
2602  continue;
2603 
2604  LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
2605  << Lane << " from " << *Scalar << ".\n");
2606  ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
2607  }
2608  }
2609  }
2610 }
2611 
2612 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2613  const EdgeInfo &UserTreeIdx) {
2614  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
2615 
2616  InstructionsState S = getSameOpcode(VL);
2617  if (Depth == RecursionMaxDepth) {
2618  LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
2619  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2620  return;
2621  }
2622 
2623  // Don't handle vectors.
2624  if (S.OpValue->getType()->isVectorTy()) {
2625  LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
2626  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2627  return;
2628  }
2629 
2630  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
2631  if (SI->getValueOperand()->getType()->isVectorTy()) {
2632  LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
2633  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2634  return;
2635  }
2636 
2637  // If all of the operands are identical or constant we have a simple solution.
2638  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
2639  LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
2640  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2641  return;
2642  }
2643 
2644  // We now know that this is a vector of instructions of the same type from
2645  // the same block.
2646 
2647  // Don't vectorize ephemeral values.
2648  for (Value *V : VL) {
2649  if (EphValues.count(V)) {
2650  LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
2651  << ") is ephemeral.\n");
2652  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2653  return;
2654  }
2655  }
2656 
2657  // Check if this is a duplicate of another entry.
2658  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
2659  LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
2660  if (!E->isSame(VL)) {
2661  LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
2662  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2663  return;
2664  }
2665  // Record the reuse of the tree node. FIXME, currently this is only used to
2666  // properly draw the graph rather than for the actual vectorization.
2667  E->UserTreeIndices.push_back(UserTreeIdx);
2668  LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
2669  << ".\n");
2670  return;
2671  }
2672 
2673  // Check that none of the instructions in the bundle are already in the tree.
2674  for (Value *V : VL) {
2675  auto *I = dyn_cast<Instruction>(V);
2676  if (!I)
2677  continue;
2678  if (getTreeEntry(I)) {
2679  LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
2680  << ") is already in tree.\n");
2681  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2682  return;
2683  }
2684  }
2685 
2686  // If any of the scalars is marked as a value that needs to stay scalar, then
2687  // we need to gather the scalars.
2688  // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
2689  for (Value *V : VL) {
2690  if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
2691  LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
2692  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2693  return;
2694  }
2695  }
2696 
2697  // Check that all of the users of the scalars that we want to vectorize are
2698  // schedulable.
2699  auto *VL0 = cast<Instruction>(S.OpValue);
2700  BasicBlock *BB = VL0->getParent();
2701 
2702  if (!DT->isReachableFromEntry(BB)) {
2703  // Don't go into unreachable blocks. They may contain instructions with
2704  // dependency cycles which confuse the final scheduling.
2705  LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
2706  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2707  return;
2708  }
2709 
2710  // Check that every instruction appears once in this bundle.
2711  SmallVector<unsigned, 4> ReuseShuffleIndicies;
2712  SmallVector<Value *, 4> UniqueValues;
2713  DenseMap<Value *, unsigned> UniquePositions;
2714  for (Value *V : VL) {
2715  auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
2716  ReuseShuffleIndicies.emplace_back(Res.first->second);
2717  if (Res.second)
2718  UniqueValues.emplace_back(V);
2719  }
2720  size_t NumUniqueScalarValues = UniqueValues.size();
2721  if (NumUniqueScalarValues == VL.size()) {
2722  ReuseShuffleIndicies.clear();
2723  } else {
2724  LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
2725  if (NumUniqueScalarValues <= 1 ||
2726  !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
2727  LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
2728  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
2729  return;
2730  }
2731  VL = UniqueValues;
2732  }
2733 
2734  auto &BSRef = BlocksSchedules[BB];
2735  if (!BSRef)
2736  BSRef = std::make_unique<BlockScheduling>(BB);
2737 
2738  BlockScheduling &BS = *BSRef.get();
2739 
2740  Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
2741  if (!Bundle) {
2742  LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
2743  assert((!BS.getScheduleData(VL0) ||
2744  !BS.getScheduleData(VL0)->isPartOfBundle()) &&
2745  "tryScheduleBundle should cancelScheduling on failure");
2746  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2747  ReuseShuffleIndicies);
2748  return;
2749  }
2750  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
2751 
2752  unsigned ShuffleOrOp = S.isAltShuffle() ?
2753  (unsigned) Instruction::ShuffleVector : S.getOpcode();
2754  switch (ShuffleOrOp) {
2755  case Instruction::PHI: {
2756  auto *PH = cast<PHINode>(VL0);
2757 
2758  // Check for terminator values (e.g. invoke).
2759  for (Value *V : VL)
2760  for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
2761  Instruction *Term = dyn_cast<Instruction>(
2762  cast<PHINode>(V)->getIncomingValueForBlock(
2763  PH->getIncomingBlock(I)));
2764  if (Term && Term->isTerminator()) {
2765  LLVM_DEBUG(dbgs()
2766  << "SLP: Need to swizzle PHINodes (terminator use).\n");
2767  BS.cancelScheduling(VL, VL0);
2768  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2769  ReuseShuffleIndicies);
2770  return;
2771  }
2772  }
2773 
2774  TreeEntry *TE =
2775  newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
2776  LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
2777 
2778  // Keeps the reordered operands to avoid code duplication.
2779  SmallVector<ValueList, 2> OperandsVec;
2780  for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
2781  ValueList Operands;
2782  // Prepare the operand vector.
2783  for (Value *V : VL)
2784  Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
2785  PH->getIncomingBlock(I)));
2786  TE->setOperand(I, Operands);
2787  OperandsVec.push_back(Operands);
2788  }
2789  for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
2790  buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
2791  return;
2792  }
2793  case Instruction::ExtractValue:
2794  case Instruction::ExtractElement: {
2795  OrdersType CurrentOrder;
2796  bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
2797  if (Reuse) {
2798  LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
2799  ++NumOpsWantToKeepOriginalOrder;
2800  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2801  ReuseShuffleIndicies);
2802  // This is a special case, as it does not gather, but at the same time
2803  // we are not extending buildTree_rec() towards the operands.
2804  ValueList Op0;
2805  Op0.assign(VL.size(), VL0->getOperand(0));
2806  VectorizableTree.back()->setOperand(0, Op0);
2807  return;
2808  }
2809  if (!CurrentOrder.empty()) {
2810  LLVM_DEBUG({
2811  dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
2812  "with order";
2813  for (unsigned Idx : CurrentOrder)
2814  dbgs() << " " << Idx;
2815  dbgs() << "\n";
2816  });
2817  // Insert new order with initial value 0, if it does not exist,
2818  // otherwise return the iterator to the existing one.
2819  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2820  ReuseShuffleIndicies, CurrentOrder);
2821  findRootOrder(CurrentOrder);
2822  ++NumOpsWantToKeepOrder[CurrentOrder];
2823  // This is a special case, as it does not gather, but at the same time
2824  // we are not extending buildTree_rec() towards the operands.
2825  ValueList Op0;
2826  Op0.assign(VL.size(), VL0->getOperand(0));
2827  VectorizableTree.back()->setOperand(0, Op0);
2828  return;
2829  }
2830  LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
2831  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2832  ReuseShuffleIndicies);
2833  BS.cancelScheduling(VL, VL0);
2834  return;
2835  }
2836  case Instruction::Load: {
2837  // Check that a vectorized load would load the same memory as a scalar
2838  // load. For example, we don't want to vectorize loads that are smaller
2839  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
2840  // treats loading/storing it as an i8 struct. If we vectorize loads/stores
2841  // from such a struct, we read/write packed bits disagreeing with the
2842  // unvectorized version.
2843  Type *ScalarTy = VL0->getType();
2844 
2845  if (DL->getTypeSizeInBits(ScalarTy) !=
2846  DL->getTypeAllocSizeInBits(ScalarTy)) {
2847  BS.cancelScheduling(VL, VL0);
2848  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2849  ReuseShuffleIndicies);
2850  LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
2851  return;
2852  }
2853 
2854  // Make sure all loads in the bundle are simple - we can't vectorize
2855  // atomic or volatile loads.
2856  SmallVector<Value *, 4> PointerOps(VL.size());
2857  auto POIter = PointerOps.begin();
2858  for (Value *V : VL) {
2859  auto *L = cast<LoadInst>(V);
2860  if (!L->isSimple()) {
2861  BS.cancelScheduling(VL, VL0);
2862  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2863  ReuseShuffleIndicies);
2864  LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
2865  return;
2866  }
2867  *POIter = L->getPointerOperand();
2868  ++POIter;
2869  }
2870 
2871  OrdersType CurrentOrder;
2872  // Check the order of pointer operands.
2873  if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
2874  Value *Ptr0;
2875  Value *PtrN;
2876  if (CurrentOrder.empty()) {
2877  Ptr0 = PointerOps.front();
2878  PtrN = PointerOps.back();
2879  } else {
2880  Ptr0 = PointerOps[CurrentOrder.front()];
2881  PtrN = PointerOps[CurrentOrder.back()];
2882  }
2883  Optional<int> Diff = getPointersDiff(Ptr0, PtrN, *DL, *SE);
2884  // Check that the sorted loads are consecutive.
2885  if (static_cast<unsigned>(*Diff) == VL.size() - 1) {
2886  if (CurrentOrder.empty()) {
2887  // Original loads are consecutive and does not require reordering.
2888  ++NumOpsWantToKeepOriginalOrder;
2889  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
2890  UserTreeIdx, ReuseShuffleIndicies);
2891  TE->setOperandsInOrder();
2892  LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
2893  } else {
2894  // Need to reorder.
2895  TreeEntry *TE =
2896  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2897  ReuseShuffleIndicies, CurrentOrder);
2898  TE->setOperandsInOrder();
2899  LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
2900  findRootOrder(CurrentOrder);
2901  ++NumOpsWantToKeepOrder[CurrentOrder];
2902  }
2903  return;
2904  }
2905  // Vectorizing non-consecutive loads with `llvm.masked.gather`.
2906  TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
2907  UserTreeIdx, ReuseShuffleIndicies);
2908  TE->setOperandsInOrder();
2909  buildTree_rec(PointerOps, Depth + 1, {TE, 0});
2910  LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
2911  return;
2912  }
2913 
2914  LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
2915  BS.cancelScheduling(VL, VL0);
2916  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2917  ReuseShuffleIndicies);
2918  return;
2919  }
2920  case Instruction::ZExt:
2921  case Instruction::SExt:
2922  case Instruction::FPToUI:
2923  case Instruction::FPToSI:
2924  case Instruction::FPExt:
2925  case Instruction::PtrToInt:
2926  case Instruction::IntToPtr:
2927  case Instruction::SIToFP:
2928  case Instruction::UIToFP:
2929  case Instruction::Trunc:
2930  case Instruction::FPTrunc:
2931  case Instruction::BitCast: {
2932  Type *SrcTy = VL0->getOperand(0)->getType();
2933  for (Value *V : VL) {
2934  Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
2935  if (Ty != SrcTy || !isValidElementType(Ty)) {
2936  BS.cancelScheduling(VL, VL0);
2937  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2938  ReuseShuffleIndicies);
2939  LLVM_DEBUG(dbgs()
2940  << "SLP: Gathering casts with different src types.\n");
2941  return;
2942  }
2943  }
2944  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2945  ReuseShuffleIndicies);
2946  LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
2947 
2948  TE->setOperandsInOrder();
2949  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
2950  ValueList Operands;
2951  // Prepare the operand vector.
2952  for (Value *V : VL)
2953  Operands.push_back(cast<Instruction>(V)->getOperand(i));
2954 
2955  buildTree_rec(Operands, Depth + 1, {TE, i});
2956  }
2957  return;
2958  }
2959  case Instruction::ICmp:
2960  case Instruction::FCmp: {
2961  // Check that all of the compares have the same predicate.
2962  CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2964  Type *ComparedTy = VL0->getOperand(0)->getType();
2965  for (Value *V : VL) {
2966  CmpInst *Cmp = cast<CmpInst>(V);
2967  if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
2968  Cmp->getOperand(0)->getType() != ComparedTy) {
2969  BS.cancelScheduling(VL, VL0);
2970  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2971  ReuseShuffleIndicies);
2972  LLVM_DEBUG(dbgs()
2973  << "SLP: Gathering cmp with different predicate.\n");
2974  return;
2975  }
2976  }
2977 
2978  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2979  ReuseShuffleIndicies);
2980  LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
2981 
2982  ValueList Left, Right;
2983  if (cast<CmpInst>(VL0)->isCommutative()) {
2984  // Commutative predicate - collect + sort operands of the instructions
2985  // so that each side is more likely to have the same opcode.
2986  assert(P0 == SwapP0 && "Commutative Predicate mismatch");
2987  reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
2988  } else {
2989  // Collect operands - commute if it uses the swapped predicate.
2990  for (Value *V : VL) {
2991  auto *Cmp = cast<CmpInst>(V);
2992  Value *LHS = Cmp->getOperand(0);
2993  Value *RHS = Cmp->getOperand(1);
2994  if (Cmp->getPredicate() != P0)
2995  std::swap(LHS, RHS);
2996  Left.push_back(LHS);
2997  Right.push_back(RHS);
2998  }
2999  }
3000  TE->setOperand(0, Left);
3001  TE->setOperand(1, Right);
3002  buildTree_rec(Left, Depth + 1, {TE, 0});
3003  buildTree_rec(Right, Depth + 1, {TE, 1});
3004  return;
3005  }
3006  case Instruction::Select:
3007  case Instruction::FNeg:
3008  case Instruction::Add:
3009  case Instruction::FAdd:
3010  case Instruction::Sub:
3011  case Instruction::FSub:
3012  case Instruction::Mul:
3013  case Instruction::FMul:
3014  case Instruction::UDiv:
3015  case Instruction::SDiv:
3016  case Instruction::FDiv:
3017  case Instruction::URem:
3018  case Instruction::SRem:
3019  case Instruction::FRem:
3020  case Instruction::Shl:
3021  case Instruction::LShr:
3022  case Instruction::AShr:
3023  case Instruction::And:
3024  case Instruction::Or:
3025  case Instruction::Xor: {
3026  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3027  ReuseShuffleIndicies);
3028  LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
3029 
3030  // Sort operands of the instructions so that each side is more likely to
3031  // have the same opcode.
3032  if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
3033  ValueList Left, Right;
3034  reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
3035  TE->setOperand(0, Left);
3036  TE->setOperand(1, Right);
3037  buildTree_rec(Left, Depth + 1, {TE, 0});
3038  buildTree_rec(Right, Depth + 1, {TE, 1});
3039  return;
3040  }
3041 
3042  TE->setOperandsInOrder();
3043  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
3044  ValueList Operands;
3045  // Prepare the operand vector.
3046  for (Value *V : VL)
3047  Operands.push_back(cast<Instruction>(V)->getOperand(i));
3048 
3049  buildTree_rec(Operands, Depth + 1, {TE, i});
3050  }
3051  return;
3052  }
3053  case Instruction::GetElementPtr: {
3054  // We don't combine GEPs with complicated (nested) indexing.
3055  for (Value *V : VL) {
3056  if (cast<Instruction>(V)->getNumOperands() != 2) {
3057  LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
3058  BS.cancelScheduling(VL, VL0);
3059  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3060  ReuseShuffleIndicies);
3061  return;
3062  }
3063  }
3064 
3065  // We can't combine several GEPs into one vector if they operate on
3066  // different types.
3067  Type *Ty0 = VL0->getOperand(0)->getType();
3068  for (Value *V : VL) {
3069  Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
3070  if (Ty0 != CurTy) {
3071  LLVM_DEBUG(dbgs()
3072  << "SLP: not-vectorizable GEP (different types).\n");
3073  BS.cancelScheduling(VL, VL0);
3074  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3075  ReuseShuffleIndicies);
3076  return;
3077  }
3078  }
3079 
3080  // We don't combine GEPs with non-constant indexes.
3081  Type *Ty1 = VL0->getOperand(1)->getType();
3082  for (Value *V : VL) {
3083  auto Op = cast<Instruction>(V)->getOperand(1);
3084  if (!isa<ConstantInt>(Op) ||
3085  (Op->getType() != Ty1 &&
3086  Op->getType()->getScalarSizeInBits() >
3087  DL->getIndexSizeInBits(
3088  V->getType()->getPointerAddressSpace()))) {
3089  LLVM_DEBUG(dbgs()
3090  << "SLP: not-vectorizable GEP (non-constant indexes).\n");
3091  BS.cancelScheduling(VL, VL0);
3092  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3093  ReuseShuffleIndicies);
3094  return;
3095  }
3096  }
3097 
3098  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3099  ReuseShuffleIndicies);
3100  LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
3101  TE->setOperandsInOrder();
3102  for (unsigned i = 0, e = 2; i < e; ++i) {
3103  ValueList Operands;
3104  // Prepare the operand vector.
3105  for (Value *V : VL)
3106  Operands.push_back(cast<Instruction>(V)->getOperand(i));
3107 
3108  buildTree_rec(Operands, Depth + 1, {TE, i});
3109  }
3110  return;
3111  }
3112  case Instruction::Store: {
3113  // Check if the stores are consecutive or if we need to swizzle them.
3114  llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
3115  // Avoid types that are padded when being allocated as scalars, while
3116  // being packed together in a vector (such as i1).
3117  if (DL->getTypeSizeInBits(ScalarTy) !=
3118  DL->getTypeAllocSizeInBits(ScalarTy)) {
3119  BS.cancelScheduling(VL, VL0);
3120  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3121  ReuseShuffleIndicies);
3122  LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
3123  return;
3124  }
3125  // Make sure all stores in the bundle are simple - we can't vectorize
3126  // atomic or volatile stores.
3127  SmallVector<Value *, 4> PointerOps(VL.size());
3128  ValueList Operands(VL.size());
3129  auto POIter = PointerOps.begin();
3130  auto OIter = Operands.begin();
3131  for (Value *V : VL) {
3132  auto *SI = cast<StoreInst>(V);
3133  if (!SI->isSimple()) {
3134  BS.cancelScheduling(VL, VL0);
3135  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3136  ReuseShuffleIndicies);
3137  LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
3138  return;
3139  }
3140  *POIter = SI->getPointerOperand();
3141  *OIter = SI->getValueOperand();
3142  ++POIter;
3143  ++OIter;
3144  }
3145 
3146  OrdersType CurrentOrder;
3147  // Check the order of pointer operands.
3148  if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
3149  Value *Ptr0;
3150  Value *PtrN;
3151  if (CurrentOrder.empty()) {
3152  Ptr0 = PointerOps.front();
3153  PtrN = PointerOps.back();
3154  } else {
3155  Ptr0 = PointerOps[CurrentOrder.front()];
3156  PtrN = PointerOps[CurrentOrder.back()];
3157  }
3158  Optional<int> Dist = getPointersDiff(Ptr0, PtrN, *DL, *SE);
3159  // Check that the sorted pointer operands are consecutive.
3160  if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
3161  if (CurrentOrder.empty()) {
3162  // Original stores are consecutive and does not require reordering.
3163  ++NumOpsWantToKeepOriginalOrder;
3164  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
3165  UserTreeIdx, ReuseShuffleIndicies);
3166  TE->setOperandsInOrder();
3167  buildTree_rec(Operands, Depth + 1, {TE, 0});
3168  LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
3169  } else {
3170  TreeEntry *TE =
3171  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3172  ReuseShuffleIndicies, CurrentOrder);
3173  TE->setOperandsInOrder();
3174  buildTree_rec(Operands, Depth + 1, {TE, 0});
3175  LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
3176  findRootOrder(CurrentOrder);
3177  ++NumOpsWantToKeepOrder[CurrentOrder];
3178  }
3179  return;
3180  }
3181  }
3182 
3183  BS.cancelScheduling(VL, VL0);
3184  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3185  ReuseShuffleIndicies);
3186  LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
3187  return;
3188  }
3189  case Instruction::Call: {
3190  // Check if the calls are all to the same vectorizable intrinsic or
3191  // library function.
3192  CallInst *CI = cast<CallInst>(VL0);
3194 
3195  VFShape Shape = VFShape::get(
3196  *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
3197  false /*HasGlobalPred*/);
3198  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3199 
3200  if (!VecFunc && !isTriviallyVectorizable(ID)) {
3201  BS.cancelScheduling(VL, VL0);
3202  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3203  ReuseShuffleIndicies);
3204  LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
3205  return;
3206  }
3207  Function *F = CI->getCalledFunction();
3208  unsigned NumArgs = CI->getNumArgOperands();
3209  SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
3210  for (unsigned j = 0; j != NumArgs; ++j)
3212  ScalarArgs[j] = CI->getArgOperand(j);
3213  for (Value *V : VL) {
3214  CallInst *CI2 = dyn_cast<CallInst>(V);
3215  if (!CI2 || CI2->getCalledFunction() != F ||
3216  getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
3217  (VecFunc &&
3218  VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
3219  !CI->hasIdenticalOperandBundleSchema(*CI2)) {
3220  BS.cancelScheduling(VL, VL0);
3221  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3222  ReuseShuffleIndicies);
3223  LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
3224  << "\n");
3225  return;
3226  }
3227  // Some intrinsics have scalar arguments and should be same in order for
3228  // them to be vectorized.
3229  for (unsigned j = 0; j != NumArgs; ++j) {
3231  Value *A1J = CI2->getArgOperand(j);
3232  if (ScalarArgs[j] != A1J) {
3233  BS.cancelScheduling(VL, VL0);
3234  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3235  ReuseShuffleIndicies);
3236  LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
3237  << " argument " << ScalarArgs[j] << "!=" << A1J
3238  << "\n");
3239  return;
3240  }
3241  }
3242  }
3243  // Verify that the bundle operands are identical between the two calls.
3244  if (CI->hasOperandBundles() &&
3246  CI->op_begin() + CI->getBundleOperandsEndIndex(),
3247  CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
3248  BS.cancelScheduling(VL, VL0);
3249  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3250  ReuseShuffleIndicies);
3251  LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
3252  << *CI << "!=" << *V << '\n');
3253  return;
3254  }
3255  }
3256 
3257  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3258  ReuseShuffleIndicies);
3259  TE->setOperandsInOrder();
3260  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
3261  ValueList Operands;
3262  // Prepare the operand vector.
3263  for (Value *V : VL) {
3264  auto *CI2 = cast<CallInst>(V);
3265  Operands.push_back(CI2->getArgOperand(i));
3266  }
3267  buildTree_rec(Operands, Depth + 1, {TE, i});
3268  }
3269  return;
3270  }
3271  case Instruction::ShuffleVector: {
3272  // If this is not an alternate sequence of opcode like add-sub
3273  // then do not vectorize this instruction.
3274  if (!S.isAltShuffle()) {
3275  BS.cancelScheduling(VL, VL0);
3276  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3277  ReuseShuffleIndicies);
3278  LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
3279  return;
3280  }
3281  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3282  ReuseShuffleIndicies);
3283  LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
3284 
3285  // Reorder operands if reordering would enable vectorization.
3286  if (isa<BinaryOperator>(VL0)) {
3287  ValueList Left, Right;
3288  reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
3289  TE->setOperand(0, Left);
3290  TE->setOperand(1, Right);
3291  buildTree_rec(Left, Depth + 1, {TE, 0});
3292  buildTree_rec(Right, Depth + 1, {TE, 1});
3293  return;
3294  }
3295 
3296  TE->setOperandsInOrder();
3297  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
3298  ValueList Operands;
3299  // Prepare the operand vector.
3300  for (Value *V : VL)
3301  Operands.push_back(cast<Instruction>(V)->getOperand(i));
3302 
3303  buildTree_rec(Operands, Depth + 1, {TE, i});
3304  }
3305  return;
3306  }
3307  default:
3308  BS.cancelScheduling(VL, VL0);
3309  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3310  ReuseShuffleIndicies);
3311  LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
3312  return;
3313  }
3314 }
3315 
3316 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
3317  unsigned N = 1;
3318  Type *EltTy = T;
3319 
3320  while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
3321  isa<VectorType>(EltTy)) {
3322  if (auto *ST = dyn_cast<StructType>(EltTy)) {
3323  // Check that struct is homogeneous.
3324  for (const auto *Ty : ST->elements())
3325  if (Ty != *ST->element_begin())
3326  return 0;
3327  N *= ST->getNumElements();
3328  EltTy = *ST->element_begin();
3329  } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
3330  N *= AT->getNumElements();
3331  EltTy = AT->getElementType();
3332  } else {
3333  auto *VT = cast<FixedVectorType>(EltTy);
3334  N *= VT->getNumElements();
3335  EltTy = VT->getElementType();
3336  }
3337  }
3338 
3339  if (!isValidElementType(EltTy))
3340  return 0;
3341  uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
3342  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
3343  return 0;
3344  return N;
3345 }
3346 
3347 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3348  SmallVectorImpl<unsigned> &CurrentOrder) const {
3349  Instruction *E0 = cast<Instruction>(OpValue);
3350  assert(E0->getOpcode() == Instruction::ExtractElement ||
3351  E0->getOpcode() == Instruction::ExtractValue);
3352  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
3353  // Check if all of the extracts come from the same vector and from the
3354  // correct offset.
3355  Value *Vec = E0->getOperand(0);
3356 
3357  CurrentOrder.clear();
3358 
3359  // We have to extract from a vector/aggregate with the same number of elements.
3360  unsigned NElts;
3361  if (E0->getOpcode() == Instruction::ExtractValue) {
3362  const DataLayout &DL = E0->getModule()->getDataLayout();
3363  NElts = canMapToVector(Vec->getType(), DL);
3364  if (!NElts)
3365  return false;
3366  // Check if load can be rewritten as load of vector.
3367  LoadInst *LI = dyn_cast<LoadInst>(Vec);
3368  if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
3369  return false;
3370  } else {
3371  NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
3372  }
3373 
3374  if (NElts != VL.size())
3375  return false;
3376 
3377  // Check that all of the indices extract from the correct offset.
3378  bool ShouldKeepOrder = true;
3379  unsigned E = VL.size();
3380  // Assign to all items the initial value E + 1 so we can check if the extract
3381  // instruction index was used already.
3382  // Also, later we can check that all the indices are used and we have a
3383  // consecutive access in the extract instructions, by checking that no
3384  // element of CurrentOrder still has value E + 1.
3385  CurrentOrder.assign(E, E + 1);
3386  unsigned I = 0;
3387  for (; I < E; ++I) {
3388  auto *Inst = cast<Instruction>(VL[I]);
3389  if (Inst->getOperand(0) != Vec)
3390  break;
3391  Optional<unsigned> Idx = getExtractIndex(Inst);
3392  if (!Idx)
3393  break;
3394  const unsigned ExtIdx = *Idx;
3395  if (ExtIdx != I) {
3396  if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
3397  break;
3398  ShouldKeepOrder = false;
3399  CurrentOrder[ExtIdx] = I;
3400  } else {
3401  if (CurrentOrder[I] != E + 1)
3402  break;
3403  CurrentOrder[I] = I;
3404  }
3405  }
3406  if (I < E) {
3407  CurrentOrder.clear();
3408  return false;
3409  }
3410 
3411  return ShouldKeepOrder;
3412 }
3413 
3414 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
3415  return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
3416  return ScalarToTreeEntry.count(U) > 0;
3417  });
3418 }
3419 
3420 static std::pair<InstructionCost, InstructionCost>
3424 
3425  // Calculate the cost of the scalar and vector calls.
3426  SmallVector<Type *, 4> VecTys;
3427  for (Use &Arg : CI->args())
3428  VecTys.push_back(
3429  FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
3430  FastMathFlags FMF;
3431  if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
3432  FMF = FPCI->getFastMathFlags();
3434  IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
3435  dyn_cast<IntrinsicInst>(CI));
3436  auto IntrinsicCost =
3438 
3439  auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
3440  VecTy->getNumElements())),
3441  false /*HasGlobalPred*/);
3442  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3443  auto LibCost = IntrinsicCost;
3444  if (!CI->isNoBuiltin() && VecFunc) {
3445  // Calculate the cost of the vector library call.
3446  // If the corresponding vector call is cheaper, return its cost.
3447  LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
3449  }
3450  return {IntrinsicCost, LibCost};
3451 }
3452 
3453 /// Compute the cost of creating a vector of type \p VecTy containing the
3454 /// extracted values from \p VL.
3455 static InstructionCost
3459  unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
3460 
3461  if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
3462  VecTy->getNumElements() < NumOfParts)
3463  return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
3464 
3465  bool AllConsecutive = true;
3466  unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
3467  unsigned Idx = -1;
3468  InstructionCost Cost = 0;
3469 
3470  // Process extracts in blocks of EltsPerVector to check if the source vector
3471  // operand can be re-used directly. If not, add the cost of creating a shuffle
3472  // to extract the values into a vector register.
3473  for (auto *V : VL) {
3474  ++Idx;
3475 
3476  // Reached the start of a new vector registers.
3477  if (Idx % EltsPerVector == 0) {
3478  AllConsecutive = true;
3479  continue;
3480  }
3481 
3482  // Check all extracts for a vector register on the target directly
3483  // extract values in order.
3484  unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
3485  unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
3486  AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
3487  CurrentIdx % EltsPerVector == Idx % EltsPerVector;
3488 
3489  if (AllConsecutive)
3490  continue;
3491 
3492  // Skip all indices, except for the last index per vector block.
3493  if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
3494  continue;
3495 
3496  // If we have a series of extracts which are not consecutive and hence
3497  // cannot re-use the source vector register directly, compute the shuffle
3498  // cost to extract the a vector with EltsPerVector elements.
3499  Cost += TTI.getShuffleCost(
3501  FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
3502  }
3503  return Cost;
3504 }
3505 
3506 InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
3507  ArrayRef<Value*> VL = E->Scalars;
3508 
3509  Type *ScalarTy = VL[0]->getType();
3510  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
3511  ScalarTy = SI->getValueOperand()->getType();
3512  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
3513  ScalarTy = CI->getOperand(0)->getType();
3514  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
3516 
3517  // If we have computed a smaller type for the expression, update VecTy so
3518  // that the costs will be accurate.
3519  if (MinBWs.count(VL[0]))
3520  VecTy = FixedVectorType::get(
3521  IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
3522 
3523  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
3524  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
3525  InstructionCost ReuseShuffleCost = 0;
3526  if (NeedToShuffleReuses) {
3527  ReuseShuffleCost =
3529  E->ReuseShuffleIndices);
3530  }
3531  if (E->State == TreeEntry::NeedToGather) {
3532  if (allConstant(VL))
3533  return 0;
3534  if (isSplat(VL)) {
3535  return ReuseShuffleCost +
3537  0);
3538  }
3539  if (E->getOpcode() == Instruction::ExtractElement &&
3540  allSameType(VL) && allSameBlock(VL)) {
3543  isShuffle(VL, Mask);
3544  if (ShuffleKind.hasValue()) {
3545  InstructionCost Cost =
3546  computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
3547  for (auto *V : VL) {
3548  // If all users of instruction are going to be vectorized and this
3549  // instruction itself is not going to be vectorized, consider this
3550  // instruction as dead and remove its cost from the final cost of the
3551  // vectorized tree.
3552  if (areAllUsersVectorized(cast<Instruction>(V)) &&
3553  !ScalarToTreeEntry.count(V)) {
3554  auto *IO = cast<ConstantInt>(
3555  cast<ExtractElementInst>(V)->getIndexOperand());
3556  Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
3557  IO->getZExtValue());
3558  }
3559  }
3560  return ReuseShuffleCost + Cost;
3561  }
3562  }
3563  return ReuseShuffleCost + getGatherCost(VL);
3564  }
3565  assert((E->State == TreeEntry::Vectorize ||
3566  E->State == TreeEntry::ScatterVectorize) &&
3567  "Unhandled state");
3568  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
3569  Instruction *VL0 = E->getMainOp();
3570  unsigned ShuffleOrOp =
3571  E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
3572  switch (ShuffleOrOp) {
3573  case Instruction::PHI:
3574  return 0;
3575 
3576  case Instruction::ExtractValue:
3577  case Instruction::ExtractElement: {
3578  // The common cost of removal ExtractElement/ExtractValue instructions +
3579  // the cost of shuffles, if required to resuffle the original vector.
3580  InstructionCost CommonCost = 0;
3581  if (NeedToShuffleReuses) {
3582  unsigned Idx = 0;
3583  for (unsigned I : E->ReuseShuffleIndices) {
3584  if (ShuffleOrOp == Instruction::ExtractElement) {
3585  auto *IO = cast<ConstantInt>(
3586  cast<ExtractElementInst>(VL[I])->getIndexOperand());
3587  Idx = IO->getZExtValue();
3588  ReuseShuffleCost -= TTI->getVectorInstrCost(
3589  Instruction::ExtractElement, VecTy, Idx);
3590  } else {
3591  ReuseShuffleCost -= TTI->getVectorInstrCost(
3592  Instruction::ExtractElement, VecTy, Idx);
3593  ++Idx;
3594  }
3595  }
3596  Idx = ReuseShuffleNumbers;
3597  for (Value *V : VL) {
3598  if (ShuffleOrOp == Instruction::ExtractElement) {
3599  auto *IO = cast<ConstantInt>(
3600  cast<ExtractElementInst>(V)->getIndexOperand());
3601  Idx = IO->getZExtValue();
3602  } else {
3603  --Idx;
3604  }
3605  ReuseShuffleCost +=
3606  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
3607  }
3608  CommonCost = ReuseShuffleCost;
3609  } else if (!E->ReorderIndices.empty()) {
3610  SmallVector<int> NewMask;
3611  inversePermutation(E->ReorderIndices, NewMask);
3612  CommonCost = TTI->getShuffleCost(
3614  }
3615  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
3616  Instruction *EI = cast<Instruction>(VL[I]);
3617  // If all users are going to be vectorized, instruction can be
3618  // considered as dead.
3619  // The same, if have only one user, it will be vectorized for sure.
3620  if (areAllUsersVectorized(EI)) {
3621  // Take credit for instruction that will become dead.
3622  if (EI->hasOneUse()) {
3623  Instruction *Ext = EI->user_back();
3624  if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
3625  all_of(Ext->users(),
3626  [](User *U) { return isa<GetElementPtrInst>(U); })) {
3627  // Use getExtractWithExtendCost() to calculate the cost of
3628  // extractelement/ext pair.
3629  CommonCost -= TTI->getExtractWithExtendCost(
3630  Ext->getOpcode(), Ext->getType(), VecTy, I);
3631  // Add back the cost of s|zext which is subtracted separately.
3632  CommonCost += TTI->getCastInstrCost(
3633  Ext->getOpcode(), Ext->getType(), EI->getType(),
3635  continue;
3636  }
3637  }
3638  CommonCost -=
3639  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
3640  }
3641  }
3642  return CommonCost;
3643  }
3644  case Instruction::ZExt:
3645  case Instruction::SExt:
3646  case Instruction::FPToUI:
3647  case Instruction::FPToSI:
3648  case Instruction::FPExt:
3649  case Instruction::PtrToInt:
3650  case Instruction::IntToPtr:
3651  case Instruction::SIToFP:
3652  case Instruction::UIToFP:
3653  case Instruction::Trunc:
3654  case Instruction::FPTrunc:
3655  case Instruction::BitCast: {
3656  Type *SrcTy = VL0->getOperand(0)->getType();
3657  InstructionCost ScalarEltCost =
3658  TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
3659  TTI::getCastContextHint(VL0), CostKind, VL0);
3660  if (NeedToShuffleReuses) {
3661  ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3662  }
3663 
3664  // Calculate the cost of this instruction.
3665  InstructionCost ScalarCost = VL.size() * ScalarEltCost;
3666 
3667  auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
3668  InstructionCost VecCost = 0;
3669  // Check if the values are candidates to demote.
3670  if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
3671  VecCost =
3672  ReuseShuffleCost +
3673  TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
3674  TTI::getCastContextHint(VL0), CostKind, VL0);
3675  }
3676  LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
3677  return VecCost - ScalarCost;
3678  }
3679  case Instruction::FCmp:
3680  case Instruction::ICmp:
3681  case Instruction::Select: {
3682  // Calculate the cost of this instruction.
3683  InstructionCost ScalarEltCost =
3684  TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
3686  if (NeedToShuffleReuses) {
3687  ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3688  }
3689  auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
3690  InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
3691 
3692  // Check if all entries in VL are either compares or selects with compares
3693  // as condition that have the same predicates.
3695  bool First = true;
3696  for (auto *V : VL) {
3697  CmpInst::Predicate CurrentPred;
3698  auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
3699  if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
3700  !match(V, MatchCmp)) ||
3701  (!First && VecPred != CurrentPred)) {
3702  VecPred = CmpInst::BAD_ICMP_PREDICATE;
3703  break;
3704  }
3705  First = false;
3706  VecPred = CurrentPred;
3707  }
3708 
3710  E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
3711  // Check if it is possible and profitable to use min/max for selects in
3712  // VL.
3713  //
3714  auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
3715  if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
3716  IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
3717  {VecTy, VecTy});
3718  InstructionCost IntrinsicCost =
3719  TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
3720  // If the selects are the only uses of the compares, they will be dead
3721  // and we can adjust the cost by removing their cost.
3722  if (IntrinsicAndUse.second)
3723  IntrinsicCost -=
3724  TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
3726  VecCost = std::min(VecCost, IntrinsicCost);
3727  }
3728  LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
3729  return ReuseShuffleCost + VecCost - ScalarCost;
3730  }
3731  case Instruction::FNeg:
3732  case Instruction::Add:
3733  case Instruction::FAdd:
3734  case Instruction::Sub:
3735  case Instruction::FSub:
3736  case Instruction::Mul:
3737  case Instruction::FMul:
3738  case Instruction::UDiv:
3739  case Instruction::SDiv:
3740  case Instruction::FDiv:
3741  case Instruction::URem:
3742  case Instruction::SRem:
3743  case Instruction::FRem:
3744  case Instruction::Shl:
3745  case Instruction::LShr:
3746  case Instruction::AShr:
3747  case Instruction::And:
3748  case Instruction::Or:
3749  case Instruction::Xor: {
3750  // Certain instructions can be cheaper to vectorize if they have a
3751  // constant second vector operand.
3760 
3761  // If all operands are exactly the same ConstantInt then set the
3762  // operand kind to OK_UniformConstantValue.
3763  // If instead not all operands are constants, then set the operand kind
3764  // to OK_AnyValue. If all operands are constants but not the same,
3765  // then set the operand kind to OK_NonUniformConstantValue.
3766  ConstantInt *CInt0 = nullptr;
3767  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3768  const Instruction *I = cast<Instruction>(VL[i]);
3769  unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
3770  ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
3771  if (!CInt) {
3774  break;
3775  }
3776  if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
3777  !CInt->getValue().isPowerOf2())
3779  if (i == 0) {
3780  CInt0 = CInt;
3781  continue;
3782  }
3783  if (CInt0 != CInt)
3785  }
3786 
3788  InstructionCost ScalarEltCost =
3789  TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
3790  Op2VK, Op1VP, Op2VP, Operands, VL0);
3791  if (NeedToShuffleReuses) {
3792  ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3793  }
3794  InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
3795  InstructionCost VecCost =
3796  TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
3797  Op2VK, Op1VP, Op2VP, Operands, VL0);
3798  LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
3799  return ReuseShuffleCost + VecCost - ScalarCost;
3800  }
3801  case Instruction::GetElementPtr: {
3806 
3807  InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
3808  Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
3809  if (NeedToShuffleReuses) {
3810  ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3811  }
3812  InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
3814  Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
3815  LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
3816  return ReuseShuffleCost + VecCost - ScalarCost;
3817  }
3818  case Instruction::Load: {
3819  // Cost of wide load - cost of scalar loads.
3820  Align alignment = cast<LoadInst>(VL0)->getAlign();
3821  InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
3822  Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
3823  if (NeedToShuffleReuses) {
3824  ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3825  }
3826  InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
3827  InstructionCost VecLdCost;
3828  if (E->State == TreeEntry::Vectorize) {
3829  VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
3830  CostKind, VL0);
3831  } else {
3832  assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
3833  VecLdCost = TTI->getGatherScatterOpCost(
3834  Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
3835  /*VariableMask=*/false, alignment, CostKind, VL0);
3836  }
3837  if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) {
3838  SmallVector<int> NewMask;
3839  inversePermutation(E->ReorderIndices, NewMask);
3840  VecLdCost += TTI->getShuffleCost(
3842  }
3843  LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
3844  return ReuseShuffleCost + VecLdCost - ScalarLdCost;
3845  }
3846  case Instruction::Store: {
3847  // We know that we can merge the stores. Calculate the cost.
3848  bool IsReorder = !E->ReorderIndices.empty();
3849  auto *SI =
3850  cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
3851  Align Alignment = SI->getAlign();
3852  InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
3853  Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
3854  InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
3855  InstructionCost VecStCost = TTI->getMemoryOpCost(
3856  Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
3857  if (IsReorder) {
3858  SmallVector<int> NewMask;
3859  inversePermutation(E->ReorderIndices, NewMask);
3860  VecStCost += TTI->getShuffleCost(
3862  }
3863  LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
3864  return VecStCost - ScalarStCost;
3865  }
3866  case Instruction::Call: {
3867  CallInst *CI = cast<CallInst>(VL0);
3869 
3870  // Calculate the cost of the scalar and vector calls.
3871  IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
3872  InstructionCost ScalarEltCost =
3873  TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
3874  if (NeedToShuffleReuses) {
3875  ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3876  }
3877  InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
3878 
3879  auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
3880  InstructionCost VecCallCost =
3881  std::min(VecCallCosts.first, VecCallCosts.second);
3882 
3883  LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
3884  << " (" << VecCallCost << "-" << ScalarCallCost << ")"
3885  << " for " << *CI << "\n");
3886 
3887  return ReuseShuffleCost + VecCallCost - ScalarCallCost;
3888  }
3889  case Instruction::ShuffleVector: {
3890  assert(E->isAltShuffle() &&
3891  ((Instruction::isBinaryOp(E->getOpcode()) &&
3892  Instruction::isBinaryOp(E->getAltOpcode())) ||
3893  (Instruction::isCast(E->getOpcode()) &&
3894  Instruction::isCast(E->getAltOpcode()))) &&
3895  "Invalid Shuffle Vector Operand");
3896  InstructionCost ScalarCost = 0;
3897  if (NeedToShuffleReuses) {
3898  for (unsigned Idx : E->ReuseShuffleIndices) {
3899  Instruction *I = cast<Instruction>(VL[Idx]);
3900  ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
3901  }
3902  for (Value *V : VL) {
3903  Instruction *I = cast<Instruction>(V);
3904  ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
3905  }
3906  }
3907  for (Value *V : VL) {
3908  Instruction *I = cast<Instruction>(V);
3909  assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
3910  ScalarCost += TTI->getInstructionCost(I, CostKind);
3911  }
3912  // VecCost is equal to sum of the cost of creating 2 vectors
3913  // and the cost of creating shuffle.
3914  InstructionCost VecCost = 0;
3915  if (Instruction::isBinaryOp(E->getOpcode())) {
3916  VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
3917  VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
3918  CostKind);
3919  } else {
3920  Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
3921  Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
3922  auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
3923  auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
3924  VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
3926  VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
3928  }
3929 
3930  SmallVector<int> Mask(E->Scalars.size());
3931  for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) {
3932  auto *OpInst = cast<Instruction>(E->Scalars[I]);
3933  assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
3934  Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0);
3935  }
3936  VecCost +=
3938  LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
3939  return ReuseShuffleCost + VecCost - ScalarCost;
3940  }
3941  default:
3942  llvm_unreachable("Unknown instruction");
3943  }
3944 }
3945 
3946 bool BoUpSLP::isFullyVectorizableTinyTree() const {
3947  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
3948  << VectorizableTree.size() << " is fully vectorizable .\n");
3949 
3950  // We only handle trees of heights 1 and 2.
3951  if (VectorizableTree.size() == 1 &&
3952  VectorizableTree[0]->State == TreeEntry::Vectorize)
3953  return true;
3954 
3955  if (VectorizableTree.size() != 2)
3956  return false;
3957 
3958  // Handle splat and all-constants stores.
3959  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
3960  (allConstant(VectorizableTree[1]->Scalars) ||
3961  isSplat(VectorizableTree[1]->Scalars)))
3962  return true;
3963 
3964  // Gathering cost would be too much for tiny trees.
3965  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
3966  VectorizableTree[1]->State == TreeEntry::NeedToGather)
3967  return false;
3968 
3969  return true;
3970 }
3971 
3972 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
3974  // Look past the root to find a source value. Arbitrarily follow the
3975  // path through operand 0 of any 'or'. Also, peek through optional
3976  // shift-left-by-multiple-of-8-bits.
3977  Value *ZextLoad = Root;
3978  const APInt *ShAmtC;
3979  while (!isa<ConstantExpr>(ZextLoad) &&
3980  (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
3981  (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
3982  ShAmtC->urem(8) == 0)))
3983  ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
3984 
3985  // Check if the input is an extended load of the required or/shift expression.
3986  Value *LoadPtr;
3987  if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
3988  return false;
3989 
3990  // Require that the total load bit width is a legal integer type.
3991  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
3992  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
3993  Type *SrcTy = LoadPtr->getType()->getPointerElementType();
3994  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
3995  if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
3996  return false;
3997 
3998  // Everything matched - assume that we can fold the whole sequence using
3999  // load combining.
4000  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
4001  << *(cast<Instruction>(Root)) << "\n");
4002 
4003  return true;
4004 }
4005 
4006 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
4007  if (RdxKind != RecurKind::Or)
4008  return false;
4009 
4010  unsigned NumElts = VectorizableTree[0]->Scalars.size();
4011  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
4012  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
4013 }
4014 
4016  // Peek through a final sequence of stores and check if all operations are
4017  // likely to be load-combined.
4018  unsigned NumElts = VectorizableTree[0]->Scalars.size();
4019  for (Value *Scalar : VectorizableTree[0]->Scalars) {
4020  Value *X;
4021  if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
4022  !isLoadCombineCandidateImpl(X, NumElts, TTI))
4023  return false;
4024  }
4025  return true;
4026 }
4027 
4028 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
4029  // We can vectorize the tree if its size is greater than or equal to the
4030  // minimum size specified by the MinTreeSize command line option.
4031  if (VectorizableTree.size() >= MinTreeSize)
4032  return false;
4033 
4034  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
4035  // can vectorize it if we can prove it fully vectorizable.
4036  if (isFullyVectorizableTinyTree())
4037  return false;
4038 
4039  assert(VectorizableTree.empty()
4040  ? ExternalUses.empty()
4041  : true && "We shouldn't have any external users");
4042 
4043  // Otherwise, we can't vectorize the tree. It is both tiny and not fully
4044  // vectorizable.
4045  return true;
4046 }
4047 
4048 InstructionCost BoUpSLP::getSpillCost() const {
4049  // Walk from the bottom of the tree to the top, tracking which values are
4050  // live. When we see a call instruction that is not part of our tree,
4051  // query TTI to see if there is a cost to keeping values live over it
4052  // (for example, if spills and fills are required).
4053  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
4054  InstructionCost Cost = 0;
4055 
4056  SmallPtrSet<Instruction*, 4> LiveValues;
4057  Instruction *PrevInst = nullptr;
4058 
4059  // The entries in VectorizableTree are not necessarily ordered by their
4060  // position in basic blocks. Collect them and order them by dominance so later
4061  // instructions are guaranteed to be visited first. For instructions in
4062  // different basic blocks, we only scan to the beginning of the block, so
4063  // their order does not matter, as long as all instructions in a basic block
4064  // are grouped together. Using dominance ensures a deterministic order.
4065  SmallVector<Instruction *, 16> OrderedScalars;
4066  for (const auto &TEPtr : VectorizableTree) {
4067  Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
4068  if (!Inst)
4069  continue;
4070  OrderedScalars.push_back(Inst);
4071  }
4072  llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
4073  return DT->dominates(B, A);
4074  });
4075 
4076  for (Instruction *Inst : OrderedScalars) {
4077  if (!PrevInst) {
4078  PrevInst = Inst;
4079  continue;
4080  }
4081 
4082  // Update LiveValues.
4083  LiveValues.erase(PrevInst);
4084  for (auto &J : PrevInst->operands()) {
4085  if (isa<Instruction>(&*J) && getTreeEntry(&*J))
4086  LiveValues.insert(cast<Instruction>(&*J));
4087  }
4088 
4089  LLVM_DEBUG({
4090  dbgs() << "SLP: #LV: " << LiveValues.size();
4091  for (auto *X : LiveValues)
4092  dbgs() << " " << X->getName();
4093  dbgs() << ", Looking at ";
4094  Inst->dump();
4095  });
4096 
4097  // Now find the sequence of instructions between PrevInst and Inst.
4098  unsigned NumCalls = 0;
4099  BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
4100  PrevInstIt =
4101  PrevInst->getIterator().getReverse();
4102  while (InstIt != PrevInstIt) {
4103  if (PrevInstIt == PrevInst->getParent()->rend()) {
4104  PrevInstIt = Inst->getParent()->rbegin();
4105  continue;
4106  }
4107 
4108  // Debug information does not impact spill cost.
4109  if ((isa<CallInst>(&*PrevInstIt) &&
4110  !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
4111  &*PrevInstIt != PrevInst)
4112  NumCalls++;
4113 
4114  ++PrevInstIt;
4115  }
4116 
4117  if (NumCalls) {
4119  for (auto *II : LiveValues)
4120  V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
4121  Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
4122  }
4123 
4124  PrevInst = Inst;
4125  }
4126 
4127  return Cost;
4128 }
4129 
4130 InstructionCost BoUpSLP::getTreeCost() {
4131  InstructionCost Cost = 0;
4132  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
4133  << VectorizableTree.size() << ".\n");
4134 
4135  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
4136 
4137  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
4138  TreeEntry &TE = *VectorizableTree[I].get();
4139 
4140  // We create duplicate tree entries for gather sequences that have multiple
4141  // uses. However, we should not compute the cost of duplicate sequences.
4142  // For example, if we have a build vector (i.e., insertelement sequence)
4143  // that is used by more than one vector instruction, we only need to
4144  // compute the cost of the insertelement instructions once. The redundant
4145  // instructions will be eliminated by CSE.
4146  //
4147  // We should consider not creating duplicate tree entries for gather
4148  // sequences, and instead add additional edges to the tree representing
4149  // their uses. Since such an approach results in fewer total entries,
4150  // existing heuristics based on tree size may yield different results.
4151  //
4152  if (TE.State == TreeEntry::NeedToGather &&
4153  std::any_of(std::next(VectorizableTree.begin(), I + 1),
4154  VectorizableTree.end(),
4155  [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
4156  return EntryPtr->State == TreeEntry::NeedToGather &&
4157  EntryPtr->isSame(TE.Scalars);
4158  }))
4159  continue;
4160 
4161  InstructionCost C = getEntryCost(&TE);
4162  Cost += C;
4163  LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
4164  << " for bundle that starts with " << *TE.Scalars[0]
4165  << ".\n"
4166  << "SLP: Current total cost = " << Cost << "\n");
4167  }
4168 
4169  SmallPtrSet<Value *, 16> ExtractCostCalculated;
4170  InstructionCost ExtractCost = 0;
4171  for (ExternalUser &EU : ExternalUses) {
4172  // We only add extract cost once for the same scalar.
4173  if (!ExtractCostCalculated.insert(EU.Scalar).second)
4174  continue;
4175 
4176  // Uses by ephemeral values are free (because the ephemeral value will be
4177  // removed prior to code generation, and so the extraction will be
4178  // removed as well).
4179  if (EphValues.count(EU.User))
4180  continue;
4181 
4182  // If we plan to rewrite the tree in a smaller type, we will need to sign
4183  // extend the extracted value back to the original type. Here, we account
4184  // for the extract and the added cost of the sign extend if needed.
4185  auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
4186  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
4187  if (MinBWs.count(ScalarRoot)) {
4188  auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
4189  auto Extend =
4190  MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
4191  VecTy = FixedVectorType::get(MinTy, BundleWidth);
4192  ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
4193  VecTy, EU.Lane);
4194  } else {
4195  ExtractCost +=
4196  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
4197  }
4198  }
4199 
4200  InstructionCost SpillCost = getSpillCost();
4201  Cost += SpillCost + ExtractCost;
4202 
4203 #ifndef NDEBUG
4204  SmallString<256> Str;
4205  {
4206  raw_svector_ostream OS(Str);
4207  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
4208  << "SLP: Extract Cost = " << ExtractCost << ".\n"
4209  << "SLP: Total Cost = " << Cost << ".\n";
4210  }
4211  LLVM_DEBUG(dbgs() << Str);
4212  if (ViewSLPTree)
4213  ViewGraph(this, "SLP" + F->getName(), false, Str);
4214 #endif
4215 
4216  return Cost;
4217 }
4218 
4220 BoUpSLP::getGatherCost(FixedVectorType *Ty,
4221  const DenseSet<unsigned> &ShuffledIndices) const {
4222  unsigned NumElts = Ty->getNumElements();
4223  APInt DemandedElts = APInt::getNullValue(NumElts);
4224  for (unsigned I = 0; I < NumElts; ++I)
4225  if (!ShuffledIndices.count(I))
4226  DemandedElts.setBit(I);
4227  InstructionCost Cost =
4228  TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
4229  /*Extract*/ false);
4230  if (!ShuffledIndices.empty())
4232  return Cost;
4233 }
4234 
4235 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
4236  // Find the type of the operands in VL.
4237  Type *ScalarTy = VL[0]->getType();
4238  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
4239  ScalarTy = SI->getValueOperand()->getType();
4240  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
4241  // Find the cost of inserting/extracting values from the vector.
4242  // Check if the same elements are inserted several times and count them as
4243  // shuffle candidates.
4244  DenseSet<unsigned> ShuffledElements;
4245  DenseSet<Value *> UniqueElements;
4246  // Iterate in reverse order to consider insert elements with the high cost.
4247  for (unsigned I = VL.size(); I > 0; --I) {
4248  unsigned Idx = I - 1;
4249  if (!UniqueElements.insert(VL[Idx]).second)
4250  ShuffledElements.insert(Idx);
4251  }
4252  return getGatherCost(VecTy, ShuffledElements);
4253 }
4254 
4255 // Perform operand reordering on the instructions in VL and return the reordered
4256 // operands in Left and Right.
4257 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
4260  const DataLayout &DL,
4261  ScalarEvolution &SE,
4262  const BoUpSLP &R) {
4263  if (VL.empty())
4264  return;
4265  VLOperands Ops(VL, DL, SE, R);
4266  // Reorder the operands in place.
4267  Ops.reorder();
4268  Left = Ops.getVL(0);
4269  Right = Ops.getVL(1);
4270 }
4271 
4272 void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
4273  // Get the basic block this bundle is in. All instructions in the bundle
4274  // should be in this block.
4275  auto *Front = E->getMainOp();
4276  auto *BB = Front->getParent();
4277  assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
4278  auto *I = cast<Instruction>(V);
4279  return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
4280  }));
4281 
4282  // The last instruction in the bundle in program order.
4283  Instruction *LastInst = nullptr;
4284 
4285  // Find the last instruction. The common case should be that BB has been
4286  // scheduled, and the last instruction is VL.back(). So we start with
4287  // VL.back() and iterate over schedule data until we reach the end of the
4288  // bundle. The end of the bundle is marked by null ScheduleData.
4289  if (BlocksSchedules.count(BB)) {
4290  auto *Bundle =
4291  BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
4292  if (Bundle && Bundle->isPartOfBundle())
4293  for (; Bundle; Bundle = Bundle->NextInBundle)
4294  if (Bundle->OpValue == Bundle->Inst)
4295  LastInst = Bundle->Inst;
4296  }
4297 
4298  // LastInst can still be null at this point if there's either not an entry
4299  // for BB in BlocksSchedules or there's no ScheduleData available for
4300  // VL.back(). This can be the case if buildTree_rec aborts for various
4301  // reasons (e.g., the maximum recursion depth is reached, the maximum region
4302  // size is reached, etc.). ScheduleData is initialized in the scheduling
4303  // "dry-run".
4304  //
4305  // If this happens, we can still find the last instruction by brute force. We
4306  // iterate forwards from Front (inclusive) until we either see all
4307  // instructions in the bundle or reach the end of the block. If Front is the
4308  // last instruction in program order, LastInst will be set to Front, and we
4309  // will visit all the remaining instructions in the block.
4310  //
4311  // One of the reasons we exit early from buildTree_rec is to place an upper
4312  // bound on compile-time. Thus, taking an additional compile-time hit here is
4313  // not ideal. However, this should be exceedingly rare since it requires that
4314  // we both exit early from buildTree_rec and that the bundle be out-of-order
4315  // (causing us to iterate all the way to the end of the block).
4316  if (!LastInst) {
4317  SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
4318  for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
4319  if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
4320  LastInst = &I;
4321  if (Bundle.empty())
4322  break;
4323  }
4324  }
4325  assert(LastInst && "Failed to find last instruction in bundle");
4326 
4327  // Set the insertion point after the last instruction in the bundle. Set the
4328  // debug location to Front.
4329  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
4330  Builder.SetCurrentDebugLocation(Front->getDebugLoc());
4331 }
4332 
4333 Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
4334  Value *Val0 =
4335  isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
4336  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
4337  Value *Vec = PoisonValue::get(VecTy);
4338  unsigned InsIndex = 0;
4339  for (Value *Val : VL) {
4340  Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
4341  auto *InsElt = dyn_cast<InsertElementInst>(Vec);
4342  if (!InsElt)
4343  continue;
4344  GatherSeq.insert(InsElt);
4345  CSEBlocks.insert(InsElt->getParent());
4346  // Add to our 'need-to-extract' list.
4347  if (TreeEntry *Entry = getTreeEntry(Val)) {
4348  // Find which lane we need to extract.
4349  unsigned FoundLane = std::distance(Entry->Scalars.begin(),
4350  find(Entry->Scalars, Val));
4351  assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
4352  if (!Entry->ReuseShuffleIndices.empty()) {
4353  FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
4354  find(Entry->ReuseShuffleIndices, FoundLane));
4355  }
4356  ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
4357  }
4358  }
4359 
4360  return Vec;
4361 }
4362 
4363 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
4364  InstructionsState S = getSameOpcode(VL);
4365  if (S.getOpcode()) {
4366  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
4367  if (E->isSame(VL)) {
4368  Value *V = vectorizeTree(E);
4369  if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
4370  // Reshuffle to get only unique values.
4371  // If some of the scalars are duplicated in the vectorization tree
4372  // entry, we do not vectorize them but instead generate a mask for the
4373  // reuses. But if there are several users of the same entry, they may
4374  // have different vectorization factors. This is especially important
4375  // for PHI nodes. In this case, we need to adapt the resulting
4376  // instruction for the user vectorization factor and have to reshuffle
4377  // it again to take only unique elements of the vector. Without this
4378  // code the function incorrectly returns reduced vector instruction
4379  // with the same elements, not with the unique ones.
4380  // block:
4381  // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
4382  // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
4383  // ... (use %2)
4384  // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
4385  // br %block
4386  SmallVector<int, 4> UniqueIdxs;
4387  SmallSet<int, 4> UsedIdxs;
4388  int Pos = 0;
4389  for (int Idx : E->ReuseShuffleIndices) {
4390  if (UsedIdxs.insert(Idx).second)
4391  UniqueIdxs.emplace_back(Pos);
4392  ++Pos;
4393  }
4394  V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
4395  }
4396  return V;
4397  }
4398  }
4399  }
4400 
4401  // Check that every instruction appears once in this bundle.
4402  SmallVector<int, 4> ReuseShuffleIndicies;
4403  SmallVector<Value *, 4> UniqueValues;
4404  if (VL.size() > 2) {
4405  DenseMap<Value *, unsigned> UniquePositions;
4406  for (Value *V : VL) {
4407  auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
4408  ReuseShuffleIndicies.emplace_back(Res.first->second);
4409  if (Res.second || isa<Constant>(V))
4410  UniqueValues.emplace_back(V);
4411  }
4412  // Do not shuffle single element or if number of unique values is not power
4413  // of 2.
4414  if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
4415  !llvm::isPowerOf2_32(UniqueValues.size()))
4416  ReuseShuffleIndicies.clear();
4417  else
4418  VL = UniqueValues;
4419  }
4420 
4421  Value *Vec = gather(VL);
4422  if (!ReuseShuffleIndicies.empty()) {
4423  Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
4424  if (auto *I = dyn_cast<Instruction>(Vec)) {
4425  GatherSeq.insert(I);
4426  CSEBlocks.insert(I->getParent());
4427  }
4428  }
4429  return Vec;
4430 }
4431 
4432 namespace {
4433 /// Merges shuffle masks and emits final shuffle instruction, if required.
4434 class ShuffleInstructionBuilder {
4436  bool IsFinalized = false;
4438 
4439 public:
4440  ShuffleInstructionBuilder(IRBuilderBase &Builder) : Builder(Builder) {}
4441 
4442  /// Adds a mask, inverting it before applying.
4443  void addInversedMask(ArrayRef<unsigned> SubMask) {
4444  if (SubMask.empty())
4445  return;
4446  SmallVector<int, 4> NewMask;
4447  inversePermutation(SubMask, NewMask);
4448  addMask(NewMask);
4449  }
4450 
4451  /// Functions adds masks, merging them into single one.
4452  void addMask(ArrayRef<unsigned> SubMask) {
4453  SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end());
4454  addMask(NewMask);
4455  }
4456 
4457  void addMask(ArrayRef<int> SubMask) {
4458  if (SubMask.empty())
4459  return;
4460  if (Mask.empty()) {
4461  Mask.append(SubMask.begin(), SubMask.end());
4462  return;
4463  }
4464  SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
4465  int TermValue = std::min(Mask.size(), SubMask.size());
4466  for (int I = 0, E = SubMask.size(); I < E; ++I) {
4467  if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) {
4468  NewMask[I] = E;
4469  continue;
4470  }
4471  NewMask[I] = Mask[SubMask[I]];
4472  }
4473  Mask.swap(NewMask);
4474  }
4475 
4476  Value *finalize(Value *V) {
4477  IsFinalized = true;
4478  if (Mask.empty())
4479  return V;
4480  return Builder.CreateShuffleVector(V, Mask, "shuffle");
4481  }
4482 
4483  ~ShuffleInstructionBuilder() {
4484  assert((IsFinalized || Mask.empty()) &&
4485  "Shuffle construction must be finalized.");
4486  }
4487 };
4488 } // namespace
4489 
4490 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
4492 
4493  if (E->VectorizedValue) {
4494  LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
4495  return E->VectorizedValue;
4496  }
4497 
4498  ShuffleInstructionBuilder ShuffleBuilder(Builder);
4499  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
4500  if (E->State == TreeEntry::NeedToGather) {
4501  setInsertPointAfterBundle(E);
4502  Value *Vec = gather(E->Scalars);
4503  if (NeedToShuffleReuses) {
4504  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4505  Vec = ShuffleBuilder.finalize(Vec);
4506  if (auto *I = dyn_cast<Instruction>(Vec)) {
4507  GatherSeq.insert(I);
4508  CSEBlocks.insert(I->getParent());
4509  }
4510  }
4511  E->VectorizedValue = Vec;
4512  return Vec;
4513  }
4514 
4515  assert((E->State == TreeEntry::Vectorize ||
4516  E->State == TreeEntry::ScatterVectorize) &&
4517  "Unhandled state");
4518  unsigned ShuffleOrOp =
4519  E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
4520  Instruction *VL0 = E->getMainOp();
4521  Type *ScalarTy = VL0->getType();
4522  if (auto *Store = dyn_cast<StoreInst>(VL0))
4523  ScalarTy = Store->getValueOperand()->getType();
4524  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
4525  switch (ShuffleOrOp) {
4526  case Instruction::PHI: {
4527  auto *PH = cast<PHINode>(VL0);
4528  Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
4529  Builder.SetCurrentDebugLocation(PH->getDebugLoc());
4530  PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
4531  Value *V = NewPhi;
4532  if (NeedToShuffleReuses)
4533  V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
4534 
4535  E->VectorizedValue = V;
4536 
4537  // PHINodes may have multiple entries from the same block. We want to
4538  // visit every block once.
4539  SmallPtrSet<BasicBlock*, 4> VisitedBBs;
4540 
4541  for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
4542  ValueList Operands;
4543  BasicBlock *IBB = PH->getIncomingBlock(i);
4544 
4545  if (!VisitedBBs.insert(IBB).second) {
4546  NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
4547  continue;
4548  }
4549 
4550  Builder.SetInsertPoint(IBB->getTerminator());
4551  Builder.SetCurrentDebugLocation(PH->getDebugLoc());
4552  Value *Vec = vectorizeTree(E->getOperand(i));
4553  NewPhi->addIncoming(Vec, IBB);
4554  }
4555 
4556  assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
4557  "Invalid number of incoming values");
4558  return V;
4559  }
4560 
4561  case Instruction::ExtractElement: {
4562  Value *V = E->getSingleOperand(0);
4563  Builder.SetInsertPoint(VL0);
4564  ShuffleBuilder.addInversedMask(E->ReorderIndices);
4565  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4566  V = ShuffleBuilder.finalize(V);
4567  E->VectorizedValue = V;
4568  return V;
4569  }
4570  case Instruction::ExtractValue: {
4571  auto *LI = cast<LoadInst>(E->getSingleOperand(0));
4572  Builder.SetInsertPoint(LI);
4573  auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
4574  Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
4575  LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
4576  Value *NewV = propagateMetadata(V, E->Scalars);
4577  ShuffleBuilder.addInversedMask(E->ReorderIndices);
4578  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4579  NewV = ShuffleBuilder.finalize(NewV);
4580  E->VectorizedValue = NewV;
4581  return NewV;
4582  }
4583  case Instruction::ZExt:
4584  case Instruction::SExt:
4585  case Instruction::FPToUI:
4586  case Instruction::FPToSI:
4587  case Instruction::FPExt:
4588  case Instruction::PtrToInt:
4589  case Instruction::IntToPtr:
4590  case Instruction::SIToFP:
4591  case Instruction::UIToFP:
4592  case Instruction::Trunc:
4593  case Instruction::FPTrunc:
4594  case Instruction::BitCast: {
4595  setInsertPointAfterBundle(E);
4596 
4597  Value *InVec = vectorizeTree(E->getOperand(0));
4598 
4599  if (E->VectorizedValue) {
4600  LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
4601  return E->VectorizedValue;
4602  }
4603 
4604  auto *CI = cast<CastInst>(VL0);
4605  Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
4606  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4607  V = ShuffleBuilder.finalize(V);
4608 
4609  E->VectorizedValue = V;
4610  ++NumVectorInstructions;
4611  return V;
4612  }
4613  case Instruction::FCmp:
4614  case Instruction::ICmp: {
4615  setInsertPointAfterBundle(E);
4616 
4617  Value *L = vectorizeTree(E->getOperand(0));
4618  Value *R = vectorizeTree(E->getOperand(1));
4619 
4620  if (E->VectorizedValue) {
4621  LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
4622  return E->VectorizedValue;
4623  }
4624 
4625  CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
4626  Value *V = Builder.CreateCmp(P0, L, R);
4627  propagateIRFlags(V, E->Scalars, VL0);
4628  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4629  V = ShuffleBuilder.finalize(V);
4630 
4631  E->VectorizedValue = V;
4632  ++NumVectorInstructions;
4633  return V;
4634  }
4635  case Instruction::Select: {
4636  setInsertPointAfterBundle(E);
4637 
4638  Value *Cond = vectorizeTree(E->getOperand(0));
4639  Value *True = vectorizeTree(E->getOperand(1));
4640  Value *False = vectorizeTree(E->getOperand(2));
4641 
4642  if (E->VectorizedValue) {
4643  LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
4644  return E->VectorizedValue;
4645  }
4646 
4647  Value *V = Builder.CreateSelect(Cond, True, False);
4648  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4649  V = ShuffleBuilder.finalize(V);
4650 
4651  E->VectorizedValue = V;
4652  ++NumVectorInstructions;
4653  return V;
4654  }
4655  case Instruction::FNeg: {
4656  setInsertPointAfterBundle(E);
4657 
4658  Value *Op = vectorizeTree(E->getOperand(0));
4659 
4660  if (E->VectorizedValue) {
4661  LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
4662  return E->VectorizedValue;
4663  }
4664 
4665  Value *V = Builder.CreateUnOp(
4666  static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
4667  propagateIRFlags(V, E->Scalars, VL0);
4668  if (auto *I = dyn_cast<Instruction>(V))
4669  V = propagateMetadata(I, E->Scalars);
4670 
4671  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4672  V = ShuffleBuilder.finalize(V);
4673 
4674  E->VectorizedValue = V;
4675  ++NumVectorInstructions;
4676 
4677  return V;
4678  }
4679  case Instruction::Add:
4680  case Instruction::FAdd:
4681  case Instruction::Sub:
4682  case Instruction::FSub:
4683  case Instruction::Mul:
4684  case Instruction::FMul:
4685  case Instruction::UDiv:
4686  case Instruction::SDiv:
4687  case Instruction::FDiv:
4688  case Instruction::URem:
4689  case Instruction::SRem:
4690  case Instruction::FRem:
4691  case Instruction::Shl:
4692  case Instruction::LShr:
4693  case Instruction::AShr:
4694  case Instruction::And:
4695  case Instruction::Or:
4696  case Instruction::Xor: {
4697  setInsertPointAfterBundle(E);
4698 
4699  Value *LHS = vectorizeTree(E->getOperand(0));
4700  Value *RHS = vectorizeTree(E->getOperand(1));
4701 
4702  if (E->VectorizedValue) {
4703  LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
4704  return E->VectorizedValue;
4705  }
4706 
4707  Value *V = Builder.CreateBinOp(
4708  static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
4709  RHS);
4710  propagateIRFlags(V, E->Scalars, VL0);
4711  if (auto *I = dyn_cast<Instruction>(V))
4712  V = propagateMetadata(I, E->Scalars);
4713 
4714  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4715  V = ShuffleBuilder.finalize(V);
4716 
4717  E->VectorizedValue = V;
4718  ++NumVectorInstructions;
4719 
4720  return V;
4721  }
4722  case Instruction::Load: {
4723  // Loads are inserted at the head of the tree because we don't want to
4724  // sink them all the way down past store instructions.
4725  bool IsReorder = E->updateStateIfReorder();
4726  if (IsReorder)
4727  VL0 = E->getMainOp();
4728  setInsertPointAfterBundle(E);
4729 
4730  LoadInst *LI = cast<LoadInst>(VL0);
4731  Instruction *NewLI;
4732  unsigned AS = LI->getPointerAddressSpace();
4733  Value *PO = LI->getPointerOperand();
4734  if (E->State == TreeEntry::Vectorize) {
4735 
4736  Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
4737 
4738  // The pointer operand uses an in-tree scalar so we add the new BitCast
4739  // to ExternalUses list to make sure that an extract will be generated
4740  // in the future.
4741  if (getTreeEntry(PO))
4742  ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
4743 
4744  NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
4745  } else {
4746  assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
4747  Value *VecPtr = vectorizeTree(E->getOperand(0));
4748  // Use the minimum alignment of the gathered loads.
4749  Align CommonAlignment = LI->getAlign();
4750  for (Value *V : E->Scalars)
4751  CommonAlignment =
4752  commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
4753  NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
4754  }
4755  Value *V = propagateMetadata(NewLI, E->Scalars);
4756 
4757  ShuffleBuilder.addInversedMask(E->ReorderIndices);
4758  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4759  V = ShuffleBuilder.finalize(V);
4760  E->VectorizedValue = V;
4761  ++NumVectorInstructions;
4762  return V;
4763  }
4764  case Instruction::Store: {
4765  bool IsReorder = !E->ReorderIndices.empty();
4766  auto *SI = cast<StoreInst>(
4767  IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
4768  unsigned AS = SI->getPointerAddressSpace();
4769 
4770  setInsertPointAfterBundle(E);
4771 
4772  Value *VecValue = vectorizeTree(E->getOperand(0));
4773  ShuffleBuilder.addMask(E->ReorderIndices);
4774  VecValue = ShuffleBuilder.finalize(VecValue);
4775 
4776  Value *ScalarPtr = SI->getPointerOperand();
4777  Value *VecPtr = Builder.CreateBitCast(
4778  ScalarPtr, VecValue->getType()->getPointerTo(AS));
4779  StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
4780  SI->getAlign());
4781 
4782  // The pointer operand uses an in-tree scalar, so add the new BitCast to
4783  // ExternalUses to make sure that an extract will be generated in the
4784  // future.
4785  if (getTreeEntry(ScalarPtr))
4786  ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
4787 
4788  Value *V = propagateMetadata(ST, E->Scalars);
4789 
4790  E->VectorizedValue = V;
4791  ++NumVectorInstructions;
4792  return V;
4793  }
4794  case Instruction::GetElementPtr: {
4795  setInsertPointAfterBundle(E);
4796 
4797  Value *Op0 = vectorizeTree(E->getOperand(0));
4798 
4799  std::vector<Value *> OpVecs;
4800  for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
4801  ++j) {
4802  ValueList &VL = E->getOperand(j);
4803  // Need to cast all elements to the same type before vectorization to
4804  // avoid crash.
4805  Type *VL0Ty = VL0->getOperand(j)->getType();
4806  Type *Ty = llvm::all_of(
4807  VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
4808  ? VL0Ty
4809  : DL->getIndexType(cast<GetElementPtrInst>(VL0)
4810  ->getPointerOperandType()
4811  ->getScalarType());
4812  for (Value *&V : VL) {
4813  auto *CI = cast<ConstantInt>(V);
4814  V = ConstantExpr::getIntegerCast(CI, Ty,
4815  CI->getValue().isSignBitSet());
4816  }
4817  Value *OpVec = vectorizeTree(VL);
4818  OpVecs.push_back(OpVec);
4819  }
4820 
4821  Value *V = Builder.CreateGEP(
4822  cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
4823  if (Instruction *I = dyn_cast<Instruction>(V))
4824  V = propagateMetadata(I, E->Scalars);
4825 
4826  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4827  V = ShuffleBuilder.finalize(V);
4828 
4829  E->VectorizedValue = V;
4830  ++NumVectorInstructions;
4831 
4832  return V;
4833  }
4834  case Instruction::Call: {
4835  CallInst *CI = cast<CallInst>(VL0);
4836  setInsertPointAfterBundle(E);
4837 
4839  if (Function *FI = CI->getCalledFunction())
4840  IID = FI->getIntrinsicID();
4841 
4843 
4844  auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
4845  bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
4846  VecCallCosts.first <= VecCallCosts.second;
4847 
4848  Value *ScalarArg = nullptr;
4849  std::vector<Value *> OpVecs;
4850  for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
4851  ValueList OpVL;
4852  // Some intrinsics have scalar arguments. This argument should not be
4853  // vectorized.
4854  if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
4855  CallInst *CEI = cast<CallInst>(VL0);
4856  ScalarArg = CEI->getArgOperand(j);
4857  OpVecs.push_back(CEI->getArgOperand(j));
4858  continue;
4859  }
4860 
4861  Value *OpVec = vectorizeTree(E->getOperand(j));
4862  LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
4863  OpVecs.push_back(OpVec);
4864  }
4865 
4866  Function *CF;
4867  if (!UseIntrinsic) {
4868  VFShape Shape =
4869  VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
4870  VecTy->getNumElements())),
4871  false /*HasGlobalPred*/);
4872  CF = VFDatabase(*CI).getVectorizedFunction(Shape);
4873  } else {
4874  Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
4875  CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
4876  }
4877 
4879  CI->getOperandBundlesAsDefs(OpBundles);
4880  Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
4881 
4882  // The scalar argument uses an in-tree scalar so we add the new vectorized
4883  // call to ExternalUses list to make sure that an extract will be
4884  // generated in the future.
4885  if (ScalarArg && getTreeEntry(ScalarArg))
4886  ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
4887 
4888  propagateIRFlags(V, E->Scalars, VL0);
4889  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4890  V = ShuffleBuilder.finalize(V);
4891 
4892  E->VectorizedValue = V;
4893  ++NumVectorInstructions;
4894  return V;
4895  }
4896  case Instruction::ShuffleVector: {
4897  assert(E->isAltShuffle() &&
4898  ((Instruction::isBinaryOp(E->getOpcode()) &&
4899  Instruction::isBinaryOp(E->getAltOpcode())) ||
4900  (Instruction::isCast(E->getOpcode()) &&
4901  Instruction::isCast(E->getAltOpcode()))) &&
4902  "Invalid Shuffle Vector Operand");
4903 
4904  Value *LHS = nullptr, *RHS = nullptr;
4905  if (Instruction::isBinaryOp(E->getOpcode())) {
4906  setInsertPointAfterBundle(E);
4907  LHS = vectorizeTree(E->getOperand(0));
4908  RHS = vectorizeTree(E->getOperand(1));
4909  } else {
4910  setInsertPointAfterBundle(E);
4911  LHS = vectorizeTree(E->getOperand(0));
4912  }
4913 
4914  if (E->VectorizedValue) {
4915  LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
4916  return E->VectorizedValue;
4917  }
4918 
4919  Value *V0, *V1;
4920  if (Instruction::isBinaryOp(E->getOpcode())) {
4921  V0 = Builder.CreateBinOp(
4922  static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
4923  V1 = Builder.CreateBinOp(
4924  static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
4925  } else {
4926  V0 = Builder.CreateCast(
4927  static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
4928  V1 = Builder.CreateCast(
4929  static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
4930  }
4931 
4932  // Create shuffle to take alternate operations from the vector.
4933  // Also, gather up main and alt scalar ops to propagate IR flags to
4934  // each vector operation.
4935  ValueList OpScalars, AltScalars;
4936  unsigned e = E->Scalars.size();
4938  for (unsigned i = 0; i < e; ++i) {
4939  auto *OpInst = cast<Instruction>(E->Scalars[i]);
4940  assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
4941  if (OpInst->getOpcode() == E->getAltOpcode()) {
4942  Mask[i] = e + i;
4943  AltScalars.push_back(E->Scalars[i]);
4944  } else {
4945  Mask[i] = i;
4946  OpScalars.push_back(E->Scalars[i]);
4947  }
4948  }
4949 
4950  propagateIRFlags(V0, OpScalars);
4951  propagateIRFlags(V1, AltScalars);
4952 
4953  Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
4954  if (Instruction *I = dyn_cast<Instruction>(V))
4955  V = propagateMetadata(I, E->Scalars);
4956  ShuffleBuilder.addMask(E->ReuseShuffleIndices);
4957  V = ShuffleBuilder.finalize(V);
4958 
4959  E->VectorizedValue = V;
4960  ++NumVectorInstructions;
4961 
4962  return V;
4963  }
4964  default:
4965  llvm_unreachable("unknown inst");
4966  }
4967  return nullptr;
4968 }
4969 
4970 Value *BoUpSLP::vectorizeTree() {
4971  ExtraValueToDebugLocsMap ExternallyUsedValues;
4972  return vectorizeTree(ExternallyUsedValues);
4973 }
4974 
4975 Value *
4976 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4977  // All blocks must be scheduled before any instructions are inserted.
4978  for (auto &BSIter : BlocksSchedules) {
4979  scheduleBlock(BSIter.second.get());
4980  }
4981 
4982  Builder.SetInsertPoint(&F->getEntryBlock().front());
4983  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
4984 
4985  // If the vectorized tree can be rewritten in a smaller type, we truncate the
4986  // vectorized root. InstCombine will then rewrite the entire expression. We
4987  // sign extend the extracted values below.
4988  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
4989  if (MinBWs.count(ScalarRoot)) {
4990  if (auto *I = dyn_cast<Instruction>(VectorRoot)) {
4991  // If current instr is a phi and not the last phi, insert it after the
4992  // last phi node.
4993  if (isa<PHINode>(I))
4994  Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());
4995  else
4996  Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
4997  }
4998  auto BundleWidth = VectorizableTree[0]->Scalars.size();
4999  auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
5000  auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
5001  auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
5002  VectorizableTree[0]->VectorizedValue = Trunc;
5003  }
5004 
5005  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
5006  << " values .\n");
5007 
5008  // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
5009  // specified by ScalarType.
5010  auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
5011  if (!MinBWs.count(ScalarRoot))
5012  return Ex;
5013  if (MinBWs[ScalarRoot].second)
5014  return Builder.CreateSExt(Ex, ScalarType);
5015  return Builder.CreateZExt(Ex, ScalarType);
5016  };
5017 
5018  // Extract all of the elements with the external uses.
5019  for (const auto &ExternalUse : ExternalUses) {
5020  Value *Scalar = ExternalUse.Scalar;
5021  llvm::User *User = ExternalUse.User;
5022 
5023  // Skip users that we already RAUW. This happens when one instruction
5024  // has multiple uses of the same value.
5025  if (User && !is_contained(Scalar->users(), User))
5026  continue;
5027  TreeEntry *E = getTreeEntry(Scalar);
5028  assert(E && "Invalid scalar");
5029  assert(E->State != TreeEntry::NeedToGather &&
5030  "Extracting from a gather list");
5031 
5032  Value *Vec = E->VectorizedValue;
5033  assert(Vec && "Can't find vectorizable value");
5034 
5035  Value *Lane = Builder.getInt32(ExternalUse.Lane);
5036  // If User == nullptr, the Scalar is used as extra arg. Generate
5037  // ExtractElement instruction and update the record for this scalar in
5038  // ExternallyUsedValues.
5039  if (!User) {
5040  assert(ExternallyUsedValues.count(Scalar) &&
5041  "Scalar with nullptr as an external user must be registered in "
5042  "ExternallyUsedValues map");
5043  if (auto *VecI = dyn_cast<Instruction>(Vec)) {
5044  Builder.SetInsertPoint(VecI->getParent(),
5045  std::next(VecI->getIterator()));
5046  } else {
5047  Builder.SetInsertPoint(&F->getEntryBlock().front());
5048  }
5049  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
5050  Ex = extend(ScalarRoot, Ex, Scalar->getType());
5051  CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
5052  auto &Locs = ExternallyUsedValues[Scalar];
5053  ExternallyUsedValues.insert({Ex, Locs});
5054  ExternallyUsedValues.erase(Scalar);
5055  // Required to update internally referenced instructions.
5056  Scalar->replaceAllUsesWith(Ex);
5057  continue;
5058  }
5059 
5060  // Generate extracts for out-of-tree users.
5061  // Find the insertion point for the extractelement lane.
5062  if (auto *VecI = dyn_cast<Instruction>(Vec)) {
5063  if (PHINode *PH = dyn_cast<PHINode>(User)) {
5064  for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
5065  if (PH->getIncomingValue(i) == Scalar) {
5066  Instruction *IncomingTerminator =
5067  PH->getIncomingBlock(i)->getTerminator();
5068  if (isa<CatchSwitchInst>(IncomingTerminator)) {
5069  Builder.SetInsertPoint(VecI->getParent(),
5070  std::next(VecI->getIterator()));
5071  } else {
5072  Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
5073  }
5074  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
5075  Ex = extend(ScalarRoot, Ex, Scalar->getType());
5076  CSEBlocks.insert(PH->getIncomingBlock(i));
5077  PH->setOperand(i, Ex);
5078  }
5079  }
5080  } else {
5081  Builder.SetInsertPoint(cast<Instruction>(User));
5082  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
5083  Ex = extend(ScalarRoot, Ex, Scalar->getType());
5084  CSEBlocks.insert(cast<Instruction>(User)->getParent());
5085  User->replaceUsesOfWith(Scalar, Ex);
5086  }
5087  } else {
5088  Builder.SetInsertPoint(&F->getEntryBlock().front());
5089  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
5090  Ex = extend(ScalarRoot, Ex, Scalar->getType());
5091  CSEBlocks.insert(&F->getEntryBlock());
5092  User->replaceUsesOfWith(Scalar, Ex);
5093  }
5094 
5095  LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
5096  }
5097 
5098  // For each vectorized value:
5099  for (auto &TEPtr : VectorizableTree) {
5100  TreeEntry *Entry = TEPtr.get();
5101 
5102  // No need to handle users of gathered values.
5103  if (Entry->State == TreeEntry::NeedToGather)
5104  continue;
5105 
5106  assert(Entry->VectorizedValue && "Can't find vectorizable value");
5107 
5108  // For each lane:
5109  for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5110  Value *Scalar = Entry->Scalars[Lane];
5111 
5112 #ifndef NDEBUG
5113  Type *Ty = Scalar->getType();
5114  if (!Ty->isVoidTy()) {
5115  for (User *U : Scalar->users()) {
5116  LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
5117 
5118  // It is legal to delete users in the ignorelist.
5119  assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
5120  "Deleting out-of-tree value");
5121  }
5122  }
5123 #endif
5124  LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
5125  eraseInstruction(cast<Instruction>(Scalar));
5126  }
5127  }
5128 
5129  Builder.ClearInsertionPoint();
5130  InstrElementSize.clear();
5131 
5132  return VectorizableTree[0]->VectorizedValue;
5133 }
5134 
5135 void BoUpSLP::optimizeGatherSequence() {
5136  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
5137  << " gather sequences instructions.\n");
5138  // LICM InsertElementInst sequences.
5139  for (Instruction *I : GatherSeq) {
5140  if (isDeleted(I))
5141  continue;
5142 
5143  // Check if this block is inside a loop.
5144  Loop *L = LI->getLoopFor(I->getParent());
5145  if (!L)
5146  continue;
5147 
5148  // Check if it has a preheader.
5149  BasicBlock *PreHeader = L->getLoopPreheader();
5150  if (!PreHeader)
5151  continue;
5152 
5153  // If the vector or the element that we insert into it are
5154  // instructions that are defined in this basic block then we can't
5155  // hoist this instruction.
5156  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
5157  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
5158  if (Op0 && L->contains(Op0))
5159  continue;
5160  if (Op1 && L->contains(Op1))
5161  continue;
5162 
5163  // We can hoist this instruction. Move it to the pre-header.
5164  I->moveBefore(PreHeader->getTerminator());
5165  }
5166 
5167  // Make a list of all reachable blocks in our CSE queue.
5169  CSEWorkList.reserve(CSEBlocks.size());
5170  for (BasicBlock *BB : CSEBlocks)
5171  if (DomTreeNode *N = DT->getNode(BB)) {
5172  assert(DT->isReachableFromEntry(N));
5173  CSEWorkList.push_back(N);
5174  }
5175 
5176  // Sort blocks by domination. This ensures we visit a block after all blocks
5177  // dominating it are visited.
5178  llvm::stable_sort(CSEWorkList,
5179  [this](const DomTreeNode *A, const DomTreeNode *B) {
5180  return DT->properlyDominates(A, B);
5181  });
5182 
5183  // Perform O(N^2) search over the gather sequences and merge identical
5184  // instructions. TODO: We can further optimize this scan if we split the
5185  // instructions into different buckets based on the insert lane.
5187  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
5188  assert(*I &&
5189  (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
5190  "Worklist not sorted properly!");
5191  BasicBlock *BB = (*I)->getBlock();
5192  // For all instructions in blocks containing gather sequences:
5193  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
5194  Instruction *In = &*it++;
5195  if (isDeleted(In))
5196  continue;
5197  if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
5198  continue;
5199 
5200  // Check if we can replace this instruction with any of the
5201  // visited instructions.
5202  for (Instruction *v : Visited) {
5203  if (In->isIdenticalTo(v) &&
5204  DT->dominates(v->getParent(), In->getParent())) {
5205  In->replaceAllUsesWith(v);
5207  In = nullptr;
5208  break;
5209  }
5210  }
5211  if (In) {
5212  assert(!is_contained(Visited, In));
5213  Visited.push_back(In);
5214  }
5215  }
5216  }
5217  CSEBlocks.clear();
5218  GatherSeq.clear();
5219 }
5220 
5221 // Groups the instructions to a bundle (which is then a single scheduling entity)
5222 // and schedules instructions until the bundle gets ready.
5224 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5225  const InstructionsState &S) {
5226  if (isa<PHINode>(S.OpValue))
5227  return nullptr;
5228 
5229  // Initialize the instruction bundle.
5230  Instruction *OldScheduleEnd = ScheduleEnd;
5231  ScheduleData *PrevInBundle = nullptr;
5232  ScheduleData *Bundle = nullptr;
5233  bool ReSchedule = false;
5234  LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
5235 
5236  auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule,
5237  ScheduleData *Bundle) {
5238  // The scheduling region got new instructions at the lower end (or it is a
5239  // new region for the first bundle). This makes it necessary to
5240  // recalculate all dependencies.
5241  // It is seldom that this needs to be done a second time after adding the
5242  // initial bundle to the region.
5243  if (ScheduleEnd != OldScheduleEnd) {
5244  for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
5245  doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
5246  ReSchedule = true;
5247  }
5248  if (ReSchedule) {
5249  resetSchedule();
5250  initialFillReadyList(ReadyInsts);
5251  }
5252  if (Bundle) {
5253  LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
5254  << " in block " << BB->getName() << "\n");
5255  calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
5256  }
5257 
5258  // Now try to schedule the new bundle or (if no bundle) just calculate
5259  // dependencies. As soon as the bundle is "ready" it means that there are no
5260  // cyclic dependencies and we can schedule it. Note that's important that we
5261  // don't "schedule" the bundle yet (see cancelScheduling).
5262  while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
5263  !ReadyInsts.empty()) {
5264  ScheduleData *Picked = ReadyInsts.pop_back_val();
5265  if (Picked->isSchedulingEntity() && Picked->isReady())
5266  schedule(Picked, ReadyInsts);
5267  }
5268  };
5269 
5270  // Make sure that the scheduling region contains all
5271  // instructions of the bundle.
5272  for (Value *V : VL) {
5273  if (!extendSchedulingRegion(V, S)) {
5274  // If the scheduling region got new instructions at the lower end (or it
5275  // is a new region for the first bundle). This makes it necessary to
5276  // recalculate all dependencies.
5277  // Otherwise the compiler may crash trying to incorrectly calculate
5278  // dependencies and emit instruction in the wrong order at the actual
5279  // scheduling.
5280  TryScheduleBundle(/*ReSchedule=*/false, nullptr);
5281  return None;
5282  }
5283  }
5284 
5285  for (Value *V : VL) {
5286  ScheduleData *BundleMember = getScheduleData(V);
5287  assert(BundleMember &&
5288  "no ScheduleData for bundle member (maybe not in same basic block)");
5289  if (BundleMember->IsScheduled) {
5290  // A bundle member was scheduled as single instruction before and now
5291  // needs to be scheduled as part of the bundle. We just get rid of the
5292  // existing schedule.
5293  LLVM_DEBUG(dbgs() &