LLVM  14.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/DenseSet.h"
22 #include "llvm/ADT/Optional.h"
24 #include "llvm/ADT/PriorityQueue.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SetOperations.h"
27 #include "llvm/ADT/SetVector.h"
29 #include "llvm/ADT/SmallPtrSet.h"
30 #include "llvm/ADT/SmallSet.h"
31 #include "llvm/ADT/SmallString.h"
32 #include "llvm/ADT/Statistic.h"
33 #include "llvm/ADT/iterator.h"
42 #include "llvm/Analysis/LoopInfo.h"
51 #include "llvm/IR/Attributes.h"
52 #include "llvm/IR/BasicBlock.h"
53 #include "llvm/IR/Constant.h"
54 #include "llvm/IR/Constants.h"
55 #include "llvm/IR/DataLayout.h"
56 #include "llvm/IR/DebugLoc.h"
57 #include "llvm/IR/DerivedTypes.h"
58 #include "llvm/IR/Dominators.h"
59 #include "llvm/IR/Function.h"
60 #include "llvm/IR/IRBuilder.h"
61 #include "llvm/IR/InstrTypes.h"
62 #include "llvm/IR/Instruction.h"
63 #include "llvm/IR/Instructions.h"
64 #include "llvm/IR/IntrinsicInst.h"
65 #include "llvm/IR/Intrinsics.h"
66 #include "llvm/IR/Module.h"
67 #include "llvm/IR/NoFolder.h"
68 #include "llvm/IR/Operator.h"
69 #include "llvm/IR/PatternMatch.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/IR/ValueHandle.h"
75 #include "llvm/IR/Verifier.h"
76 #include "llvm/InitializePasses.h"
77 #include "llvm/Pass.h"
78 #include "llvm/Support/Casting.h"
80 #include "llvm/Support/Compiler.h"
82 #include "llvm/Support/Debug.h"
86 #include "llvm/Support/KnownBits.h"
92 #include <algorithm>
93 #include <cassert>
94 #include <cstdint>
95 #include <iterator>
96 #include <memory>
97 #include <set>
98 #include <string>
99 #include <tuple>
100 #include <utility>
101 #include <vector>
102 
103 using namespace llvm;
104 using namespace llvm::PatternMatch;
105 using namespace slpvectorizer;
106 
107 #define SV_NAME "slp-vectorizer"
108 #define DEBUG_TYPE "SLP"
109 
110 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111 
112 cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
113  cl::desc("Run the SLP vectorization passes"));
114 
115 static cl::opt<int>
116  SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
117  cl::desc("Only vectorize if you gain more than this "
118  "number "));
119 
120 static cl::opt<bool>
121 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
122  cl::desc("Attempt to vectorize horizontal reductions"));
123 
125  "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
126  cl::desc(
127  "Attempt to vectorize horizontal reductions feeding into a store"));
128 
129 static cl::opt<int>
130 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
131  cl::desc("Attempt to vectorize for this register size in bits"));
132 
133 static cl::opt<unsigned>
134 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
135  cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
136 
137 static cl::opt<int>
138 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
139  cl::desc("Maximum depth of the lookup for consecutive stores."));
140 
141 /// Limits the size of scheduling regions in a block.
142 /// It avoid long compile times for _very_ large blocks where vector
143 /// instructions are spread over a wide range.
144 /// This limit is way higher than needed by real-world functions.
145 static cl::opt<int>
146 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
147  cl::desc("Limit the size of the SLP scheduling region per block"));
148 
150  "slp-min-reg-size", cl::init(128), cl::Hidden,
151  cl::desc("Attempt to vectorize for this register size in bits"));
152 
154  "slp-recursion-max-depth", cl::init(12), cl::Hidden,
155  cl::desc("Limit the recursion depth when building a vectorizable tree"));
156 
158  "slp-min-tree-size", cl::init(3), cl::Hidden,
159  cl::desc("Only vectorize small trees if they are fully vectorizable"));
160 
161 // The maximum depth that the look-ahead score heuristic will explore.
162 // The higher this value, the higher the compilation time overhead.
164  "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
165  cl::desc("The maximum look-ahead depth for operand reordering scores"));
166 
167 // The Look-ahead heuristic goes through the users of the bundle to calculate
168 // the users cost in getExternalUsesCost(). To avoid compilation time increase
169 // we limit the number of users visited to this value.
171  "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
172  cl::desc("The maximum number of users to visit while visiting the "
173  "predecessors. This prevents compilation time increase."));
174 
175 static cl::opt<bool>
176  ViewSLPTree("view-slp-tree", cl::Hidden,
177  cl::desc("Display the SLP trees with Graphviz"));
178 
179 // Limit the number of alias checks. The limit is chosen so that
180 // it has no negative effect on the llvm benchmarks.
181 static const unsigned AliasedCheckLimit = 10;
182 
183 // Another limit for the alias checks: The maximum distance between load/store
184 // instructions where alias checks are done.
185 // This limit is useful for very large basic blocks.
186 static const unsigned MaxMemDepDistance = 160;
187 
188 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
189 /// regions to be handled.
190 static const int MinScheduleRegionSize = 16;
191 
192 /// Predicate for the element types that the SLP vectorizer supports.
193 ///
194 /// The most important thing to filter here are types which are invalid in LLVM
195 /// vectors. We also filter target specific types which have absolutely no
196 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
197 /// avoids spending time checking the cost model and realizing that they will
198 /// be inevitably scalarized.
199 static bool isValidElementType(Type *Ty) {
200  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
201  !Ty->isPPC_FP128Ty();
202 }
203 
204 /// \returns True if the value is a constant (but not globals/constant
205 /// expressions).
206 static bool isConstant(Value *V) {
207  return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
208 }
209 
210 /// Checks if \p V is one of vector-like instructions, i.e. undef,
211 /// insertelement/extractelement with constant indices for fixed vector type or
212 /// extractvalue instruction.
214  if (!isa<InsertElementInst, ExtractElementInst>(V) &&
215  !isa<ExtractValueInst, UndefValue>(V))
216  return false;
217  auto *I = dyn_cast<Instruction>(V);
218  if (!I || isa<ExtractValueInst>(I))
219  return true;
220  if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
221  return false;
222  if (isa<ExtractElementInst>(I))
223  return isConstant(I->getOperand(1));
224  assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
225  return isConstant(I->getOperand(2));
226 }
227 
228 /// \returns true if all of the instructions in \p VL are in the same block or
229 /// false otherwise.
231  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
232  if (!I0)
233  return false;
235  return true;
236 
237  BasicBlock *BB = I0->getParent();
238  for (int I = 1, E = VL.size(); I < E; I++) {
239  auto *II = dyn_cast<Instruction>(VL[I]);
240  if (!II)
241  return false;
242 
243  if (BB != II->getParent())
244  return false;
245  }
246  return true;
247 }
248 
249 /// \returns True if all of the values in \p VL are constants (but not
250 /// globals/constant expressions).
252  // Constant expressions and globals can't be vectorized like normal integer/FP
253  // constants.
254  return all_of(VL, isConstant);
255 }
256 
257 /// \returns True if all of the values in \p VL are identical or some of them
258 /// are UndefValue.
259 static bool isSplat(ArrayRef<Value *> VL) {
260  Value *FirstNonUndef = nullptr;
261  for (Value *V : VL) {
262  if (isa<UndefValue>(V))
263  continue;
264  if (!FirstNonUndef) {
265  FirstNonUndef = V;
266  continue;
267  }
268  if (V != FirstNonUndef)
269  return false;
270  }
271  return FirstNonUndef != nullptr;
272 }
273 
274 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
275 static bool isCommutative(Instruction *I) {
276  if (auto *Cmp = dyn_cast<CmpInst>(I))
277  return Cmp->isCommutative();
278  if (auto *BO = dyn_cast<BinaryOperator>(I))
279  return BO->isCommutative();
280  // TODO: This should check for generic Instruction::isCommutative(), but
281  // we need to confirm that the caller code correctly handles Intrinsics
282  // for example (does not have 2 operands).
283  return false;
284 }
285 
286 /// Checks if the given value is actually an undefined constant vector.
287 static bool isUndefVector(const Value *V) {
288  if (isa<UndefValue>(V))
289  return true;
290  auto *C = dyn_cast<Constant>(V);
291  if (!C)
292  return false;
293  if (!C->containsUndefOrPoisonElement())
294  return false;
295  auto *VecTy = dyn_cast<FixedVectorType>(C->getType());
296  if (!VecTy)
297  return false;
298  for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
299  if (Constant *Elem = C->getAggregateElement(I))
300  if (!isa<UndefValue>(Elem))
301  return false;
302  }
303  return true;
304 }
305 
306 /// Checks if the vector of instructions can be represented as a shuffle, like:
307 /// %x0 = extractelement <4 x i8> %x, i32 0
308 /// %x3 = extractelement <4 x i8> %x, i32 3
309 /// %y1 = extractelement <4 x i8> %y, i32 1
310 /// %y2 = extractelement <4 x i8> %y, i32 2
311 /// %x0x0 = mul i8 %x0, %x0
312 /// %x3x3 = mul i8 %x3, %x3
313 /// %y1y1 = mul i8 %y1, %y1
314 /// %y2y2 = mul i8 %y2, %y2
315 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
316 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
317 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
318 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
319 /// ret <4 x i8> %ins4
320 /// can be transformed into:
321 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
322 /// i32 6>
323 /// %2 = mul <4 x i8> %1, %1
324 /// ret <4 x i8> %2
325 /// We convert this initially to something like:
326 /// %x0 = extractelement <4 x i8> %x, i32 0
327 /// %x3 = extractelement <4 x i8> %x, i32 3
328 /// %y1 = extractelement <4 x i8> %y, i32 1
329 /// %y2 = extractelement <4 x i8> %y, i32 2
330 /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
331 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
332 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
333 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
334 /// %5 = mul <4 x i8> %4, %4
335 /// %6 = extractelement <4 x i8> %5, i32 0
336 /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
337 /// %7 = extractelement <4 x i8> %5, i32 1
338 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
339 /// %8 = extractelement <4 x i8> %5, i32 2
340 /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
341 /// %9 = extractelement <4 x i8> %5, i32 3
342 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
343 /// ret <4 x i8> %ins4
344 /// InstCombiner transforms this into a shuffle and vector mul
345 /// Mask will return the Shuffle Mask equivalent to the extracted elements.
346 /// TODO: Can we split off and reuse the shuffle mask detection from
347 /// TargetTransformInfo::getInstructionThroughput?
350  const auto *It =
351  find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
352  if (It == VL.end())
353  return None;
354  auto *EI0 = cast<ExtractElementInst>(*It);
355  if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
356  return None;
357  unsigned Size =
358  cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
359  Value *Vec1 = nullptr;
360  Value *Vec2 = nullptr;
361  enum ShuffleMode { Unknown, Select, Permute };
362  ShuffleMode CommonShuffleMode = Unknown;
363  Mask.assign(VL.size(), UndefMaskElem);
364  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
365  // Undef can be represented as an undef element in a vector.
366  if (isa<UndefValue>(VL[I]))
367  continue;
368  auto *EI = cast<ExtractElementInst>(VL[I]);
369  if (isa<ScalableVectorType>(EI->getVectorOperandType()))
370  return None;
371  auto *Vec = EI->getVectorOperand();
372  // We can extractelement from undef or poison vector.
373  if (isUndefVector(Vec))
374  continue;
375  // All vector operands must have the same number of vector elements.
376  if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
377  return None;
378  if (isa<UndefValue>(EI->getIndexOperand()))
379  continue;
380  auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
381  if (!Idx)
382  return None;
383  // Undefined behavior if Idx is negative or >= Size.
384  if (Idx->getValue().uge(Size))
385  continue;
386  unsigned IntIdx = Idx->getValue().getZExtValue();
387  Mask[I] = IntIdx;
388  // For correct shuffling we have to have at most 2 different vector operands
389  // in all extractelement instructions.
390  if (!Vec1 || Vec1 == Vec) {
391  Vec1 = Vec;
392  } else if (!Vec2 || Vec2 == Vec) {
393  Vec2 = Vec;
394  Mask[I] += Size;
395  } else {
396  return None;
397  }
398  if (CommonShuffleMode == Permute)
399  continue;
400  // If the extract index is not the same as the operation number, it is a
401  // permutation.
402  if (IntIdx != I) {
403  CommonShuffleMode = Permute;
404  continue;
405  }
406  CommonShuffleMode = Select;
407  }
408  // If we're not crossing lanes in different vectors, consider it as blending.
409  if (CommonShuffleMode == Select && Vec2)
411  // If Vec2 was never used, we have a permutation of a single vector, otherwise
412  // we have permutation of 2 vectors.
415 }
416 
417 namespace {
418 
419 /// Main data required for vectorization of instructions.
420 struct InstructionsState {
421  /// The very first instruction in the list with the main opcode.
422  Value *OpValue = nullptr;
423 
424  /// The main/alternate instruction.
425  Instruction *MainOp = nullptr;
426  Instruction *AltOp = nullptr;
427 
428  /// The main/alternate opcodes for the list of instructions.
429  unsigned getOpcode() const {
430  return MainOp ? MainOp->getOpcode() : 0;
431  }
432 
433  unsigned getAltOpcode() const {
434  return AltOp ? AltOp->getOpcode() : 0;
435  }
436 
437  /// Some of the instructions in the list have alternate opcodes.
438  bool isAltShuffle() const { return AltOp != MainOp; }
439 
440  bool isOpcodeOrAlt(Instruction *I) const {
441  unsigned CheckedOpcode = I->getOpcode();
442  return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
443  }
444 
445  InstructionsState() = delete;
446  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
447  : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
448 };
449 
450 } // end anonymous namespace
451 
452 /// Chooses the correct key for scheduling data. If \p Op has the same (or
453 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
454 /// OpValue.
455 static Value *isOneOf(const InstructionsState &S, Value *Op) {
456  auto *I = dyn_cast<Instruction>(Op);
457  if (I && S.isOpcodeOrAlt(I))
458  return Op;
459  return S.OpValue;
460 }
461 
462 /// \returns true if \p Opcode is allowed as part of of the main/alternate
463 /// instruction for SLP vectorization.
464 ///
465 /// Example of unsupported opcode is SDIV that can potentially cause UB if the
466 /// "shuffled out" lane would result in division by zero.
467 static bool isValidForAlternation(unsigned Opcode) {
468  if (Instruction::isIntDivRem(Opcode))
469  return false;
470 
471  return true;
472 }
473 
474 /// \returns analysis of the Instructions in \p VL described in
475 /// InstructionsState, the Opcode that we suppose the whole list
476 /// could be vectorized even if its structure is diverse.
477 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
478  unsigned BaseIndex = 0) {
479  // Make sure these are all Instructions.
480  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
481  return InstructionsState(VL[BaseIndex], nullptr, nullptr);
482 
483  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
484  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
485  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
486  unsigned AltOpcode = Opcode;
487  unsigned AltIndex = BaseIndex;
488 
489  // Check for one alternate opcode from another BinaryOperator.
490  // TODO - generalize to support all operators (types, calls etc.).
491  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
492  unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
493  if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
494  if (InstOpcode == Opcode || InstOpcode == AltOpcode)
495  continue;
496  if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
497  isValidForAlternation(Opcode)) {
498  AltOpcode = InstOpcode;
499  AltIndex = Cnt;
500  continue;
501  }
502  } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
503  Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
504  Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
505  if (Ty0 == Ty1) {
506  if (InstOpcode == Opcode || InstOpcode == AltOpcode)
507  continue;
508  if (Opcode == AltOpcode) {
509  assert(isValidForAlternation(Opcode) &&
510  isValidForAlternation(InstOpcode) &&
511  "Cast isn't safe for alternation, logic needs to be updated!");
512  AltOpcode = InstOpcode;
513  AltIndex = Cnt;
514  continue;
515  }
516  }
517  } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
518  continue;
519  return InstructionsState(VL[BaseIndex], nullptr, nullptr);
520  }
521 
522  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
523  cast<Instruction>(VL[AltIndex]));
524 }
525 
526 /// \returns true if all of the values in \p VL have the same type or false
527 /// otherwise.
529  Type *Ty = VL[0]->getType();
530  for (int i = 1, e = VL.size(); i < e; i++)
531  if (VL[i]->getType() != Ty)
532  return false;
533 
534  return true;
535 }
536 
537 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
539  unsigned Opcode = E->getOpcode();
540  assert((Opcode == Instruction::ExtractElement ||
541  Opcode == Instruction::ExtractValue) &&
542  "Expected extractelement or extractvalue instruction.");
543  if (Opcode == Instruction::ExtractElement) {
544  auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
545  if (!CI)
546  return None;
547  return CI->getZExtValue();
548  }
549  ExtractValueInst *EI = cast<ExtractValueInst>(E);
550  if (EI->getNumIndices() != 1)
551  return None;
552  return *EI->idx_begin();
553 }
554 
555 /// \returns True if in-tree use also needs extract. This refers to
556 /// possible scalar operand in vectorized instruction.
557 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
558  TargetLibraryInfo *TLI) {
559  unsigned Opcode = UserInst->getOpcode();
560  switch (Opcode) {
561  case Instruction::Load: {
562  LoadInst *LI = cast<LoadInst>(UserInst);
563  return (LI->getPointerOperand() == Scalar);
564  }
565  case Instruction::Store: {
566  StoreInst *SI = cast<StoreInst>(UserInst);
567  return (SI->getPointerOperand() == Scalar);
568  }
569  case Instruction::Call: {
570  CallInst *CI = cast<CallInst>(UserInst);
572  for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
574  return (CI->getArgOperand(i) == Scalar);
575  }
577  }
578  default:
579  return false;
580  }
581 }
582 
583 /// \returns the AA location that is being access by the instruction.
585  if (StoreInst *SI = dyn_cast<StoreInst>(I))
586  return MemoryLocation::get(SI);
587  if (LoadInst *LI = dyn_cast<LoadInst>(I))
588  return MemoryLocation::get(LI);
589  return MemoryLocation();
590 }
591 
592 /// \returns True if the instruction is not a volatile or atomic load/store.
593 static bool isSimple(Instruction *I) {
594  if (LoadInst *LI = dyn_cast<LoadInst>(I))
595  return LI->isSimple();
596  if (StoreInst *SI = dyn_cast<StoreInst>(I))
597  return SI->isSimple();
598  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
599  return !MI->isVolatile();
600  return true;
601 }
602 
603 /// Shuffles \p Mask in accordance with the given \p SubMask.
605  if (SubMask.empty())
606  return;
607  if (Mask.empty()) {
608  Mask.append(SubMask.begin(), SubMask.end());
609  return;
610  }
611  SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
612  int TermValue = std::min(Mask.size(), SubMask.size());
613  for (int I = 0, E = SubMask.size(); I < E; ++I) {
614  if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
615  Mask[SubMask[I]] >= TermValue)
616  continue;
617  NewMask[I] = Mask[SubMask[I]];
618  }
619  Mask.swap(NewMask);
620 }
621 
622 /// Order may have elements assigned special value (size) which is out of
623 /// bounds. Such indices only appear on places which correspond to undef values
624 /// (see canReuseExtract for details) and used in order to avoid undef values
625 /// have effect on operands ordering.
626 /// The first loop below simply finds all unused indices and then the next loop
627 /// nest assigns these indices for undef values positions.
628 /// As an example below Order has two undef positions and they have assigned
629 /// values 3 and 7 respectively:
630 /// before: 6 9 5 4 9 2 1 0
631 /// after: 6 3 5 4 7 2 1 0
633  const unsigned Sz = Order.size();
634  SmallBitVector UnusedIndices(Sz, /*t=*/true);
635  SmallBitVector MaskedIndices(Sz);
636  for (unsigned I = 0; I < Sz; ++I) {
637  if (Order[I] < Sz)
638  UnusedIndices.reset(Order[I]);
639  else
640  MaskedIndices.set(I);
641  }
642  if (MaskedIndices.none())
643  return;
644  assert(UnusedIndices.count() == MaskedIndices.count() &&
645  "Non-synced masked/available indices.");
646  int Idx = UnusedIndices.find_first();
647  int MIdx = MaskedIndices.find_first();
648  while (MIdx >= 0) {
649  assert(Idx >= 0 && "Indices must be synced.");
650  Order[MIdx] = Idx;
651  Idx = UnusedIndices.find_next(Idx);
652  MIdx = MaskedIndices.find_next(MIdx);
653  }
654 }
655 
656 namespace llvm {
657 
660  Mask.clear();
661  const unsigned E = Indices.size();
662  Mask.resize(E, UndefMaskElem);
663  for (unsigned I = 0; I < E; ++I)
664  Mask[Indices[I]] = I;
665 }
666 
667 /// \returns inserting index of InsertElement or InsertValue instruction,
668 /// using Offset as base offset for index.
669 static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
670  int Index = Offset;
671  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
672  if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
673  auto *VT = cast<FixedVectorType>(IE->getType());
674  if (CI->getValue().uge(VT->getNumElements()))
675  return UndefMaskElem;
676  Index *= VT->getNumElements();
677  Index += CI->getZExtValue();
678  return Index;
679  }
680  if (isa<UndefValue>(IE->getOperand(2)))
681  return UndefMaskElem;
682  return None;
683  }
684 
685  auto *IV = cast<InsertValueInst>(InsertInst);
686  Type *CurrentType = IV->getType();
687  for (unsigned I : IV->indices()) {
688  if (auto *ST = dyn_cast<StructType>(CurrentType)) {
689  Index *= ST->getNumElements();
690  CurrentType = ST->getElementType(I);
691  } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
692  Index *= AT->getNumElements();
693  CurrentType = AT->getElementType();
694  } else {
695  return None;
696  }
697  Index += I;
698  }
699  return Index;
700 }
701 
702 /// Reorders the list of scalars in accordance with the given \p Order and then
703 /// the \p Mask. \p Order - is the original order of the scalars, need to
704 /// reorder scalars into an unordered state at first according to the given
705 /// order. Then the ordered scalars are shuffled once again in accordance with
706 /// the provided mask.
709  assert(!Mask.empty() && "Expected non-empty mask.");
710  SmallVector<Value *> Prev(Scalars.size(),
711  UndefValue::get(Scalars.front()->getType()));
712  Prev.swap(Scalars);
713  for (unsigned I = 0, E = Prev.size(); I < E; ++I)
714  if (Mask[I] != UndefMaskElem)
715  Scalars[Mask[I]] = Prev[I];
716 }
717 
718 namespace slpvectorizer {
719 
720 /// Bottom Up SLP Vectorizer.
721 class BoUpSLP {
722  struct TreeEntry;
723  struct ScheduleData;
724 
725 public:
733 
735  TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
738  : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
739  DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
740  CodeMetrics::collectEphemeralValues(F, AC, EphValues);
741  // Use the vector register size specified by the target unless overridden
742  // by a command-line option.
743  // TODO: It would be better to limit the vectorization factor based on
744  // data type rather than just register size. For example, x86 AVX has
745  // 256-bit registers, but it does not support integer operations
746  // at that width (that requires AVX2).
747  if (MaxVectorRegSizeOption.getNumOccurrences())
748  MaxVecRegSize = MaxVectorRegSizeOption;
749  else
750  MaxVecRegSize =
752  .getFixedSize();
753 
754  if (MinVectorRegSizeOption.getNumOccurrences())
755  MinVecRegSize = MinVectorRegSizeOption;
756  else
757  MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
758  }
759 
760  /// Vectorize the tree that starts with the elements in \p VL.
761  /// Returns the vectorized root.
762  Value *vectorizeTree();
763 
764  /// Vectorize the tree but with the list of externally used values \p
765  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
766  /// generated extractvalue instructions.
767  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
768 
769  /// \returns the cost incurred by unwanted spills and fills, caused by
770  /// holding live values over call sites.
771  InstructionCost getSpillCost() const;
772 
773  /// \returns the vectorization cost of the subtree that starts at \p VL.
774  /// A negative number means that this is profitable.
775  InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);
776 
777  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
778  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
779  void buildTree(ArrayRef<Value *> Roots,
780  ArrayRef<Value *> UserIgnoreLst = None);
781 
782  /// Builds external uses of the vectorized scalars, i.e. the list of
783  /// vectorized scalars to be extracted, their lanes and their scalar users. \p
784  /// ExternallyUsedValues contains additional list of external uses to handle
785  /// vectorization of reductions.
786  void
787  buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
788 
789  /// Clear the internal data structures that are created by 'buildTree'.
790  void deleteTree() {
791  VectorizableTree.clear();
792  ScalarToTreeEntry.clear();
793  MustGather.clear();
794  ExternalUses.clear();
795  for (auto &Iter : BlocksSchedules) {
796  BlockScheduling *BS = Iter.second.get();
797  BS->clear();
798  }
799  MinBWs.clear();
800  InstrElementSize.clear();
801  }
802 
803  unsigned getTreeSize() const { return VectorizableTree.size(); }
804 
805  /// Perform LICM and CSE on the newly generated gather sequences.
806  void optimizeGatherSequence();
807 
808  /// Checks if the specified gather tree entry \p TE can be represented as a
809  /// shuffled vector entry + (possibly) permutation with other gathers. It
810  /// implements the checks only for possibly ordered scalars (Loads,
811  /// ExtractElement, ExtractValue), which can be part of the graph.
812  Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
813 
814  /// Gets reordering data for the given tree entry. If the entry is vectorized
815  /// - just return ReorderIndices, otherwise check if the scalars can be
816  /// reordered and return the most optimal order.
817  /// \param TopToBottom If true, include the order of vectorized stores and
818  /// insertelement nodes, otherwise skip them.
819  Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
820 
821  /// Reorders the current graph to the most profitable order starting from the
822  /// root node to the leaf nodes. The best order is chosen only from the nodes
823  /// of the same size (vectorization factor). Smaller nodes are considered
824  /// parts of subgraph with smaller VF and they are reordered independently. We
825  /// can make it because we still need to extend smaller nodes to the wider VF
826  /// and we can merge reordering shuffles with the widening shuffles.
827  void reorderTopToBottom();
828 
829  /// Reorders the current graph to the most profitable order starting from
830  /// leaves to the root. It allows to rotate small subgraphs and reduce the
831  /// number of reshuffles if the leaf nodes use the same order. In this case we
832  /// can merge the orders and just shuffle user node instead of shuffling its
833  /// operands. Plus, even the leaf nodes have different orders, it allows to
834  /// sink reordering in the graph closer to the root node and merge it later
835  /// during analysis.
836  void reorderBottomToTop(bool IgnoreReorder = false);
837 
838  /// \return The vector element size in bits to use when vectorizing the
839  /// expression tree ending at \p V. If V is a store, the size is the width of
840  /// the stored value. Otherwise, the size is the width of the largest loaded
841  /// value reaching V. This method is used by the vectorizer to calculate
842  /// vectorization factors.
843  unsigned getVectorElementSize(Value *V);
844 
845  /// Compute the minimum type sizes required to represent the entries in a
846  /// vectorizable tree.
848 
849  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
850  unsigned getMaxVecRegSize() const {
851  return MaxVecRegSize;
852  }
853 
854  // \returns minimum vector register size as set by cl::opt.
855  unsigned getMinVecRegSize() const {
856  return MinVecRegSize;
857  }
858 
859  unsigned getMinVF(unsigned Sz) const {
860  return std::max(2U, getMinVecRegSize() / Sz);
861  }
862 
863  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
864  unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
865  MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
866  return MaxVF ? MaxVF : UINT_MAX;
867  }
868 
869  /// Check if homogeneous aggregate is isomorphic to some VectorType.
870  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
871  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
872  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
873  ///
874  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
875  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
876 
877  /// \returns True if the VectorizableTree is both tiny and not fully
878  /// vectorizable. We do not vectorize such trees.
879  bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
880 
881  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
882  /// can be load combined in the backend. Load combining may not be allowed in
883  /// the IR optimizer, so we do not want to alter the pattern. For example,
884  /// partially transforming a scalar bswap() pattern into vector code is
885  /// effectively impossible for the backend to undo.
886  /// TODO: If load combining is allowed in the IR optimizer, this analysis
887  /// may not be necessary.
888  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
889 
890  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
891  /// can be load combined in the backend. Load combining may not be allowed in
892  /// the IR optimizer, so we do not want to alter the pattern. For example,
893  /// partially transforming a scalar bswap() pattern into vector code is
894  /// effectively impossible for the backend to undo.
895  /// TODO: If load combining is allowed in the IR optimizer, this analysis
896  /// may not be necessary.
897  bool isLoadCombineCandidate() const;
898 
900 
901  /// This structure holds any data we need about the edges being traversed
902  /// during buildTree_rec(). We keep track of:
903  /// (i) the user TreeEntry index, and
904  /// (ii) the index of the edge.
905  struct EdgeInfo {
906  EdgeInfo() = default;
907  EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
908  : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
909  /// The user TreeEntry.
910  TreeEntry *UserTE = nullptr;
911  /// The operand index of the use.
912  unsigned EdgeIdx = UINT_MAX;
913 #ifndef NDEBUG
914  friend inline raw_ostream &operator<<(raw_ostream &OS,
915  const BoUpSLP::EdgeInfo &EI) {
916  EI.dump(OS);
917  return OS;
918  }
919  /// Debug print.
920  void dump(raw_ostream &OS) const {
921  OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
922  << " EdgeIdx:" << EdgeIdx << "}";
923  }
924  LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
925 #endif
926  };
927 
928  /// A helper data structure to hold the operands of a vector of instructions.
929  /// This supports a fixed vector length for all operand vectors.
930  class VLOperands {
931  /// For each operand we need (i) the value, and (ii) the opcode that it
932  /// would be attached to if the expression was in a left-linearized form.
933  /// This is required to avoid illegal operand reordering.
934  /// For example:
935  /// \verbatim
936  /// 0 Op1
937  /// |/
938  /// Op1 Op2 Linearized + Op2
939  /// \ / ----------> |/
940  /// - -
941  ///
942  /// Op1 - Op2 (0 + Op1) - Op2
943  /// \endverbatim
944  ///
945  /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
946  ///
947  /// Another way to think of this is to track all the operations across the
948  /// path from the operand all the way to the root of the tree and to
949  /// calculate the operation that corresponds to this path. For example, the
950  /// path from Op2 to the root crosses the RHS of the '-', therefore the
951  /// corresponding operation is a '-' (which matches the one in the
952  /// linearized tree, as shown above).
953  ///
954  /// For lack of a better term, we refer to this operation as Accumulated
955  /// Path Operation (APO).
956  struct OperandData {
957  OperandData() = default;
958  OperandData(Value *V, bool APO, bool IsUsed)
959  : V(V), APO(APO), IsUsed(IsUsed) {}
960  /// The operand value.
961  Value *V = nullptr;
962  /// TreeEntries only allow a single opcode, or an alternate sequence of
963  /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
964  /// APO. It is set to 'true' if 'V' is attached to an inverse operation
965  /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
966  /// (e.g., Add/Mul)
967  bool APO = false;
968  /// Helper data for the reordering function.
969  bool IsUsed = false;
970  };
971 
972  /// During operand reordering, we are trying to select the operand at lane
973  /// that matches best with the operand at the neighboring lane. Our
974  /// selection is based on the type of value we are looking for. For example,
975  /// if the neighboring lane has a load, we need to look for a load that is
976  /// accessing a consecutive address. These strategies are summarized in the
977  /// 'ReorderingMode' enumerator.
978  enum class ReorderingMode {
979  Load, ///< Matching loads to consecutive memory addresses
980  Opcode, ///< Matching instructions based on opcode (same or alternate)
981  Constant, ///< Matching constants
982  Splat, ///< Matching the same instruction multiple times (broadcast)
983  Failed, ///< We failed to create a vectorizable group
984  };
985 
987 
988  /// A vector of operand vectors.
990 
991  const DataLayout &DL;
992  ScalarEvolution &SE;
993  const BoUpSLP &R;
994 
995  /// \returns the operand data at \p OpIdx and \p Lane.
996  OperandData &getData(unsigned OpIdx, unsigned Lane) {
997  return OpsVec[OpIdx][Lane];
998  }
999 
1000  /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1001  const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1002  return OpsVec[OpIdx][Lane];
1003  }
1004 
1005  /// Clears the used flag for all entries.
1006  void clearUsed() {
1007  for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1008  OpIdx != NumOperands; ++OpIdx)
1009  for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1010  ++Lane)
1011  OpsVec[OpIdx][Lane].IsUsed = false;
1012  }
1013 
1014  /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1015  void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1016  std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1017  }
1018 
1019  // The hard-coded scores listed here are not very important, though it shall
1020  // be higher for better matches to improve the resulting cost. When
1021  // computing the scores of matching one sub-tree with another, we are
1022  // basically counting the number of values that are matching. So even if all
1023  // scores are set to 1, we would still get a decent matching result.
1024  // However, sometimes we have to break ties. For example we may have to
1025  // choose between matching loads vs matching opcodes. This is what these
1026  // scores are helping us with: they provide the order of preference. Also,
1027  // this is important if the scalar is externally used or used in another
1028  // tree entry node in the different lane.
1029 
1030  /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1031  static const int ScoreConsecutiveLoads = 4;
1032  /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1033  static const int ScoreReversedLoads = 3;
1034  /// ExtractElementInst from same vector and consecutive indexes.
1035  static const int ScoreConsecutiveExtracts = 4;
1036  /// ExtractElementInst from same vector and reversed indices.
1037  static const int ScoreReversedExtracts = 3;
1038  /// Constants.
1039  static const int ScoreConstants = 2;
1040  /// Instructions with the same opcode.
1041  static const int ScoreSameOpcode = 2;
1042  /// Instructions with alt opcodes (e.g, add + sub).
1043  static const int ScoreAltOpcodes = 1;
1044  /// Identical instructions (a.k.a. splat or broadcast).
1045  static const int ScoreSplat = 1;
1046  /// Matching with an undef is preferable to failing.
1047  static const int ScoreUndef = 1;
1048  /// Score for failing to find a decent match.
1049  static const int ScoreFail = 0;
1050  /// User exteranl to the vectorized code.
1051  static const int ExternalUseCost = 1;
1052  /// The user is internal but in a different lane.
1053  static const int UserInDiffLaneCost = ExternalUseCost;
1054 
1055  /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1056  static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
1057  ScalarEvolution &SE, int NumLanes) {
1058  if (V1 == V2)
1059  return VLOperands::ScoreSplat;
1060 
1061  auto *LI1 = dyn_cast<LoadInst>(V1);
1062  auto *LI2 = dyn_cast<LoadInst>(V2);
1063  if (LI1 && LI2) {
1064  if (LI1->getParent() != LI2->getParent())
1065  return VLOperands::ScoreFail;
1066 
1068  LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1069  LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1070  if (!Dist)
1071  return VLOperands::ScoreFail;
1072  // The distance is too large - still may be profitable to use masked
1073  // loads/gathers.
1074  if (std::abs(*Dist) > NumLanes / 2)
1075  return VLOperands::ScoreAltOpcodes;
1076  // This still will detect consecutive loads, but we might have "holes"
1077  // in some cases. It is ok for non-power-2 vectorization and may produce
1078  // better results. It should not affect current vectorization.
1079  return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
1080  : VLOperands::ScoreReversedLoads;
1081  }
1082 
1083  auto *C1 = dyn_cast<Constant>(V1);
1084  auto *C2 = dyn_cast<Constant>(V2);
1085  if (C1 && C2)
1086  return VLOperands::ScoreConstants;
1087 
1088  // Extracts from consecutive indexes of the same vector better score as
1089  // the extracts could be optimized away.
1090  Value *EV1;
1091  ConstantInt *Ex1Idx;
1092  if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1093  // Undefs are always profitable for extractelements.
1094  if (isa<UndefValue>(V2))
1095  return VLOperands::ScoreConsecutiveExtracts;
1096  Value *EV2 = nullptr;
1097  ConstantInt *Ex2Idx = nullptr;
1098  if (match(V2,
1100  m_Undef())))) {
1101  // Undefs are always profitable for extractelements.
1102  if (!Ex2Idx)
1103  return VLOperands::ScoreConsecutiveExtracts;
1104  if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
1105  return VLOperands::ScoreConsecutiveExtracts;
1106  if (EV2 == EV1) {
1107  int Idx1 = Ex1Idx->getZExtValue();
1108  int Idx2 = Ex2Idx->getZExtValue();
1109  int Dist = Idx2 - Idx1;
1110  // The distance is too large - still may be profitable to use
1111  // shuffles.
1112  if (std::abs(Dist) > NumLanes / 2)
1113  return VLOperands::ScoreAltOpcodes;
1114  return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
1115  : VLOperands::ScoreReversedExtracts;
1116  }
1117  }
1118  }
1119 
1120  auto *I1 = dyn_cast<Instruction>(V1);
1121  auto *I2 = dyn_cast<Instruction>(V2);
1122  if (I1 && I2) {
1123  if (I1->getParent() != I2->getParent())
1124  return VLOperands::ScoreFail;
1125  InstructionsState S = getSameOpcode({I1, I2});
1126  // Note: Only consider instructions with <= 2 operands to avoid
1127  // complexity explosion.
1128  if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
1129  return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
1130  : VLOperands::ScoreSameOpcode;
1131  }
1132 
1133  if (isa<UndefValue>(V2))
1134  return VLOperands::ScoreUndef;
1135 
1136  return VLOperands::ScoreFail;
1137  }
1138 
1139  /// Holds the values and their lanes that are taking part in the look-ahead
1140  /// score calculation. This is used in the external uses cost calculation.
1141  /// Need to hold all the lanes in case of splat/broadcast at least to
1142  /// correctly check for the use in the different lane.
1143  SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
1144 
1145  /// \returns the additional cost due to uses of \p LHS and \p RHS that are
1146  /// either external to the vectorized code, or require shuffling.
1147  int getExternalUsesCost(const std::pair<Value *, int> &LHS,
1148  const std::pair<Value *, int> &RHS) {
1149  int Cost = 0;
1150  std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
1151  for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
1152  Value *V = Values[Idx].first;
1153  if (isa<Constant>(V)) {
1154  // Since this is a function pass, it doesn't make semantic sense to
1155  // walk the users of a subclass of Constant. The users could be in
1156  // another function, or even another module that happens to be in
1157  // the same LLVMContext.
1158  continue;
1159  }
1160 
1161  // Calculate the absolute lane, using the minimum relative lane of LHS
1162  // and RHS as base and Idx as the offset.
1163  int Ln = std::min(LHS.second, RHS.second) + Idx;
1164  assert(Ln >= 0 && "Bad lane calculation");
1165  unsigned UsersBudget = LookAheadUsersBudget;
1166  for (User *U : V->users()) {
1167  if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
1168  // The user is in the VectorizableTree. Check if we need to insert.
1169  int UserLn = UserTE->findLaneForValue(U);
1170  assert(UserLn >= 0 && "Bad lane");
1171  // If the values are different, check just the line of the current
1172  // value. If the values are the same, need to add UserInDiffLaneCost
1173  // only if UserLn does not match both line numbers.
1174  if ((LHS.first != RHS.first && UserLn != Ln) ||
1175  (LHS.first == RHS.first && UserLn != LHS.second &&
1176  UserLn != RHS.second)) {
1177  Cost += UserInDiffLaneCost;
1178  break;
1179  }
1180  } else {
1181  // Check if the user is in the look-ahead code.
1182  auto It2 = InLookAheadValues.find(U);
1183  if (It2 != InLookAheadValues.end()) {
1184  // The user is in the look-ahead code. Check the lane.
1185  if (!It2->getSecond().contains(Ln)) {
1186  Cost += UserInDiffLaneCost;
1187  break;
1188  }
1189  } else {
1190  // The user is neither in SLP tree nor in the look-ahead code.
1191  Cost += ExternalUseCost;
1192  break;
1193  }
1194  }
1195  // Limit the number of visited uses to cap compilation time.
1196  if (--UsersBudget == 0)
1197  break;
1198  }
1199  }
1200  return Cost;
1201  }
1202 
1203  /// Go through the operands of \p LHS and \p RHS recursively until \p
1204  /// MaxLevel, and return the cummulative score. For example:
1205  /// \verbatim
1206  /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1207  /// \ / \ / \ / \ /
1208  /// + + + +
1209  /// G1 G2 G3 G4
1210  /// \endverbatim
1211  /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1212  /// each level recursively, accumulating the score. It starts from matching
1213  /// the additions at level 0, then moves on to the loads (level 1). The
1214  /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1215  /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
1216  /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
1217  /// Please note that the order of the operands does not matter, as we
1218  /// evaluate the score of all profitable combinations of operands. In
1219  /// other words the score of G1 and G4 is the same as G1 and G2. This
1220  /// heuristic is based on ideas described in:
1221  /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1222  /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1223  /// Luís F. W. Góes
1224  int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
1225  const std::pair<Value *, int> &RHS, int CurrLevel,
1226  int MaxLevel) {
1227 
1228  Value *V1 = LHS.first;
1229  Value *V2 = RHS.first;
1230  // Get the shallow score of V1 and V2.
1231  int ShallowScoreAtThisLevel = std::max(
1232  (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
1233  getExternalUsesCost(LHS, RHS));
1234  int Lane1 = LHS.second;
1235  int Lane2 = RHS.second;
1236 
1237  // If reached MaxLevel,
1238  // or if V1 and V2 are not instructions,
1239  // or if they are SPLAT,
1240  // or if they are not consecutive,
1241  // or if profitable to vectorize loads or extractelements, early return
1242  // the current cost.
1243  auto *I1 = dyn_cast<Instruction>(V1);
1244  auto *I2 = dyn_cast<Instruction>(V2);
1245  if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1246  ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
1247  (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1248  (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1249  ShallowScoreAtThisLevel))
1250  return ShallowScoreAtThisLevel;
1251  assert(I1 && I2 && "Should have early exited.");
1252 
1253  // Keep track of in-tree values for determining the external-use cost.
1254  InLookAheadValues[V1].insert(Lane1);
1255  InLookAheadValues[V2].insert(Lane2);
1256 
1257  // Contains the I2 operand indexes that got matched with I1 operands.
1258  SmallSet<unsigned, 4> Op2Used;
1259 
1260  // Recursion towards the operands of I1 and I2. We are trying all possible
1261  // operand pairs, and keeping track of the best score.
1262  for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1263  OpIdx1 != NumOperands1; ++OpIdx1) {
1264  // Try to pair op1I with the best operand of I2.
1265  int MaxTmpScore = 0;
1266  unsigned MaxOpIdx2 = 0;
1267  bool FoundBest = false;
1268  // If I2 is commutative try all combinations.
1269  unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1270  unsigned ToIdx = isCommutative(I2)
1271  ? I2->getNumOperands()
1272  : std::min(I2->getNumOperands(), OpIdx1 + 1);
1273  assert(FromIdx <= ToIdx && "Bad index");
1274  for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1275  // Skip operands already paired with OpIdx1.
1276  if (Op2Used.count(OpIdx2))
1277  continue;
1278  // Recursively calculate the cost at each level
1279  int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
1280  {I2->getOperand(OpIdx2), Lane2},
1281  CurrLevel + 1, MaxLevel);
1282  // Look for the best score.
1283  if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
1284  MaxTmpScore = TmpScore;
1285  MaxOpIdx2 = OpIdx2;
1286  FoundBest = true;
1287  }
1288  }
1289  if (FoundBest) {
1290  // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1291  Op2Used.insert(MaxOpIdx2);
1292  ShallowScoreAtThisLevel += MaxTmpScore;
1293  }
1294  }
1295  return ShallowScoreAtThisLevel;
1296  }
1297 
1298  /// \Returns the look-ahead score, which tells us how much the sub-trees
1299  /// rooted at \p LHS and \p RHS match, the more they match the higher the
1300  /// score. This helps break ties in an informed way when we cannot decide on
1301  /// the order of the operands by just considering the immediate
1302  /// predecessors.
1303  int getLookAheadScore(const std::pair<Value *, int> &LHS,
1304  const std::pair<Value *, int> &RHS) {
1305  InLookAheadValues.clear();
1306  return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
1307  }
1308 
1309  // Search all operands in Ops[*][Lane] for the one that matches best
1310  // Ops[OpIdx][LastLane] and return its opreand index.
1311  // If no good match can be found, return None.
1313  getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1314  ArrayRef<ReorderingMode> ReorderingModes) {
1315  unsigned NumOperands = getNumOperands();
1316 
1317  // The operand of the previous lane at OpIdx.
1318  Value *OpLastLane = getData(OpIdx, LastLane).V;
1319 
1320  // Our strategy mode for OpIdx.
1321  ReorderingMode RMode = ReorderingModes[OpIdx];
1322 
1323  // The linearized opcode of the operand at OpIdx, Lane.
1324  bool OpIdxAPO = getData(OpIdx, Lane).APO;
1325 
1326  // The best operand index and its score.
1327  // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1328  // are using the score to differentiate between the two.
1329  struct BestOpData {
1330  Optional<unsigned> Idx = None;
1331  unsigned Score = 0;
1332  } BestOp;
1333 
1334  // Iterate through all unused operands and look for the best.
1335  for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1336  // Get the operand at Idx and Lane.
1337  OperandData &OpData = getData(Idx, Lane);
1338  Value *Op = OpData.V;
1339  bool OpAPO = OpData.APO;
1340 
1341  // Skip already selected operands.
1342  if (OpData.IsUsed)
1343  continue;
1344 
1345  // Skip if we are trying to move the operand to a position with a
1346  // different opcode in the linearized tree form. This would break the
1347  // semantics.
1348  if (OpAPO != OpIdxAPO)
1349  continue;
1350 
1351  // Look for an operand that matches the current mode.
1352  switch (RMode) {
1353  case ReorderingMode::Load:
1355  case ReorderingMode::Opcode: {
1356  bool LeftToRight = Lane > LastLane;
1357  Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1358  Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1359  unsigned Score =
1360  getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
1361  if (Score > BestOp.Score) {
1362  BestOp.Idx = Idx;
1363  BestOp.Score = Score;
1364  }
1365  break;
1366  }
1367  case ReorderingMode::Splat:
1368  if (Op == OpLastLane)
1369  BestOp.Idx = Idx;
1370  break;
1372  return None;
1373  }
1374  }
1375 
1376  if (BestOp.Idx) {
1377  getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
1378  return BestOp.Idx;
1379  }
1380  // If we could not find a good match return None.
1381  return None;
1382  }
1383 
1384  /// Helper for reorderOperandVecs.
1385  /// \returns the lane that we should start reordering from. This is the one
1386  /// which has the least number of operands that can freely move about or
1387  /// less profitable because it already has the most optimal set of operands.
1388  unsigned getBestLaneToStartReordering() const {
1389  unsigned Min = UINT_MAX;
1390  unsigned SameOpNumber = 0;
1391  // std::pair<unsigned, unsigned> is used to implement a simple voting
1392  // algorithm and choose the lane with the least number of operands that
1393  // can freely move about or less profitable because it already has the
1394  // most optimal set of operands. The first unsigned is a counter for
1395  // voting, the second unsigned is the counter of lanes with instructions
1396  // with same/alternate opcodes and same parent basic block.
1398  // Try to be closer to the original results, if we have multiple lanes
1399  // with same cost. If 2 lanes have the same cost, use the one with the
1400  // lowest index.
1401  for (int I = getNumLanes(); I > 0; --I) {
1402  unsigned Lane = I - 1;
1403  OperandsOrderData NumFreeOpsHash =
1404  getMaxNumOperandsThatCanBeReordered(Lane);
1405  // Compare the number of operands that can move and choose the one with
1406  // the least number.
1407  if (NumFreeOpsHash.NumOfAPOs < Min) {
1408  Min = NumFreeOpsHash.NumOfAPOs;
1409  SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1410  HashMap.clear();
1411  HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1412  } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1413  NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1414  // Select the most optimal lane in terms of number of operands that
1415  // should be moved around.
1416  SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1417  HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1418  } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1419  NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1420  auto It = HashMap.find(NumFreeOpsHash.Hash);
1421  if (It == HashMap.end())
1422  HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1423  else
1424  ++It->second.first;
1425  }
1426  }
1427  // Select the lane with the minimum counter.
1428  unsigned BestLane = 0;
1429  unsigned CntMin = UINT_MAX;
1430  for (const auto &Data : reverse(HashMap)) {
1431  if (Data.second.first < CntMin) {
1432  CntMin = Data.second.first;
1433  BestLane = Data.second.second;
1434  }
1435  }
1436  return BestLane;
1437  }
1438 
1439  /// Data structure that helps to reorder operands.
1440  struct OperandsOrderData {
1441  /// The best number of operands with the same APOs, which can be
1442  /// reordered.
1443  unsigned NumOfAPOs = UINT_MAX;
1444  /// Number of operands with the same/alternate instruction opcode and
1445  /// parent.
1446  unsigned NumOpsWithSameOpcodeParent = 0;
1447  /// Hash for the actual operands ordering.
1448  /// Used to count operands, actually their position id and opcode
1449  /// value. It is used in the voting mechanism to find the lane with the
1450  /// least number of operands that can freely move about or less profitable
1451  /// because it already has the most optimal set of operands. Can be
1452  /// replaced with SmallVector<unsigned> instead but hash code is faster
1453  /// and requires less memory.
1454  unsigned Hash = 0;
1455  };
1456  /// \returns the maximum number of operands that are allowed to be reordered
1457  /// for \p Lane and the number of compatible instructions(with the same
1458  /// parent/opcode). This is used as a heuristic for selecting the first lane
1459  /// to start operand reordering.
1460  OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1461  unsigned CntTrue = 0;
1462  unsigned NumOperands = getNumOperands();
1463  // Operands with the same APO can be reordered. We therefore need to count
1464  // how many of them we have for each APO, like this: Cnt[APO] = x.
1465  // Since we only have two APOs, namely true and false, we can avoid using
1466  // a map. Instead we can simply count the number of operands that
1467  // correspond to one of them (in this case the 'true' APO), and calculate
1468  // the other by subtracting it from the total number of operands.
1469  // Operands with the same instruction opcode and parent are more
1470  // profitable since we don't need to move them in many cases, with a high
1471  // probability such lane already can be vectorized effectively.
1472  bool AllUndefs = true;
1473  unsigned NumOpsWithSameOpcodeParent = 0;
1474  Instruction *OpcodeI = nullptr;
1475  BasicBlock *Parent = nullptr;
1476  unsigned Hash = 0;
1477  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1478  const OperandData &OpData = getData(OpIdx, Lane);
1479  if (OpData.APO)
1480  ++CntTrue;
1481  // Use Boyer-Moore majority voting for finding the majority opcode and
1482  // the number of times it occurs.
1483  if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1484  if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
1485  I->getParent() != Parent) {
1486  if (NumOpsWithSameOpcodeParent == 0) {
1487  NumOpsWithSameOpcodeParent = 1;
1488  OpcodeI = I;
1489  Parent = I->getParent();
1490  } else {
1491  --NumOpsWithSameOpcodeParent;
1492  }
1493  } else {
1494  ++NumOpsWithSameOpcodeParent;
1495  }
1496  }
1497  Hash = hash_combine(
1498  Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1499  AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1500  }
1501  if (AllUndefs)
1502  return {};
1503  OperandsOrderData Data;
1504  Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1505  Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1506  Data.Hash = Hash;
1507  return Data;
1508  }
1509 
1510  /// Go through the instructions in VL and append their operands.
1511  void appendOperandsOfVL(ArrayRef<Value *> VL) {
1512  assert(!VL.empty() && "Bad VL");
1513  assert((empty() || VL.size() == getNumLanes()) &&
1514  "Expected same number of lanes");
1515  assert(isa<Instruction>(VL[0]) && "Expected instruction");
1516  unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1517  OpsVec.resize(NumOperands);
1518  unsigned NumLanes = VL.size();
1519  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1520  OpsVec[OpIdx].resize(NumLanes);
1521  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1522  assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
1523  // Our tree has just 3 nodes: the root and two operands.
1524  // It is therefore trivial to get the APO. We only need to check the
1525  // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1526  // RHS operand. The LHS operand of both add and sub is never attached
1527  // to an inversese operation in the linearized form, therefore its APO
1528  // is false. The RHS is true only if VL[Lane] is an inverse operation.
1529 
1530  // Since operand reordering is performed on groups of commutative
1531  // operations or alternating sequences (e.g., +, -), we can safely
1532  // tell the inverse operations by checking commutativity.
1533  bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1534  bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1535  OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1536  APO, false};
1537  }
1538  }
1539  }
1540 
1541  /// \returns the number of operands.
1542  unsigned getNumOperands() const { return OpsVec.size(); }
1543 
1544  /// \returns the number of lanes.
1545  unsigned getNumLanes() const { return OpsVec[0].size(); }
1546 
1547  /// \returns the operand value at \p OpIdx and \p Lane.
1548  Value *getValue(unsigned OpIdx, unsigned Lane) const {
1549  return getData(OpIdx, Lane).V;
1550  }
1551 
1552  /// \returns true if the data structure is empty.
1553  bool empty() const { return OpsVec.empty(); }
1554 
1555  /// Clears the data.
1556  void clear() { OpsVec.clear(); }
1557 
1558  /// \Returns true if there are enough operands identical to \p Op to fill
1559  /// the whole vector.
1560  /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1561  bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1562  bool OpAPO = getData(OpIdx, Lane).APO;
1563  for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1564  if (Ln == Lane)
1565  continue;
1566  // This is set to true if we found a candidate for broadcast at Lane.
1567  bool FoundCandidate = false;
1568  for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1569  OperandData &Data = getData(OpI, Ln);
1570  if (Data.APO != OpAPO || Data.IsUsed)
1571  continue;
1572  if (Data.V == Op) {
1573  FoundCandidate = true;
1574  Data.IsUsed = true;
1575  break;
1576  }
1577  }
1578  if (!FoundCandidate)
1579  return false;
1580  }
1581  return true;
1582  }
1583 
1584  public:
1585  /// Initialize with all the operands of the instruction vector \p RootVL.
1587  ScalarEvolution &SE, const BoUpSLP &R)
1588  : DL(DL), SE(SE), R(R) {
1589  // Append all the operands of RootVL.
1590  appendOperandsOfVL(RootVL);
1591  }
1592 
1593  /// \Returns a value vector with the operands across all lanes for the
1594  /// opearnd at \p OpIdx.
1595  ValueList getVL(unsigned OpIdx) const {
1596  ValueList OpVL(OpsVec[OpIdx].size());
1597  assert(OpsVec[OpIdx].size() == getNumLanes() &&
1598  "Expected same num of lanes across all operands");
1599  for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1600  OpVL[Lane] = OpsVec[OpIdx][Lane].V;
1601  return OpVL;
1602  }
1603 
1604  // Performs operand reordering for 2 or more operands.
1605  // The original operands are in OrigOps[OpIdx][Lane].
1606  // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
1607  void reorder() {
1608  unsigned NumOperands = getNumOperands();
1609  unsigned NumLanes = getNumLanes();
1610  // Each operand has its own mode. We are using this mode to help us select
1611  // the instructions for each lane, so that they match best with the ones
1612  // we have selected so far.
1613  SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
1614 
1615  // This is a greedy single-pass algorithm. We are going over each lane
1616  // once and deciding on the best order right away with no back-tracking.
1617  // However, in order to increase its effectiveness, we start with the lane
1618  // that has operands that can move the least. For example, given the
1619  // following lanes:
1620  // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
1621  // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
1622  // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
1623  // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
1624  // we will start at Lane 1, since the operands of the subtraction cannot
1625  // be reordered. Then we will visit the rest of the lanes in a circular
1626  // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
1627 
1628  // Find the first lane that we will start our search from.
1629  unsigned FirstLane = getBestLaneToStartReordering();
1630 
1631  // Initialize the modes.
1632  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1633  Value *OpLane0 = getValue(OpIdx, FirstLane);
1634  // Keep track if we have instructions with all the same opcode on one
1635  // side.
1636  if (isa<LoadInst>(OpLane0))
1637  ReorderingModes[OpIdx] = ReorderingMode::Load;
1638  else if (isa<Instruction>(OpLane0)) {
1639  // Check if OpLane0 should be broadcast.
1640  if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
1641  ReorderingModes[OpIdx] = ReorderingMode::Splat;
1642  else
1643  ReorderingModes[OpIdx] = ReorderingMode::Opcode;
1644  }
1645  else if (isa<Constant>(OpLane0))
1646  ReorderingModes[OpIdx] = ReorderingMode::Constant;
1647  else if (isa<Argument>(OpLane0))
1648  // Our best hope is a Splat. It may save some cost in some cases.
1649  ReorderingModes[OpIdx] = ReorderingMode::Splat;
1650  else
1651  // NOTE: This should be unreachable.
1652  ReorderingModes[OpIdx] = ReorderingMode::Failed;
1653  }
1654 
1655  // Check that we don't have same operands. No need to reorder if operands
1656  // are just perfect diamond or shuffled diamond match. Do not do it only
1657  // for possible broadcasts or non-power of 2 number of scalars (just for
1658  // now).
1659  auto &&SkipReordering = [this]() {
1660  SmallPtrSet<Value *, 4> UniqueValues;
1661  ArrayRef<OperandData> Op0 = OpsVec.front();
1662  for (const OperandData &Data : Op0)
1663  UniqueValues.insert(Data.V);
1664  for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
1665  if (any_of(Op, [&UniqueValues](const OperandData &Data) {
1666  return !UniqueValues.contains(Data.V);
1667  }))
1668  return false;
1669  }
1670  // TODO: Check if we can remove a check for non-power-2 number of
1671  // scalars after full support of non-power-2 vectorization.
1672  return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
1673  };
1674 
1675  // If the initial strategy fails for any of the operand indexes, then we
1676  // perform reordering again in a second pass. This helps avoid assigning
1677  // high priority to the failed strategy, and should improve reordering for
1678  // the non-failed operand indexes.
1679  for (int Pass = 0; Pass != 2; ++Pass) {
1680  // Check if no need to reorder operands since they're are perfect or
1681  // shuffled diamond match.
1682  // Need to to do it to avoid extra external use cost counting for
1683  // shuffled matches, which may cause regressions.
1684  if (SkipReordering())
1685  break;
1686  // Skip the second pass if the first pass did not fail.
1687  bool StrategyFailed = false;
1688  // Mark all operand data as free to use.
1689  clearUsed();
1690  // We keep the original operand order for the FirstLane, so reorder the
1691  // rest of the lanes. We are visiting the nodes in a circular fashion,
1692  // using FirstLane as the center point and increasing the radius
1693  // distance.
1694  for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
1695  // Visit the lane on the right and then the lane on the left.
1696  for (int Direction : {+1, -1}) {
1697  int Lane = FirstLane + Direction * Distance;
1698  if (Lane < 0 || Lane >= (int)NumLanes)
1699  continue;
1700  int LastLane = Lane - Direction;
1701  assert(LastLane >= 0 && LastLane < (int)NumLanes &&
1702  "Out of bounds");
1703  // Look for a good match for each operand.
1704  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1705  // Search for the operand that matches SortedOps[OpIdx][Lane-1].
1706  Optional<unsigned> BestIdx =
1707  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
1708  // By not selecting a value, we allow the operands that follow to
1709  // select a better matching value. We will get a non-null value in
1710  // the next run of getBestOperand().
1711  if (BestIdx) {
1712  // Swap the current operand with the one returned by
1713  // getBestOperand().
1714  swap(OpIdx, BestIdx.getValue(), Lane);
1715  } else {
1716  // We failed to find a best operand, set mode to 'Failed'.
1717  ReorderingModes[OpIdx] = ReorderingMode::Failed;
1718  // Enable the second pass.
1719  StrategyFailed = true;
1720  }
1721  }
1722  }
1723  }
1724  // Skip second pass if the strategy did not fail.
1725  if (!StrategyFailed)
1726  break;
1727  }
1728  }
1729 
1730 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1731  LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
1732  switch (RMode) {
1733  case ReorderingMode::Load:
1734  return "Load";
1735  case ReorderingMode::Opcode:
1736  return "Opcode";
1738  return "Constant";
1739  case ReorderingMode::Splat:
1740  return "Splat";
1742  return "Failed";
1743  }
1744  llvm_unreachable("Unimplemented Reordering Type");
1745  }
1746 
1747  LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
1748  raw_ostream &OS) {
1749  return OS << getModeStr(RMode);
1750  }
1751 
1752  /// Debug print.
1753  LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
1754  printMode(RMode, dbgs());
1755  }
1756 
1757  friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
1758  return printMode(RMode, OS);
1759  }
1760 
1762  const unsigned Indent = 2;
1763  unsigned Cnt = 0;
1764  for (const OperandDataVec &OpDataVec : OpsVec) {
1765  OS << "Operand " << Cnt++ << "\n";
1766  for (const OperandData &OpData : OpDataVec) {
1767  OS.indent(Indent) << "{";
1768  if (Value *V = OpData.V)
1769  OS << *V;
1770  else
1771  OS << "null";
1772  OS << ", APO:" << OpData.APO << "}\n";
1773  }
1774  OS << "\n";
1775  }
1776  return OS;
1777  }
1778 
1779  /// Debug print.
1780  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
1781 #endif
1782  };
1783 
1784  /// Checks if the instruction is marked for deletion.
1785  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
1786 
1787  /// Marks values operands for later deletion by replacing them with Undefs.
1788  void eraseInstructions(ArrayRef<Value *> AV);
1789 
1790  ~BoUpSLP();
1791 
1792 private:
1793  /// Checks if all users of \p I are the part of the vectorization tree.
1794  bool areAllUsersVectorized(Instruction *I,
1795  ArrayRef<Value *> VectorizedVals) const;
1796 
1797  /// \returns the cost of the vectorizable entry.
1798  InstructionCost getEntryCost(const TreeEntry *E,
1799  ArrayRef<Value *> VectorizedVals);
1800 
1801  /// This is the recursive part of buildTree.
1802  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
1803  const EdgeInfo &EI);
1804 
1805  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
1806  /// be vectorized to use the original vector (or aggregate "bitcast" to a
1807  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
1808  /// returns false, setting \p CurrentOrder to either an empty vector or a
1809  /// non-identity permutation that allows to reuse extract instructions.
1810  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
1811  SmallVectorImpl<unsigned> &CurrentOrder) const;
1812 
1813  /// Vectorize a single entry in the tree.
1814  Value *vectorizeTree(TreeEntry *E);
1815 
1816  /// Vectorize a single entry in the tree, starting in \p VL.
1817  Value *vectorizeTree(ArrayRef<Value *> VL);
1818 
1819  /// \returns the scalarization cost for this type. Scalarization in this
1820  /// context means the creation of vectors from a group of scalars. If \p
1821  /// NeedToShuffle is true, need to add a cost of reshuffling some of the
1822  /// vector elements.
1823  InstructionCost getGatherCost(FixedVectorType *Ty,
1824  const DenseSet<unsigned> &ShuffledIndices,
1825  bool NeedToShuffle) const;
1826 
1827  /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
1828  /// tree entries.
1829  /// \returns ShuffleKind, if gathered values can be represented as shuffles of
1830  /// previous tree entries. \p Mask is filled with the shuffle mask.
1832  isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
1834 
1835  /// \returns the scalarization cost for this list of values. Assuming that
1836  /// this subtree gets vectorized, we may need to extract the values from the
1837  /// roots. This method calculates the cost of extracting the values.
1838  InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
1839 
1840  /// Set the Builder insert point to one after the last instruction in
1841  /// the bundle
1842  void setInsertPointAfterBundle(const TreeEntry *E);
1843 
1844  /// \returns a vector from a collection of scalars in \p VL.
1845  Value *gather(ArrayRef<Value *> VL);
1846 
1847  /// \returns whether the VectorizableTree is fully vectorizable and will
1848  /// be beneficial even the tree height is tiny.
1849  bool isFullyVectorizableTinyTree(bool ForReduction) const;
1850 
1851  /// Reorder commutative or alt operands to get better probability of
1852  /// generating vectorized code.
1853  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1855  SmallVectorImpl<Value *> &Right,
1856  const DataLayout &DL,
1857  ScalarEvolution &SE,
1858  const BoUpSLP &R);
1859  struct TreeEntry {
1860  using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
1861  TreeEntry(VecTreeTy &Container) : Container(Container) {}
1862 
1863  /// \returns true if the scalars in VL are equal to this entry.
1864  bool isSame(ArrayRef<Value *> VL) const {
1865  auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
1866  if (Mask.size() != VL.size() && VL.size() == Scalars.size())
1867  return std::equal(VL.begin(), VL.end(), Scalars.begin());
1868  return VL.size() == Mask.size() &&
1869  std::equal(VL.begin(), VL.end(), Mask.begin(),
1870  [Scalars](Value *V, int Idx) {
1871  return (isa<UndefValue>(V) &&
1872  Idx == UndefMaskElem) ||
1873  (Idx != UndefMaskElem && V == Scalars[Idx]);
1874  });
1875  };
1876  if (!ReorderIndices.empty()) {
1877  // TODO: implement matching if the nodes are just reordered, still can
1878  // treat the vector as the same if the list of scalars matches VL
1879  // directly, without reordering.
1881  inversePermutation(ReorderIndices, Mask);
1882  if (VL.size() == Scalars.size())
1883  return IsSame(Scalars, Mask);
1884  if (VL.size() == ReuseShuffleIndices.size()) {
1885  ::addMask(Mask, ReuseShuffleIndices);
1886  return IsSame(Scalars, Mask);
1887  }
1888  return false;
1889  }
1890  return IsSame(Scalars, ReuseShuffleIndices);
1891  }
1892 
1893  /// \returns true if current entry has same operands as \p TE.
1894  bool hasEqualOperands(const TreeEntry &TE) const {
1895  if (TE.getNumOperands() != getNumOperands())
1896  return false;
1897  SmallBitVector Used(getNumOperands());
1898  for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
1899  unsigned PrevCount = Used.count();
1900  for (unsigned K = 0; K < E; ++K) {
1901  if (Used.test(K))
1902  continue;
1903  if (getOperand(K) == TE.getOperand(I)) {
1904  Used.set(K);
1905  break;
1906  }
1907  }
1908  // Check if we actually found the matching operand.
1909  if (PrevCount == Used.count())
1910  return false;
1911  }
1912  return true;
1913  }
1914 
1915  /// \return Final vectorization factor for the node. Defined by the total
1916  /// number of vectorized scalars, including those, used several times in the
1917  /// entry and counted in the \a ReuseShuffleIndices, if any.
1918  unsigned getVectorFactor() const {
1919  if (!ReuseShuffleIndices.empty())
1920  return ReuseShuffleIndices.size();
1921  return Scalars.size();
1922  };
1923 
1924  /// A vector of scalars.
1925  ValueList Scalars;
1926 
1927  /// The Scalars are vectorized into this value. It is initialized to Null.
1928  Value *VectorizedValue = nullptr;
1929 
1930  /// Do we need to gather this sequence or vectorize it
1931  /// (either with vector instruction or with scatter/gather
1932  /// intrinsics for store/load)?
1933  enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
1934  EntryState State;
1935 
1936  /// Does this sequence require some shuffling?
1937  SmallVector<int, 4> ReuseShuffleIndices;
1938 
1939  /// Does this entry require reordering?
1940  SmallVector<unsigned, 4> ReorderIndices;
1941 
1942  /// Points back to the VectorizableTree.
1943  ///
1944  /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
1945  /// to be a pointer and needs to be able to initialize the child iterator.
1946  /// Thus we need a reference back to the container to translate the indices
1947  /// to entries.
1948  VecTreeTy &Container;
1949 
1950  /// The TreeEntry index containing the user of this entry. We can actually
1951  /// have multiple users so the data structure is not truly a tree.
1952  SmallVector<EdgeInfo, 1> UserTreeIndices;
1953 
1954  /// The index of this treeEntry in VectorizableTree.
1955  int Idx = -1;
1956 
1957  private:
1958  /// The operands of each instruction in each lane Operands[op_index][lane].
1959  /// Note: This helps avoid the replication of the code that performs the
1960  /// reordering of operands during buildTree_rec() and vectorizeTree().
1962 
1963  /// The main/alternate instruction.
1964  Instruction *MainOp = nullptr;
1965  Instruction *AltOp = nullptr;
1966 
1967  public:
1968  /// Set this bundle's \p OpIdx'th operand to \p OpVL.
1969  void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
1970  if (Operands.size() < OpIdx + 1)
1971  Operands.resize(OpIdx + 1);
1972  assert(Operands[OpIdx].empty() && "Already resized?");
1973  assert(OpVL.size() <= Scalars.size() &&
1974  "Number of operands is greater than the number of scalars.");
1975  Operands[OpIdx].resize(OpVL.size());
1976  copy(OpVL, Operands[OpIdx].begin());
1977  }
1978 
1979  /// Set the operands of this bundle in their original order.
1980  void setOperandsInOrder() {
1981  assert(Operands.empty() && "Already initialized?");
1982  auto *I0 = cast<Instruction>(Scalars[0]);
1983  Operands.resize(I0->getNumOperands());
1984  unsigned NumLanes = Scalars.size();
1985  for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
1986  OpIdx != NumOperands; ++OpIdx) {
1987  Operands[OpIdx].resize(NumLanes);
1988  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1989  auto *I = cast<Instruction>(Scalars[Lane]);
1990  assert(I->getNumOperands() == NumOperands &&
1991  "Expected same number of operands");
1992  Operands[OpIdx][Lane] = I->getOperand(OpIdx);
1993  }
1994  }
1995  }
1996 
1997  /// Reorders operands of the node to the given mask \p Mask.
1998  void reorderOperands(ArrayRef<int> Mask) {
1999  for (ValueList &Operand : Operands)
2000  reorderScalars(Operand, Mask);
2001  }
2002 
2003  /// \returns the \p OpIdx operand of this TreeEntry.
2004  ValueList &getOperand(unsigned OpIdx) {
2005  assert(OpIdx < Operands.size() && "Off bounds");
2006  return Operands[OpIdx];
2007  }
2008 
2009  /// \returns the \p OpIdx operand of this TreeEntry.
2010  ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2011  assert(OpIdx < Operands.size() && "Off bounds");
2012  return Operands[OpIdx];
2013  }
2014 
2015  /// \returns the number of operands.
2016  unsigned getNumOperands() const { return Operands.size(); }
2017 
2018  /// \return the single \p OpIdx operand.
2019  Value *getSingleOperand(unsigned OpIdx) const {
2020  assert(OpIdx < Operands.size() && "Off bounds");
2021  assert(!Operands[OpIdx].empty() && "No operand available");
2022  return Operands[OpIdx][0];
2023  }
2024 
2025  /// Some of the instructions in the list have alternate opcodes.
2026  bool isAltShuffle() const { return MainOp != AltOp; }
2027 
2028  bool isOpcodeOrAlt(Instruction *I) const {
2029  unsigned CheckedOpcode = I->getOpcode();
2030  return (getOpcode() == CheckedOpcode ||
2031  getAltOpcode() == CheckedOpcode);
2032  }
2033 
2034  /// Chooses the correct key for scheduling data. If \p Op has the same (or
2035  /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2036  /// \p OpValue.
2037  Value *isOneOf(Value *Op) const {
2038  auto *I = dyn_cast<Instruction>(Op);
2039  if (I && isOpcodeOrAlt(I))
2040  return Op;
2041  return MainOp;
2042  }
2043 
2044  void setOperations(const InstructionsState &S) {
2045  MainOp = S.MainOp;
2046  AltOp = S.AltOp;
2047  }
2048 
2049  Instruction *getMainOp() const {
2050  return MainOp;
2051  }
2052 
2053  Instruction *getAltOp() const {
2054  return AltOp;
2055  }
2056 
2057  /// The main/alternate opcodes for the list of instructions.
2058  unsigned getOpcode() const {
2059  return MainOp ? MainOp->getOpcode() : 0;
2060  }
2061 
2062  unsigned getAltOpcode() const {
2063  return AltOp ? AltOp->getOpcode() : 0;
2064  }
2065 
2066  /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2067  /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2068  int findLaneForValue(Value *V) const {
2069  unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2070  assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2071  if (!ReorderIndices.empty())
2072  FoundLane = ReorderIndices[FoundLane];
2073  assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2074  if (!ReuseShuffleIndices.empty()) {
2075  FoundLane = std::distance(ReuseShuffleIndices.begin(),
2076  find(ReuseShuffleIndices, FoundLane));
2077  }
2078  return FoundLane;
2079  }
2080 
2081 #ifndef NDEBUG
2082  /// Debug printer.
2083  LLVM_DUMP_METHOD void dump() const {
2084  dbgs() << Idx << ".\n";
2085  for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2086  dbgs() << "Operand " << OpI << ":\n";
2087  for (const Value *V : Operands[OpI])
2088  dbgs().indent(2) << *V << "\n";
2089  }
2090  dbgs() << "Scalars: \n";
2091  for (Value *V : Scalars)
2092  dbgs().indent(2) << *V << "\n";
2093  dbgs() << "State: ";
2094  switch (State) {
2095  case Vectorize:
2096  dbgs() << "Vectorize\n";
2097  break;
2098  case ScatterVectorize:
2099  dbgs() << "ScatterVectorize\n";
2100  break;
2101  case NeedToGather:
2102  dbgs() << "NeedToGather\n";
2103  break;
2104  }
2105  dbgs() << "MainOp: ";
2106  if (MainOp)
2107  dbgs() << *MainOp << "\n";
2108  else
2109  dbgs() << "NULL\n";
2110  dbgs() << "AltOp: ";
2111  if (AltOp)
2112  dbgs() << *AltOp << "\n";
2113  else
2114  dbgs() << "NULL\n";
2115  dbgs() << "VectorizedValue: ";
2116  if (VectorizedValue)
2117  dbgs() << *VectorizedValue << "\n";
2118  else
2119  dbgs() << "NULL\n";
2120  dbgs() << "ReuseShuffleIndices: ";
2121  if (ReuseShuffleIndices.empty())
2122  dbgs() << "Empty";
2123  else
2124  for (int ReuseIdx : ReuseShuffleIndices)
2125  dbgs() << ReuseIdx << ", ";
2126  dbgs() << "\n";
2127  dbgs() << "ReorderIndices: ";
2128  for (unsigned ReorderIdx : ReorderIndices)
2129  dbgs() << ReorderIdx << ", ";
2130  dbgs() << "\n";
2131  dbgs() << "UserTreeIndices: ";
2132  for (const auto &EInfo : UserTreeIndices)
2133  dbgs() << EInfo << ", ";
2134  dbgs() << "\n";
2135  }
2136 #endif
2137  };
2138 
2139 #ifndef NDEBUG
2140  void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2141  InstructionCost VecCost,
2142  InstructionCost ScalarCost) const {
2143  dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
2144  dbgs() << "SLP: Costs:\n";
2145  dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2146  dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2147  dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2148  dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
2149  ReuseShuffleCost + VecCost - ScalarCost << "\n";
2150  }
2151 #endif
2152 
2153  /// Create a new VectorizableTree entry.
2154  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
2155  const InstructionsState &S,
2156  const EdgeInfo &UserTreeIdx,
2157  ArrayRef<int> ReuseShuffleIndices = None,
2158  ArrayRef<unsigned> ReorderIndices = None) {
2159  TreeEntry::EntryState EntryState =
2160  Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2161  return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2162  ReuseShuffleIndices, ReorderIndices);
2163  }
2164 
2165  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2166  TreeEntry::EntryState EntryState,
2167  Optional<ScheduleData *> Bundle,
2168  const InstructionsState &S,
2169  const EdgeInfo &UserTreeIdx,
2170  ArrayRef<int> ReuseShuffleIndices = None,
2171  ArrayRef<unsigned> ReorderIndices = None) {
2172  assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2173  (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2174  "Need to vectorize gather entry?");
2175  VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2176  TreeEntry *Last = VectorizableTree.back().get();
2177  Last->Idx = VectorizableTree.size() - 1;
2178  Last->State = EntryState;
2179  Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2180  ReuseShuffleIndices.end());
2181  if (ReorderIndices.empty()) {
2182  Last->Scalars.assign(VL.begin(), VL.end());
2183  Last->setOperations(S);
2184  } else {
2185  // Reorder scalars and build final mask.
2186  Last->Scalars.assign(VL.size(), nullptr);
2187  transform(ReorderIndices, Last->Scalars.begin(),
2188  [VL](unsigned Idx) -> Value * {
2189  if (Idx >= VL.size())
2190  return UndefValue::get(VL.front()->getType());
2191  return VL[Idx];
2192  });
2193  InstructionsState S = getSameOpcode(Last->Scalars);
2194  Last->setOperations(S);
2195  Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2196  }
2197  if (Last->State != TreeEntry::NeedToGather) {
2198  for (Value *V : VL) {
2199  assert(!getTreeEntry(V) && "Scalar already in tree!");
2200  ScalarToTreeEntry[V] = Last;
2201  }
2202  // Update the scheduler bundle to point to this TreeEntry.
2203  unsigned Lane = 0;
2204  for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
2205  BundleMember = BundleMember->NextInBundle) {
2206  BundleMember->TE = Last;
2207  BundleMember->Lane = Lane;
2208  ++Lane;
2209  }
2210  assert((!Bundle.getValue() || Lane == VL.size()) &&
2211  "Bundle and VL out of sync");
2212  } else {
2213  MustGather.insert(VL.begin(), VL.end());
2214  }
2215 
2216  if (UserTreeIdx.UserTE)
2217  Last->UserTreeIndices.push_back(UserTreeIdx);
2218 
2219  return Last;
2220  }
2221 
2222  /// -- Vectorization State --
2223  /// Holds all of the tree entries.
2224  TreeEntry::VecTreeTy VectorizableTree;
2225 
2226 #ifndef NDEBUG
2227  /// Debug printer.
2228  LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2229  for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2230  VectorizableTree[Id]->dump();
2231  dbgs() << "\n";
2232  }
2233  }
2234 #endif
2235 
2236  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2237 
2238  const TreeEntry *getTreeEntry(Value *V) const {
2239  return ScalarToTreeEntry.lookup(V);
2240  }
2241 
2242  /// Maps a specific scalar to its tree entry.
2243  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
2244 
2245  /// Maps a value to the proposed vectorizable size.
2246  SmallDenseMap<Value *, unsigned> InstrElementSize;
2247 
2248  /// A list of scalars that we found that we need to keep as scalars.
2249  ValueSet MustGather;
2250 
2251  /// This POD struct describes one external user in the vectorized tree.
2252  struct ExternalUser {
2253  ExternalUser(Value *S, llvm::User *U, int L)
2254  : Scalar(S), User(U), Lane(L) {}
2255 
2256  // Which scalar in our function.
2257  Value *Scalar;
2258 
2259  // Which user that uses the scalar.
2260  llvm::User *User;
2261 
2262  // Which lane does the scalar belong to.
2263  int Lane;
2264  };
2265  using UserList = SmallVector<ExternalUser, 16>;
2266 
2267  /// Checks if two instructions may access the same memory.
2268  ///
2269  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2270  /// is invariant in the calling loop.
2271  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2272  Instruction *Inst2) {
2273  // First check if the result is already in the cache.
2274  AliasCacheKey key = std::make_pair(Inst1, Inst2);
2275  Optional<bool> &result = AliasCache[key];
2276  if (result.hasValue()) {
2277  return result.getValue();
2278  }
2279  bool aliased = true;
2280  if (Loc1.Ptr && isSimple(Inst1))
2281  aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));
2282  // Store the result in the cache.
2283  result = aliased;
2284  return aliased;
2285  }
2286 
2287  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2288 
2289  /// Cache for alias results.
2290  /// TODO: consider moving this to the AliasAnalysis itself.
2292 
2293  /// Removes an instruction from its block and eventually deletes it.
2294  /// It's like Instruction::eraseFromParent() except that the actual deletion
2295  /// is delayed until BoUpSLP is destructed.
2296  /// This is required to ensure that there are no incorrect collisions in the
2297  /// AliasCache, which can happen if a new instruction is allocated at the
2298  /// same address as a previously deleted instruction.
2299  void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
2300  auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
2301  It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
2302  }
2303 
2304  /// Temporary store for deleted instructions. Instructions will be deleted
2305  /// eventually when the BoUpSLP is destructed.
2306  DenseMap<Instruction *, bool> DeletedInstructions;
2307 
2308  /// A list of values that need to extracted out of the tree.
2309  /// This list holds pairs of (Internal Scalar : External User). External User
2310  /// can be nullptr, it means that this Internal Scalar will be used later,
2311  /// after vectorization.
2312  UserList ExternalUses;
2313 
2314  /// Values used only by @llvm.assume calls.
2316 
2317  /// Holds all of the instructions that we gathered.
2318  SetVector<Instruction *> GatherShuffleSeq;
2319 
2320  /// A list of blocks that we are going to CSE.
2321  SetVector<BasicBlock *> CSEBlocks;
2322 
2323  /// Contains all scheduling relevant data for an instruction.
2324  /// A ScheduleData either represents a single instruction or a member of an
2325  /// instruction bundle (= a group of instructions which is combined into a
2326  /// vector instruction).
2327  struct ScheduleData {
2328  // The initial value for the dependency counters. It means that the
2329  // dependencies are not calculated yet.
2330  enum { InvalidDeps = -1 };
2331 
2332  ScheduleData() = default;
2333 
2334  void init(int BlockSchedulingRegionID, Value *OpVal) {
2335  FirstInBundle = this;
2336  NextInBundle = nullptr;
2337  NextLoadStore = nullptr;
2338  IsScheduled = false;
2339  SchedulingRegionID = BlockSchedulingRegionID;
2340  UnscheduledDepsInBundle = UnscheduledDeps;
2341  clearDependencies();
2342  OpValue = OpVal;
2343  TE = nullptr;
2344  Lane = -1;
2345  }
2346 
2347  /// Returns true if the dependency information has been calculated.
2348  bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
2349 
2350  /// Returns true for single instructions and for bundle representatives
2351  /// (= the head of a bundle).
2352  bool isSchedulingEntity() const { return FirstInBundle == this; }
2353 
2354  /// Returns true if it represents an instruction bundle and not only a
2355  /// single instruction.
2356  bool isPartOfBundle() const {
2357  return NextInBundle != nullptr || FirstInBundle != this;
2358  }
2359 
2360  /// Returns true if it is ready for scheduling, i.e. it has no more
2361  /// unscheduled depending instructions/bundles.
2362  bool isReady() const {
2363  assert(isSchedulingEntity() &&
2364  "can't consider non-scheduling entity for ready list");
2365  return UnscheduledDepsInBundle == 0 && !IsScheduled;
2366  }
2367 
2368  /// Modifies the number of unscheduled dependencies, also updating it for
2369  /// the whole bundle.
2370  int incrementUnscheduledDeps(int Incr) {
2371  UnscheduledDeps += Incr;
2372  return FirstInBundle->UnscheduledDepsInBundle += Incr;
2373  }
2374 
2375  /// Sets the number of unscheduled dependencies to the number of
2376  /// dependencies.
2377  void resetUnscheduledDeps() {
2378  incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
2379  }
2380 
2381  /// Clears all dependency information.
2382  void clearDependencies() {
2383  Dependencies = InvalidDeps;
2384  resetUnscheduledDeps();
2385  MemoryDependencies.clear();
2386  }
2387 
2388  void dump(raw_ostream &os) const {
2389  if (!isSchedulingEntity()) {
2390  os << "/ " << *Inst;
2391  } else if (NextInBundle) {
2392  os << '[' << *Inst;
2393  ScheduleData *SD = NextInBundle;
2394  while (SD) {
2395  os << ';' << *SD->Inst;
2396  SD = SD->NextInBundle;
2397  }
2398  os << ']';
2399  } else {
2400  os << *Inst;
2401  }
2402  }
2403 
2404  Instruction *Inst = nullptr;
2405 
2406  /// Points to the head in an instruction bundle (and always to this for
2407  /// single instructions).
2408  ScheduleData *FirstInBundle = nullptr;
2409 
2410  /// Single linked list of all instructions in a bundle. Null if it is a
2411  /// single instruction.
2412  ScheduleData *NextInBundle = nullptr;
2413 
2414  /// Single linked list of all memory instructions (e.g. load, store, call)
2415  /// in the block - until the end of the scheduling region.
2416  ScheduleData *NextLoadStore = nullptr;
2417 
2418  /// The dependent memory instructions.
2419  /// This list is derived on demand in calculateDependencies().
2420  SmallVector<ScheduleData *, 4> MemoryDependencies;
2421 
2422  /// This ScheduleData is in the current scheduling region if this matches
2423  /// the current SchedulingRegionID of BlockScheduling.
2424  int SchedulingRegionID = 0;
2425 
2426  /// Used for getting a "good" final ordering of instructions.
2427  int SchedulingPriority = 0;
2428 
2429  /// The number of dependencies. Constitutes of the number of users of the
2430  /// instruction plus the number of dependent memory instructions (if any).
2431  /// This value is calculated on demand.
2432  /// If InvalidDeps, the number of dependencies is not calculated yet.
2433  int Dependencies = InvalidDeps;
2434 
2435  /// The number of dependencies minus the number of dependencies of scheduled
2436  /// instructions. As soon as this is zero, the instruction/bundle gets ready
2437  /// for scheduling.
2438  /// Note that this is negative as long as Dependencies is not calculated.
2439  int UnscheduledDeps = InvalidDeps;
2440 
2441  /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
2442  /// single instructions.
2443  int UnscheduledDepsInBundle = InvalidDeps;
2444 
2445  /// True if this instruction is scheduled (or considered as scheduled in the
2446  /// dry-run).
2447  bool IsScheduled = false;
2448 
2449  /// Opcode of the current instruction in the schedule data.
2450  Value *OpValue = nullptr;
2451 
2452  /// The TreeEntry that this instruction corresponds to.
2453  TreeEntry *TE = nullptr;
2454 
2455  /// The lane of this node in the TreeEntry.
2456  int Lane = -1;
2457  };
2458 
2459 #ifndef NDEBUG
2460  friend inline raw_ostream &operator<<(raw_ostream &os,
2461  const BoUpSLP::ScheduleData &SD) {
2462  SD.dump(os);
2463  return os;
2464  }
2465 #endif
2466 
2467  friend struct GraphTraits<BoUpSLP *>;
2468  friend struct DOTGraphTraits<BoUpSLP *>;
2469 
2470  /// Contains all scheduling data for a basic block.
2471  struct BlockScheduling {
2472  BlockScheduling(BasicBlock *BB)
2473  : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
2474 
2475  void clear() {
2476  ReadyInsts.clear();
2477  ScheduleStart = nullptr;
2478  ScheduleEnd = nullptr;
2479  FirstLoadStoreInRegion = nullptr;
2480  LastLoadStoreInRegion = nullptr;
2481 
2482  // Reduce the maximum schedule region size by the size of the
2483  // previous scheduling run.
2484  ScheduleRegionSizeLimit -= ScheduleRegionSize;
2485  if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
2486  ScheduleRegionSizeLimit = MinScheduleRegionSize;
2487  ScheduleRegionSize = 0;
2488 
2489  // Make a new scheduling region, i.e. all existing ScheduleData is not
2490  // in the new region yet.
2491  ++SchedulingRegionID;
2492  }
2493 
2494  ScheduleData *getScheduleData(Value *V) {
2495  ScheduleData *SD = ScheduleDataMap[V];
2496  if (SD && SD->SchedulingRegionID == SchedulingRegionID)
2497  return SD;
2498  return nullptr;
2499  }
2500 
2501  ScheduleData *getScheduleData(Value *V, Value *Key) {
2502  if (V == Key)
2503  return getScheduleData(V);
2504  auto I = ExtraScheduleDataMap.find(V);
2505  if (I != ExtraScheduleDataMap.end()) {
2506  ScheduleData *SD = I->second[Key];
2507  if (SD && SD->SchedulingRegionID == SchedulingRegionID)
2508  return SD;
2509  }
2510  return nullptr;
2511  }
2512 
2513  bool isInSchedulingRegion(ScheduleData *SD) const {
2514  return SD->SchedulingRegionID == SchedulingRegionID;
2515  }
2516 
2517  /// Marks an instruction as scheduled and puts all dependent ready
2518  /// instructions into the ready-list.
2519  template <typename ReadyListType>
2520  void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
2521  SD->IsScheduled = true;
2522  LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
2523 
2524  for (ScheduleData *BundleMember = SD; BundleMember;
2525  BundleMember = BundleMember->NextInBundle) {
2526  if (BundleMember->Inst != BundleMember->OpValue)
2527  continue;
2528 
2529  // Handle the def-use chain dependencies.
2530 
2531  // Decrement the unscheduled counter and insert to ready list if ready.
2532  auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
2533  doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
2534  if (OpDef && OpDef->hasValidDependencies() &&
2535  OpDef->incrementUnscheduledDeps(-1) == 0) {
2536  // There are no more unscheduled dependencies after
2537  // decrementing, so we can put the dependent instruction
2538  // into the ready list.
2539  ScheduleData *DepBundle = OpDef->FirstInBundle;
2540  assert(!DepBundle->IsScheduled &&
2541  "already scheduled bundle gets ready");
2542  ReadyList.insert(DepBundle);
2543  LLVM_DEBUG(dbgs()
2544  << "SLP: gets ready (def): " << *DepBundle << "\n");
2545  }
2546  });
2547  };
2548 
2549  // If BundleMember is a vector bundle, its operands may have been
2550  // reordered duiring buildTree(). We therefore need to get its operands
2551  // through the TreeEntry.
2552  if (TreeEntry *TE = BundleMember->TE) {
2553  int Lane = BundleMember->Lane;
2554  assert(Lane >= 0 && "Lane not set");
2555 
2556  // Since vectorization tree is being built recursively this assertion
2557  // ensures that the tree entry has all operands set before reaching
2558  // this code. Couple of exceptions known at the moment are extracts
2559  // where their second (immediate) operand is not added. Since
2560  // immediates do not affect scheduler behavior this is considered
2561  // okay.
2562  auto *In = TE->getMainOp();
2563  assert(In &&
2564  (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
2565  In->getNumOperands() == TE->getNumOperands()) &&
2566  "Missed TreeEntry operands?");
2567  (void)In; // fake use to avoid build failure when assertions disabled
2568 
2569  for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
2570  OpIdx != NumOperands; ++OpIdx)
2571  if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
2572  DecrUnsched(I);
2573  } else {
2574  // If BundleMember is a stand-alone instruction, no operand reordering
2575  // has taken place, so we directly access its operands.
2576  for (Use &U : BundleMember->Inst->operands())
2577  if (auto *I = dyn_cast<Instruction>(U.get()))
2578  DecrUnsched(I);
2579  }
2580  // Handle the memory dependencies.
2581  for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
2582  if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
2583  // There are no more unscheduled dependencies after decrementing,
2584  // so we can put the dependent instruction into the ready list.
2585  ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
2586  assert(!DepBundle->IsScheduled &&
2587  "already scheduled bundle gets ready");
2588  ReadyList.insert(DepBundle);
2589  LLVM_DEBUG(dbgs()
2590  << "SLP: gets ready (mem): " << *DepBundle << "\n");
2591  }
2592  }
2593  }
2594  }
2595 
2596  void doForAllOpcodes(Value *V,
2597  function_ref<void(ScheduleData *SD)> Action) {
2598  if (ScheduleData *SD = getScheduleData(V))
2599  Action(SD);
2600  auto I = ExtraScheduleDataMap.find(V);
2601  if (I != ExtraScheduleDataMap.end())
2602  for (auto &P : I->second)
2603  if (P.second->SchedulingRegionID == SchedulingRegionID)
2604  Action(P.second);
2605  }
2606 
2607  /// Put all instructions into the ReadyList which are ready for scheduling.
2608  template <typename ReadyListType>
2609  void initialFillReadyList(ReadyListType &ReadyList) {
2610  for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2611  doForAllOpcodes(I, [&](ScheduleData *SD) {
2612  if (SD->isSchedulingEntity() && SD->isReady()) {
2613  ReadyList.insert(SD);
2614  LLVM_DEBUG(dbgs()
2615  << "SLP: initially in ready list: " << *I << "\n");
2616  }
2617  });
2618  }
2619  }
2620 
2621  /// Build a bundle from the ScheduleData nodes corresponding to the
2622  /// scalar instruction for each lane.
2623  ScheduleData *buildBundle(ArrayRef<Value *> VL);
2624 
2625  /// Checks if a bundle of instructions can be scheduled, i.e. has no
2626  /// cyclic dependencies. This is only a dry-run, no instructions are
2627  /// actually moved at this stage.
2628  /// \returns the scheduling bundle. The returned Optional value is non-None
2629  /// if \p VL is allowed to be scheduled.
2631  tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
2632  const InstructionsState &S);
2633 
2634  /// Un-bundles a group of instructions.
2635  void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
2636 
2637  /// Allocates schedule data chunk.
2638  ScheduleData *allocateScheduleDataChunks();
2639 
2640  /// Extends the scheduling region so that V is inside the region.
2641  /// \returns true if the region size is within the limit.
2642  bool extendSchedulingRegion(Value *V, const InstructionsState &S);
2643 
2644  /// Initialize the ScheduleData structures for new instructions in the
2645  /// scheduling region.
2646  void initScheduleData(Instruction *FromI, Instruction *ToI,
2647  ScheduleData *PrevLoadStore,
2648  ScheduleData *NextLoadStore);
2649 
2650  /// Updates the dependency information of a bundle and of all instructions/
2651  /// bundles which depend on the original bundle.
2652  void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
2653  BoUpSLP *SLP);
2654 
2655  /// Sets all instruction in the scheduling region to un-scheduled.
2656  void resetSchedule();
2657 
2658  BasicBlock *BB;
2659 
2660  /// Simple memory allocation for ScheduleData.
2661  std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
2662 
2663  /// The size of a ScheduleData array in ScheduleDataChunks.
2664  int ChunkSize;
2665 
2666  /// The allocator position in the current chunk, which is the last entry
2667  /// of ScheduleDataChunks.
2668  int ChunkPos;
2669 
2670  /// Attaches ScheduleData to Instruction.
2671  /// Note that the mapping survives during all vectorization iterations, i.e.
2672  /// ScheduleData structures are recycled.
2673  DenseMap<Value *, ScheduleData *> ScheduleDataMap;
2674 
2675  /// Attaches ScheduleData to Instruction with the leading key.
2677  ExtraScheduleDataMap;
2678 
2679  struct ReadyList : SmallVector<ScheduleData *, 8> {
2680  void insert(ScheduleData *SD) { push_back(SD); }
2681  };
2682 
2683  /// The ready-list for scheduling (only used for the dry-run).
2684  ReadyList ReadyInsts;
2685 
2686  /// The first instruction of the scheduling region.
2687  Instruction *ScheduleStart = nullptr;
2688 
2689  /// The first instruction _after_ the scheduling region.
2690  Instruction *ScheduleEnd = nullptr;
2691 
2692  /// The first memory accessing instruction in the scheduling region
2693  /// (can be null).
2694  ScheduleData *FirstLoadStoreInRegion = nullptr;
2695 
2696  /// The last memory accessing instruction in the scheduling region
2697  /// (can be null).
2698  ScheduleData *LastLoadStoreInRegion = nullptr;
2699 
2700  /// The current size of the scheduling region.
2701  int ScheduleRegionSize = 0;
2702 
2703  /// The maximum size allowed for the scheduling region.
2704  int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
2705 
2706  /// The ID of the scheduling region. For a new vectorization iteration this
2707  /// is incremented which "removes" all ScheduleData from the region.
2708  // Make sure that the initial SchedulingRegionID is greater than the
2709  // initial SchedulingRegionID in ScheduleData (which is 0).
2710  int SchedulingRegionID = 1;
2711  };
2712 
2713  /// Attaches the BlockScheduling structures to basic blocks.
2715 
2716  /// Performs the "real" scheduling. Done before vectorization is actually
2717  /// performed in a basic block.
2718  void scheduleBlock(BlockScheduling *BS);
2719 
2720  /// List of users to ignore during scheduling and that don't need extracting.
2721  ArrayRef<Value *> UserIgnoreList;
2722 
2723  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
2724  /// sorted SmallVectors of unsigned.
2725  struct OrdersTypeDenseMapInfo {
2726  static OrdersType getEmptyKey() {
2727  OrdersType V;
2728  V.push_back(~1U);
2729  return V;
2730  }
2731 
2732  static OrdersType getTombstoneKey() {
2733  OrdersType V;
2734  V.push_back(~2U);
2735  return V;
2736  }
2737 
2738  static unsigned getHashValue(const OrdersType &V) {
2739  return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
2740  }
2741 
2742  static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
2743  return LHS == RHS;
2744  }
2745  };
2746 
2747  // Analysis and block reference.
2748  Function *F;
2749  ScalarEvolution *SE;
2751  TargetLibraryInfo *TLI;
2752  AAResults *AA;
2753  LoopInfo *LI;
2754  DominatorTree *DT;
2755  AssumptionCache *AC;
2756  DemandedBits *DB;
2757  const DataLayout *DL;
2759 
2760  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
2761  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
2762 
2763  /// Instruction builder to construct the vectorized tree.
2765 
2766  /// A map of scalar integer values to the smallest bit width with which they
2767  /// can legally be represented. The values map to (width, signed) pairs,
2768  /// where "width" indicates the minimum bit width and "signed" is True if the
2769  /// value must be signed-extended, rather than zero-extended, back to its
2770  /// original width.
2772 };
2773 
2774 } // end namespace slpvectorizer
2775 
2776 template <> struct GraphTraits<BoUpSLP *> {
2777  using TreeEntry = BoUpSLP::TreeEntry;
2778 
2779  /// NodeRef has to be a pointer per the GraphWriter.
2780  using NodeRef = TreeEntry *;
2781 
2782  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
2783 
2784  /// Add the VectorizableTree to the index iterator to be able to return
2785  /// TreeEntry pointers.
2786  struct ChildIteratorType
2787  : public iterator_adaptor_base<
2788  ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
2790 
2792  ContainerTy &VT)
2793  : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
2794 
2795  NodeRef operator*() { return I->UserTE; }
2796  };
2797 
2798  static NodeRef getEntryNode(BoUpSLP &R) {
2799  return R.VectorizableTree[0].get();
2800  }
2801 
2802  static ChildIteratorType child_begin(NodeRef N) {
2803  return {N->UserTreeIndices.begin(), N->Container};
2804  }
2805 
2806  static ChildIteratorType child_end(NodeRef N) {
2807  return {N->UserTreeIndices.end(), N->Container};
2808  }
2809 
2810  /// For the node iterator we just need to turn the TreeEntry iterator into a
2811  /// TreeEntry* iterator so that it dereferences to NodeRef.
2812  class nodes_iterator {
2813  using ItTy = ContainerTy::iterator;
2814  ItTy It;
2815 
2816  public:
2817  nodes_iterator(const ItTy &It2) : It(It2) {}
2818  NodeRef operator*() { return It->get(); }
2819  nodes_iterator operator++() {
2820  ++It;
2821  return *this;
2822  }
2823  bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
2824  };
2825 
2826  static nodes_iterator nodes_begin(BoUpSLP *R) {
2827  return nodes_iterator(R->VectorizableTree.begin());
2828  }
2829 
2830  static nodes_iterator nodes_end(BoUpSLP *R) {
2831  return nodes_iterator(R->VectorizableTree.end());
2832  }
2833 
2834  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
2835 };
2836 
2837 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
2838  using TreeEntry = BoUpSLP::TreeEntry;
2839 
2841 
2842  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
2843  std::string Str;
2844  raw_string_ostream OS(Str);
2845  if (isSplat(Entry->Scalars))
2846  OS << "<splat> ";
2847  for (auto V : Entry->Scalars) {
2848  OS << *V;
2849  if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
2850  return EU.Scalar == V;
2851  }))
2852  OS << " <extract>";
2853  OS << "\n";
2854  }
2855  return Str;
2856  }
2857 
2858  static std::string getNodeAttributes(const TreeEntry *Entry,
2859  const BoUpSLP *) {
2860  if (Entry->State == TreeEntry::NeedToGather)
2861  return "color=red";
2862  return "";
2863  }
2864 };
2865 
2866 } // end namespace llvm
2867 
2868 BoUpSLP::~BoUpSLP() {
2869  for (const auto &Pair : DeletedInstructions) {
2870  // Replace operands of ignored instructions with Undefs in case if they were
2871  // marked for deletion.
2872  if (Pair.getSecond()) {
2873  Value *Undef = UndefValue::get(Pair.getFirst()->getType());
2874  Pair.getFirst()->replaceAllUsesWith(Undef);
2875  }
2876  Pair.getFirst()->dropAllReferences();
2877  }
2878  for (const auto &Pair : DeletedInstructions) {
2879  assert(Pair.getFirst()->use_empty() &&
2880  "trying to erase instruction with users.");
2881  Pair.getFirst()->eraseFromParent();
2882  }
2883 #ifdef EXPENSIVE_CHECKS
2884  // If we could guarantee that this call is not extremely slow, we could
2885  // remove the ifdef limitation (see PR47712).
2886  assert(!verifyFunction(*F, &dbgs()));
2887 #endif
2888 }
2889 
2890 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
2891  for (auto *V : AV) {
2892  if (auto *I = dyn_cast<Instruction>(V))
2893  eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
2894  };
2895 }
2896 
2897 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
2898 /// contains original mask for the scalars reused in the node. Procedure
2899 /// transform this mask in accordance with the given \p Mask.
2901  assert(!Mask.empty() && Reuses.size() == Mask.size() &&
2902  "Expected non-empty mask.");
2903  SmallVector<int> Prev(Reuses.begin(), Reuses.end());
2904  Prev.swap(Reuses);
2905  for (unsigned I = 0, E = Prev.size(); I < E; ++I)
2906  if (Mask[I] != UndefMaskElem)
2907  Reuses[Mask[I]] = Prev[I];
2908 }
2909 
2910 /// Reorders the given \p Order according to the given \p Mask. \p Order - is
2911 /// the original order of the scalars. Procedure transforms the provided order
2912 /// in accordance with the given \p Mask. If the resulting \p Order is just an
2913 /// identity order, \p Order is cleared.
2915  assert(!Mask.empty() && "Expected non-empty mask.");
2916  SmallVector<int> MaskOrder;
2917  if (Order.empty()) {
2918  MaskOrder.resize(Mask.size());
2919  std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
2920  } else {
2921  inversePermutation(Order, MaskOrder);
2922  }
2923  reorderReuses(MaskOrder, Mask);
2924  if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
2925  Order.clear();
2926  return;
2927  }
2928  Order.assign(Mask.size(), Mask.size());
2929  for (unsigned I = 0, E = Mask.size(); I < E; ++I)
2930  if (MaskOrder[I] != UndefMaskElem)
2931  Order[MaskOrder[I]] = I;
2932  fixupOrderingIndices(Order);
2933 }
2934 
2936 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
2937  assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
2938  unsigned NumScalars = TE.Scalars.size();
2939  OrdersType CurrentOrder(NumScalars, NumScalars);
2940  SmallVector<int> Positions;
2941  SmallBitVector UsedPositions(NumScalars);
2942  const TreeEntry *STE = nullptr;
2943  // Try to find all gathered scalars that are gets vectorized in other
2944  // vectorize node. Here we can have only one single tree vector node to
2945  // correctly identify order of the gathered scalars.
2946  for (unsigned I = 0; I < NumScalars; ++I) {
2947  Value *V = TE.Scalars[I];
2948  if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
2949  continue;
2950  if (const auto *LocalSTE = getTreeEntry(V)) {
2951  if (!STE)
2952  STE = LocalSTE;
2953  else if (STE != LocalSTE)
2954  // Take the order only from the single vector node.
2955  return None;
2956  unsigned Lane =
2957  std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
2958  if (Lane >= NumScalars)
2959  return None;
2960  if (CurrentOrder[Lane] != NumScalars) {
2961  if (Lane != I)
2962  continue;
2963  UsedPositions.reset(CurrentOrder[Lane]);
2964  }
2965  // The partial identity (where only some elements of the gather node are
2966  // in the identity order) is good.
2967  CurrentOrder[Lane] = I;
2968  UsedPositions.set(I);
2969  }
2970  }
2971  // Need to keep the order if we have a vector entry and at least 2 scalars or
2972  // the vectorized entry has just 2 scalars.
2973  if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
2974  auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
2975  for (unsigned I = 0; I < NumScalars; ++I)
2976  if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
2977  return false;
2978  return true;
2979  };
2980  if (IsIdentityOrder(CurrentOrder)) {
2981  CurrentOrder.clear();
2982  return CurrentOrder;
2983  }
2984  auto *It = CurrentOrder.begin();
2985  for (unsigned I = 0; I < NumScalars;) {
2986  if (UsedPositions.test(I)) {
2987  ++I;
2988  continue;
2989  }
2990  if (*It == NumScalars) {
2991  *It = I;
2992  ++I;
2993  }
2994  ++It;
2995  }
2996  return CurrentOrder;
2997  }
2998  return None;
2999 }
3000 
3001 Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
3002  bool TopToBottom) {
3003  // No need to reorder if need to shuffle reuses, still need to shuffle the
3004  // node.
3005  if (!TE.ReuseShuffleIndices.empty())
3006  return None;
3007  if (TE.State == TreeEntry::Vectorize &&
3008  (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
3009  (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
3010  !TE.isAltShuffle())
3011  return TE.ReorderIndices;
3012  if (TE.State == TreeEntry::NeedToGather) {
3013  // TODO: add analysis of other gather nodes with extractelement
3014  // instructions and other values/instructions, not only undefs.
3015  if (((TE.getOpcode() == Instruction::ExtractElement &&
3016  !TE.isAltShuffle()) ||
3017  (all_of(TE.Scalars,
3018  [](Value *V) {
3019  return isa<UndefValue, ExtractElementInst>(V);
3020  }) &&
3021  any_of(TE.Scalars,
3022  [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
3023  all_of(TE.Scalars,
3024  [](Value *V) {
3025  auto *EE = dyn_cast<ExtractElementInst>(V);
3026  return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
3027  }) &&
3028  allSameType(TE.Scalars)) {
3029  // Check that gather of extractelements can be represented as
3030  // just a shuffle of a single vector.
3031  OrdersType CurrentOrder;
3032  bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
3033  if (Reuse || !CurrentOrder.empty()) {
3034  if (!CurrentOrder.empty())
3035  fixupOrderingIndices(CurrentOrder);
3036  return CurrentOrder;
3037  }
3038  }
3039  if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
3040  return CurrentOrder;
3041  }
3042  return None;
3043 }
3044 
3045 void BoUpSLP::reorderTopToBottom() {
3046  // Maps VF to the graph nodes.
3047  DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
3048  // ExtractElement gather nodes which can be vectorized and need to handle
3049  // their ordering.
3051  // Find all reorderable nodes with the given VF.
3052  // Currently the are vectorized stores,loads,extracts + some gathering of
3053  // extracts.
3054  for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
3055  const std::unique_ptr<TreeEntry> &TE) {
3056  if (Optional<OrdersType> CurrentOrder =
3057  getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
3058  // Do not include ordering for nodes used in the alt opcode vectorization,
3059  // better to reorder them during bottom-to-top stage. If follow the order
3060  // here, it causes reordering of the whole graph though actually it is
3061  // profitable just to reorder the subgraph that starts from the alternate
3062  // opcode vectorization node. Such nodes already end-up with the shuffle
3063  // instruction and it is just enough to change this shuffle rather than
3064  // rotate the scalars for the whole graph.
3065  unsigned Cnt = 0;
3066  const TreeEntry *UserTE = TE.get();
3067  while (UserTE && Cnt < RecursionMaxDepth) {
3068  if (UserTE->UserTreeIndices.size() != 1)
3069  break;
3070  if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
3071  return EI.UserTE->State == TreeEntry::Vectorize &&
3072  EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
3073  }))
3074  return;
3075  if (UserTE->UserTreeIndices.empty())
3076  UserTE = nullptr;
3077  else
3078  UserTE = UserTE->UserTreeIndices.back().UserTE;
3079  ++Cnt;
3080  }
3081  VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
3082  if (TE->State != TreeEntry::Vectorize)
3083  GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
3084  }
3085  });
3086 
3087  // Reorder the graph nodes according to their vectorization factor.
3088  for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
3089  VF /= 2) {
3090  auto It = VFToOrderedEntries.find(VF);
3091  if (It == VFToOrderedEntries.end())
3092  continue;
3093  // Try to find the most profitable order. We just are looking for the most
3094  // used order and reorder scalar elements in the nodes according to this
3095  // mostly used order.
3096  ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
3097  // All operands are reordered and used only in this node - propagate the
3098  // most used order to the user node.
3099  MapVector<OrdersType, unsigned,
3101  OrdersUses;
3103  for (const TreeEntry *OpTE : OrderedEntries) {
3104  // No need to reorder this nodes, still need to extend and to use shuffle,
3105  // just need to merge reordering shuffle and the reuse shuffle.
3106  if (!OpTE->ReuseShuffleIndices.empty())
3107  continue;
3108  // Count number of orders uses.
3109  const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3110  if (OpTE->State == TreeEntry::NeedToGather)
3111  return GathersToOrders.find(OpTE)->second;
3112  return OpTE->ReorderIndices;
3113  }();
3114  // Stores actually store the mask, not the order, need to invert.
3115  if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
3116  OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
3118  inversePermutation(Order, Mask);
3119  unsigned E = Order.size();
3120  OrdersType CurrentOrder(E, E);
3121  transform(Mask, CurrentOrder.begin(), [E](int Idx) {
3122  return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
3123  });
3124  fixupOrderingIndices(CurrentOrder);
3125  ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
3126  } else {
3127  ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
3128  }
3129  }
3130  // Set order of the user node.
3131  if (OrdersUses.empty())
3132  continue;
3133  // Choose the most used order.
3134  ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
3135  unsigned Cnt = OrdersUses.front().second;
3136  for (const auto &Pair : drop_begin(OrdersUses)) {
3137  if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
3138  BestOrder = Pair.first;
3139  Cnt = Pair.second;
3140  }
3141  }
3142  // Set order of the user node.
3143  if (BestOrder.empty())
3144  continue;
3146  inversePermutation(BestOrder, Mask);
3147  SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
3148  unsigned E = BestOrder.size();
3149  transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
3150  return I < E ? static_cast<int>(I) : UndefMaskElem;
3151  });
3152  // Do an actual reordering, if profitable.
3153  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
3154  // Just do the reordering for the nodes with the given VF.
3155  if (TE->Scalars.size() != VF) {
3156  if (TE->ReuseShuffleIndices.size() == VF) {
3157  // Need to reorder the reuses masks of the operands with smaller VF to
3158  // be able to find the match between the graph nodes and scalar
3159  // operands of the given node during vectorization/cost estimation.
3160  assert(all_of(TE->UserTreeIndices,
3161  [VF, &TE](const EdgeInfo &EI) {
3162  return EI.UserTE->Scalars.size() == VF ||
3163  EI.UserTE->Scalars.size() ==
3164  TE->Scalars.size();
3165  }) &&
3166  "All users must be of VF size.");
3167  // Update ordering of the operands with the smaller VF than the given
3168  // one.
3169  reorderReuses(TE->ReuseShuffleIndices, Mask);
3170  }
3171  continue;
3172  }
3173  if (TE->State == TreeEntry::Vectorize &&
3175  InsertElementInst>(TE->getMainOp()) &&
3176  !TE->isAltShuffle()) {
3177  // Build correct orders for extract{element,value}, loads and
3178  // stores.
3179  reorderOrder(TE->ReorderIndices, Mask);
3180  if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
3181  TE->reorderOperands(Mask);
3182  } else {
3183  // Reorder the node and its operands.
3184  TE->reorderOperands(Mask);
3185  assert(TE->ReorderIndices.empty() &&
3186  "Expected empty reorder sequence.");
3187  reorderScalars(TE->Scalars, Mask);
3188  }
3189  if (!TE->ReuseShuffleIndices.empty()) {
3190  // Apply reversed order to keep the original ordering of the reused
3191  // elements to avoid extra reorder indices shuffling.
3192  OrdersType CurrentOrder;
3193  reorderOrder(CurrentOrder, MaskOrder);
3194  SmallVector<int> NewReuses;
3195  inversePermutation(CurrentOrder, NewReuses);
3196  addMask(NewReuses, TE->ReuseShuffleIndices);
3197  TE->ReuseShuffleIndices.swap(NewReuses);
3198  }
3199  }
3200  }
3201 }
3202 
3203 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
3204  SetVector<TreeEntry *> OrderedEntries;
3206  // Find all reorderable leaf nodes with the given VF.
3207  // Currently the are vectorized loads,extracts without alternate operands +
3208  // some gathering of extracts.
3209  SmallVector<TreeEntry *> NonVectorized;
3210  for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
3211  &NonVectorized](
3212  const std::unique_ptr<TreeEntry> &TE) {
3213  if (TE->State != TreeEntry::Vectorize)
3214  NonVectorized.push_back(TE.get());
3215  if (Optional<OrdersType> CurrentOrder =
3216  getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
3217  OrderedEntries.insert(TE.get());
3218  if (TE->State != TreeEntry::Vectorize)
3219  GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
3220  }
3221  });
3222 
3223  // Checks if the operands of the users are reordarable and have only single
3224  // use.
3225  auto &&CheckOperands =
3226  [this, &NonVectorized](const auto &Data,
3227  SmallVectorImpl<TreeEntry *> &GatherOps) {
3228  for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
3229  if (any_of(Data.second,
3230  [I](const std::pair<unsigned, TreeEntry *> &OpData) {
3231  return OpData.first == I &&
3232  OpData.second->State == TreeEntry::Vectorize;
3233  }))
3234  continue;
3235  ArrayRef<Value *> VL = Data.first->getOperand(I);
3236  const TreeEntry *TE = nullptr;
3237  const auto *It = find_if(VL, [this, &TE](Value *V) {
3238  TE = getTreeEntry(V);
3239  return TE;
3240  });
3241  if (It != VL.end() && TE->isSame(VL))
3242  return false;
3243  TreeEntry *Gather = nullptr;
3244  if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
3245  assert(TE->State != TreeEntry::Vectorize &&
3246  "Only non-vectorized nodes are expected.");
3247  if (TE->isSame(VL)) {
3248  Gather = TE;
3249  return true;
3250  }
3251  return false;
3252  }) > 1)
3253  return false;
3254  if (Gather)
3255  GatherOps.push_back(Gather);
3256  }
3257  return true;
3258  };
3259  // 1. Propagate order to the graph nodes, which use only reordered nodes.
3260  // I.e., if the node has operands, that are reordered, try to make at least
3261  // one operand order in the natural order and reorder others + reorder the
3262  // user node itself.
3264  while (!OrderedEntries.empty()) {
3265  // 1. Filter out only reordered nodes.
3266  // 2. If the entry has multiple uses - skip it and jump to the next node.
3268  SmallVector<TreeEntry *> Filtered;
3269  for (TreeEntry *TE : OrderedEntries) {
3270  if (!(TE->State == TreeEntry::Vectorize ||
3271  (TE->State == TreeEntry::NeedToGather &&
3272  GathersToOrders.count(TE))) ||
3273  TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
3274  !all_of(drop_begin(TE->UserTreeIndices),
3275  [TE](const EdgeInfo &EI) {
3276  return EI.UserTE == TE->UserTreeIndices.front().UserTE;
3277  }) ||
3278  !Visited.insert(TE).second) {
3279  Filtered.push_back(TE);
3280  continue;
3281  }
3282  // Build a map between user nodes and their operands order to speedup
3283  // search. The graph currently does not provide this dependency directly.
3284  for (EdgeInfo &EI : TE->UserTreeIndices) {
3285  TreeEntry *UserTE = EI.UserTE;
3286  auto It = Users.find(UserTE);
3287  if (It == Users.end())
3288  It = Users.insert({UserTE, {}}).first;
3289  It->second.emplace_back(EI.EdgeIdx, TE);
3290  }
3291  }
3292  // Erase filtered entries.
3293  for_each(Filtered,
3294  [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
3295  for (const auto &Data : Users) {
3296  // Check that operands are used only in the User node.
3297  SmallVector<TreeEntry *> GatherOps;
3298  if (!CheckOperands(Data, GatherOps)) {
3299  for_each(Data.second,
3300  [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
3301  OrderedEntries.remove(Op.second);
3302  });
3303  continue;
3304  }
3305  // All operands are reordered and used only in this node - propagate the
3306  // most used order to the user node.
3307  MapVector<OrdersType, unsigned,
3309  OrdersUses;
3311  for (const auto &Op : Data.second) {
3312  TreeEntry *OpTE = Op.second;
3313  if (!OpTE->ReuseShuffleIndices.empty() ||
3314  (IgnoreReorder && OpTE == VectorizableTree.front().get()))
3315  continue;
3316  const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3317  if (OpTE->State == TreeEntry::NeedToGather)
3318  return GathersToOrders.find(OpTE)->second;
3319  return OpTE->ReorderIndices;
3320  }();
3321  // Stores actually store the mask, not the order, need to invert.
3322  if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
3323  OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
3325  inversePermutation(Order, Mask);
3326  unsigned E = Order.size();
3327  OrdersType CurrentOrder(E, E);
3328  transform(Mask, CurrentOrder.begin(), [E](int Idx) {
3329  return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
3330  });
3331  fixupOrderingIndices(CurrentOrder);
3332  ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
3333  } else {
3334  ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
3335  }
3336  if (VisitedOps.insert(OpTE).second)
3337  OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
3338  OpTE->UserTreeIndices.size();
3339  assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
3340  --OrdersUses[{}];
3341  }
3342  // If no orders - skip current nodes and jump to the next one, if any.
3343  if (OrdersUses.empty()) {
3344  for_each(Data.second,
3345  [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
3346  OrderedEntries.remove(Op.second);
3347  });
3348  continue;
3349  }
3350  // Choose the best order.
3351  ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
3352  unsigned Cnt = OrdersUses.front().second;
3353  for (const auto &Pair : drop_begin(OrdersUses)) {
3354  if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
3355  BestOrder = Pair.first;
3356  Cnt = Pair.second;
3357  }
3358  }
3359  // Set order of the user node (reordering of operands and user nodes).
3360  if (BestOrder.empty()) {
3361  for_each(Data.second,
3362  [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
3363  OrderedEntries.remove(Op.second);
3364  });
3365  continue;
3366  }
3367  // Erase operands from OrderedEntries list and adjust their orders.
3368  VisitedOps.clear();
3370  inversePermutation(BestOrder, Mask);
3371  SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
3372  unsigned E = BestOrder.size();
3373  transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
3374  return I < E ? static_cast<int>(I) : UndefMaskElem;
3375  });
3376  for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
3377  TreeEntry *TE = Op.second;
3378  OrderedEntries.remove(TE);
3379  if (!VisitedOps.insert(TE).second)
3380  continue;
3381  if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
3382  // Just reorder reuses indices.
3383  reorderReuses(TE->ReuseShuffleIndices, Mask);
3384  continue;
3385  }
3386  // Gathers are processed separately.
3387  if (TE->State != TreeEntry::Vectorize)
3388  continue;
3389  assert((BestOrder.size() == TE->ReorderIndices.size() ||
3390  TE->ReorderIndices.empty()) &&
3391  "Non-matching sizes of user/operand entries.");
3392  reorderOrder(TE->ReorderIndices, Mask);
3393  }
3394  // For gathers just need to reorder its scalars.
3395  for (TreeEntry *Gather : GatherOps) {
3396  assert(Gather->ReorderIndices.empty() &&
3397  "Unexpected reordering of gathers.");
3398  if (!Gather->ReuseShuffleIndices.empty()) {
3399  // Just reorder reuses indices.
3400  reorderReuses(Gather->ReuseShuffleIndices, Mask);
3401  continue;
3402  }
3403  reorderScalars(Gather->Scalars, Mask);
3404  OrderedEntries.remove(Gather);
3405  }
3406  // Reorder operands of the user node and set the ordering for the user
3407  // node itself.
3408  if (Data.first->State != TreeEntry::Vectorize ||
3409  !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
3410  Data.first->getMainOp()) ||
3411  Data.first->isAltShuffle())
3412  Data.first->reorderOperands(Mask);
3413  if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
3414  Data.first->isAltShuffle()) {
3415  reorderScalars(Data.first->Scalars, Mask);
3416  reorderOrder(Data.first->ReorderIndices, MaskOrder);
3417  if (Data.first->ReuseShuffleIndices.empty() &&
3418  !Data.first->ReorderIndices.empty() &&
3419  !Data.first->isAltShuffle()) {
3420  // Insert user node to the list to try to sink reordering deeper in
3421  // the graph.
3422  OrderedEntries.insert(Data.first);
3423  }
3424  } else {
3425  reorderOrder(Data.first->ReorderIndices, Mask);
3426  }
3427  }
3428  }
3429  // If the reordering is unnecessary, just remove the reorder.
3430  if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
3431  VectorizableTree.front()->ReuseShuffleIndices.empty())
3432  VectorizableTree.front()->ReorderIndices.clear();
3433 }
3434 
3435 void BoUpSLP::buildExternalUses(
3436  const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
3437  // Collect the values that we need to extract from the tree.
3438  for (auto &TEPtr : VectorizableTree) {
3439  TreeEntry *Entry = TEPtr.get();
3440 
3441  // No need to handle users of gathered values.
3442  if (Entry->State == TreeEntry::NeedToGather)
3443  continue;
3444 
3445  // For each lane:
3446  for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
3447  Value *Scalar = Entry->Scalars[Lane];
3448  int FoundLane = Entry->findLaneForValue(Scalar);
3449 
3450  // Check if the scalar is externally used as an extra arg.
3451  auto ExtI = ExternallyUsedValues.find(Scalar);
3452  if (ExtI != ExternallyUsedValues.end()) {
3453  LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
3454  << Lane << " from " << *Scalar << ".\n");
3455  ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
3456  }
3457  for (User *U : Scalar->users()) {
3458  LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
3459 
3460  Instruction *UserInst = dyn_cast<Instruction>(U);
3461  if (!UserInst)
3462  continue;
3463 
3464  if (isDeleted(UserInst))
3465  continue;
3466 
3467  // Skip in-tree scalars that become vectors
3468  if (TreeEntry *UseEntry = getTreeEntry(U)) {
3469  Value *UseScalar = UseEntry->Scalars[0];
3470  // Some in-tree scalars will remain as scalar in vectorized
3471  // instructions. If that is the case, the one in Lane 0 will
3472  // be used.
3473  if (UseScalar != U ||
3474  UseEntry->State == TreeEntry::ScatterVectorize ||
3475  !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
3476  LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
3477  << ".\n");
3478  assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
3479  continue;
3480  }
3481  }
3482 
3483  // Ignore users in the user ignore list.
3484  if (is_contained(UserIgnoreList, UserInst))
3485  continue;
3486 
3487  LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
3488  << Lane << " from " << *Scalar << ".\n");
3489  ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
3490  }
3491  }
3492  }
3493 }
3494 
3495 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
3496  ArrayRef<Value *> UserIgnoreLst) {
3497  deleteTree();
3498  UserIgnoreList = UserIgnoreLst;
3499  if (!allSameType(Roots))
3500  return;
3501  buildTree_rec(Roots, 0, EdgeInfo());
3502 }
3503 
3504 namespace {
3505 /// Tracks the state we can represent the loads in the given sequence.
3506 enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3507 } // anonymous namespace
3508 
3509 /// Checks if the given array of loads can be represented as a vectorized,
3510 /// scatter or just simple gather.
3511 static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3512  const TargetTransformInfo &TTI,
3513  const DataLayout &DL, ScalarEvolution &SE,
3515  SmallVectorImpl<Value *> &PointerOps) {
3516  // Check that a vectorized load would load the same memory as a scalar
3517  // load. For example, we don't want to vectorize loads that are smaller
3518  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3519  // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3520  // from such a struct, we read/write packed bits disagreeing with the
3521  // unvectorized version.
3522  Type *ScalarTy = VL0->getType();
3523 
3524  if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
3525  return LoadsState::Gather;
3526 
3527  // Make sure all loads in the bundle are simple - we can't vectorize
3528  // atomic or volatile loads.
3529  PointerOps.clear();
3530  PointerOps.resize(VL.size());
3531  auto *POIter = PointerOps.begin();
3532  for (Value *V : VL) {
3533  auto *L = cast<LoadInst>(V);
3534  if (!L->isSimple())
3535  return LoadsState::Gather;
3536  *POIter = L->getPointerOperand();
3537  ++POIter;
3538  }
3539 
3540  Order.clear();
3541  // Check the order of pointer operands.
3542  if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {
3543  Value *Ptr0;
3544  Value *PtrN;
3545  if (Order.empty()) {
3546  Ptr0 = PointerOps.front();
3547  PtrN = PointerOps.back();
3548  } else {
3549  Ptr0 = PointerOps[Order.front()];
3550  PtrN = PointerOps[Order.back()];
3551  }
3552  Optional<int> Diff =
3553  getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
3554  // Check that the sorted loads are consecutive.
3555  if (static_cast<unsigned>(*Diff) == VL.size() - 1)
3556  return LoadsState::Vectorize;
3557  Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3558  for (Value *V : VL)
3559  CommonAlignment =
3560  commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
3561  if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
3562  CommonAlignment))
3563  return LoadsState::ScatterVectorize;
3564  }
3565 
3566  return LoadsState::Gather;
3567 }
3568 
3569 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
3570  const EdgeInfo &UserTreeIdx) {
3571  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
3572 
3573  SmallVector<int> ReuseShuffleIndicies;
3574  SmallVector<Value *> UniqueValues;
3575  auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
3576  &UserTreeIdx,
3577  this](const InstructionsState &S) {
3578  // Check that every instruction appears once in this bundle.
3579  DenseMap<Value *, unsigned> UniquePositions;
3580  for (Value *V : VL) {
3581  if (isConstant(V)) {
3582  ReuseShuffleIndicies.emplace_back(
3583  isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
3584  UniqueValues.emplace_back(V);
3585  continue;
3586  }
3587  auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
3588  ReuseShuffleIndicies.emplace_back(Res.first->second);
3589  if (Res.second)
3590  UniqueValues.emplace_back(V);
3591  }
3592  size_t NumUniqueScalarValues = UniqueValues.size();
3593  if (NumUniqueScalarValues == VL.size()) {
3594  ReuseShuffleIndicies.clear();
3595  } else {
3596  LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
3597  if (NumUniqueScalarValues <= 1 ||
3598  (UniquePositions.size() == 1 && all_of(UniqueValues,
3599  [](Value *V) {
3600  return isa<UndefValue>(V) ||
3601  !isConstant(V);
3602  })) ||
3603  !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
3604  LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
3605  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3606  return false;
3607  }
3608  VL = UniqueValues;
3609  }
3610  return true;
3611  };
3612 
3613  InstructionsState S = getSameOpcode(VL);
3614  if (Depth == RecursionMaxDepth) {
3615  LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
3616  if (TryToFindDuplicates(S))
3617  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3618  ReuseShuffleIndicies);
3619  return;
3620  }
3621 
3622  // Don't handle scalable vectors
3623  if (S.getOpcode() == Instruction::ExtractElement &&
3624  isa<ScalableVectorType>(
3625  cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
3626  LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
3627  if (TryToFindDuplicates(S))
3628  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3629  ReuseShuffleIndicies);
3630  return;
3631  }
3632 
3633  // Don't handle vectors.
3634  if (S.OpValue->getType()->isVectorTy() &&
3635  !isa<InsertElementInst>(S.OpValue)) {
3636  LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
3637  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3638  return;
3639  }
3640 
3641  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
3642  if (SI->getValueOperand()->getType()->isVectorTy()) {
3643  LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
3644  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3645  return;
3646  }
3647 
3648  // If all of the operands are identical or constant we have a simple solution.
3649  // If we deal with insert/extract instructions, they all must have constant
3650  // indices, otherwise we should gather them, not try to vectorize.
3651  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
3652  (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
3654  LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
3655  if (TryToFindDuplicates(S))
3656  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3657  ReuseShuffleIndicies);
3658  return;
3659  }
3660 
3661  // We now know that this is a vector of instructions of the same type from
3662  // the same block.
3663 
3664  // Don't vectorize ephemeral values.
3665  for (Value *V : VL) {
3666  if (EphValues.count(V)) {
3667  LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
3668  << ") is ephemeral.\n");
3669  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3670  return;
3671  }
3672  }
3673 
3674  // Check if this is a duplicate of another entry.
3675  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
3676  LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
3677  if (!E->isSame(VL)) {
3678  LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
3679  if (TryToFindDuplicates(S))
3680  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3681  ReuseShuffleIndicies);
3682  return;
3683  }
3684  // Record the reuse of the tree node. FIXME, currently this is only used to
3685  // properly draw the graph rather than for the actual vectorization.
3686  E->UserTreeIndices.push_back(UserTreeIdx);
3687  LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
3688  << ".\n");
3689  return;
3690  }
3691 
3692  // Check that none of the instructions in the bundle are already in the tree.
3693  for (Value *V : VL) {
3694  auto *I = dyn_cast<Instruction>(V);
3695  if (!I)
3696  continue;
3697  if (getTreeEntry(I)) {
3698  LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
3699  << ") is already in tree.\n");
3700  if (TryToFindDuplicates(S))
3701  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3702  ReuseShuffleIndicies);
3703  return;
3704  }
3705  }
3706 
3707  // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
3708  for (Value *V : VL) {
3709  if (is_contained(UserIgnoreList, V)) {
3710  LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
3711  if (TryToFindDuplicates(S))
3712  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3713  ReuseShuffleIndicies);
3714  return;
3715  }
3716  }
3717 
3718  // Check that all of the users of the scalars that we want to vectorize are
3719  // schedulable.
3720  auto *VL0 = cast<Instruction>(S.OpValue);
3721  BasicBlock *BB = VL0->getParent();
3722 
3723  if (!DT->isReachableFromEntry(BB)) {
3724  // Don't go into unreachable blocks. They may contain instructions with
3725  // dependency cycles which confuse the final scheduling.
3726  LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
3727  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3728  return;
3729  }
3730 
3731  // Check that every instruction appears once in this bundle.
3732  if (!TryToFindDuplicates(S))
3733  return;
3734 
3735  auto &BSRef = BlocksSchedules[BB];
3736  if (!BSRef)
3737  BSRef = std::make_unique<BlockScheduling>(BB);
3738 
3739  BlockScheduling &BS = *BSRef.get();
3740 
3741  Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
3742  if (!Bundle) {
3743  LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
3744  assert((!BS.getScheduleData(VL0) ||
3745  !BS.getScheduleData(VL0)->isPartOfBundle()) &&
3746  "tryScheduleBundle should cancelScheduling on failure");
3747  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3748  ReuseShuffleIndicies);
3749  return;
3750  }
3751  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
3752 
3753  unsigned ShuffleOrOp = S.isAltShuffle() ?
3754  (unsigned) Instruction::ShuffleVector : S.getOpcode();
3755  switch (ShuffleOrOp) {
3756  case Instruction::PHI: {
3757  auto *PH = cast<PHINode>(VL0);
3758 
3759  // Check for terminator values (e.g. invoke).
3760  for (Value *V : VL)
3761  for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
3762  Instruction *Term = dyn_cast<Instruction>(
3763  cast<PHINode>(V)->getIncomingValueForBlock(
3764  PH->getIncomingBlock(I)));
3765  if (Term && Term->isTerminator()) {
3766  LLVM_DEBUG(dbgs()
3767  << "SLP: Need to swizzle PHINodes (terminator use).\n");
3768  BS.cancelScheduling(VL, VL0);
3769  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3770  ReuseShuffleIndicies);
3771  return;
3772  }
3773  }
3774 
3775  TreeEntry *TE =
3776  newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
3777  LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
3778 
3779  // Keeps the reordered operands to avoid code duplication.
3780  SmallVector<ValueList, 2> OperandsVec;
3781  for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
3782  if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
3783  ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
3784  TE->setOperand(I, Operands);
3785  OperandsVec.push_back(Operands);
3786  continue;
3787  }
3788  ValueList Operands;
3789  // Prepare the operand vector.
3790  for (Value *V : VL)
3791  Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
3792  PH->getIncomingBlock(I)));
3793  TE->setOperand(I, Operands);
3794  OperandsVec.push_back(Operands);
3795  }
3796  for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
3797  buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
3798  return;
3799  }
3800  case Instruction::ExtractValue:
3801  case Instruction::ExtractElement: {
3802  OrdersType CurrentOrder;
3803  bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
3804  if (Reuse) {
3805  LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
3806  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3807  ReuseShuffleIndicies);
3808  // This is a special case, as it does not gather, but at the same time
3809  // we are not extending buildTree_rec() towards the operands.
3810  ValueList Op0;
3811  Op0.assign(VL.size(), VL0->getOperand(0));
3812  VectorizableTree.back()->setOperand(0, Op0);
3813  return;
3814  }
3815  if (!CurrentOrder.empty()) {
3816  LLVM_DEBUG({
3817  dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
3818  "with order";
3819  for (unsigned Idx : CurrentOrder)
3820  dbgs() << " " << Idx;
3821  dbgs() << "\n";
3822  });
3823  fixupOrderingIndices(CurrentOrder);
3824  // Insert new order with initial value 0, if it does not exist,
3825  // otherwise return the iterator to the existing one.
3826  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3827  ReuseShuffleIndicies, CurrentOrder);
3828  // This is a special case, as it does not gather, but at the same time
3829  // we are not extending buildTree_rec() towards the operands.
3830  ValueList Op0;
3831  Op0.assign(VL.size(), VL0->getOperand(0));
3832  VectorizableTree.back()->setOperand(0, Op0);
3833  return;
3834  }
3835  LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
3836  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3837  ReuseShuffleIndicies);
3838  BS.cancelScheduling(VL, VL0);
3839  return;
3840  }
3841  case Instruction::InsertElement: {
3842  assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
3843 
3844  // Check that we have a buildvector and not a shuffle of 2 or more
3845  // different vectors.
3846  ValueSet SourceVectors;
3847  int MinIdx = std::numeric_limits<int>::max();
3848  for (Value *V : VL) {
3849  SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
3850  Optional<int> Idx = *getInsertIndex(V, 0);
3851  if (!Idx || *Idx == UndefMaskElem)
3852  continue;
3853  MinIdx = std::min(MinIdx, *Idx);
3854  }
3855 
3856  if (count_if(VL, [&SourceVectors](Value *V) {
3857  return !SourceVectors.contains(V);
3858  }) >= 2) {
3859  // Found 2nd source vector - cancel.
3860  LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
3861  "different source vectors.\n");
3862  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
3863  BS.cancelScheduling(VL, VL0);
3864  return;
3865  }
3866 
3867  auto OrdCompare = [](const std::pair<int, int> &P1,
3868  const std::pair<int, int> &P2) {
3869  return P1.first > P2.first;
3870  };
3872  decltype(OrdCompare)>
3873  Indices(OrdCompare);
3874  for (int I = 0, E = VL.size(); I < E; ++I) {
3875  Optional<int> Idx = *getInsertIndex(VL[I], 0);
3876  if (!Idx || *Idx == UndefMaskElem)
3877  continue;
3878  Indices.emplace(*Idx, I);
3879  }
3880  OrdersType CurrentOrder(VL.size(), VL.size());
3881  bool IsIdentity = true;
3882  for (int I = 0, E = VL.size(); I < E; ++I) {
3883  CurrentOrder[Indices.top().second] = I;
3884  IsIdentity &= Indices.top().second == I;
3885  Indices.pop();
3886  }
3887  if (IsIdentity)
3888  CurrentOrder.clear();
3889  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3890  None, CurrentOrder);
3891  LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
3892 
3893  constexpr int NumOps = 2;
3894  ValueList VectorOperands[NumOps];
3895  for (int I = 0; I < NumOps; ++I) {
3896  for (Value *V : VL)
3897  VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
3898 
3899  TE->setOperand(I, VectorOperands[I]);
3900  }
3901  buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
3902  return;
3903  }
3904  case Instruction::Load: {
3905  // Check that a vectorized load would load the same memory as a scalar
3906  // load. For example, we don't want to vectorize loads that are smaller
3907  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3908  // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3909  // from such a struct, we read/write packed bits disagreeing with the
3910  // unvectorized version.
3911  SmallVector<Value *> PointerOps;
3912  OrdersType CurrentOrder;
3913  TreeEntry *TE = nullptr;
3914  switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,
3915  PointerOps)) {
3916  case LoadsState::Vectorize:
3917  if (CurrentOrder.empty()) {
3918  // Original loads are consecutive and does not require reordering.
3919  TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3920  ReuseShuffleIndicies);
3921  LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
3922  } else {
3923  fixupOrderingIndices(CurrentOrder);
3924  // Need to reorder.
3925  TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3926  ReuseShuffleIndicies, CurrentOrder);
3927  LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
3928  }
3929  TE->setOperandsInOrder();
3930  break;
3931  case LoadsState::ScatterVectorize:
3932  // Vectorizing non-consecutive loads with `llvm.masked.gather`.
3933  TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
3934  UserTreeIdx, ReuseShuffleIndicies);
3935  TE->setOperandsInOrder();
3936  buildTree_rec(PointerOps, Depth + 1, {TE, 0});
3937  LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
3938  break;
3939  case LoadsState::Gather:
3940  BS.cancelScheduling(VL, VL0);
3941  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3942  ReuseShuffleIndicies);
3943 #ifndef NDEBUG
3944  Type *ScalarTy = VL0->getType();
3945  if (DL->getTypeSizeInBits(ScalarTy) !=
3946  DL->getTypeAllocSizeInBits(ScalarTy))
3947  LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
3948  else if (any_of(VL, [](Value *V) {
3949  return !cast<LoadInst>(V)->isSimple();
3950  }))
3951  LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
3952  else
3953  LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
3954 #endif // NDEBUG
3955  break;
3956  }
3957  return;
3958  }
3959  case Instruction::ZExt:
3960  case Instruction::SExt:
3961  case Instruction::FPToUI:
3962  case Instruction::FPToSI:
3963  case Instruction::FPExt:
3964  case Instruction::PtrToInt:
3965  case Instruction::IntToPtr:
3966  case Instruction::SIToFP:
3967  case Instruction::UIToFP:
3968  case Instruction::Trunc:
3969  case Instruction::FPTrunc:
3970  case Instruction::BitCast: {
3971  Type *SrcTy = VL0->getOperand(0)->getType();
3972  for (Value *V : VL) {
3973  Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
3974  if (Ty != SrcTy || !isValidElementType(Ty)) {
3975  BS.cancelScheduling(VL, VL0);
3976  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
3977  ReuseShuffleIndicies);
3978  LLVM_DEBUG(dbgs()
3979  << "SLP: Gathering casts with different src types.\n");
3980  return;
3981  }
3982  }
3983  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
3984  ReuseShuffleIndicies);
3985  LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
3986 
3987  TE->setOperandsInOrder();
3988  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
3989  ValueList Operands;
3990  // Prepare the operand vector.
3991  for (Value *V : VL)
3992  Operands.push_back(cast<Instruction>(V)->getOperand(i));
3993 
3994  buildTree_rec(Operands, Depth + 1, {TE, i});
3995  }
3996  return;
3997  }
3998  case Instruction::ICmp:
3999  case Instruction::FCmp: {
4000  // Check that all of the compares have the same predicate.
4001  CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
4003  Type *ComparedTy = VL0->getOperand(0)->getType();
4004  for (Value *V : VL) {
4005  CmpInst *Cmp = cast<CmpInst>(V);
4006  if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
4007  Cmp->getOperand(0)->getType() != ComparedTy) {
4008  BS.cancelScheduling(VL, VL0);
4009  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4010  ReuseShuffleIndicies);
4011  LLVM_DEBUG(dbgs()
4012  << "SLP: Gathering cmp with different predicate.\n");
4013  return;
4014  }
4015  }
4016 
4017  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4018  ReuseShuffleIndicies);
4019  LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
4020 
4021  ValueList Left, Right;
4022  if (cast<CmpInst>(VL0)->isCommutative()) {
4023  // Commutative predicate - collect + sort operands of the instructions
4024  // so that each side is more likely to have the same opcode.
4025  assert(P0 == SwapP0 && "Commutative Predicate mismatch");
4026  reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
4027  } else {
4028  // Collect operands - commute if it uses the swapped predicate.
4029  for (Value *V : VL) {
4030  auto *Cmp = cast<CmpInst>(V);
4031  Value *LHS = Cmp->getOperand(0);
4032  Value *RHS = Cmp->getOperand(1);
4033  if (Cmp->getPredicate() != P0)
4034  std::swap(LHS, RHS);
4035  Left.push_back(LHS);
4036  Right.push_back(RHS);
4037  }
4038  }
4039  TE->setOperand(0, Left);
4040  TE->setOperand(1, Right);
4041  buildTree_rec(Left, Depth + 1, {TE, 0});
4042  buildTree_rec(Right, Depth + 1, {TE, 1});
4043  return;
4044  }
4045  case Instruction::Select:
4046  case Instruction::FNeg:
4047  case Instruction::Add:
4048  case Instruction::FAdd:
4049  case Instruction::Sub:
4050  case Instruction::FSub:
4051  case Instruction::Mul:
4052  case Instruction::FMul:
4053  case Instruction::UDiv:
4054  case Instruction::SDiv:
4055  case Instruction::FDiv:
4056  case Instruction::URem:
4057  case Instruction::SRem:
4058  case Instruction::FRem:
4059  case Instruction::Shl:
4060  case Instruction::LShr:
4061  case Instruction::AShr:
4062  case Instruction::And:
4063  case Instruction::Or:
4064  case Instruction::Xor: {
4065  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4066  ReuseShuffleIndicies);
4067  LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
4068 
4069  // Sort operands of the instructions so that each side is more likely to
4070  // have the same opcode.
4071  if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
4072  ValueList Left, Right;
4073  reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
4074  TE->setOperand(0, Left);
4075  TE->setOperand(1, Right);
4076  buildTree_rec(Left, Depth + 1, {TE, 0});
4077  buildTree_rec(Right, Depth + 1, {TE, 1});
4078  return;
4079  }
4080 
4081  TE->setOperandsInOrder();
4082  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
4083  ValueList Operands;
4084  // Prepare the operand vector.
4085  for (Value *V : VL)
4086  Operands.push_back(cast<Instruction>(V)->getOperand(i));
4087 
4088  buildTree_rec(Operands, Depth + 1, {TE, i});
4089  }
4090  return;
4091  }
4092  case Instruction::GetElementPtr: {
4093  // We don't combine GEPs with complicated (nested) indexing.
4094  for (Value *V : VL) {
4095  if (cast<Instruction>(V)->getNumOperands() != 2) {
4096  LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
4097  BS.cancelScheduling(VL, VL0);
4098  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4099  ReuseShuffleIndicies);
4100  return;
4101  }
4102  }
4103 
4104  // We can't combine several GEPs into one vector if they operate on
4105  // different types.
4106  Type *Ty0 = VL0->getOperand(0)->getType();
4107  for (Value *V : VL) {
4108  Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
4109  if (Ty0 != CurTy) {
4110  LLVM_DEBUG(dbgs()
4111  << "SLP: not-vectorizable GEP (different types).\n");
4112  BS.cancelScheduling(VL, VL0);
4113  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4114  ReuseShuffleIndicies);
4115  return;
4116  }
4117  }
4118 
4119  // We don't combine GEPs with non-constant indexes.
4120  Type *Ty1 = VL0->getOperand(1)->getType();
4121  for (Value *V : VL) {
4122  auto Op = cast<Instruction>(V)->getOperand(1);
4123  if (!isa<ConstantInt>(Op) ||
4124  (Op->getType() != Ty1 &&
4125  Op->getType()->getScalarSizeInBits() >
4126  DL->getIndexSizeInBits(
4127  V->getType()->getPointerAddressSpace()))) {
4128  LLVM_DEBUG(dbgs()
4129  << "SLP: not-vectorizable GEP (non-constant indexes).\n");
4130  BS.cancelScheduling(VL, VL0);
4131  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4132  ReuseShuffleIndicies);
4133  return;
4134  }
4135  }
4136 
4137  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4138  ReuseShuffleIndicies);
4139  LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
4141  // Prepare the operand vector for pointer operands.
4142  for (Value *V : VL)
4143  Operands.front().push_back(
4144  cast<GetElementPtrInst>(V)->getPointerOperand());
4145  TE->setOperand(0, Operands.front());
4146  // Need to cast all indices to the same type before vectorization to
4147  // avoid crash.
4148  // Required to be able to find correct matches between different gather
4149  // nodes and reuse the vectorized values rather than trying to gather them
4150  // again.
4151  int IndexIdx = 1;
4152  Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
4153  Type *Ty = all_of(VL,
4154  [VL0Ty, IndexIdx](Value *V) {
4155  return VL0Ty == cast<GetElementPtrInst>(V)
4156  ->getOperand(IndexIdx)
4157  ->getType();
4158  })
4159  ? VL0Ty
4160  : DL->getIndexType(cast<GetElementPtrInst>(VL0)
4161  ->getPointerOperandType()
4162  ->getScalarType());
4163  // Prepare the operand vector.
4164  for (Value *V : VL) {
4165  auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);
4166  auto *CI = cast<ConstantInt>(Op);
4167  Operands.back().push_back(ConstantExpr::getIntegerCast(
4168  CI, Ty, CI->getValue().isSignBitSet()));
4169  }
4170  TE->setOperand(IndexIdx, Operands.back());
4171 
4172  for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
4173  buildTree_rec(Operands[I], Depth + 1, {TE, I});
4174  return;
4175  }
4176  case Instruction::Store: {
4177  // Check if the stores are consecutive or if we need to swizzle them.
4178  llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
4179  // Avoid types that are padded when being allocated as scalars, while
4180  // being packed together in a vector (such as i1).
4181  if (DL->getTypeSizeInBits(ScalarTy) !=
4182  DL->getTypeAllocSizeInBits(ScalarTy)) {
4183  BS.cancelScheduling(VL, VL0);
4184  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4185  ReuseShuffleIndicies);
4186  LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
4187  return;
4188  }
4189  // Make sure all stores in the bundle are simple - we can't vectorize
4190  // atomic or volatile stores.
4191  SmallVector<Value *, 4> PointerOps(VL.size());
4192  ValueList Operands(VL.size());
4193  auto POIter = PointerOps.begin();
4194  auto OIter = Operands.begin();
4195  for (Value *V : VL) {
4196  auto *SI = cast<StoreInst>(V);
4197  if (!SI->isSimple()) {
4198  BS.cancelScheduling(VL, VL0);
4199  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4200  ReuseShuffleIndicies);
4201  LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
4202  return;
4203  }
4204  *POIter = SI->getPointerOperand();
4205  *OIter = SI->getValueOperand();
4206  ++POIter;
4207  ++OIter;
4208  }
4209 
4210  OrdersType CurrentOrder;
4211  // Check the order of pointer operands.
4212  if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
4213  Value *Ptr0;
4214  Value *PtrN;
4215  if (CurrentOrder.empty()) {
4216  Ptr0 = PointerOps.front();
4217  PtrN = PointerOps.back();
4218  } else {
4219  Ptr0 = PointerOps[CurrentOrder.front()];
4220  PtrN = PointerOps[CurrentOrder.back()];
4221  }
4222  Optional<int> Dist =
4223  getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4224  // Check that the sorted pointer operands are consecutive.
4225  if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
4226  if (CurrentOrder.empty()) {
4227  // Original stores are consecutive and does not require reordering.
4228  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
4229  UserTreeIdx, ReuseShuffleIndicies);
4230  TE->setOperandsInOrder();
4231  buildTree_rec(Operands, Depth + 1, {TE, 0});
4232  LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
4233  } else {
4234  fixupOrderingIndices(CurrentOrder);
4235  TreeEntry *TE =
4236  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4237  ReuseShuffleIndicies, CurrentOrder);
4238  TE->setOperandsInOrder();
4239  buildTree_rec(Operands, Depth + 1, {TE, 0});
4240  LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
4241  }
4242  return;
4243  }
4244  }
4245 
4246  BS.cancelScheduling(VL, VL0);
4247  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4248  ReuseShuffleIndicies);
4249  LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
4250  return;
4251  }
4252  case Instruction::Call: {
4253  // Check if the calls are all to the same vectorizable intrinsic or
4254  // library function.
4255  CallInst *CI = cast<CallInst>(VL0);
4257 
4258  VFShape Shape = VFShape::get(
4259  *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
4260  false /*HasGlobalPred*/);
4261  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
4262 
4263  if (!VecFunc && !isTriviallyVectorizable(ID)) {
4264  BS.cancelScheduling(VL, VL0);
4265  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4266  ReuseShuffleIndicies);
4267  LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
4268  return;
4269  }
4270  Function *F = CI->getCalledFunction();
4271  unsigned NumArgs = CI->arg_size();
4272  SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
4273  for (unsigned j = 0; j != NumArgs; ++j)
4275  ScalarArgs[j] = CI->getArgOperand(j);
4276  for (Value *V : VL) {
4277  CallInst *CI2 = dyn_cast<CallInst>(V);
4278  if (!CI2 || CI2->getCalledFunction() != F ||
4279  getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
4280  (VecFunc &&
4281  VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
4282  !CI->hasIdenticalOperandBundleSchema(*CI2)) {
4283  BS.cancelScheduling(VL, VL0);
4284  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4285  ReuseShuffleIndicies);
4286  LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
4287  << "\n");
4288  return;
4289  }
4290  // Some intrinsics have scalar arguments and should be same in order for
4291  // them to be vectorized.
4292  for (unsigned j = 0; j != NumArgs; ++j) {
4294  Value *A1J = CI2->getArgOperand(j);
4295  if (ScalarArgs[j] != A1J) {
4296  BS.cancelScheduling(VL, VL0);
4297  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4298  ReuseShuffleIndicies);
4299  LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
4300  << " argument " << ScalarArgs[j] << "!=" << A1J
4301  << "\n");
4302  return;
4303  }
4304  }
4305  }
4306  // Verify that the bundle operands are identical between the two calls.
4307  if (CI->hasOperandBundles() &&
4309  CI->op_begin() + CI->getBundleOperandsEndIndex(),
4310  CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
4311  BS.cancelScheduling(VL, VL0);
4312  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4313  ReuseShuffleIndicies);
4314  LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
4315  << *CI << "!=" << *V << '\n');
4316  return;
4317  }
4318  }
4319 
4320  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4321  ReuseShuffleIndicies);
4322  TE->setOperandsInOrder();
4323  for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
4324  // For scalar operands no need to to create an entry since no need to
4325  // vectorize it.
4327  continue;
4328  ValueList Operands;
4329  // Prepare the operand vector.
4330  for (Value *V : VL) {
4331  auto *CI2 = cast<CallInst>(V);
4332  Operands.push_back(CI2->getArgOperand(i));
4333  }
4334  buildTree_rec(Operands, Depth + 1, {TE, i});
4335  }
4336  return;
4337  }
4338  case Instruction::ShuffleVector: {
4339  // If this is not an alternate sequence of opcode like add-sub
4340  // then do not vectorize this instruction.
4341  if (!S.isAltShuffle()) {
4342  BS.cancelScheduling(VL, VL0);
4343  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4344  ReuseShuffleIndicies);
4345  LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
4346  return;
4347  }
4348  TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
4349  ReuseShuffleIndicies);
4350  LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
4351 
4352  // Reorder operands if reordering would enable vectorization.
4353  if (isa<BinaryOperator>(VL0)) {
4354  ValueList Left, Right;
4355  reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
4356  TE->setOperand(0, Left);
4357  TE->setOperand(1, Right);
4358  buildTree_rec(Left, Depth + 1, {TE, 0});
4359  buildTree_rec(Right, Depth + 1, {TE, 1});
4360  return;
4361  }
4362 
4363  TE->setOperandsInOrder();
4364  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
4365  ValueList Operands;
4366  // Prepare the operand vector.
4367  for (Value *V : VL)
4368  Operands.push_back(cast<Instruction>(V)->getOperand(i));
4369 
4370  buildTree_rec(Operands, Depth + 1, {TE, i});
4371  }
4372  return;
4373  }
4374  default:
4375  BS.cancelScheduling(VL, VL0);
4376  newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
4377  ReuseShuffleIndicies);
4378  LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
4379  return;
4380  }
4381 }
4382 
4383 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
4384  unsigned N = 1;
4385  Type *EltTy = T;
4386 
4387  while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
4388  isa<VectorType>(EltTy)) {
4389  if (auto *ST = dyn_cast<StructType>(EltTy)) {
4390  // Check that struct is homogeneous.
4391  for (const auto *Ty : ST->elements())
4392  if (Ty != *ST->element_begin())
4393  return 0;
4394  N *= ST->getNumElements();
4395  EltTy = *ST->element_begin();
4396  } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
4397  N *= AT->getNumElements();
4398  EltTy = AT->getElementType();
4399  } else {
4400  auto *VT = cast<FixedVectorType>(EltTy);
4401  N *= VT->getNumElements();
4402  EltTy = VT->getElementType();
4403  }
4404  }
4405 
4406  if (!isValidElementType(EltTy))
4407  return 0;
4408  uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
4409  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
4410  return 0;
4411  return N;
4412 }
4413 
4414 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
4415  SmallVectorImpl<unsigned> &CurrentOrder) const {
4416  const auto *It = find_if(VL, [](Value *V) {
4417  return isa<ExtractElementInst, ExtractValueInst>(V);
4418  });
4419  assert(It != VL.end() && "Expected at least one extract instruction.");
4420  auto *E0 = cast<Instruction>(*It);
4421  assert(all_of(VL,
4422  [](Value *V) {
4423  return isa<UndefValue, ExtractElementInst, ExtractValueInst>(
4424  V);
4425  }) &&
4426  "Invalid opcode");
4427  // Check if all of the extracts come from the same vector and from the
4428  // correct offset.
4429  Value *Vec = E0->getOperand(0);
4430 
4431  CurrentOrder.clear();
4432 
4433  // We have to extract from a vector/aggregate with the same number of elements.
4434  unsigned NElts;
4435  if (E0->getOpcode() == Instruction::ExtractValue) {
4436  const DataLayout &DL = E0->getModule()->getDataLayout();
4437  NElts = canMapToVector(Vec->getType(), DL);
4438  if (!NElts)
4439  return false;
4440  // Check if load can be rewritten as load of vector.
4441  LoadInst *LI = dyn_cast<LoadInst>(Vec);
4442  if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
4443  return false;
4444  } else {
4445  NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
4446  }
4447 
4448  if (NElts != VL.size())
4449  return false;
4450 
4451  // Check that all of the indices extract from the correct offset.
4452  bool ShouldKeepOrder = true;
4453  unsigned E = VL.size();
4454  // Assign to all items the initial value E + 1 so we can check if the extract
4455  // instruction index was used already.
4456  // Also, later we can check that all the indices are used and we have a
4457  // consecutive access in the extract instructions, by checking that no
4458  // element of CurrentOrder still has value E + 1.
4459  CurrentOrder.assign(E, E);
4460  unsigned I = 0;
4461  for (; I < E; ++I) {
4462  auto *Inst = dyn_cast<Instruction>(VL[I]);
4463  if (!Inst)
4464  continue;
4465  if (Inst->getOperand(0) != Vec)
4466  break;
4467  if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
4468  if (isa<UndefValue>(EE->getIndexOperand()))
4469  continue;
4470  Optional<unsigned> Idx = getExtractIndex(Inst);
4471  if (!Idx)
4472  break;
4473  const unsigned ExtIdx = *Idx;
4474  if (ExtIdx != I) {
4475  if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)
4476  break;
4477  ShouldKeepOrder = false;
4478  CurrentOrder[ExtIdx] = I;
4479  } else {
4480  if (CurrentOrder[I] != E)
4481  break;
4482  CurrentOrder[I] = I;
4483  }
4484  }
4485  if (I < E) {
4486  CurrentOrder.clear();
4487  return false;
4488  }
4489  if (ShouldKeepOrder)
4490  CurrentOrder.clear();
4491 
4492  return ShouldKeepOrder;
4493 }
4494 
4495 bool BoUpSLP::areAllUsersVectorized(Instruction *I,
4496  ArrayRef<Value *> VectorizedVals) const {
4497  return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
4498  all_of(I->users(), [this](User *U) {
4499  return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
4500  });
4501 }
4502 
4503 static std::pair<InstructionCost, InstructionCost>
4507 
4508  // Calculate the cost of the scalar and vector calls.
4509  SmallVector<Type *, 4> VecTys;
4510  for (Use &Arg : CI->args())
4511  VecTys.push_back(
4512  FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
4513  FastMathFlags FMF;
4514  if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
4515  FMF = FPCI->getFastMathFlags();
4517  IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
4518  dyn_cast<IntrinsicInst>(CI));
4519  auto IntrinsicCost =
4520  TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
4521 
4522  auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
4523  VecTy->getNumElements())),
4524  false /*HasGlobalPred*/);
4525  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
4526  auto LibCost = IntrinsicCost;
4527  if (!CI->isNoBuiltin() && VecFunc) {
4528  // Calculate the cost of the vector library call.
4529  // If the corresponding vector call is cheaper, return its cost.
4530  LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
4531  TTI::TCK_RecipThroughput);
4532  }
4533  return {IntrinsicCost, LibCost};
4534 }
4535 
4536 /// Compute the cost of creating a vector of type \p VecTy containing the
4537 /// extracted values from \p VL.
4538 static InstructionCost
4542  unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
4543 
4544  if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
4545  VecTy->getNumElements() < NumOfParts)
4546  return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
4547 
4548  bool AllConsecutive = true;
4549  unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
4550  unsigned Idx = -1;
4551  InstructionCost Cost = 0;
4552 
4553  // Process extracts in blocks of EltsPerVector to check if the source vector
4554  // operand can be re-used directly. If not, add the cost of creating a shuffle
4555  // to extract the values into a vector register.
4556  for (auto *V : VL) {
4557  ++Idx;
4558 
4559  // Need to exclude undefs from analysis.
4560  if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
4561  continue;
4562 
4563  // Reached the start of a new vector registers.
4564  if (Idx % EltsPerVector == 0) {
4565  AllConsecutive = true;
4566  continue;
4567  }
4568 
4569  // Check all extracts for a vector register on the target directly
4570  // extract values in order.
4571  unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
4572  if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
4573  unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
4574  AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
4575  CurrentIdx % EltsPerVector == Idx % EltsPerVector;
4576  }
4577 
4578  if (AllConsecutive)
4579  continue;
4580 
4581  // Skip all indices, except for the last index per vector block.
4582  if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
4583  continue;
4584 
4585  // If we have a series of extracts which are not consecutive and hence
4586  // cannot re-use the source vector register directly, compute the shuffle
4587  // cost to extract the a vector with EltsPerVector elements.
4588  Cost += TTI.getShuffleCost(
4589  TargetTransformInfo::SK_PermuteSingleSrc,
4590  FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
4591  }
4592  return Cost;
4593 }
4594 
4595 /// Build shuffle mask for shuffle graph entries and lists of main and alternate
4596 /// operations operands.
4597 static void
4599  ArrayRef<int> ReusesIndices,
4600  const function_ref<bool(Instruction *)> IsAltOp,
4602  SmallVectorImpl<Value *> *OpScalars = nullptr,
4603  SmallVectorImpl<Value *> *AltScalars = nullptr) {
4604  unsigned Sz = VL.size();
4605  Mask.assign(Sz, UndefMaskElem);
4606  SmallVector<int> OrderMask;
4607  if (!ReorderIndices.empty())
4608  inversePermutation(ReorderIndices, OrderMask);
4609  for (unsigned I = 0; I < Sz; ++I) {
4610  unsigned Idx = I;
4611  if (!ReorderIndices.empty())
4612  Idx = OrderMask[I];
4613  auto *OpInst = cast<Instruction>(VL[Idx]);
4614  if (IsAltOp(OpInst)) {
4615  Mask[I] = Sz + Idx;
4616  if (AltScalars)
4617  AltScalars->push_back(OpInst);
4618  } else {
4619  Mask[I] = Idx;
4620  if (OpScalars)
4621  OpScalars->push_back(OpInst);
4622  }
4623  }
4624  if (!ReusesIndices.empty()) {
4625  SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);
4626  transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
4627  return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;
4628  });
4629  Mask.swap(NewMask);
4630  }
4631 }
4632 
4633 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
4634  ArrayRef<Value *> VectorizedVals) {
4635  ArrayRef<Value*> VL = E->Scalars;
4636 
4637  Type *ScalarTy = VL[0]->getType();
4638  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
4639  ScalarTy = SI->getValueOperand()->getType();
4640  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
4641  ScalarTy = CI->getOperand(0)->getType();
4642  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
4643  ScalarTy = IE->getOperand(1)->getType();
4644  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
4645  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4646 
4647  // If we have computed a smaller type for the expression, update VecTy so
4648  // that the costs will be accurate.
4649  if (MinBWs.count(VL[0]))
4650  VecTy = FixedVectorType::get(
4651  IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
4652  unsigned EntryVF = E->getVectorFactor();
4653  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
4654 
4655  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
4656  // FIXME: it tries to fix a problem with MSVC buildbots.
4657  TargetTransformInfo &TTIRef = *TTI;
4658  auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
4659  VectorizedVals, E](InstructionCost &Cost) {
4660  DenseMap<Value *, int> ExtractVectorsTys;
4661  SmallPtrSet<Value *, 4> CheckedExtracts;
4662  for (auto *V : VL) {
4663  if (isa<UndefValue>(V))
4664  continue;
4665  // If all users of instruction are going to be vectorized and this
4666  // instruction itself is not going to be vectorized, consider this
4667  // instruction as dead and remove its cost from the final cost of the
4668  // vectorized tree.
4669  // Also, avoid adjusting the cost for extractelements with multiple uses
4670  // in different graph entries.
4671  const TreeEntry *VE = getTreeEntry(V);
4672  if (!CheckedExtracts.insert(V).second ||
4673  !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
4674  (VE && VE != E))
4675  continue;
4676  auto *EE = cast<ExtractElementInst>(V);
4677  Optional<unsigned> EEIdx = getExtractIndex(EE);
4678  if (!EEIdx)
4679  continue;
4680  unsigned Idx = *EEIdx;
4681  if (TTIRef.getNumberOfParts(VecTy) !=
4682  TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
4683  auto It =
4684  ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
4685  It->getSecond() = std::min<int>(It->second, Idx);
4686  }
4687  // Take credit for instruction that will become dead.
4688  if (EE->hasOneUse()) {
4689  Instruction *Ext = EE->user_back();
4690  if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4691  all_of(Ext->users(),
4692  [](User *U) { return isa<GetElementPtrInst>(U); })) {
4693  // Use getExtractWithExtendCost() to calculate the cost of
4694  // extractelement/ext pair.
4695  Cost -=
4696  TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
4697  EE->getVectorOperandType(), Idx);
4698  // Add back the cost of s|zext which is subtracted separately.
4699  Cost += TTIRef.getCastInstrCost(
4700  Ext->getOpcode(), Ext->getType(), EE->getType(),
4701  TTI::getCastContextHint(Ext), CostKind, Ext);
4702  continue;
4703  }
4704  }
4705  Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement,
4706  EE->getVectorOperandType(), Idx);
4707  }
4708  // Add a cost for subvector extracts/inserts if required.
4709  for (const auto &Data : ExtractVectorsTys) {
4710  auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
4711  unsigned NumElts = VecTy->getNumElements();
4712  if (Data.second % NumElts == 0)
4713  continue;
4714  if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
4715  unsigned Idx = (Data.second / NumElts) * NumElts;
4716  unsigned EENumElts = EEVTy->getNumElements();
4717  if (Idx + NumElts <= EENumElts) {
4718  Cost +=
4719  TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
4720  EEVTy, None, Idx, VecTy);
4721  } else {
4722  // Need to round up the subvector type vectorization factor to avoid a
4723  // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
4724  // <= EENumElts.
4725  auto *SubVT =
4726  FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
4727  Cost +=
4728  TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
4729  EEVTy, None, Idx, SubVT);
4730  }
4731  } else {
4732  Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
4733  VecTy, None, 0, EEVTy);
4734  }
4735  }
4736  };
4737  if (E->State == TreeEntry::NeedToGather) {
4738  if (allConstant(VL))
4739  return 0;
4740  if (isa<InsertElementInst>(VL[0]))
4741  return InstructionCost::getInvalid();
4745  isGatherShuffledEntry(E, Mask, Entries);
4746  if (Shuffle.hasValue()) {
4747  InstructionCost GatherCost = 0;
4748  if (ShuffleVectorInst::isIdentityMask(Mask)) {
4749  // Perfect match in the graph, will reuse the previously vectorized
4750  // node. Cost is 0.
4751  LLVM_DEBUG(
4752  dbgs()
4753  << "SLP: perfect diamond match for gather bundle that starts with "
4754  << *VL.front() << ".\n");
4755  if (NeedToShuffleReuses)
4756  GatherCost =
4757  TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
4758  FinalVecTy, E->ReuseShuffleIndices);
4759  } else {
4760  LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
4761  << " entries for bundle that starts with "
4762  << *VL.front() << ".\n");
4763  // Detected that instead of gather we can emit a shuffle of single/two
4764  // previously vectorized nodes. Add the cost of the permutation rather
4765  // than gather.
4766  ::addMask(Mask, E->ReuseShuffleIndices);
4767  GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask);
4768  }
4769  return GatherCost;
4770  }
4771  if ((E->getOpcode() == Instruction::ExtractElement ||
4772  all_of(E->Scalars,
4773  [](Value *V) {
4774  return isa<ExtractElementInst, UndefValue>(V);
4775  })) &&
4776  allSameType(VL)) {
4777  // Check that gather of extractelements can be represented as just a
4778  // shuffle of a single/two vectors the scalars are extracted from.
4782  if (ShuffleKind.hasValue()) {
4783  // Found the bunch of extractelement instructions that must be gathered
4784  // into a vector and can be represented as a permutation elements in a
4785  // single input vector or of 2 input vectors.
4786  InstructionCost Cost =
4787  computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
4788  AdjustExtractsCost(Cost);
4789  if (NeedToShuffleReuses)
4790  Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
4791  FinalVecTy, E->ReuseShuffleIndices);
4792  return Cost;
4793  }
4794  }
4795  if (isSplat(VL)) {
4796  // Found the broadcasting of the single scalar, calculate the cost as the
4797  // broadcast.
4798  assert(VecTy == FinalVecTy &&
4799  "No reused scalars expected for broadcast.");
4800  return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
4801  }
4802  InstructionCost ReuseShuffleCost = 0;
4803  if (NeedToShuffleReuses)
4804  ReuseShuffleCost = TTI->getShuffleCost(
4805  TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
4806  // Improve gather cost for gather of loads, if we can group some of the
4807  // loads into vector loads.
4808  if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
4809  !E->isAltShuffle()) {
4810  BoUpSLP::ValueSet VectorizedLoads;
4811  unsigned StartIdx = 0;
4812  unsigned VF = VL.size() / 2;
4813  unsigned VectorizedCnt = 0;
4814  unsigned ScatterVectorizeCnt = 0;
4815  const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
4816  for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
4817  for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
4818  Cnt += VF) {
4819  ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4820  if (!VectorizedLoads.count(Slice.front()) &&
4821  !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
4822  SmallVector<Value *> PointerOps;
4823  OrdersType CurrentOrder;
4824  LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,
4825  *SE, CurrentOrder, PointerOps);
4826  switch (LS) {
4827  case LoadsState::Vectorize:
4828  case LoadsState::ScatterVectorize:
4829  // Mark the vectorized loads so that we don't vectorize them
4830  // again.
4831  if (LS == LoadsState::Vectorize)
4832  ++VectorizedCnt;
4833  else
4834  ++ScatterVectorizeCnt;
4835  VectorizedLoads.insert(Slice.begin(), Slice.end());
4836  // If we vectorized initial block, no need to try to vectorize it
4837  // again.
4838  if (Cnt == StartIdx)
4839  StartIdx += VF;
4840  break;
4841  case LoadsState::Gather:
4842  break;
4843  }
4844  }
4845  }
4846  // Check if the whole array was vectorized already - exit.
4847  if (StartIdx >= VL.size())
4848  break;
4849  // Found vectorizable parts - exit.
4850  if (!VectorizedLoads.empty())
4851  break;
4852  }
4853  if (!VectorizedLoads.empty()) {
4854  InstructionCost GatherCost = 0;
4855  unsigned NumParts = TTI->getNumberOfParts(VecTy);
4856  bool NeedInsertSubvectorAnalysis =
4857  !NumParts || (VL.size() / VF) > NumParts;
4858  // Get the cost for gathered loads.
4859  for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
4860  if (VectorizedLoads.contains(VL[I]))
4861  continue;
4862  GatherCost += getGatherCost(VL.slice(I, VF));
4863  }
4864  // The cost for vectorized loads.
4865  InstructionCost ScalarsCost = 0;
4866  for (Value *V : VectorizedLoads) {
4867  auto *LI = cast<LoadInst>(V);
4868  ScalarsCost += TTI->getMemoryOpCost(
4869  Instruction::Load, LI->getType(), LI->getAlign(),
4870  LI->getPointerAddressSpace(), CostKind, LI);
4871  }
4872  auto *LI = cast<LoadInst>(E->getMainOp());
4873  auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
4874  Align Alignment = LI->getAlign();
4875  GatherCost +=
4876  VectorizedCnt *
4877  TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
4878  LI->getPointerAddressSpace(), CostKind, LI);
4879  GatherCost += ScatterVectorizeCnt *
4881  Instruction::Load, LoadTy, LI->getPointerOperand(),
4882  /*VariableMask=*/false, Alignment, CostKind, LI);
4883  if (NeedInsertSubvectorAnalysis) {
4884  // Add the cost for the subvectors insert.
4885  for (int I = VF, E = VL.size(); I < E; I += VF)
4886  GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
4887  None, I, LoadTy);
4888  }
4889  return ReuseShuffleCost + GatherCost - ScalarsCost;
4890  }
4891  }
4892  return ReuseShuffleCost + getGatherCost(VL);
4893  }
4894  InstructionCost CommonCost = 0;
4896  if (!E->ReorderIndices.empty()) {
4897  SmallVector<int> NewMask;
4898  if (E->getOpcode() == Instruction::Store) {
4899  // For stores the order is actually a mask.
4900  NewMask.resize(E->ReorderIndices.size());
4901  copy(E->ReorderIndices, NewMask.begin());
4902  } else {
4903  inversePermutation(E->ReorderIndices, NewMask);
4904  }
4905  ::addMask(Mask, NewMask);
4906  }
4907  if (NeedToShuffleReuses)
4908  ::addMask(Mask, E->ReuseShuffleIndices);
4909  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
4910  CommonCost =
4911  TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
4912  assert((E->State == TreeEntry::Vectorize ||
4913  E->State == TreeEntry::ScatterVectorize) &&
4914  "Unhandled state");
4915  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
4916  Instruction *VL0 = E->getMainOp();
4917  unsigned ShuffleOrOp =
4918  E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
4919  switch (ShuffleOrOp) {
4920  case Instruction::PHI:
4921  return 0;
4922 
4923  case Instruction::ExtractValue:
4924  case Instruction::ExtractElement: {
4925  // The common cost of removal ExtractElement/ExtractValue instructions +
4926  // the cost of shuffles, if required to resuffle the original vector.
4927  if (NeedToShuffleReuses) {
4928  unsigned Idx = 0;
4929  for (unsigned I : E->ReuseShuffleIndices) {
4930  if (ShuffleOrOp == Instruction::ExtractElement) {
4931  auto *EE = cast<ExtractElementInst>(VL[I]);
4932  CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
4933  EE->getVectorOperandType(),
4934  *getExtractIndex(EE));
4935  } else {
4936  CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
4937  VecTy, Idx);
4938  ++Idx;
4939  }
4940  }
4941  Idx = EntryVF;
4942  for (Value *V : VL) {
4943  if (ShuffleOrOp == Instruction::ExtractElement) {
4944  auto *EE = cast<ExtractElementInst>(V);
4945  CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
4946  EE->getVectorOperandType(),
4947  *getExtractIndex(EE));
4948  } else {
4949  --Idx;
4950  CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
4951  VecTy, Idx);
4952  }
4953  }
4954  }
4955  if (ShuffleOrOp == Instruction::ExtractValue) {
4956  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
4957  auto *EI = cast<Instruction>(VL[I]);
4958  // Take credit for instruction that will become dead.
4959  if (EI->hasOneUse()) {
4960  Instruction *Ext = EI->user_back();
4961  if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4962  all_of(Ext->users(),
4963  [](User *U) { return isa<GetElementPtrInst>(U); })) {
4964  // Use getExtractWithExtendCost() to calculate the cost of
4965  // extractelement/ext pair.
4966  CommonCost -= TTI->getExtractWithExtendCost(
4967  Ext->getOpcode(), Ext->getType(), VecTy, I);
4968  // Add back the cost of s|zext which is subtracted separately.
4969  CommonCost += TTI->getCastInstrCost(
4970  Ext->getOpcode(), Ext->getType(), EI->getType(),
4971  TTI::getCastContextHint(Ext), CostKind, Ext);
4972  continue;
4973  }
4974  }
4975  CommonCost -=
4976  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
4977  }
4978  } else {
4979  AdjustExtractsCost(CommonCost);
4980  }
4981  return CommonCost;
4982  }
4983  case Instruction::InsertElement: {
4984  assert(E->ReuseShuffleIndices.empty() &&
4985  "Unique insertelements only are expected.");
4986  auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
4987 
4988  unsigned const NumElts = SrcVecTy->getNumElements();
4989  unsigned const NumScalars = VL.size();
4990  APInt DemandedElts = APInt::getZero(NumElts);
4991  // TODO: Add support for Instruction::InsertValue.
4993  if (!E->ReorderIndices.empty()) {
4994  inversePermutation(E->ReorderIndices, Mask);
4995  Mask.append(NumElts - NumScalars, UndefMaskElem);
4996  } else {
4997  Mask.assign(NumElts, UndefMaskElem);
4998  std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
4999  }
5000  unsigned Offset = *getInsertIndex(VL0, 0);
5001  bool IsIdentity = true;
5002  SmallVector<int> PrevMask(NumElts, UndefMaskElem);
5003  Mask.swap(PrevMask);
5004  for (unsigned I = 0; I < NumScalars; ++I) {
5005  Optional<int> InsertIdx = getInsertIndex(VL[PrevMask[I]], 0);
5006  if (!InsertIdx || *InsertIdx == UndefMaskElem)
5007  continue;
5008  DemandedElts.setBit(*InsertIdx);
5009  IsIdentity &= *InsertIdx - Offset == I;
5010  Mask[*InsertIdx - Offset] = I;
5011  }
5012  assert(Offset < NumElts && "Failed to find vector index offset");
5013 
5014  InstructionCost Cost = 0;
5015  Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
5016  /*Insert*/ true, /*Extract*/ false);
5017 
5018  if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
5019  // FIXME: Replace with SK_InsertSubvector once it is properly supported.
5020  unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
5021  Cost += TTI->getShuffleCost(
5022  TargetTransformInfo::SK_PermuteSingleSrc,
5023  FixedVectorType::get(SrcVecTy->getElementType(), Sz));
5024  } else if (!IsIdentity) {
5025  auto *FirstInsert =
5026  cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
5027  return !is_contained(E->Scalars,
5028  cast<Instruction>(V)->getOperand(0));
5029  }));
5030  if (isUndefVector(FirstInsert->getOperand(0))) {
5031  Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
5032  } else {
5033  SmallVector<int> InsertMask(NumElts);
5034  std::iota(InsertMask.begin(), InsertMask.end(), 0);
5035  for (unsigned I = 0; I < NumElts; I++) {
5036  if (Mask[I] != UndefMaskElem)
5037  InsertMask[Offset + I] = NumElts + I;
5038  }
5039  Cost +=
5040  TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
5041  }
5042  }
5043 
5044  return Cost;
5045  }
5046  case Instruction::ZExt:
5047  case Instruction::SExt:
5048  case Instruction::FPToUI:
5049  case Instruction::FPToSI:
5050  case Instruction::FPExt:
5051  case Instruction::PtrToInt:
5052  case Instruction::IntToPtr:
5053  case Instruction::SIToFP:
5054  case Instruction::UIToFP:
5055  case Instruction::Trunc:
5056  case Instruction::FPTrunc:
5057  case Instruction::BitCast: {
5058  Type *SrcTy = VL0->getOperand(0)->getType();
5059  InstructionCost ScalarEltCost =
5060  TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
5061  TTI::getCastContextHint(VL0), CostKind, VL0);
5062  if (NeedToShuffleReuses) {
5063  CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5064  }
5065 
5066  // Calculate the cost of this instruction.
5067  InstructionCost ScalarCost = VL.size() * ScalarEltCost;
5068 
5069  auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
5070  InstructionCost VecCost = 0;
5071  // Check if the values are candidates to demote.
5072  if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
5073  VecCost = CommonCost + TTI->getCastInstrCost(
5074  E->getOpcode(), VecTy, SrcVecTy,
5075  TTI::getCastContextHint(VL0), CostKind, VL0);
5076  }
5077  LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
5078  return VecCost - ScalarCost;
5079  }
5080  case Instruction::FCmp:
5081  case Instruction::ICmp:
5082  case Instruction::Select: {
5083  // Calculate the cost of this instruction.
5084  InstructionCost ScalarEltCost =
5085  TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
5086  CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
5087  if (NeedToShuffleReuses) {
5088  CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5089  }
5090  auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
5091  InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
5092 
5093  // Check if all entries in VL are either compares or selects with compares
5094  // as condition that have the same predicates.
5095  CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
5096  bool First = true;
5097  for (auto *V : VL) {
5098  CmpInst::Predicate CurrentPred;
5099  auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
5100  if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
5101  !match(V, MatchCmp)) ||
5102  (!First && VecPred != CurrentPred)) {
5103  VecPred = CmpInst::BAD_ICMP_PREDICATE;
5104  break;
5105  }
5106  First = false;
5107  VecPred = CurrentPred;
5108  }
5109 
5111  E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
5112  // Check if it is possible and profitable to use min/max for selects in
5113  // VL.
5114  //
5115  auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
5116  if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
5117  IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
5118  {VecTy, VecTy});
5119  InstructionCost IntrinsicCost =
5120  TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
5121  // If the selects are the only uses of the compares, they will be dead
5122  // and we can adjust the cost by removing their cost.
5123  if (IntrinsicAndUse.second)
5124  IntrinsicCost -=
5125  TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
5126  CmpInst::BAD_ICMP_PREDICATE, CostKind);
5127  VecCost = std::min(VecCost, IntrinsicCost);
5128  }
5129  LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
5130  return CommonCost + VecCost - ScalarCost;
5131  }
5132  case Instruction::FNeg:
5133  case Instruction::Add:
5134  case Instruction::FAdd:
5135  case Instruction::Sub:
5136  case Instruction::FSub:
5137  case Instruction::Mul:
5138  case Instruction::FMul:
5139  case Instruction::UDiv:
5140  case Instruction::SDiv:
5141  case Instruction::FDiv:
5142  case Instruction::URem:
5143  case Instruction::SRem:
5144  case Instruction::FRem:
5145  case Instruction::Shl:
5146  case Instruction::LShr:
5147  case Instruction::AShr:
5148  case Instruction::And:
5149  case Instruction::Or:
5150  case Instruction::Xor: {
5151  // Certain instructions can be cheaper to vectorize if they have a
5152  // constant second vector operand.
5154  TargetTransformInfo::OK_AnyValue;
5156  TargetTransformInfo::OK_UniformConstantValue;
5158  TargetTransformInfo::OP_None;
5160  TargetTransformInfo::OP_PowerOf2;
5161 
5162  // If all operands are exactly the same ConstantInt then set the
5163  // operand kind to OK_UniformConstantValue.
5164  // If instead not all operands are constants, then set the operand kind
5165  // to OK_AnyValue. If all operands are constants but not the same,
5166  // then set the operand kind to OK_NonUniformConstantValue.
5167  ConstantInt *CInt0 = nullptr;
5168  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
5169  const Instruction *I = cast<Instruction>(VL[i]);
5170  unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
5171  ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
5172  if (!CInt) {
5173  Op2VK = TargetTransformInfo::OK_AnyValue;
5174  Op2VP = TargetTransformInfo::OP_None;
5175  break;
5176  }
5177  if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
5178  !CInt->getValue().isPowerOf2())
5179  Op2VP = TargetTransformInfo::OP_None;
5180  if (i == 0) {
5181  CInt0 = CInt;
5182  continue;
5183  }
5184  if (CInt0 != CInt)
5185  Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
5186  }
5187 
5189  InstructionCost ScalarEltCost =
5190  TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
5191  Op2VK, Op1VP, Op2VP, Operands, VL0);
5192  if (NeedToShuffleReuses) {
5193  CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5194  }
5195  InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
5196  InstructionCost VecCost =
5197  TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
5198  Op2VK, Op1VP, Op2VP, Operands, VL0);
5199  LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
5200  return CommonCost + VecCost - ScalarCost;
5201  }
5202  case Instruction::GetElementPtr: {
5204  TargetTransformInfo::OK_AnyValue;
5206  TargetTransformInfo::OK_UniformConstantValue;
5207 
5208  InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
5209  Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
5210  if (NeedToShuffleReuses) {
5211  CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5212  }
5213  InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
5215  Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
5216  LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
5217  return CommonCost + VecCost - ScalarCost;
5218  }
5219  case Instruction::Load: {
5220  // Cost of wide load - cost of scalar loads.
5221  Align Alignment = cast<LoadInst>(VL0)->getAlign();
5222  InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
5223  Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
5224  if (NeedToShuffleReuses) {
5225  CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
5226  }
5227  InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
5228  InstructionCost VecLdCost;
5229  if (E->State == TreeEntry::Vectorize) {
5230  VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
5231  CostKind, VL0);
5232  } else {
5233  assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
5234  Align CommonAlignment = Alignment;
5235  for (Value *V : VL)
5236  CommonAlignment =
5237  commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
5238  VecLdCost = TTI->getGatherScatterOpCost(
5239  Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5240  /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
5241  }
5242  LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
5243  return CommonCost + VecLdCost - ScalarLdCost;
5244  }
5245  case Instruction::Store: {
5246  // We know that we can merge the stores. Calculate the cost.
5247  bool IsReorder = !E->ReorderIndices.empty();
5248  auto *SI =
5249  cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
5250  Align Alignment = SI->getAlign();
5251  InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
5252  Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
5253  InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
5254  InstructionCost VecStCost = TTI->getMemoryOpCost(
5255  Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
5256  LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
5257  return CommonCost + VecStCost - ScalarStCost;
5258  }
5259  case Instruction::Call: {
5260  CallInst *CI = cast<CallInst>(VL0);
5262 
5263  // Calculate the cost of the scalar and vector calls.
5264  IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
5265  InstructionCost ScalarEltCost =
5266  TTI<