LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120static cl::opt<int>
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130static cl::opt<bool>
131ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140// even if we match a reduction but do not vectorize in the end.
142 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143 cl::desc("Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
145
146static cl::opt<int>
148 cl::desc("Attempt to vectorize for this register size in bits"));
149
152 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153
154/// Limits the size of scheduling regions in a block.
155/// It avoid long compile times for _very_ large blocks where vector
156/// instructions are spread over a wide range.
157/// This limit is way higher than needed by real-world functions.
158static cl::opt<int>
159ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160 cl::desc("Limit the size of the SLP scheduling region per block"));
161
163 "slp-min-reg-size", cl::init(128), cl::Hidden,
164 cl::desc("Attempt to vectorize for this register size in bits"));
165
167 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168 cl::desc("Limit the recursion depth when building a vectorizable tree"));
169
171 "slp-min-tree-size", cl::init(3), cl::Hidden,
172 cl::desc("Only vectorize small trees if they are fully vectorizable"));
173
174// The maximum depth that the look-ahead score heuristic will explore.
175// The higher this value, the higher the compilation time overhead.
177 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for operand reordering scores"));
179
180// The maximum depth that the look-ahead score heuristic will explore
181// when it probing among candidates for vectorization tree roots.
182// The higher this value, the higher the compilation time overhead but unlike
183// similar limit for operands ordering this is less frequently used, hence
184// impact of higher value is less noticeable.
186 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187 cl::desc("The maximum look-ahead depth for searching best rooting option"));
188
190 "slp-min-strided-loads", cl::init(2), cl::Hidden,
191 cl::desc("The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
193
195 "slp-max-stride", cl::init(8), cl::Hidden,
196 cl::desc("The maximum stride, considered to be profitable."));
197
198static cl::opt<bool>
199 ViewSLPTree("view-slp-tree", cl::Hidden,
200 cl::desc("Display the SLP trees with Graphviz"));
201
203 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205
206// Limit the number of alias checks. The limit is chosen so that
207// it has no negative effect on the llvm benchmarks.
208static const unsigned AliasedCheckLimit = 10;
209
210// Limit of the number of uses for potentially transformed instructions/values,
211// used in checks to avoid compile-time explode.
212static constexpr int UsesLimit = 64;
213
214// Another limit for the alias checks: The maximum distance between load/store
215// instructions where alias checks are done.
216// This limit is useful for very large basic blocks.
217static const unsigned MaxMemDepDistance = 160;
218
219/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220/// regions to be handled.
221static const int MinScheduleRegionSize = 16;
222
223/// Maximum allowed number of operands in the PHI nodes.
224static const unsigned MaxPHINumOperands = 128;
225
226/// Predicate for the element types that the SLP vectorizer supports.
227///
228/// The most important thing to filter here are types which are invalid in LLVM
229/// vectors. We also filter target specific types which have absolutely no
230/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231/// avoids spending time checking the cost model and realizing that they will
232/// be inevitably scalarized.
233static bool isValidElementType(Type *Ty) {
234 // TODO: Support ScalableVectorType.
235 if (SLPReVec && isa<FixedVectorType>(Ty))
236 Ty = Ty->getScalarType();
237 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238 !Ty->isPPC_FP128Ty();
239}
240
241/// \returns the number of elements for Ty.
242static unsigned getNumElements(Type *Ty) {
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
247 return 1;
248}
249
250/// \returns the vector type of ScalarTy based on vectorization factor.
251static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252 return FixedVectorType::get(ScalarTy->getScalarType(),
253 VF * getNumElements(ScalarTy));
254}
255
256static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
257 SmallVectorImpl<int> &Mask) {
258 // The ShuffleBuilder implementation use shufflevector to splat an "element".
259 // But the element have different meaning for SLP (scalar) and REVEC
260 // (vector). We need to expand Mask into masks which shufflevector can use
261 // directly.
262 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
263 for (unsigned I : seq<unsigned>(Mask.size()))
264 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
265 I * VecTyNumElements, VecTyNumElements)))
266 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
267 : Mask[I] * VecTyNumElements + J;
268 Mask.swap(NewMask);
269}
270
271/// \returns the number of groups of shufflevector
272/// A group has the following features
273/// 1. All of value in a group are shufflevector.
274/// 2. The mask of all shufflevector is isExtractSubvectorMask.
275/// 3. The mask of all shufflevector uses all of the elements of the source (and
276/// the elements are used in order).
277/// e.g., it is 1 group (%0)
278/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
279/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
280/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
281/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
282/// it is 2 groups (%3 and %4)
283/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
284/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
285/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
286/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
287/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
288/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
289/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
290/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
291/// it is 0 group
292/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
293/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
294/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
295/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
297 if (VL.empty())
298 return 0;
299 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
300 return 0;
301 auto *SV = cast<ShuffleVectorInst>(VL.front());
302 unsigned SVNumElements =
303 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
304 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
305 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
306 return 0;
307 unsigned NumGroup = 0;
308 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
309 auto *SV = cast<ShuffleVectorInst>(VL[I]);
310 Value *Src = SV->getOperand(0);
311 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
312 SmallVector<int> ExtractionIndex(SVNumElements);
313 if (!all_of(Group, [&](Value *V) {
314 auto *SV = cast<ShuffleVectorInst>(V);
315 // From the same source.
316 if (SV->getOperand(0) != Src)
317 return false;
318 int Index;
319 if (!SV->isExtractSubvectorMask(Index))
320 return false;
321 for (int I : seq<int>(Index, Index + SV->getShuffleMask().size()))
322 ExtractionIndex.push_back(I);
323 return true;
324 }))
325 return 0;
326 if (!is_sorted(ExtractionIndex))
327 return 0;
328 ++NumGroup;
329 }
330 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
331 return NumGroup;
332}
333
334/// \returns a shufflevector mask which is used to vectorize shufflevectors
335/// e.g.,
336/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
337/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
338/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
339/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
340/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
341/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
342/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
343/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
344/// the result is
345/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
347 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
348 auto *SV = cast<ShuffleVectorInst>(VL.front());
349 unsigned SVNumElements =
350 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
351 SmallVector<int> Mask;
352 unsigned AccumulateLength = 0;
353 for (Value *V : VL) {
354 auto *SV = cast<ShuffleVectorInst>(V);
355 for (int M : SV->getShuffleMask())
356 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
357 : AccumulateLength + M);
358 AccumulateLength += SVNumElements;
359 }
360 return Mask;
361}
362
363/// \returns True if the value is a constant (but not globals/constant
364/// expressions).
365static bool isConstant(Value *V) {
366 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
367}
368
369/// Checks if \p V is one of vector-like instructions, i.e. undef,
370/// insertelement/extractelement with constant indices for fixed vector type or
371/// extractvalue instruction.
373 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
374 !isa<ExtractValueInst, UndefValue>(V))
375 return false;
376 auto *I = dyn_cast<Instruction>(V);
377 if (!I || isa<ExtractValueInst>(I))
378 return true;
379 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
380 return false;
381 if (isa<ExtractElementInst>(I))
382 return isConstant(I->getOperand(1));
383 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
384 return isConstant(I->getOperand(2));
385}
386
387/// Returns power-of-2 number of elements in a single register (part), given the
388/// total number of elements \p Size and number of registers (parts) \p
389/// NumParts.
390static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
391 return PowerOf2Ceil(divideCeil(Size, NumParts));
392}
393
394/// Returns correct remaining number of elements, considering total amount \p
395/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
396/// and current register (part) \p Part.
397static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
398 unsigned Part) {
399 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
400}
401
402#if !defined(NDEBUG)
403/// Print a short descriptor of the instruction bundle suitable for debug output.
404static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
405 std::string Result;
406 raw_string_ostream OS(Result);
407 if (Idx >= 0)
408 OS << "Idx: " << Idx << ", ";
409 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
410 OS.flush();
411 return Result;
412}
413#endif
414
415/// \returns true if all of the instructions in \p VL are in the same block or
416/// false otherwise.
418 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
419 if (!I0)
420 return false;
422 return true;
423
424 BasicBlock *BB = I0->getParent();
425 for (int I = 1, E = VL.size(); I < E; I++) {
426 auto *II = dyn_cast<Instruction>(VL[I]);
427 if (!II)
428 return false;
429
430 if (BB != II->getParent())
431 return false;
432 }
433 return true;
434}
435
436/// \returns True if all of the values in \p VL are constants (but not
437/// globals/constant expressions).
439 // Constant expressions and globals can't be vectorized like normal integer/FP
440 // constants.
441 return all_of(VL, isConstant);
442}
443
444/// \returns True if all of the values in \p VL are identical or some of them
445/// are UndefValue.
446static bool isSplat(ArrayRef<Value *> VL) {
447 Value *FirstNonUndef = nullptr;
448 for (Value *V : VL) {
449 if (isa<UndefValue>(V))
450 continue;
451 if (!FirstNonUndef) {
452 FirstNonUndef = V;
453 continue;
454 }
455 if (V != FirstNonUndef)
456 return false;
457 }
458 return FirstNonUndef != nullptr;
459}
460
461/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
463 if (auto *Cmp = dyn_cast<CmpInst>(I))
464 return Cmp->isCommutative();
465 if (auto *BO = dyn_cast<BinaryOperator>(I))
466 return BO->isCommutative() ||
467 (BO->getOpcode() == Instruction::Sub &&
468 !BO->hasNUsesOrMore(UsesLimit) &&
469 all_of(
470 BO->uses(),
471 [](const Use &U) {
472 // Commutative, if icmp eq/ne sub, 0
473 ICmpInst::Predicate Pred;
474 if (match(U.getUser(),
475 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
476 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
477 return true;
478 // Commutative, if abs(sub nsw, true) or abs(sub, false).
479 ConstantInt *Flag;
480 return match(U.getUser(),
481 m_Intrinsic<Intrinsic::abs>(
482 m_Specific(U.get()), m_ConstantInt(Flag))) &&
483 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
484 Flag->isOne());
485 })) ||
486 (BO->getOpcode() == Instruction::FSub &&
487 !BO->hasNUsesOrMore(UsesLimit) &&
488 all_of(BO->uses(), [](const Use &U) {
489 return match(U.getUser(),
490 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
491 }));
492 return I->isCommutative();
493}
494
495template <typename T>
496static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
497 unsigned Offset) {
498 static_assert(std::is_same_v<T, InsertElementInst> ||
499 std::is_same_v<T, ExtractElementInst>,
500 "unsupported T");
501 int Index = Offset;
502 if (const auto *IE = dyn_cast<T>(Inst)) {
503 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
504 if (!VT)
505 return std::nullopt;
506 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
507 if (!CI)
508 return std::nullopt;
509 if (CI->getValue().uge(VT->getNumElements()))
510 return std::nullopt;
511 Index *= VT->getNumElements();
512 Index += CI->getZExtValue();
513 return Index;
514 }
515 return std::nullopt;
516}
517
518/// \returns inserting or extracting index of InsertElement, ExtractElement or
519/// InsertValue instruction, using Offset as base offset for index.
520/// \returns std::nullopt if the index is not an immediate.
521static std::optional<unsigned> getElementIndex(const Value *Inst,
522 unsigned Offset = 0) {
523 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
524 return Index;
525 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
526 return Index;
527
528 int Index = Offset;
529
530 const auto *IV = dyn_cast<InsertValueInst>(Inst);
531 if (!IV)
532 return std::nullopt;
533
534 Type *CurrentType = IV->getType();
535 for (unsigned I : IV->indices()) {
536 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
537 Index *= ST->getNumElements();
538 CurrentType = ST->getElementType(I);
539 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
540 Index *= AT->getNumElements();
541 CurrentType = AT->getElementType();
542 } else {
543 return std::nullopt;
544 }
545 Index += I;
546 }
547 return Index;
548}
549
550namespace {
551/// Specifies the way the mask should be analyzed for undefs/poisonous elements
552/// in the shuffle mask.
553enum class UseMask {
554 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
555 ///< check for the mask elements for the first argument (mask
556 ///< indices are in range [0:VF)).
557 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
558 ///< for the mask elements for the second argument (mask indices
559 ///< are in range [VF:2*VF))
560 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
561 ///< future shuffle elements and mark them as ones as being used
562 ///< in future. Non-undef elements are considered as unused since
563 ///< they're already marked as used in the mask.
564};
565} // namespace
566
567/// Prepares a use bitset for the given mask either for the first argument or
568/// for the second.
570 UseMask MaskArg) {
571 SmallBitVector UseMask(VF, true);
572 for (auto [Idx, Value] : enumerate(Mask)) {
573 if (Value == PoisonMaskElem) {
574 if (MaskArg == UseMask::UndefsAsMask)
575 UseMask.reset(Idx);
576 continue;
577 }
578 if (MaskArg == UseMask::FirstArg && Value < VF)
579 UseMask.reset(Value);
580 else if (MaskArg == UseMask::SecondArg && Value >= VF)
581 UseMask.reset(Value - VF);
582 }
583 return UseMask;
584}
585
586/// Checks if the given value is actually an undefined constant vector.
587/// Also, if the \p UseMask is not empty, tries to check if the non-masked
588/// elements actually mask the insertelement buildvector, if any.
589template <bool IsPoisonOnly = false>
591 const SmallBitVector &UseMask = {}) {
592 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
593 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
594 if (isa<T>(V))
595 return Res;
596 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
597 if (!VecTy)
598 return Res.reset();
599 auto *C = dyn_cast<Constant>(V);
600 if (!C) {
601 if (!UseMask.empty()) {
602 const Value *Base = V;
603 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
604 Base = II->getOperand(0);
605 if (isa<T>(II->getOperand(1)))
606 continue;
607 std::optional<unsigned> Idx = getElementIndex(II);
608 if (!Idx) {
609 Res.reset();
610 return Res;
611 }
612 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
613 Res.reset(*Idx);
614 }
615 // TODO: Add analysis for shuffles here too.
616 if (V == Base) {
617 Res.reset();
618 } else {
619 SmallBitVector SubMask(UseMask.size(), false);
620 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
621 }
622 } else {
623 Res.reset();
624 }
625 return Res;
626 }
627 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
628 if (Constant *Elem = C->getAggregateElement(I))
629 if (!isa<T>(Elem) &&
630 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
631 Res.reset(I);
632 }
633 return Res;
634}
635
636/// Checks if the vector of instructions can be represented as a shuffle, like:
637/// %x0 = extractelement <4 x i8> %x, i32 0
638/// %x3 = extractelement <4 x i8> %x, i32 3
639/// %y1 = extractelement <4 x i8> %y, i32 1
640/// %y2 = extractelement <4 x i8> %y, i32 2
641/// %x0x0 = mul i8 %x0, %x0
642/// %x3x3 = mul i8 %x3, %x3
643/// %y1y1 = mul i8 %y1, %y1
644/// %y2y2 = mul i8 %y2, %y2
645/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
646/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
647/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
648/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
649/// ret <4 x i8> %ins4
650/// can be transformed into:
651/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
652/// i32 6>
653/// %2 = mul <4 x i8> %1, %1
654/// ret <4 x i8> %2
655/// Mask will return the Shuffle Mask equivalent to the extracted elements.
656/// TODO: Can we split off and reuse the shuffle mask detection from
657/// ShuffleVectorInst/getShuffleCost?
658static std::optional<TargetTransformInfo::ShuffleKind>
660 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
661 if (It == VL.end())
662 return std::nullopt;
663 unsigned Size =
664 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
665 auto *EI = dyn_cast<ExtractElementInst>(V);
666 if (!EI)
667 return S;
668 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
669 if (!VTy)
670 return S;
671 return std::max(S, VTy->getNumElements());
672 });
673
674 Value *Vec1 = nullptr;
675 Value *Vec2 = nullptr;
676 bool HasNonUndefVec = any_of(VL, [](Value *V) {
677 auto *EE = dyn_cast<ExtractElementInst>(V);
678 if (!EE)
679 return false;
680 Value *Vec = EE->getVectorOperand();
681 if (isa<UndefValue>(Vec))
682 return false;
683 return isGuaranteedNotToBePoison(Vec);
684 });
685 enum ShuffleMode { Unknown, Select, Permute };
686 ShuffleMode CommonShuffleMode = Unknown;
687 Mask.assign(VL.size(), PoisonMaskElem);
688 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
689 // Undef can be represented as an undef element in a vector.
690 if (isa<UndefValue>(VL[I]))
691 continue;
692 auto *EI = cast<ExtractElementInst>(VL[I]);
693 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
694 return std::nullopt;
695 auto *Vec = EI->getVectorOperand();
696 // We can extractelement from undef or poison vector.
697 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
698 continue;
699 // All vector operands must have the same number of vector elements.
700 if (isa<UndefValue>(Vec)) {
701 Mask[I] = I;
702 } else {
703 if (isa<UndefValue>(EI->getIndexOperand()))
704 continue;
705 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
706 if (!Idx)
707 return std::nullopt;
708 // Undefined behavior if Idx is negative or >= Size.
709 if (Idx->getValue().uge(Size))
710 continue;
711 unsigned IntIdx = Idx->getValue().getZExtValue();
712 Mask[I] = IntIdx;
713 }
714 if (isUndefVector(Vec).all() && HasNonUndefVec)
715 continue;
716 // For correct shuffling we have to have at most 2 different vector operands
717 // in all extractelement instructions.
718 if (!Vec1 || Vec1 == Vec) {
719 Vec1 = Vec;
720 } else if (!Vec2 || Vec2 == Vec) {
721 Vec2 = Vec;
722 Mask[I] += Size;
723 } else {
724 return std::nullopt;
725 }
726 if (CommonShuffleMode == Permute)
727 continue;
728 // If the extract index is not the same as the operation number, it is a
729 // permutation.
730 if (Mask[I] % Size != I) {
731 CommonShuffleMode = Permute;
732 continue;
733 }
734 CommonShuffleMode = Select;
735 }
736 // If we're not crossing lanes in different vectors, consider it as blending.
737 if (CommonShuffleMode == Select && Vec2)
739 // If Vec2 was never used, we have a permutation of a single vector, otherwise
740 // we have permutation of 2 vectors.
743}
744
745/// \returns True if Extract{Value,Element} instruction extracts element Idx.
746static std::optional<unsigned> getExtractIndex(Instruction *E) {
747 unsigned Opcode = E->getOpcode();
748 assert((Opcode == Instruction::ExtractElement ||
749 Opcode == Instruction::ExtractValue) &&
750 "Expected extractelement or extractvalue instruction.");
751 if (Opcode == Instruction::ExtractElement) {
752 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
753 if (!CI)
754 return std::nullopt;
755 return CI->getZExtValue();
756 }
757 auto *EI = cast<ExtractValueInst>(E);
758 if (EI->getNumIndices() != 1)
759 return std::nullopt;
760 return *EI->idx_begin();
761}
762
763namespace {
764
765/// Main data required for vectorization of instructions.
766struct InstructionsState {
767 /// The very first instruction in the list with the main opcode.
768 Value *OpValue = nullptr;
769
770 /// The main/alternate instruction.
771 Instruction *MainOp = nullptr;
772 Instruction *AltOp = nullptr;
773
774 /// The main/alternate opcodes for the list of instructions.
775 unsigned getOpcode() const {
776 return MainOp ? MainOp->getOpcode() : 0;
777 }
778
779 unsigned getAltOpcode() const {
780 return AltOp ? AltOp->getOpcode() : 0;
781 }
782
783 /// Some of the instructions in the list have alternate opcodes.
784 bool isAltShuffle() const { return AltOp != MainOp; }
785
786 bool isOpcodeOrAlt(Instruction *I) const {
787 unsigned CheckedOpcode = I->getOpcode();
788 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
789 }
790
791 InstructionsState() = delete;
792 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
793 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
794};
795
796} // end anonymous namespace
797
798/// \returns true if \p Opcode is allowed as part of the main/alternate
799/// instruction for SLP vectorization.
800///
801/// Example of unsupported opcode is SDIV that can potentially cause UB if the
802/// "shuffled out" lane would result in division by zero.
803static bool isValidForAlternation(unsigned Opcode) {
804 if (Instruction::isIntDivRem(Opcode))
805 return false;
806
807 return true;
808}
809
810static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
811 const TargetLibraryInfo &TLI,
812 unsigned BaseIndex = 0);
813
814/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
815/// compatible instructions or constants, or just some other regular values.
816static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
817 Value *Op1, const TargetLibraryInfo &TLI) {
818 return (isConstant(BaseOp0) && isConstant(Op0)) ||
819 (isConstant(BaseOp1) && isConstant(Op1)) ||
820 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
821 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
822 BaseOp0 == Op0 || BaseOp1 == Op1 ||
823 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
824 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
825}
826
827/// \returns true if a compare instruction \p CI has similar "look" and
828/// same predicate as \p BaseCI, "as is" or with its operands and predicate
829/// swapped, false otherwise.
830static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
831 const TargetLibraryInfo &TLI) {
832 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
833 "Assessing comparisons of different types?");
834 CmpInst::Predicate BasePred = BaseCI->getPredicate();
835 CmpInst::Predicate Pred = CI->getPredicate();
837
838 Value *BaseOp0 = BaseCI->getOperand(0);
839 Value *BaseOp1 = BaseCI->getOperand(1);
840 Value *Op0 = CI->getOperand(0);
841 Value *Op1 = CI->getOperand(1);
842
843 return (BasePred == Pred &&
844 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
845 (BasePred == SwappedPred &&
846 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
847}
848
849/// \returns analysis of the Instructions in \p VL described in
850/// InstructionsState, the Opcode that we suppose the whole list
851/// could be vectorized even if its structure is diverse.
852static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
853 const TargetLibraryInfo &TLI,
854 unsigned BaseIndex) {
855 // Make sure these are all Instructions.
856 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
857 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
858
859 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
860 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
861 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
862 CmpInst::Predicate BasePred =
863 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
865 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
866 unsigned AltOpcode = Opcode;
867 unsigned AltIndex = BaseIndex;
868
869 bool SwappedPredsCompatible = [&]() {
870 if (!IsCmpOp)
871 return false;
872 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
873 UniquePreds.insert(BasePred);
874 UniqueNonSwappedPreds.insert(BasePred);
875 for (Value *V : VL) {
876 auto *I = dyn_cast<CmpInst>(V);
877 if (!I)
878 return false;
879 CmpInst::Predicate CurrentPred = I->getPredicate();
880 CmpInst::Predicate SwappedCurrentPred =
881 CmpInst::getSwappedPredicate(CurrentPred);
882 UniqueNonSwappedPreds.insert(CurrentPred);
883 if (!UniquePreds.contains(CurrentPred) &&
884 !UniquePreds.contains(SwappedCurrentPred))
885 UniquePreds.insert(CurrentPred);
886 }
887 // Total number of predicates > 2, but if consider swapped predicates
888 // compatible only 2, consider swappable predicates as compatible opcodes,
889 // not alternate.
890 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
891 }();
892 // Check for one alternate opcode from another BinaryOperator.
893 // TODO - generalize to support all operators (types, calls etc.).
894 auto *IBase = cast<Instruction>(VL[BaseIndex]);
895 Intrinsic::ID BaseID = 0;
896 SmallVector<VFInfo> BaseMappings;
897 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
899 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
900 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
901 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
902 }
903 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
904 auto *I = cast<Instruction>(VL[Cnt]);
905 unsigned InstOpcode = I->getOpcode();
906 if (IsBinOp && isa<BinaryOperator>(I)) {
907 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
908 continue;
909 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
910 isValidForAlternation(Opcode)) {
911 AltOpcode = InstOpcode;
912 AltIndex = Cnt;
913 continue;
914 }
915 } else if (IsCastOp && isa<CastInst>(I)) {
916 Value *Op0 = IBase->getOperand(0);
917 Type *Ty0 = Op0->getType();
918 Value *Op1 = I->getOperand(0);
919 Type *Ty1 = Op1->getType();
920 if (Ty0 == Ty1) {
921 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
922 continue;
923 if (Opcode == AltOpcode) {
925 isValidForAlternation(InstOpcode) &&
926 "Cast isn't safe for alternation, logic needs to be updated!");
927 AltOpcode = InstOpcode;
928 AltIndex = Cnt;
929 continue;
930 }
931 }
932 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
933 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
934 Type *Ty0 = BaseInst->getOperand(0)->getType();
935 Type *Ty1 = Inst->getOperand(0)->getType();
936 if (Ty0 == Ty1) {
937 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
938 // Check for compatible operands. If the corresponding operands are not
939 // compatible - need to perform alternate vectorization.
940 CmpInst::Predicate CurrentPred = Inst->getPredicate();
941 CmpInst::Predicate SwappedCurrentPred =
942 CmpInst::getSwappedPredicate(CurrentPred);
943
944 if ((E == 2 || SwappedPredsCompatible) &&
945 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
946 continue;
947
948 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
949 continue;
950 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
951 if (AltIndex != BaseIndex) {
952 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
953 continue;
954 } else if (BasePred != CurrentPred) {
955 assert(
956 isValidForAlternation(InstOpcode) &&
957 "CmpInst isn't safe for alternation, logic needs to be updated!");
958 AltIndex = Cnt;
959 continue;
960 }
961 CmpInst::Predicate AltPred = AltInst->getPredicate();
962 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
963 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
964 continue;
965 }
966 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
967 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
968 if (Gep->getNumOperands() != 2 ||
969 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
970 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
971 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
973 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
974 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
975 auto *BaseLI = cast<LoadInst>(IBase);
976 if (!LI->isSimple() || !BaseLI->isSimple())
977 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
978 } else if (auto *Call = dyn_cast<CallInst>(I)) {
979 auto *CallBase = cast<CallInst>(IBase);
980 if (Call->getCalledFunction() != CallBase->getCalledFunction())
981 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
982 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
983 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
984 Call->op_begin() + Call->getBundleOperandsEndIndex(),
985 CallBase->op_begin() +
987 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
989 if (ID != BaseID)
990 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
991 if (!ID) {
992 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
993 if (Mappings.size() != BaseMappings.size() ||
994 Mappings.front().ISA != BaseMappings.front().ISA ||
995 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
996 Mappings.front().VectorName != BaseMappings.front().VectorName ||
997 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
998 Mappings.front().Shape.Parameters !=
999 BaseMappings.front().Shape.Parameters)
1000 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1001 }
1002 }
1003 continue;
1004 }
1005 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1006 }
1007
1008 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
1009 cast<Instruction>(VL[AltIndex]));
1010}
1011
1012/// \returns true if all of the values in \p VL have the same type or false
1013/// otherwise.
1015 Type *Ty = VL.front()->getType();
1016 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1017}
1018
1019/// \returns True if in-tree use also needs extract. This refers to
1020/// possible scalar operand in vectorized instruction.
1021static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1022 TargetLibraryInfo *TLI) {
1023 unsigned Opcode = UserInst->getOpcode();
1024 switch (Opcode) {
1025 case Instruction::Load: {
1026 LoadInst *LI = cast<LoadInst>(UserInst);
1027 return (LI->getPointerOperand() == Scalar);
1028 }
1029 case Instruction::Store: {
1030 StoreInst *SI = cast<StoreInst>(UserInst);
1031 return (SI->getPointerOperand() == Scalar);
1032 }
1033 case Instruction::Call: {
1034 CallInst *CI = cast<CallInst>(UserInst);
1036 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1037 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
1038 Arg.value().get() == Scalar;
1039 });
1040 }
1041 default:
1042 return false;
1043 }
1044}
1045
1046/// \returns the AA location that is being access by the instruction.
1048 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1049 return MemoryLocation::get(SI);
1050 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1051 return MemoryLocation::get(LI);
1052 return MemoryLocation();
1053}
1054
1055/// \returns True if the instruction is not a volatile or atomic load/store.
1056static bool isSimple(Instruction *I) {
1057 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1058 return LI->isSimple();
1059 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1060 return SI->isSimple();
1061 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1062 return !MI->isVolatile();
1063 return true;
1064}
1065
1066/// Shuffles \p Mask in accordance with the given \p SubMask.
1067/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1068/// one but two input vectors.
1069static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1070 bool ExtendingManyInputs = false) {
1071 if (SubMask.empty())
1072 return;
1073 assert(
1074 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1075 // Check if input scalars were extended to match the size of other node.
1076 (SubMask.size() == Mask.size() &&
1077 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
1078 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
1079 "SubMask with many inputs support must be larger than the mask.");
1080 if (Mask.empty()) {
1081 Mask.append(SubMask.begin(), SubMask.end());
1082 return;
1083 }
1084 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1085 int TermValue = std::min(Mask.size(), SubMask.size());
1086 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1087 if (SubMask[I] == PoisonMaskElem ||
1088 (!ExtendingManyInputs &&
1089 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1090 continue;
1091 NewMask[I] = Mask[SubMask[I]];
1092 }
1093 Mask.swap(NewMask);
1094}
1095
1096/// Order may have elements assigned special value (size) which is out of
1097/// bounds. Such indices only appear on places which correspond to undef values
1098/// (see canReuseExtract for details) and used in order to avoid undef values
1099/// have effect on operands ordering.
1100/// The first loop below simply finds all unused indices and then the next loop
1101/// nest assigns these indices for undef values positions.
1102/// As an example below Order has two undef positions and they have assigned
1103/// values 3 and 7 respectively:
1104/// before: 6 9 5 4 9 2 1 0
1105/// after: 6 3 5 4 7 2 1 0
1107 const unsigned Sz = Order.size();
1108 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1109 SmallBitVector MaskedIndices(Sz);
1110 for (unsigned I = 0; I < Sz; ++I) {
1111 if (Order[I] < Sz)
1112 UnusedIndices.reset(Order[I]);
1113 else
1114 MaskedIndices.set(I);
1115 }
1116 if (MaskedIndices.none())
1117 return;
1118 assert(UnusedIndices.count() == MaskedIndices.count() &&
1119 "Non-synced masked/available indices.");
1120 int Idx = UnusedIndices.find_first();
1121 int MIdx = MaskedIndices.find_first();
1122 while (MIdx >= 0) {
1123 assert(Idx >= 0 && "Indices must be synced.");
1124 Order[MIdx] = Idx;
1125 Idx = UnusedIndices.find_next(Idx);
1126 MIdx = MaskedIndices.find_next(MIdx);
1127 }
1128}
1129
1130/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1131/// Opcode1.
1133 unsigned Opcode1) {
1134 Type *ScalarTy = VL[0]->getType();
1135 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1136 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1137 for (unsigned Lane : seq<unsigned>(VL.size()))
1138 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1139 OpcodeMask.set(Lane * ScalarTyNumElements,
1140 Lane * ScalarTyNumElements + ScalarTyNumElements);
1141 return OpcodeMask;
1142}
1143
1144namespace llvm {
1145
1147 SmallVectorImpl<int> &Mask) {
1148 Mask.clear();
1149 const unsigned E = Indices.size();
1150 Mask.resize(E, PoisonMaskElem);
1151 for (unsigned I = 0; I < E; ++I)
1152 Mask[Indices[I]] = I;
1153}
1154
1155/// Reorders the list of scalars in accordance with the given \p Mask.
1157 ArrayRef<int> Mask) {
1158 assert(!Mask.empty() && "Expected non-empty mask.");
1159 SmallVector<Value *> Prev(Scalars.size(),
1160 PoisonValue::get(Scalars.front()->getType()));
1161 Prev.swap(Scalars);
1162 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1163 if (Mask[I] != PoisonMaskElem)
1164 Scalars[Mask[I]] = Prev[I];
1165}
1166
1167/// Checks if the provided value does not require scheduling. It does not
1168/// require scheduling if this is not an instruction or it is an instruction
1169/// that does not read/write memory and all operands are either not instructions
1170/// or phi nodes or instructions from different blocks.
1172 auto *I = dyn_cast<Instruction>(V);
1173 if (!I)
1174 return true;
1175 return !mayHaveNonDefUseDependency(*I) &&
1176 all_of(I->operands(), [I](Value *V) {
1177 auto *IO = dyn_cast<Instruction>(V);
1178 if (!IO)
1179 return true;
1180 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1181 });
1182}
1183
1184/// Checks if the provided value does not require scheduling. It does not
1185/// require scheduling if this is not an instruction or it is an instruction
1186/// that does not read/write memory and all users are phi nodes or instructions
1187/// from the different blocks.
1188static bool isUsedOutsideBlock(Value *V) {
1189 auto *I = dyn_cast<Instruction>(V);
1190 if (!I)
1191 return true;
1192 // Limits the number of uses to save compile time.
1193 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1194 all_of(I->users(), [I](User *U) {
1195 auto *IU = dyn_cast<Instruction>(U);
1196 if (!IU)
1197 return true;
1198 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1199 });
1200}
1201
1202/// Checks if the specified value does not require scheduling. It does not
1203/// require scheduling if all operands and all users do not need to be scheduled
1204/// in the current basic block.
1207}
1208
1209/// Checks if the specified array of instructions does not require scheduling.
1210/// It is so if all either instructions have operands that do not require
1211/// scheduling or their users do not require scheduling since they are phis or
1212/// in other basic blocks.
1214 return !VL.empty() &&
1216}
1217
1218namespace slpvectorizer {
1219
1220/// Bottom Up SLP Vectorizer.
1221class BoUpSLP {
1222 struct TreeEntry;
1223 struct ScheduleData;
1226
1227public:
1228 /// Tracks the state we can represent the loads in the given sequence.
1229 enum class LoadsState {
1230 Gather,
1231 Vectorize,
1234 };
1235
1243
1245 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1248 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1249 AC(AC), DB(DB), DL(DL), ORE(ORE),
1250 Builder(Se->getContext(), TargetFolder(*DL)) {
1251 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1252 // Use the vector register size specified by the target unless overridden
1253 // by a command-line option.
1254 // TODO: It would be better to limit the vectorization factor based on
1255 // data type rather than just register size. For example, x86 AVX has
1256 // 256-bit registers, but it does not support integer operations
1257 // at that width (that requires AVX2).
1258 if (MaxVectorRegSizeOption.getNumOccurrences())
1259 MaxVecRegSize = MaxVectorRegSizeOption;
1260 else
1261 MaxVecRegSize =
1263 .getFixedValue();
1264
1265 if (MinVectorRegSizeOption.getNumOccurrences())
1266 MinVecRegSize = MinVectorRegSizeOption;
1267 else
1268 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1269 }
1270
1271 /// Vectorize the tree that starts with the elements in \p VL.
1272 /// Returns the vectorized root.
1274
1275 /// Vectorize the tree but with the list of externally used values \p
1276 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1277 /// generated extractvalue instructions.
1278 /// \param ReplacedExternals containd list of replaced external values
1279 /// {scalar, replace} after emitting extractelement for external uses.
1280 Value *
1281 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1282 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1283 Instruction *ReductionRoot = nullptr);
1284
1285 /// \returns the cost incurred by unwanted spills and fills, caused by
1286 /// holding live values over call sites.
1288
1289 /// \returns the vectorization cost of the subtree that starts at \p VL.
1290 /// A negative number means that this is profitable.
1291 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1292
1293 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1294 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1295 void buildTree(ArrayRef<Value *> Roots,
1296 const SmallDenseSet<Value *> &UserIgnoreLst);
1297
1298 /// Construct a vectorizable tree that starts at \p Roots.
1299 void buildTree(ArrayRef<Value *> Roots);
1300
1301 /// Returns whether the root node has in-tree uses.
1303 return !VectorizableTree.empty() &&
1304 !VectorizableTree.front()->UserTreeIndices.empty();
1305 }
1306
1307 /// Return the scalars of the root node.
1309 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1310 return VectorizableTree.front()->Scalars;
1311 }
1312
1313 /// Checks if the root graph node can be emitted with narrower bitwidth at
1314 /// codegen and returns it signedness, if so.
1316 return MinBWs.at(VectorizableTree.front().get()).second;
1317 }
1318
1319 /// Builds external uses of the vectorized scalars, i.e. the list of
1320 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1321 /// ExternallyUsedValues contains additional list of external uses to handle
1322 /// vectorization of reductions.
1323 void
1324 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1325
1326 /// Transforms graph nodes to target specific representations, if profitable.
1327 void transformNodes();
1328
1329 /// Clear the internal data structures that are created by 'buildTree'.
1330 void deleteTree() {
1331 VectorizableTree.clear();
1332 ScalarToTreeEntry.clear();
1333 MultiNodeScalars.clear();
1334 MustGather.clear();
1335 NonScheduledFirst.clear();
1336 EntryToLastInstruction.clear();
1337 ExternalUses.clear();
1338 ExternalUsesAsOriginalScalar.clear();
1339 for (auto &Iter : BlocksSchedules) {
1340 BlockScheduling *BS = Iter.second.get();
1341 BS->clear();
1342 }
1343 MinBWs.clear();
1344 ReductionBitWidth = 0;
1345 CastMaxMinBWSizes.reset();
1346 ExtraBitWidthNodes.clear();
1347 InstrElementSize.clear();
1348 UserIgnoreList = nullptr;
1349 PostponedGathers.clear();
1350 ValueToGatherNodes.clear();
1351 }
1352
1353 unsigned getTreeSize() const { return VectorizableTree.size(); }
1354
1355 /// Perform LICM and CSE on the newly generated gather sequences.
1357
1358 /// Checks if the specified gather tree entry \p TE can be represented as a
1359 /// shuffled vector entry + (possibly) permutation with other gathers. It
1360 /// implements the checks only for possibly ordered scalars (Loads,
1361 /// ExtractElement, ExtractValue), which can be part of the graph.
1362 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1363
1364 /// Sort loads into increasing pointers offsets to allow greater clustering.
1365 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1366
1367 /// Gets reordering data for the given tree entry. If the entry is vectorized
1368 /// - just return ReorderIndices, otherwise check if the scalars can be
1369 /// reordered and return the most optimal order.
1370 /// \return std::nullopt if ordering is not important, empty order, if
1371 /// identity order is important, or the actual order.
1372 /// \param TopToBottom If true, include the order of vectorized stores and
1373 /// insertelement nodes, otherwise skip them.
1374 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1375 bool TopToBottom);
1376
1377 /// Reorders the current graph to the most profitable order starting from the
1378 /// root node to the leaf nodes. The best order is chosen only from the nodes
1379 /// of the same size (vectorization factor). Smaller nodes are considered
1380 /// parts of subgraph with smaller VF and they are reordered independently. We
1381 /// can make it because we still need to extend smaller nodes to the wider VF
1382 /// and we can merge reordering shuffles with the widening shuffles.
1383 void reorderTopToBottom();
1384
1385 /// Reorders the current graph to the most profitable order starting from
1386 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1387 /// number of reshuffles if the leaf nodes use the same order. In this case we
1388 /// can merge the orders and just shuffle user node instead of shuffling its
1389 /// operands. Plus, even the leaf nodes have different orders, it allows to
1390 /// sink reordering in the graph closer to the root node and merge it later
1391 /// during analysis.
1392 void reorderBottomToTop(bool IgnoreReorder = false);
1393
1394 /// \return The vector element size in bits to use when vectorizing the
1395 /// expression tree ending at \p V. If V is a store, the size is the width of
1396 /// the stored value. Otherwise, the size is the width of the largest loaded
1397 /// value reaching V. This method is used by the vectorizer to calculate
1398 /// vectorization factors.
1399 unsigned getVectorElementSize(Value *V);
1400
1401 /// Compute the minimum type sizes required to represent the entries in a
1402 /// vectorizable tree.
1404
1405 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1406 unsigned getMaxVecRegSize() const {
1407 return MaxVecRegSize;
1408 }
1409
1410 // \returns minimum vector register size as set by cl::opt.
1411 unsigned getMinVecRegSize() const {
1412 return MinVecRegSize;
1413 }
1414
1415 unsigned getMinVF(unsigned Sz) const {
1416 return std::max(2U, getMinVecRegSize() / Sz);
1417 }
1418
1419 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1420 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1421 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1422 return MaxVF ? MaxVF : UINT_MAX;
1423 }
1424
1425 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1426 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1427 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1428 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1429 ///
1430 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1431 unsigned canMapToVector(Type *T) const;
1432
1433 /// \returns True if the VectorizableTree is both tiny and not fully
1434 /// vectorizable. We do not vectorize such trees.
1435 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1436
1437 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1438 /// can be load combined in the backend. Load combining may not be allowed in
1439 /// the IR optimizer, so we do not want to alter the pattern. For example,
1440 /// partially transforming a scalar bswap() pattern into vector code is
1441 /// effectively impossible for the backend to undo.
1442 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1443 /// may not be necessary.
1444 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1445
1446 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1447 /// can be load combined in the backend. Load combining may not be allowed in
1448 /// the IR optimizer, so we do not want to alter the pattern. For example,
1449 /// partially transforming a scalar bswap() pattern into vector code is
1450 /// effectively impossible for the backend to undo.
1451 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1452 /// may not be necessary.
1453 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1454
1455 /// Checks if the given array of loads can be represented as a vectorized,
1456 /// scatter or just simple gather.
1457 /// \param VL list of loads.
1458 /// \param VL0 main load value.
1459 /// \param Order returned order of load instructions.
1460 /// \param PointerOps returned list of pointer operands.
1461 /// \param TryRecursiveCheck used to check if long masked gather can be
1462 /// represented as a serie of loads/insert subvector, if profitable.
1465 SmallVectorImpl<Value *> &PointerOps,
1466 bool TryRecursiveCheck = true) const;
1467
1469
1470 /// This structure holds any data we need about the edges being traversed
1471 /// during buildTree_rec(). We keep track of:
1472 /// (i) the user TreeEntry index, and
1473 /// (ii) the index of the edge.
1474 struct EdgeInfo {
1475 EdgeInfo() = default;
1476 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1478 /// The user TreeEntry.
1479 TreeEntry *UserTE = nullptr;
1480 /// The operand index of the use.
1481 unsigned EdgeIdx = UINT_MAX;
1482#ifndef NDEBUG
1484 const BoUpSLP::EdgeInfo &EI) {
1485 EI.dump(OS);
1486 return OS;
1487 }
1488 /// Debug print.
1489 void dump(raw_ostream &OS) const {
1490 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1491 << " EdgeIdx:" << EdgeIdx << "}";
1492 }
1493 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1494#endif
1495 bool operator == (const EdgeInfo &Other) const {
1496 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1497 }
1498 };
1499
1500 /// A helper class used for scoring candidates for two consecutive lanes.
1502 const TargetLibraryInfo &TLI;
1503 const DataLayout &DL;
1504 ScalarEvolution &SE;
1505 const BoUpSLP &R;
1506 int NumLanes; // Total number of lanes (aka vectorization factor).
1507 int MaxLevel; // The maximum recursion depth for accumulating score.
1508
1509 public:
1511 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1512 int MaxLevel)
1513 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1514 MaxLevel(MaxLevel) {}
1515
1516 // The hard-coded scores listed here are not very important, though it shall
1517 // be higher for better matches to improve the resulting cost. When
1518 // computing the scores of matching one sub-tree with another, we are
1519 // basically counting the number of values that are matching. So even if all
1520 // scores are set to 1, we would still get a decent matching result.
1521 // However, sometimes we have to break ties. For example we may have to
1522 // choose between matching loads vs matching opcodes. This is what these
1523 // scores are helping us with: they provide the order of preference. Also,
1524 // this is important if the scalar is externally used or used in another
1525 // tree entry node in the different lane.
1526
1527 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1528 static const int ScoreConsecutiveLoads = 4;
1529 /// The same load multiple times. This should have a better score than
1530 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1531 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1532 /// a vector load and 1.0 for a broadcast.
1533 static const int ScoreSplatLoads = 3;
1534 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1535 static const int ScoreReversedLoads = 3;
1536 /// A load candidate for masked gather.
1537 static const int ScoreMaskedGatherCandidate = 1;
1538 /// ExtractElementInst from same vector and consecutive indexes.
1539 static const int ScoreConsecutiveExtracts = 4;
1540 /// ExtractElementInst from same vector and reversed indices.
1541 static const int ScoreReversedExtracts = 3;
1542 /// Constants.
1543 static const int ScoreConstants = 2;
1544 /// Instructions with the same opcode.
1545 static const int ScoreSameOpcode = 2;
1546 /// Instructions with alt opcodes (e.g, add + sub).
1547 static const int ScoreAltOpcodes = 1;
1548 /// Identical instructions (a.k.a. splat or broadcast).
1549 static const int ScoreSplat = 1;
1550 /// Matching with an undef is preferable to failing.
1551 static const int ScoreUndef = 1;
1552 /// Score for failing to find a decent match.
1553 static const int ScoreFail = 0;
1554 /// Score if all users are vectorized.
1555 static const int ScoreAllUserVectorized = 1;
1556
1557 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1558 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1559 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1560 /// MainAltOps.
1562 ArrayRef<Value *> MainAltOps) const {
1563 if (!isValidElementType(V1->getType()) ||
1564 !isValidElementType(V2->getType()))
1566
1567 if (V1 == V2) {
1568 if (isa<LoadInst>(V1)) {
1569 // Retruns true if the users of V1 and V2 won't need to be extracted.
1570 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1571 // Bail out if we have too many uses to save compilation time.
1572 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1573 return false;
1574
1575 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1576 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1577 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1578 });
1579 };
1580 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1581 };
1582 // A broadcast of a load can be cheaper on some targets.
1583 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1584 ElementCount::getFixed(NumLanes)) &&
1585 ((int)V1->getNumUses() == NumLanes ||
1586 AllUsersAreInternal(V1, V2)))
1588 }
1590 }
1591
1592 auto CheckSameEntryOrFail = [&]() {
1593 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1594 TE1 && TE1 == R.getTreeEntry(V2))
1597 };
1598
1599 auto *LI1 = dyn_cast<LoadInst>(V1);
1600 auto *LI2 = dyn_cast<LoadInst>(V2);
1601 if (LI1 && LI2) {
1602 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1603 !LI2->isSimple())
1604 return CheckSameEntryOrFail();
1605
1606 std::optional<int> Dist = getPointersDiff(
1607 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1608 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1609 if (!Dist || *Dist == 0) {
1610 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1611 getUnderlyingObject(LI2->getPointerOperand()) &&
1612 R.TTI->isLegalMaskedGather(
1613 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1615 return CheckSameEntryOrFail();
1616 }
1617 // The distance is too large - still may be profitable to use masked
1618 // loads/gathers.
1619 if (std::abs(*Dist) > NumLanes / 2)
1621 // This still will detect consecutive loads, but we might have "holes"
1622 // in some cases. It is ok for non-power-2 vectorization and may produce
1623 // better results. It should not affect current vectorization.
1626 }
1627
1628 auto *C1 = dyn_cast<Constant>(V1);
1629 auto *C2 = dyn_cast<Constant>(V2);
1630 if (C1 && C2)
1632
1633 // Extracts from consecutive indexes of the same vector better score as
1634 // the extracts could be optimized away.
1635 Value *EV1;
1636 ConstantInt *Ex1Idx;
1637 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1638 // Undefs are always profitable for extractelements.
1639 // Compiler can easily combine poison and extractelement <non-poison> or
1640 // undef and extractelement <poison>. But combining undef +
1641 // extractelement <non-poison-but-may-produce-poison> requires some
1642 // extra operations.
1643 if (isa<UndefValue>(V2))
1644 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1647 Value *EV2 = nullptr;
1648 ConstantInt *Ex2Idx = nullptr;
1649 if (match(V2,
1651 m_Undef())))) {
1652 // Undefs are always profitable for extractelements.
1653 if (!Ex2Idx)
1655 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1657 if (EV2 == EV1) {
1658 int Idx1 = Ex1Idx->getZExtValue();
1659 int Idx2 = Ex2Idx->getZExtValue();
1660 int Dist = Idx2 - Idx1;
1661 // The distance is too large - still may be profitable to use
1662 // shuffles.
1663 if (std::abs(Dist) == 0)
1665 if (std::abs(Dist) > NumLanes / 2)
1669 }
1671 }
1672 return CheckSameEntryOrFail();
1673 }
1674
1675 auto *I1 = dyn_cast<Instruction>(V1);
1676 auto *I2 = dyn_cast<Instruction>(V2);
1677 if (I1 && I2) {
1678 if (I1->getParent() != I2->getParent())
1679 return CheckSameEntryOrFail();
1680 SmallVector<Value *, 4> Ops(MainAltOps);
1681 Ops.push_back(I1);
1682 Ops.push_back(I2);
1683 InstructionsState S = getSameOpcode(Ops, TLI);
1684 // Note: Only consider instructions with <= 2 operands to avoid
1685 // complexity explosion.
1686 if (S.getOpcode() &&
1687 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1688 !S.isAltShuffle()) &&
1689 all_of(Ops, [&S](Value *V) {
1690 return cast<Instruction>(V)->getNumOperands() ==
1691 S.MainOp->getNumOperands();
1692 }))
1693 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1695 }
1696
1697 if (isa<UndefValue>(V2))
1699
1700 return CheckSameEntryOrFail();
1701 }
1702
1703 /// Go through the operands of \p LHS and \p RHS recursively until
1704 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1705 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1706 /// of \p U1 and \p U2), except at the beginning of the recursion where
1707 /// these are set to nullptr.
1708 ///
1709 /// For example:
1710 /// \verbatim
1711 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1712 /// \ / \ / \ / \ /
1713 /// + + + +
1714 /// G1 G2 G3 G4
1715 /// \endverbatim
1716 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1717 /// each level recursively, accumulating the score. It starts from matching
1718 /// the additions at level 0, then moves on to the loads (level 1). The
1719 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1720 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1721 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1722 /// Please note that the order of the operands does not matter, as we
1723 /// evaluate the score of all profitable combinations of operands. In
1724 /// other words the score of G1 and G4 is the same as G1 and G2. This
1725 /// heuristic is based on ideas described in:
1726 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1727 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1728 /// Luís F. W. Góes
1730 Instruction *U2, int CurrLevel,
1731 ArrayRef<Value *> MainAltOps) const {
1732
1733 // Get the shallow score of V1 and V2.
1734 int ShallowScoreAtThisLevel =
1735 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1736
1737 // If reached MaxLevel,
1738 // or if V1 and V2 are not instructions,
1739 // or if they are SPLAT,
1740 // or if they are not consecutive,
1741 // or if profitable to vectorize loads or extractelements, early return
1742 // the current cost.
1743 auto *I1 = dyn_cast<Instruction>(LHS);
1744 auto *I2 = dyn_cast<Instruction>(RHS);
1745 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1746 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1747 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1748 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1749 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1750 ShallowScoreAtThisLevel))
1751 return ShallowScoreAtThisLevel;
1752 assert(I1 && I2 && "Should have early exited.");
1753
1754 // Contains the I2 operand indexes that got matched with I1 operands.
1755 SmallSet<unsigned, 4> Op2Used;
1756
1757 // Recursion towards the operands of I1 and I2. We are trying all possible
1758 // operand pairs, and keeping track of the best score.
1759 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1760 OpIdx1 != NumOperands1; ++OpIdx1) {
1761 // Try to pair op1I with the best operand of I2.
1762 int MaxTmpScore = 0;
1763 unsigned MaxOpIdx2 = 0;
1764 bool FoundBest = false;
1765 // If I2 is commutative try all combinations.
1766 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1767 unsigned ToIdx = isCommutative(I2)
1768 ? I2->getNumOperands()
1769 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1770 assert(FromIdx <= ToIdx && "Bad index");
1771 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1772 // Skip operands already paired with OpIdx1.
1773 if (Op2Used.count(OpIdx2))
1774 continue;
1775 // Recursively calculate the cost at each level
1776 int TmpScore =
1777 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1778 I1, I2, CurrLevel + 1, std::nullopt);
1779 // Look for the best score.
1780 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1781 TmpScore > MaxTmpScore) {
1782 MaxTmpScore = TmpScore;
1783 MaxOpIdx2 = OpIdx2;
1784 FoundBest = true;
1785 }
1786 }
1787 if (FoundBest) {
1788 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1789 Op2Used.insert(MaxOpIdx2);
1790 ShallowScoreAtThisLevel += MaxTmpScore;
1791 }
1792 }
1793 return ShallowScoreAtThisLevel;
1794 }
1795 };
1796 /// A helper data structure to hold the operands of a vector of instructions.
1797 /// This supports a fixed vector length for all operand vectors.
1799 /// For each operand we need (i) the value, and (ii) the opcode that it
1800 /// would be attached to if the expression was in a left-linearized form.
1801 /// This is required to avoid illegal operand reordering.
1802 /// For example:
1803 /// \verbatim
1804 /// 0 Op1
1805 /// |/
1806 /// Op1 Op2 Linearized + Op2
1807 /// \ / ----------> |/
1808 /// - -
1809 ///
1810 /// Op1 - Op2 (0 + Op1) - Op2
1811 /// \endverbatim
1812 ///
1813 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1814 ///
1815 /// Another way to think of this is to track all the operations across the
1816 /// path from the operand all the way to the root of the tree and to
1817 /// calculate the operation that corresponds to this path. For example, the
1818 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1819 /// corresponding operation is a '-' (which matches the one in the
1820 /// linearized tree, as shown above).
1821 ///
1822 /// For lack of a better term, we refer to this operation as Accumulated
1823 /// Path Operation (APO).
1824 struct OperandData {
1825 OperandData() = default;
1826 OperandData(Value *V, bool APO, bool IsUsed)
1827 : V(V), APO(APO), IsUsed(IsUsed) {}
1828 /// The operand value.
1829 Value *V = nullptr;
1830 /// TreeEntries only allow a single opcode, or an alternate sequence of
1831 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1832 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1833 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1834 /// (e.g., Add/Mul)
1835 bool APO = false;
1836 /// Helper data for the reordering function.
1837 bool IsUsed = false;
1838 };
1839
1840 /// During operand reordering, we are trying to select the operand at lane
1841 /// that matches best with the operand at the neighboring lane. Our
1842 /// selection is based on the type of value we are looking for. For example,
1843 /// if the neighboring lane has a load, we need to look for a load that is
1844 /// accessing a consecutive address. These strategies are summarized in the
1845 /// 'ReorderingMode' enumerator.
1846 enum class ReorderingMode {
1847 Load, ///< Matching loads to consecutive memory addresses
1848 Opcode, ///< Matching instructions based on opcode (same or alternate)
1849 Constant, ///< Matching constants
1850 Splat, ///< Matching the same instruction multiple times (broadcast)
1851 Failed, ///< We failed to create a vectorizable group
1852 };
1853
1855
1856 /// A vector of operand vectors.
1858
1859 const TargetLibraryInfo &TLI;
1860 const DataLayout &DL;
1861 ScalarEvolution &SE;
1862 const BoUpSLP &R;
1863 const Loop *L = nullptr;
1864
1865 /// \returns the operand data at \p OpIdx and \p Lane.
1866 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1867 return OpsVec[OpIdx][Lane];
1868 }
1869
1870 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1871 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1872 return OpsVec[OpIdx][Lane];
1873 }
1874
1875 /// Clears the used flag for all entries.
1876 void clearUsed() {
1877 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1878 OpIdx != NumOperands; ++OpIdx)
1879 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1880 ++Lane)
1881 OpsVec[OpIdx][Lane].IsUsed = false;
1882 }
1883
1884 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1885 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1886 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1887 }
1888
1889 /// \param Lane lane of the operands under analysis.
1890 /// \param OpIdx operand index in \p Lane lane we're looking the best
1891 /// candidate for.
1892 /// \param Idx operand index of the current candidate value.
1893 /// \returns The additional score due to possible broadcasting of the
1894 /// elements in the lane. It is more profitable to have power-of-2 unique
1895 /// elements in the lane, it will be vectorized with higher probability
1896 /// after removing duplicates. Currently the SLP vectorizer supports only
1897 /// vectorization of the power-of-2 number of unique scalars.
1898 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1899 Value *IdxLaneV = getData(Idx, Lane).V;
1900 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1901 return 0;
1903 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1904 if (Ln == Lane)
1905 continue;
1906 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1907 if (!isa<Instruction>(OpIdxLnV))
1908 return 0;
1909 Uniques.insert(OpIdxLnV);
1910 }
1911 int UniquesCount = Uniques.size();
1912 int UniquesCntWithIdxLaneV =
1913 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1914 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1915 int UniquesCntWithOpIdxLaneV =
1916 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1917 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1918 return 0;
1919 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1920 UniquesCntWithOpIdxLaneV) -
1921 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1922 }
1923
1924 /// \param Lane lane of the operands under analysis.
1925 /// \param OpIdx operand index in \p Lane lane we're looking the best
1926 /// candidate for.
1927 /// \param Idx operand index of the current candidate value.
1928 /// \returns The additional score for the scalar which users are all
1929 /// vectorized.
1930 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1931 Value *IdxLaneV = getData(Idx, Lane).V;
1932 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1933 // Do not care about number of uses for vector-like instructions
1934 // (extractelement/extractvalue with constant indices), they are extracts
1935 // themselves and already externally used. Vectorization of such
1936 // instructions does not add extra extractelement instruction, just may
1937 // remove it.
1938 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1939 isVectorLikeInstWithConstOps(OpIdxLaneV))
1941 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1942 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1943 return 0;
1944 return R.areAllUsersVectorized(IdxLaneI)
1946 : 0;
1947 }
1948
1949 /// Score scaling factor for fully compatible instructions but with
1950 /// different number of external uses. Allows better selection of the
1951 /// instructions with less external uses.
1952 static const int ScoreScaleFactor = 10;
1953
1954 /// \Returns the look-ahead score, which tells us how much the sub-trees
1955 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1956 /// score. This helps break ties in an informed way when we cannot decide on
1957 /// the order of the operands by just considering the immediate
1958 /// predecessors.
1959 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1960 int Lane, unsigned OpIdx, unsigned Idx,
1961 bool &IsUsed) {
1962 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1964 // Keep track of the instruction stack as we recurse into the operands
1965 // during the look-ahead score exploration.
1966 int Score =
1967 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1968 /*CurrLevel=*/1, MainAltOps);
1969 if (Score) {
1970 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1971 if (Score <= -SplatScore) {
1972 // Set the minimum score for splat-like sequence to avoid setting
1973 // failed state.
1974 Score = 1;
1975 } else {
1976 Score += SplatScore;
1977 // Scale score to see the difference between different operands
1978 // and similar operands but all vectorized/not all vectorized
1979 // uses. It does not affect actual selection of the best
1980 // compatible operand in general, just allows to select the
1981 // operand with all vectorized uses.
1982 Score *= ScoreScaleFactor;
1983 Score += getExternalUseScore(Lane, OpIdx, Idx);
1984 IsUsed = true;
1985 }
1986 }
1987 return Score;
1988 }
1989
1990 /// Best defined scores per lanes between the passes. Used to choose the
1991 /// best operand (with the highest score) between the passes.
1992 /// The key - {Operand Index, Lane}.
1993 /// The value - the best score between the passes for the lane and the
1994 /// operand.
1996 BestScoresPerLanes;
1997
1998 // Search all operands in Ops[*][Lane] for the one that matches best
1999 // Ops[OpIdx][LastLane] and return its opreand index.
2000 // If no good match can be found, return std::nullopt.
2001 std::optional<unsigned>
2002 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2003 ArrayRef<ReorderingMode> ReorderingModes,
2004 ArrayRef<Value *> MainAltOps) {
2005 unsigned NumOperands = getNumOperands();
2006
2007 // The operand of the previous lane at OpIdx.
2008 Value *OpLastLane = getData(OpIdx, LastLane).V;
2009
2010 // Our strategy mode for OpIdx.
2011 ReorderingMode RMode = ReorderingModes[OpIdx];
2012 if (RMode == ReorderingMode::Failed)
2013 return std::nullopt;
2014
2015 // The linearized opcode of the operand at OpIdx, Lane.
2016 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2017
2018 // The best operand index and its score.
2019 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2020 // are using the score to differentiate between the two.
2021 struct BestOpData {
2022 std::optional<unsigned> Idx;
2023 unsigned Score = 0;
2024 } BestOp;
2025 BestOp.Score =
2026 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2027 .first->second;
2028
2029 // Track if the operand must be marked as used. If the operand is set to
2030 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2031 // want to reestimate the operands again on the following iterations).
2032 bool IsUsed = RMode == ReorderingMode::Splat ||
2033 RMode == ReorderingMode::Constant ||
2034 RMode == ReorderingMode::Load;
2035 // Iterate through all unused operands and look for the best.
2036 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2037 // Get the operand at Idx and Lane.
2038 OperandData &OpData = getData(Idx, Lane);
2039 Value *Op = OpData.V;
2040 bool OpAPO = OpData.APO;
2041
2042 // Skip already selected operands.
2043 if (OpData.IsUsed)
2044 continue;
2045
2046 // Skip if we are trying to move the operand to a position with a
2047 // different opcode in the linearized tree form. This would break the
2048 // semantics.
2049 if (OpAPO != OpIdxAPO)
2050 continue;
2051
2052 // Look for an operand that matches the current mode.
2053 switch (RMode) {
2054 case ReorderingMode::Load:
2055 case ReorderingMode::Opcode: {
2056 bool LeftToRight = Lane > LastLane;
2057 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2058 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2059 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2060 OpIdx, Idx, IsUsed);
2061 if (Score > static_cast<int>(BestOp.Score) ||
2062 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2063 Idx == OpIdx)) {
2064 BestOp.Idx = Idx;
2065 BestOp.Score = Score;
2066 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2067 }
2068 break;
2069 }
2070 case ReorderingMode::Constant:
2071 if (isa<Constant>(Op) ||
2072 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2073 BestOp.Idx = Idx;
2074 if (isa<Constant>(Op)) {
2076 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2078 }
2079 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2080 IsUsed = false;
2081 }
2082 break;
2083 case ReorderingMode::Splat:
2084 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2085 IsUsed = Op == OpLastLane;
2086 if (Op == OpLastLane) {
2087 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2088 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2090 }
2091 BestOp.Idx = Idx;
2092 }
2093 break;
2094 case ReorderingMode::Failed:
2095 llvm_unreachable("Not expected Failed reordering mode.");
2096 }
2097 }
2098
2099 if (BestOp.Idx) {
2100 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2101 return BestOp.Idx;
2102 }
2103 // If we could not find a good match return std::nullopt.
2104 return std::nullopt;
2105 }
2106
2107 /// Helper for reorderOperandVecs.
2108 /// \returns the lane that we should start reordering from. This is the one
2109 /// which has the least number of operands that can freely move about or
2110 /// less profitable because it already has the most optimal set of operands.
2111 unsigned getBestLaneToStartReordering() const {
2112 unsigned Min = UINT_MAX;
2113 unsigned SameOpNumber = 0;
2114 // std::pair<unsigned, unsigned> is used to implement a simple voting
2115 // algorithm and choose the lane with the least number of operands that
2116 // can freely move about or less profitable because it already has the
2117 // most optimal set of operands. The first unsigned is a counter for
2118 // voting, the second unsigned is the counter of lanes with instructions
2119 // with same/alternate opcodes and same parent basic block.
2121 // Try to be closer to the original results, if we have multiple lanes
2122 // with same cost. If 2 lanes have the same cost, use the one with the
2123 // lowest index.
2124 for (int I = getNumLanes(); I > 0; --I) {
2125 unsigned Lane = I - 1;
2126 OperandsOrderData NumFreeOpsHash =
2127 getMaxNumOperandsThatCanBeReordered(Lane);
2128 // Compare the number of operands that can move and choose the one with
2129 // the least number.
2130 if (NumFreeOpsHash.NumOfAPOs < Min) {
2131 Min = NumFreeOpsHash.NumOfAPOs;
2132 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2133 HashMap.clear();
2134 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2135 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2136 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2137 // Select the most optimal lane in terms of number of operands that
2138 // should be moved around.
2139 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2140 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2141 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2142 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2143 auto *It = HashMap.find(NumFreeOpsHash.Hash);
2144 if (It == HashMap.end())
2145 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2146 else
2147 ++It->second.first;
2148 }
2149 }
2150 // Select the lane with the minimum counter.
2151 unsigned BestLane = 0;
2152 unsigned CntMin = UINT_MAX;
2153 for (const auto &Data : reverse(HashMap)) {
2154 if (Data.second.first < CntMin) {
2155 CntMin = Data.second.first;
2156 BestLane = Data.second.second;
2157 }
2158 }
2159 return BestLane;
2160 }
2161
2162 /// Data structure that helps to reorder operands.
2163 struct OperandsOrderData {
2164 /// The best number of operands with the same APOs, which can be
2165 /// reordered.
2166 unsigned NumOfAPOs = UINT_MAX;
2167 /// Number of operands with the same/alternate instruction opcode and
2168 /// parent.
2169 unsigned NumOpsWithSameOpcodeParent = 0;
2170 /// Hash for the actual operands ordering.
2171 /// Used to count operands, actually their position id and opcode
2172 /// value. It is used in the voting mechanism to find the lane with the
2173 /// least number of operands that can freely move about or less profitable
2174 /// because it already has the most optimal set of operands. Can be
2175 /// replaced with SmallVector<unsigned> instead but hash code is faster
2176 /// and requires less memory.
2177 unsigned Hash = 0;
2178 };
2179 /// \returns the maximum number of operands that are allowed to be reordered
2180 /// for \p Lane and the number of compatible instructions(with the same
2181 /// parent/opcode). This is used as a heuristic for selecting the first lane
2182 /// to start operand reordering.
2183 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2184 unsigned CntTrue = 0;
2185 unsigned NumOperands = getNumOperands();
2186 // Operands with the same APO can be reordered. We therefore need to count
2187 // how many of them we have for each APO, like this: Cnt[APO] = x.
2188 // Since we only have two APOs, namely true and false, we can avoid using
2189 // a map. Instead we can simply count the number of operands that
2190 // correspond to one of them (in this case the 'true' APO), and calculate
2191 // the other by subtracting it from the total number of operands.
2192 // Operands with the same instruction opcode and parent are more
2193 // profitable since we don't need to move them in many cases, with a high
2194 // probability such lane already can be vectorized effectively.
2195 bool AllUndefs = true;
2196 unsigned NumOpsWithSameOpcodeParent = 0;
2197 Instruction *OpcodeI = nullptr;
2198 BasicBlock *Parent = nullptr;
2199 unsigned Hash = 0;
2200 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2201 const OperandData &OpData = getData(OpIdx, Lane);
2202 if (OpData.APO)
2203 ++CntTrue;
2204 // Use Boyer-Moore majority voting for finding the majority opcode and
2205 // the number of times it occurs.
2206 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2207 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2208 I->getParent() != Parent) {
2209 if (NumOpsWithSameOpcodeParent == 0) {
2210 NumOpsWithSameOpcodeParent = 1;
2211 OpcodeI = I;
2212 Parent = I->getParent();
2213 } else {
2214 --NumOpsWithSameOpcodeParent;
2215 }
2216 } else {
2217 ++NumOpsWithSameOpcodeParent;
2218 }
2219 }
2220 Hash = hash_combine(
2221 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2222 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2223 }
2224 if (AllUndefs)
2225 return {};
2226 OperandsOrderData Data;
2227 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2228 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2229 Data.Hash = Hash;
2230 return Data;
2231 }
2232
2233 /// Go through the instructions in VL and append their operands.
2234 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2235 assert(!VL.empty() && "Bad VL");
2236 assert((empty() || VL.size() == getNumLanes()) &&
2237 "Expected same number of lanes");
2238 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2239 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2240 constexpr unsigned IntrinsicNumOperands = 2;
2241 if (isa<IntrinsicInst>(VL[0]))
2242 NumOperands = IntrinsicNumOperands;
2243 OpsVec.resize(NumOperands);
2244 unsigned NumLanes = VL.size();
2245 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2246 OpsVec[OpIdx].resize(NumLanes);
2247 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2248 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2249 // Our tree has just 3 nodes: the root and two operands.
2250 // It is therefore trivial to get the APO. We only need to check the
2251 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2252 // RHS operand. The LHS operand of both add and sub is never attached
2253 // to an inversese operation in the linearized form, therefore its APO
2254 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2255
2256 // Since operand reordering is performed on groups of commutative
2257 // operations or alternating sequences (e.g., +, -), we can safely
2258 // tell the inverse operations by checking commutativity.
2259 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2260 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2261 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2262 APO, false};
2263 }
2264 }
2265 }
2266
2267 /// \returns the number of operands.
2268 unsigned getNumOperands() const { return OpsVec.size(); }
2269
2270 /// \returns the number of lanes.
2271 unsigned getNumLanes() const { return OpsVec[0].size(); }
2272
2273 /// \returns the operand value at \p OpIdx and \p Lane.
2274 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2275 return getData(OpIdx, Lane).V;
2276 }
2277
2278 /// \returns true if the data structure is empty.
2279 bool empty() const { return OpsVec.empty(); }
2280
2281 /// Clears the data.
2282 void clear() { OpsVec.clear(); }
2283
2284 /// \Returns true if there are enough operands identical to \p Op to fill
2285 /// the whole vector (it is mixed with constants or loop invariant values).
2286 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2287 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2288 bool OpAPO = getData(OpIdx, Lane).APO;
2289 bool IsInvariant = L && L->isLoopInvariant(Op);
2290 unsigned Cnt = 0;
2291 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2292 if (Ln == Lane)
2293 continue;
2294 // This is set to true if we found a candidate for broadcast at Lane.
2295 bool FoundCandidate = false;
2296 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2297 OperandData &Data = getData(OpI, Ln);
2298 if (Data.APO != OpAPO || Data.IsUsed)
2299 continue;
2300 Value *OpILane = getValue(OpI, Lane);
2301 bool IsConstantOp = isa<Constant>(OpILane);
2302 // Consider the broadcast candidate if:
2303 // 1. Same value is found in one of the operands.
2304 if (Data.V == Op ||
2305 // 2. The operand in the given lane is not constant but there is a
2306 // constant operand in another lane (which can be moved to the
2307 // given lane). In this case we can represent it as a simple
2308 // permutation of constant and broadcast.
2309 (!IsConstantOp &&
2310 ((Lns > 2 && isa<Constant>(Data.V)) ||
2311 // 2.1. If we have only 2 lanes, need to check that value in the
2312 // next lane does not build same opcode sequence.
2313 (Lns == 2 &&
2314 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2315 .getOpcode() &&
2316 isa<Constant>(Data.V)))) ||
2317 // 3. The operand in the current lane is loop invariant (can be
2318 // hoisted out) and another operand is also a loop invariant
2319 // (though not a constant). In this case the whole vector can be
2320 // hoisted out.
2321 // FIXME: need to teach the cost model about this case for better
2322 // estimation.
2323 (IsInvariant && !isa<Constant>(Data.V) &&
2324 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2325 L->isLoopInvariant(Data.V))) {
2326 FoundCandidate = true;
2327 Data.IsUsed = Data.V == Op;
2328 if (Data.V == Op)
2329 ++Cnt;
2330 break;
2331 }
2332 }
2333 if (!FoundCandidate)
2334 return false;
2335 }
2336 return getNumLanes() == 2 || Cnt > 1;
2337 }
2338
2339 /// Checks if there is at least single compatible operand in lanes other
2340 /// than \p Lane, compatible with the operand \p Op.
2341 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2342 bool OpAPO = getData(OpIdx, Lane).APO;
2343 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2344 if (Ln == Lane)
2345 continue;
2346 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2347 const OperandData &Data = getData(OpI, Ln);
2348 if (Data.APO != OpAPO || Data.IsUsed)
2349 return true;
2350 Value *OpILn = getValue(OpI, Ln);
2351 return (L && L->isLoopInvariant(OpILn)) ||
2352 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2353 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2354 }))
2355 return true;
2356 }
2357 return false;
2358 }
2359
2360 public:
2361 /// Initialize with all the operands of the instruction vector \p RootVL.
2363 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2364 L(R.LI->getLoopFor(
2365 (cast<Instruction>(RootVL.front())->getParent()))) {
2366 // Append all the operands of RootVL.
2367 appendOperandsOfVL(RootVL);
2368 }
2369
2370 /// \Returns a value vector with the operands across all lanes for the
2371 /// opearnd at \p OpIdx.
2372 ValueList getVL(unsigned OpIdx) const {
2373 ValueList OpVL(OpsVec[OpIdx].size());
2374 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2375 "Expected same num of lanes across all operands");
2376 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2377 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2378 return OpVL;
2379 }
2380
2381 // Performs operand reordering for 2 or more operands.
2382 // The original operands are in OrigOps[OpIdx][Lane].
2383 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2384 void reorder() {
2385 unsigned NumOperands = getNumOperands();
2386 unsigned NumLanes = getNumLanes();
2387 // Each operand has its own mode. We are using this mode to help us select
2388 // the instructions for each lane, so that they match best with the ones
2389 // we have selected so far.
2390 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2391
2392 // This is a greedy single-pass algorithm. We are going over each lane
2393 // once and deciding on the best order right away with no back-tracking.
2394 // However, in order to increase its effectiveness, we start with the lane
2395 // that has operands that can move the least. For example, given the
2396 // following lanes:
2397 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2398 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2399 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2400 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2401 // we will start at Lane 1, since the operands of the subtraction cannot
2402 // be reordered. Then we will visit the rest of the lanes in a circular
2403 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2404
2405 // Find the first lane that we will start our search from.
2406 unsigned FirstLane = getBestLaneToStartReordering();
2407
2408 // Initialize the modes.
2409 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2410 Value *OpLane0 = getValue(OpIdx, FirstLane);
2411 // Keep track if we have instructions with all the same opcode on one
2412 // side.
2413 if (isa<LoadInst>(OpLane0))
2414 ReorderingModes[OpIdx] = ReorderingMode::Load;
2415 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2416 // Check if OpLane0 should be broadcast.
2417 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2418 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2419 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2420 else
2421 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2422 } else if (isa<Constant>(OpLane0))
2423 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2424 else if (isa<Argument>(OpLane0))
2425 // Our best hope is a Splat. It may save some cost in some cases.
2426 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2427 else
2428 // NOTE: This should be unreachable.
2429 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2430 }
2431
2432 // Check that we don't have same operands. No need to reorder if operands
2433 // are just perfect diamond or shuffled diamond match. Do not do it only
2434 // for possible broadcasts or non-power of 2 number of scalars (just for
2435 // now).
2436 auto &&SkipReordering = [this]() {
2437 SmallPtrSet<Value *, 4> UniqueValues;
2438 ArrayRef<OperandData> Op0 = OpsVec.front();
2439 for (const OperandData &Data : Op0)
2440 UniqueValues.insert(Data.V);
2441 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2442 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2443 return !UniqueValues.contains(Data.V);
2444 }))
2445 return false;
2446 }
2447 // TODO: Check if we can remove a check for non-power-2 number of
2448 // scalars after full support of non-power-2 vectorization.
2449 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2450 };
2451
2452 // If the initial strategy fails for any of the operand indexes, then we
2453 // perform reordering again in a second pass. This helps avoid assigning
2454 // high priority to the failed strategy, and should improve reordering for
2455 // the non-failed operand indexes.
2456 for (int Pass = 0; Pass != 2; ++Pass) {
2457 // Check if no need to reorder operands since they're are perfect or
2458 // shuffled diamond match.
2459 // Need to do it to avoid extra external use cost counting for
2460 // shuffled matches, which may cause regressions.
2461 if (SkipReordering())
2462 break;
2463 // Skip the second pass if the first pass did not fail.
2464 bool StrategyFailed = false;
2465 // Mark all operand data as free to use.
2466 clearUsed();
2467 // We keep the original operand order for the FirstLane, so reorder the
2468 // rest of the lanes. We are visiting the nodes in a circular fashion,
2469 // using FirstLane as the center point and increasing the radius
2470 // distance.
2471 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2472 for (unsigned I = 0; I < NumOperands; ++I)
2473 MainAltOps[I].push_back(getData(I, FirstLane).V);
2474
2475 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2476 // Visit the lane on the right and then the lane on the left.
2477 for (int Direction : {+1, -1}) {
2478 int Lane = FirstLane + Direction * Distance;
2479 if (Lane < 0 || Lane >= (int)NumLanes)
2480 continue;
2481 int LastLane = Lane - Direction;
2482 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2483 "Out of bounds");
2484 // Look for a good match for each operand.
2485 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2486 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2487 std::optional<unsigned> BestIdx = getBestOperand(
2488 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2489 // By not selecting a value, we allow the operands that follow to
2490 // select a better matching value. We will get a non-null value in
2491 // the next run of getBestOperand().
2492 if (BestIdx) {
2493 // Swap the current operand with the one returned by
2494 // getBestOperand().
2495 swap(OpIdx, *BestIdx, Lane);
2496 } else {
2497 // Enable the second pass.
2498 StrategyFailed = true;
2499 }
2500 // Try to get the alternate opcode and follow it during analysis.
2501 if (MainAltOps[OpIdx].size() != 2) {
2502 OperandData &AltOp = getData(OpIdx, Lane);
2503 InstructionsState OpS =
2504 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2505 if (OpS.getOpcode() && OpS.isAltShuffle())
2506 MainAltOps[OpIdx].push_back(AltOp.V);
2507 }
2508 }
2509 }
2510 }
2511 // Skip second pass if the strategy did not fail.
2512 if (!StrategyFailed)
2513 break;
2514 }
2515 }
2516
2517#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2518 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2519 switch (RMode) {
2520 case ReorderingMode::Load:
2521 return "Load";
2522 case ReorderingMode::Opcode:
2523 return "Opcode";
2524 case ReorderingMode::Constant:
2525 return "Constant";
2526 case ReorderingMode::Splat:
2527 return "Splat";
2528 case ReorderingMode::Failed:
2529 return "Failed";
2530 }
2531 llvm_unreachable("Unimplemented Reordering Type");
2532 }
2533
2534 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2535 raw_ostream &OS) {
2536 return OS << getModeStr(RMode);
2537 }
2538
2539 /// Debug print.
2540 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2541 printMode(RMode, dbgs());
2542 }
2543
2544 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2545 return printMode(RMode, OS);
2546 }
2547
2549 const unsigned Indent = 2;
2550 unsigned Cnt = 0;
2551 for (const OperandDataVec &OpDataVec : OpsVec) {
2552 OS << "Operand " << Cnt++ << "\n";
2553 for (const OperandData &OpData : OpDataVec) {
2554 OS.indent(Indent) << "{";
2555 if (Value *V = OpData.V)
2556 OS << *V;
2557 else
2558 OS << "null";
2559 OS << ", APO:" << OpData.APO << "}\n";
2560 }
2561 OS << "\n";
2562 }
2563 return OS;
2564 }
2565
2566 /// Debug print.
2567 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2568#endif
2569 };
2570
2571 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2572 /// for a pair which have highest score deemed to have best chance to form
2573 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2574 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2575 /// of the cost, considered to be good enough score.
2576 std::optional<int>
2577 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2578 int Limit = LookAheadHeuristics::ScoreFail) const {
2579 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2581 int BestScore = Limit;
2582 std::optional<int> Index;
2583 for (int I : seq<int>(0, Candidates.size())) {
2584 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2585 Candidates[I].second,
2586 /*U1=*/nullptr, /*U2=*/nullptr,
2587 /*Level=*/1, std::nullopt);
2588 if (Score > BestScore) {
2589 BestScore = Score;
2590 Index = I;
2591 }
2592 }
2593 return Index;
2594 }
2595
2596 /// Checks if the instruction is marked for deletion.
2597 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2598
2599 /// Removes an instruction from its block and eventually deletes it.
2600 /// It's like Instruction::eraseFromParent() except that the actual deletion
2601 /// is delayed until BoUpSLP is destructed.
2603 DeletedInstructions.insert(I);
2604 }
2605
2606 /// Remove instructions from the parent function and clear the operands of \p
2607 /// DeadVals instructions, marking for deletion trivially dead operands.
2608 template <typename T>
2611 for (T *V : DeadVals) {
2612 auto *I = cast<Instruction>(V);
2613 DeletedInstructions.insert(I);
2614 }
2615 DenseSet<Value *> Processed;
2616 for (T *V : DeadVals) {
2617 if (!V || !Processed.insert(V).second)
2618 continue;
2619 auto *I = cast<Instruction>(V);
2622 if (const TreeEntry *Entry = getTreeEntry(I)) {
2623 Entries.push_back(Entry);
2624 auto It = MultiNodeScalars.find(I);
2625 if (It != MultiNodeScalars.end())
2626 Entries.append(It->second.begin(), It->second.end());
2627 }
2628 for (Use &U : I->operands()) {
2629 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2630 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2632 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2633 return Entry->VectorizedValue == OpI;
2634 })))
2635 DeadInsts.push_back(OpI);
2636 }
2637 I->dropAllReferences();
2638 }
2639 for (T *V : DeadVals) {
2640 auto *I = cast<Instruction>(V);
2641 if (!I->getParent())
2642 continue;
2643 assert((I->use_empty() || all_of(I->uses(),
2644 [&](Use &U) {
2645 return isDeleted(
2646 cast<Instruction>(U.getUser()));
2647 })) &&
2648 "trying to erase instruction with users.");
2649 I->removeFromParent();
2650 SE->forgetValue(I);
2651 }
2652 // Process the dead instruction list until empty.
2653 while (!DeadInsts.empty()) {
2654 Value *V = DeadInsts.pop_back_val();
2655 Instruction *VI = cast_or_null<Instruction>(V);
2656 if (!VI || !VI->getParent())
2657 continue;
2659 "Live instruction found in dead worklist!");
2660 assert(VI->use_empty() && "Instructions with uses are not dead.");
2661
2662 // Don't lose the debug info while deleting the instructions.
2663 salvageDebugInfo(*VI);
2664
2665 // Null out all of the instruction's operands to see if any operand
2666 // becomes dead as we go.
2667 for (Use &OpU : VI->operands()) {
2668 Value *OpV = OpU.get();
2669 if (!OpV)
2670 continue;
2671 OpU.set(nullptr);
2672
2673 if (!OpV->use_empty())
2674 continue;
2675
2676 // If the operand is an instruction that became dead as we nulled out
2677 // the operand, and if it is 'trivially' dead, delete it in a future
2678 // loop iteration.
2679 if (auto *OpI = dyn_cast<Instruction>(OpV))
2680 if (!DeletedInstructions.contains(OpI) &&
2682 DeadInsts.push_back(OpI);
2683 }
2684
2685 VI->removeFromParent();
2686 DeletedInstructions.insert(VI);
2687 SE->forgetValue(VI);
2688 }
2689 }
2690
2691 /// Checks if the instruction was already analyzed for being possible
2692 /// reduction root.
2694 return AnalyzedReductionsRoots.count(I);
2695 }
2696 /// Register given instruction as already analyzed for being possible
2697 /// reduction root.
2699 AnalyzedReductionsRoots.insert(I);
2700 }
2701 /// Checks if the provided list of reduced values was checked already for
2702 /// vectorization.
2704 return AnalyzedReductionVals.contains(hash_value(VL));
2705 }
2706 /// Adds the list of reduced values to list of already checked values for the
2707 /// vectorization.
2709 AnalyzedReductionVals.insert(hash_value(VL));
2710 }
2711 /// Clear the list of the analyzed reduction root instructions.
2713 AnalyzedReductionsRoots.clear();
2714 AnalyzedReductionVals.clear();
2715 AnalyzedMinBWVals.clear();
2716 }
2717 /// Checks if the given value is gathered in one of the nodes.
2718 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2719 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2720 }
2721 /// Checks if the given value is gathered in one of the nodes.
2722 bool isGathered(const Value *V) const {
2723 return MustGather.contains(V);
2724 }
2725 /// Checks if the specified value was not schedule.
2726 bool isNotScheduled(const Value *V) const {
2727 return NonScheduledFirst.contains(V);
2728 }
2729
2730 /// Check if the value is vectorized in the tree.
2731 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2732
2733 ~BoUpSLP();
2734
2735private:
2736 /// Determine if a node \p E in can be demoted to a smaller type with a
2737 /// truncation. We collect the entries that will be demoted in ToDemote.
2738 /// \param E Node for analysis
2739 /// \param ToDemote indices of the nodes to be demoted.
2740 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2741 unsigned &BitWidth,
2742 SmallVectorImpl<unsigned> &ToDemote,
2744 unsigned &MaxDepthLevel,
2745 bool &IsProfitableToDemote,
2746 bool IsTruncRoot) const;
2747
2748 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2749 /// reordering (i.e. the operands can be reordered because they have only one
2750 /// user and reordarable).
2751 /// \param ReorderableGathers List of all gather nodes that require reordering
2752 /// (e.g., gather of extractlements or partially vectorizable loads).
2753 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2754 /// reordering, subset of \p NonVectorized.
2755 bool
2756 canReorderOperands(TreeEntry *UserTE,
2757 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2758 ArrayRef<TreeEntry *> ReorderableGathers,
2759 SmallVectorImpl<TreeEntry *> &GatherOps);
2760
2761 /// Checks if the given \p TE is a gather node with clustered reused scalars
2762 /// and reorders it per given \p Mask.
2763 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2764
2765 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2766 /// if any. If it is not vectorized (gather node), returns nullptr.
2767 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2768 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2769 TreeEntry *TE = nullptr;
2770 const auto *It = find_if(VL, [&](Value *V) {
2771 TE = getTreeEntry(V);
2772 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2773 return true;
2774 auto It = MultiNodeScalars.find(V);
2775 if (It != MultiNodeScalars.end()) {
2776 for (TreeEntry *E : It->second) {
2777 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2778 TE = E;
2779 return true;
2780 }
2781 }
2782 }
2783 return false;
2784 });
2785 if (It != VL.end()) {
2786 assert(TE->isSame(VL) && "Expected same scalars.");
2787 return TE;
2788 }
2789 return nullptr;
2790 }
2791
2792 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2793 /// if any. If it is not vectorized (gather node), returns nullptr.
2794 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2795 unsigned OpIdx) const {
2796 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2797 const_cast<TreeEntry *>(UserTE), OpIdx);
2798 }
2799
2800 /// Checks if all users of \p I are the part of the vectorization tree.
2801 bool areAllUsersVectorized(
2802 Instruction *I,
2803 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2804
2805 /// Return information about the vector formed for the specified index
2806 /// of a vector of (the same) instruction.
2808
2809 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2810 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2811
2812 /// \returns Cast context for the given graph node.
2814 getCastContextHint(const TreeEntry &TE) const;
2815
2816 /// \returns the cost of the vectorizable entry.
2817 InstructionCost getEntryCost(const TreeEntry *E,
2818 ArrayRef<Value *> VectorizedVals,
2819 SmallPtrSetImpl<Value *> &CheckedExtracts);
2820
2821 /// This is the recursive part of buildTree.
2822 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2823 const EdgeInfo &EI);
2824
2825 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2826 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2827 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2828 /// returns false, setting \p CurrentOrder to either an empty vector or a
2829 /// non-identity permutation that allows to reuse extract instructions.
2830 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2831 /// extract order.
2832 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2833 SmallVectorImpl<unsigned> &CurrentOrder,
2834 bool ResizeAllowed = false) const;
2835
2836 /// Vectorize a single entry in the tree.
2837 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2838 /// avoid issues with def-use order.
2839 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2840
2841 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2842 /// \p E.
2843 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2844 /// avoid issues with def-use order.
2845 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2846
2847 /// Create a new vector from a list of scalar values. Produces a sequence
2848 /// which exploits values reused across lanes, and arranges the inserts
2849 /// for ease of later optimization.
2850 template <typename BVTy, typename ResTy, typename... Args>
2851 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2852
2853 /// Create a new vector from a list of scalar values. Produces a sequence
2854 /// which exploits values reused across lanes, and arranges the inserts
2855 /// for ease of later optimization.
2856 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2857
2858 /// Returns the instruction in the bundle, which can be used as a base point
2859 /// for scheduling. Usually it is the last instruction in the bundle, except
2860 /// for the case when all operands are external (in this case, it is the first
2861 /// instruction in the list).
2862 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2863
2864 /// Tries to find extractelement instructions with constant indices from fixed
2865 /// vector type and gather such instructions into a bunch, which highly likely
2866 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2867 /// was successful, the matched scalars are replaced by poison values in \p VL
2868 /// for future analysis.
2869 std::optional<TargetTransformInfo::ShuffleKind>
2870 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2871 SmallVectorImpl<int> &Mask) const;
2872
2873 /// Tries to find extractelement instructions with constant indices from fixed
2874 /// vector type and gather such instructions into a bunch, which highly likely
2875 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2876 /// was successful, the matched scalars are replaced by poison values in \p VL
2877 /// for future analysis.
2879 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2881 unsigned NumParts) const;
2882
2883 /// Checks if the gathered \p VL can be represented as a single register
2884 /// shuffle(s) of previous tree entries.
2885 /// \param TE Tree entry checked for permutation.
2886 /// \param VL List of scalars (a subset of the TE scalar), checked for
2887 /// permutations. Must form single-register vector.
2888 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2889 /// commands to build the mask using the original vector value, without
2890 /// relying on the potential reordering.
2891 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2892 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2893 std::optional<TargetTransformInfo::ShuffleKind>
2894 isGatherShuffledSingleRegisterEntry(
2895 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2896 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2897 bool ForOrder);
2898
2899 /// Checks if the gathered \p VL can be represented as multi-register
2900 /// shuffle(s) of previous tree entries.
2901 /// \param TE Tree entry checked for permutation.
2902 /// \param VL List of scalars (a subset of the TE scalar), checked for
2903 /// permutations.
2904 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2905 /// commands to build the mask using the original vector value, without
2906 /// relying on the potential reordering.
2907 /// \returns per-register series of ShuffleKind, if gathered values can be
2908 /// represented as shuffles of previous tree entries. \p Mask is filled with
2909 /// the shuffle mask (also on per-register base).
2911 isGatherShuffledEntry(
2912 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2914 unsigned NumParts, bool ForOrder = false);
2915
2916 /// \returns the scalarization cost for this list of values. Assuming that
2917 /// this subtree gets vectorized, we may need to extract the values from the
2918 /// roots. This method calculates the cost of extracting the values.
2919 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2920 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2921 Type *ScalarTy) const;
2922
2923 /// Set the Builder insert point to one after the last instruction in
2924 /// the bundle
2925 void setInsertPointAfterBundle(const TreeEntry *E);
2926
2927 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2928 /// specified, the starting vector value is poison.
2929 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2930
2931 /// \returns whether the VectorizableTree is fully vectorizable and will
2932 /// be beneficial even the tree height is tiny.
2933 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2934
2935 /// Reorder commutative or alt operands to get better probability of
2936 /// generating vectorized code.
2937 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2940 const BoUpSLP &R);
2941
2942 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2943 /// users of \p TE and collects the stores. It returns the map from the store
2944 /// pointers to the collected stores.
2946 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2947
2948 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2949 /// stores in \p StoresVec can form a vector instruction. If so it returns
2950 /// true and populates \p ReorderIndices with the shuffle indices of the
2951 /// stores when compared to the sorted vector.
2952 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2953 OrdersType &ReorderIndices) const;
2954
2955 /// Iterates through the users of \p TE, looking for scalar stores that can be
2956 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2957 /// their order and builds an order index vector for each store bundle. It
2958 /// returns all these order vectors found.
2959 /// We run this after the tree has formed, otherwise we may come across user
2960 /// instructions that are not yet in the tree.
2962 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2963
2964 struct TreeEntry {
2965 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2966 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2967
2968 /// \returns Common mask for reorder indices and reused scalars.
2969 SmallVector<int> getCommonMask() const {
2971 inversePermutation(ReorderIndices, Mask);
2972 ::addMask(Mask, ReuseShuffleIndices);
2973 return Mask;
2974 }
2975
2976 /// \returns true if the scalars in VL are equal to this entry.
2977 bool isSame(ArrayRef<Value *> VL) const {
2978 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2979 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2980 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2981 return VL.size() == Mask.size() &&
2982 std::equal(VL.begin(), VL.end(), Mask.begin(),
2983 [Scalars](Value *V, int Idx) {
2984 return (isa<UndefValue>(V) &&
2985 Idx == PoisonMaskElem) ||
2986 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2987 });
2988 };
2989 if (!ReorderIndices.empty()) {
2990 // TODO: implement matching if the nodes are just reordered, still can
2991 // treat the vector as the same if the list of scalars matches VL
2992 // directly, without reordering.
2994 inversePermutation(ReorderIndices, Mask);
2995 if (VL.size() == Scalars.size())
2996 return IsSame(Scalars, Mask);
2997 if (VL.size() == ReuseShuffleIndices.size()) {
2998 ::addMask(Mask, ReuseShuffleIndices);
2999 return IsSame(Scalars, Mask);
3000 }
3001 return false;
3002 }
3003 return IsSame(Scalars, ReuseShuffleIndices);
3004 }
3005
3006 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3007 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3008 UserTreeIndices.front().UserTE == UserEI.UserTE;
3009 }
3010
3011 /// \returns true if current entry has same operands as \p TE.
3012 bool hasEqualOperands(const TreeEntry &TE) const {
3013 if (TE.getNumOperands() != getNumOperands())
3014 return false;
3015 SmallBitVector Used(getNumOperands());
3016 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3017 unsigned PrevCount = Used.count();
3018 for (unsigned K = 0; K < E; ++K) {
3019 if (Used.test(K))
3020 continue;
3021 if (getOperand(K) == TE.getOperand(I)) {
3022 Used.set(K);
3023 break;
3024 }
3025 }
3026 // Check if we actually found the matching operand.
3027 if (PrevCount == Used.count())
3028 return false;
3029 }
3030 return true;
3031 }
3032
3033 /// \return Final vectorization factor for the node. Defined by the total
3034 /// number of vectorized scalars, including those, used several times in the
3035 /// entry and counted in the \a ReuseShuffleIndices, if any.
3036 unsigned getVectorFactor() const {
3037 if (!ReuseShuffleIndices.empty())
3038 return ReuseShuffleIndices.size();
3039 return Scalars.size();
3040 };
3041
3042 /// Checks if the current node is a gather node.
3043 bool isGather() const {return State == NeedToGather; }
3044
3045 /// A vector of scalars.
3046 ValueList Scalars;
3047
3048 /// The Scalars are vectorized into this value. It is initialized to Null.
3049 WeakTrackingVH VectorizedValue = nullptr;
3050
3051 /// New vector phi instructions emitted for the vectorized phi nodes.
3052 PHINode *PHI = nullptr;
3053
3054 /// Do we need to gather this sequence or vectorize it
3055 /// (either with vector instruction or with scatter/gather
3056 /// intrinsics for store/load)?
3057 enum EntryState {
3058 Vectorize, ///< The node is regularly vectorized.
3059 ScatterVectorize, ///< Masked scatter/gather node.
3060 StridedVectorize, ///< Strided loads (and stores)
3061 NeedToGather, ///< Gather/buildvector node.
3062 CombinedVectorize, ///< Vectorized node, combined with its user into more
3063 ///< complex node like select/cmp to minmax, mul/add to
3064 ///< fma, etc. Must be used for the following nodes in
3065 ///< the pattern, not the very first one.
3066 };
3067 EntryState State;
3068
3069 /// List of combined opcodes supported by the vectorizer.
3070 enum CombinedOpcode {
3071 NotCombinedOp = -1,
3072 MinMax = Instruction::OtherOpsEnd + 1,
3073 };
3074 CombinedOpcode CombinedOp = NotCombinedOp;
3075
3076 /// Does this sequence require some shuffling?
3077 SmallVector<int, 4> ReuseShuffleIndices;
3078
3079 /// Does this entry require reordering?
3080 SmallVector<unsigned, 4> ReorderIndices;
3081
3082 /// Points back to the VectorizableTree.
3083 ///
3084 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3085 /// to be a pointer and needs to be able to initialize the child iterator.
3086 /// Thus we need a reference back to the container to translate the indices
3087 /// to entries.
3088 VecTreeTy &Container;
3089
3090 /// The TreeEntry index containing the user of this entry. We can actually
3091 /// have multiple users so the data structure is not truly a tree.
3092 SmallVector<EdgeInfo, 1> UserTreeIndices;
3093
3094 /// The index of this treeEntry in VectorizableTree.
3095 int Idx = -1;
3096
3097 private:
3098 /// The operands of each instruction in each lane Operands[op_index][lane].
3099 /// Note: This helps avoid the replication of the code that performs the
3100 /// reordering of operands during buildTree_rec() and vectorizeTree().
3102
3103 /// The main/alternate instruction.
3104 Instruction *MainOp = nullptr;
3105 Instruction *AltOp = nullptr;
3106
3107 public:
3108 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3109 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3110 if (Operands.size() < OpIdx + 1)
3111 Operands.resize(OpIdx + 1);
3112 assert(Operands[OpIdx].empty() && "Already resized?");
3113 assert(OpVL.size() <= Scalars.size() &&
3114 "Number of operands is greater than the number of scalars.");
3115 Operands[OpIdx].resize(OpVL.size());
3116 copy(OpVL, Operands[OpIdx].begin());
3117 }
3118
3119 /// Set the operands of this bundle in their original order.
3120 void setOperandsInOrder() {
3121 assert(Operands.empty() && "Already initialized?");
3122 auto *I0 = cast<Instruction>(Scalars[0]);
3123 Operands.resize(I0->getNumOperands());
3124 unsigned NumLanes = Scalars.size();
3125 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3126 OpIdx != NumOperands; ++OpIdx) {
3127 Operands[OpIdx].resize(NumLanes);
3128 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3129 auto *I = cast<Instruction>(Scalars[Lane]);
3130 assert(I->getNumOperands() == NumOperands &&
3131 "Expected same number of operands");
3132 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3133 }
3134 }
3135 }
3136
3137 /// Reorders operands of the node to the given mask \p Mask.
3138 void reorderOperands(ArrayRef<int> Mask) {
3139 for (ValueList &Operand : Operands)
3140 reorderScalars(Operand, Mask);
3141 }
3142
3143 /// \returns the \p OpIdx operand of this TreeEntry.
3144 ValueList &getOperand(unsigned OpIdx) {
3145 assert(OpIdx < Operands.size() && "Off bounds");
3146 return Operands[OpIdx];
3147 }
3148
3149 /// \returns the \p OpIdx operand of this TreeEntry.
3150 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3151 assert(OpIdx < Operands.size() && "Off bounds");
3152 return Operands[OpIdx];
3153 }
3154
3155 /// \returns the number of operands.
3156 unsigned getNumOperands() const { return Operands.size(); }
3157
3158 /// \return the single \p OpIdx operand.
3159 Value *getSingleOperand(unsigned OpIdx) const {
3160 assert(OpIdx < Operands.size() && "Off bounds");
3161 assert(!Operands[OpIdx].empty() && "No operand available");
3162 return Operands[OpIdx][0];
3163 }
3164
3165 /// Some of the instructions in the list have alternate opcodes.
3166 bool isAltShuffle() const { return MainOp != AltOp; }
3167
3168 bool isOpcodeOrAlt(Instruction *I) const {
3169 unsigned CheckedOpcode = I->getOpcode();
3170 return (getOpcode() == CheckedOpcode ||
3171 getAltOpcode() == CheckedOpcode);
3172 }
3173
3174 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3175 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3176 /// \p OpValue.
3177 Value *isOneOf(Value *Op) const {
3178 auto *I = dyn_cast<Instruction>(Op);
3179 if (I && isOpcodeOrAlt(I))
3180 return Op;
3181 return MainOp;
3182 }
3183
3184 void setOperations(const InstructionsState &S) {
3185 MainOp = S.MainOp;
3186 AltOp = S.AltOp;
3187 }
3188
3189 Instruction *getMainOp() const {
3190 return MainOp;
3191 }
3192
3193 Instruction *getAltOp() const {
3194 return AltOp;
3195 }
3196
3197 /// The main/alternate opcodes for the list of instructions.
3198 unsigned getOpcode() const {
3199 return MainOp ? MainOp->getOpcode() : 0;
3200 }
3201
3202 unsigned getAltOpcode() const {
3203 return AltOp ? AltOp->getOpcode() : 0;
3204 }
3205
3206 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3207 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3208 int findLaneForValue(Value *V) const {
3209 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3210 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3211 if (!ReorderIndices.empty())
3212 FoundLane = ReorderIndices[FoundLane];
3213 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3214 if (!ReuseShuffleIndices.empty()) {
3215 FoundLane = std::distance(ReuseShuffleIndices.begin(),
3216 find(ReuseShuffleIndices, FoundLane));
3217 }
3218 return FoundLane;
3219 }
3220
3221 /// Build a shuffle mask for graph entry which represents a merge of main
3222 /// and alternate operations.
3223 void
3224 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3226 SmallVectorImpl<Value *> *OpScalars = nullptr,
3227 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3228
3229 /// Return true if this is a non-power-of-2 node.
3230 bool isNonPowOf2Vec() const {
3231 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3232 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3233 "Reshuffling not supported with non-power-of-2 vectors yet.");
3234 return IsNonPowerOf2;
3235 }
3236
3237#ifndef NDEBUG
3238 /// Debug printer.
3239 LLVM_DUMP_METHOD void dump() const {
3240 dbgs() << Idx << ".\n";
3241 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3242 dbgs() << "Operand " << OpI << ":\n";
3243 for (const Value *V : Operands[OpI])
3244 dbgs().indent(2) << *V << "\n";
3245 }
3246 dbgs() << "Scalars: \n";
3247 for (Value *V : Scalars)
3248 dbgs().indent(2) << *V << "\n";
3249 dbgs() << "State: ";
3250 switch (State) {
3251 case Vectorize:
3252 dbgs() << "Vectorize\n";
3253 break;
3254 case ScatterVectorize:
3255 dbgs() << "ScatterVectorize\n";
3256 break;
3257 case StridedVectorize:
3258 dbgs() << "StridedVectorize\n";
3259 break;
3260 case NeedToGather:
3261 dbgs() << "NeedToGather\n";
3262 break;
3263 case CombinedVectorize:
3264 dbgs() << "CombinedVectorize\n";
3265 break;
3266 }
3267 dbgs() << "MainOp: ";
3268 if (MainOp)
3269 dbgs() << *MainOp << "\n";
3270 else
3271 dbgs() << "NULL\n";
3272 dbgs() << "AltOp: ";
3273 if (AltOp)
3274 dbgs() << *AltOp << "\n";
3275 else
3276 dbgs() << "NULL\n";
3277 dbgs() << "VectorizedValue: ";
3278 if (VectorizedValue)
3279 dbgs() << *VectorizedValue << "\n";
3280 else
3281 dbgs() << "NULL\n";
3282 dbgs() << "ReuseShuffleIndices: ";
3283 if (ReuseShuffleIndices.empty())
3284 dbgs() << "Empty";
3285 else
3286 for (int ReuseIdx : ReuseShuffleIndices)
3287 dbgs() << ReuseIdx << ", ";
3288 dbgs() << "\n";
3289 dbgs() << "ReorderIndices: ";
3290 for (unsigned ReorderIdx : ReorderIndices)
3291 dbgs() << ReorderIdx << ", ";
3292 dbgs() << "\n";
3293 dbgs() << "UserTreeIndices: ";
3294 for (const auto &EInfo : UserTreeIndices)
3295 dbgs() << EInfo << ", ";
3296 dbgs() << "\n";
3297 }
3298#endif
3299 };
3300
3301#ifndef NDEBUG
3302 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3303 InstructionCost VecCost, InstructionCost ScalarCost,
3304 StringRef Banner) const {
3305 dbgs() << "SLP: " << Banner << ":\n";
3306 E->dump();
3307 dbgs() << "SLP: Costs:\n";
3308 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3309 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3310 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3311 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3312 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3313 }
3314#endif
3315
3316 /// Create a new VectorizableTree entry.
3317 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3318 std::optional<ScheduleData *> Bundle,
3319 const InstructionsState &S,
3320 const EdgeInfo &UserTreeIdx,
3321 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3322 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3323 TreeEntry::EntryState EntryState =
3324 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3325 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3326 ReuseShuffleIndices, ReorderIndices);
3327 }
3328
3329 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3330 TreeEntry::EntryState EntryState,
3331 std::optional<ScheduleData *> Bundle,
3332 const InstructionsState &S,
3333 const EdgeInfo &UserTreeIdx,
3334 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3335 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3336 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3337 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3338 "Need to vectorize gather entry?");
3339 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3340 TreeEntry *Last = VectorizableTree.back().get();
3341 Last->Idx = VectorizableTree.size() - 1;
3342 Last->State = EntryState;
3343 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3344 ReuseShuffleIndices.end());
3345 if (ReorderIndices.empty()) {
3346 Last->Scalars.assign(VL.begin(), VL.end());
3347 Last->setOperations(S);
3348 } else {
3349 // Reorder scalars and build final mask.
3350 Last->Scalars.assign(VL.size(), nullptr);
3351 transform(ReorderIndices, Last->Scalars.begin(),
3352 [VL](unsigned Idx) -> Value * {
3353 if (Idx >= VL.size())
3354 return UndefValue::get(VL.front()->getType());
3355 return VL[Idx];
3356 });
3357 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3358 Last->setOperations(S);
3359 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3360 }
3361 if (!Last->isGather()) {
3362 for (Value *V : VL) {
3363 const TreeEntry *TE = getTreeEntry(V);
3364 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3365 "Scalar already in tree!");
3366 if (TE) {
3367 if (TE != Last)
3368 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3369 continue;
3370 }
3371 ScalarToTreeEntry[V] = Last;
3372 }
3373 // Update the scheduler bundle to point to this TreeEntry.
3374 ScheduleData *BundleMember = *Bundle;
3375 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3376 isVectorLikeInstWithConstOps(S.MainOp) ||
3377 doesNotNeedToSchedule(VL)) &&
3378 "Bundle and VL out of sync");
3379 if (BundleMember) {
3380 for (Value *V : VL) {
3382 continue;
3383 if (!BundleMember)
3384 continue;
3385 BundleMember->TE = Last;
3386 BundleMember = BundleMember->NextInBundle;
3387 }
3388 }
3389 assert(!BundleMember && "Bundle and VL out of sync");
3390 } else {
3391 // Build a map for gathered scalars to the nodes where they are used.
3392 bool AllConstsOrCasts = true;
3393 for (Value *V : VL)
3394 if (!isConstant(V)) {
3395 auto *I = dyn_cast<CastInst>(V);
3396 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3397 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3398 }
3399 if (AllConstsOrCasts)
3400 CastMaxMinBWSizes =
3401 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3402 MustGather.insert(VL.begin(), VL.end());
3403 }
3404
3405 if (UserTreeIdx.UserTE) {
3406 Last->UserTreeIndices.push_back(UserTreeIdx);
3407 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3408 "Reordering isn't implemented for non-power-of-2 nodes yet");
3409 }
3410 return Last;
3411 }
3412
3413 /// -- Vectorization State --
3414 /// Holds all of the tree entries.
3415 TreeEntry::VecTreeTy VectorizableTree;
3416
3417#ifndef NDEBUG
3418 /// Debug printer.
3419 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3420 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3421 VectorizableTree[Id]->dump();
3422 dbgs() << "\n";
3423 }
3424 }
3425#endif
3426
3427 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3428
3429 const TreeEntry *getTreeEntry(Value *V) const {
3430 return ScalarToTreeEntry.lookup(V);
3431 }
3432
3433 /// Check that the operand node of alternate node does not generate
3434 /// buildvector sequence. If it is, then probably not worth it to build
3435 /// alternate shuffle, if number of buildvector operands + alternate
3436 /// instruction > than the number of buildvector instructions.
3437 /// \param S the instructions state of the analyzed values.
3438 /// \param VL list of the instructions with alternate opcodes.
3439 bool areAltOperandsProfitable(const InstructionsState &S,
3440 ArrayRef<Value *> VL) const;
3441
3442 /// Checks if the specified list of the instructions/values can be vectorized
3443 /// and fills required data before actual scheduling of the instructions.
3444 TreeEntry::EntryState getScalarsVectorizationState(
3445 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3446 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3447
3448 /// Maps a specific scalar to its tree entry.
3449 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3450
3451 /// List of scalars, used in several vectorize nodes, and the list of the
3452 /// nodes.
3454
3455 /// Maps a value to the proposed vectorizable size.
3456 SmallDenseMap<Value *, unsigned> InstrElementSize;
3457
3458 /// A list of scalars that we found that we need to keep as scalars.
3459 ValueSet MustGather;
3460
3461 /// A set of first non-schedulable values.
3462 ValueSet NonScheduledFirst;
3463
3464 /// A map between the vectorized entries and the last instructions in the
3465 /// bundles. The bundles are built in use order, not in the def order of the
3466 /// instructions. So, we cannot rely directly on the last instruction in the
3467 /// bundle being the last instruction in the program order during
3468 /// vectorization process since the basic blocks are affected, need to
3469 /// pre-gather them before.
3470 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3471
3472 /// List of gather nodes, depending on other gather/vector nodes, which should
3473 /// be emitted after the vector instruction emission process to correctly
3474 /// handle order of the vector instructions and shuffles.
3475 SetVector<const TreeEntry *> PostponedGathers;
3476
3477 using ValueToGatherNodesMap =
3479 ValueToGatherNodesMap ValueToGatherNodes;
3480
3481 /// This POD struct describes one external user in the vectorized tree.
3482 struct ExternalUser {
3483 ExternalUser(Value *S, llvm::User *U, int L)
3484 : Scalar(S), User(U), Lane(L) {}
3485
3486 // Which scalar in our function.
3487 Value *Scalar;
3488
3489 // Which user that uses the scalar.
3491
3492 // Which lane does the scalar belong to.
3493 int Lane;
3494 };
3495 using UserList = SmallVector<ExternalUser, 16>;
3496
3497 /// Checks if two instructions may access the same memory.
3498 ///
3499 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3500 /// is invariant in the calling loop.
3501 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3502 Instruction *Inst2) {
3503 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3504 return true;
3505 // First check if the result is already in the cache.
3506 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3507 auto It = AliasCache.find(Key);
3508 if (It != AliasCache.end())
3509 return It->second;
3510 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3511 // Store the result in the cache.
3512 AliasCache.try_emplace(Key, Aliased);
3513 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3514 return Aliased;
3515 }
3516
3517 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3518
3519 /// Cache for alias results.
3520 /// TODO: consider moving this to the AliasAnalysis itself.
3522
3523 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3524 // globally through SLP because we don't perform any action which
3525 // invalidates capture results.
3526 BatchAAResults BatchAA;
3527
3528 /// Temporary store for deleted instructions. Instructions will be deleted
3529 /// eventually when the BoUpSLP is destructed. The deferral is required to
3530 /// ensure that there are no incorrect collisions in the AliasCache, which
3531 /// can happen if a new instruction is allocated at the same address as a
3532 /// previously deleted instruction.
3533 DenseSet<Instruction *> DeletedInstructions;
3534
3535 /// Set of the instruction, being analyzed already for reductions.
3536 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3537
3538 /// Set of hashes for the list of reduction values already being analyzed.
3539 DenseSet<size_t> AnalyzedReductionVals;
3540
3541 /// Values, already been analyzed for mininmal bitwidth and found to be
3542 /// non-profitable.
3543 DenseSet<Value *> AnalyzedMinBWVals;
3544
3545 /// A list of values that need to extracted out of the tree.
3546 /// This list holds pairs of (Internal Scalar : External User). External User
3547 /// can be nullptr, it means that this Internal Scalar will be used later,
3548 /// after vectorization.
3549 UserList ExternalUses;
3550
3551 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3552 /// extractelement instructions.
3553 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3554
3555 /// Values used only by @llvm.assume calls.
3557
3558 /// Holds all of the instructions that we gathered, shuffle instructions and
3559 /// extractelements.
3560 SetVector<Instruction *> GatherShuffleExtractSeq;
3561
3562 /// A list of blocks that we are going to CSE.
3563 DenseSet<BasicBlock *> CSEBlocks;
3564
3565 /// Contains all scheduling relevant data for an instruction.
3566 /// A ScheduleData either represents a single instruction or a member of an
3567 /// instruction bundle (= a group of instructions which is combined into a
3568 /// vector instruction).
3569 struct ScheduleData {
3570 // The initial value for the dependency counters. It means that the
3571 // dependencies are not calculated yet.
3572 enum { InvalidDeps = -1 };
3573
3574 ScheduleData() = default;
3575
3576 void init(int BlockSchedulingRegionID, Instruction *I) {
3577 FirstInBundle = this;
3578 NextInBundle = nullptr;
3579 NextLoadStore = nullptr;
3580 IsScheduled = false;
3581 SchedulingRegionID = BlockSchedulingRegionID;
3582 clearDependencies();
3583 Inst = I;
3584 TE = nullptr;
3585 }
3586
3587 /// Verify basic self consistency properties
3588 void verify() {
3589 if (hasValidDependencies()) {
3590 assert(UnscheduledDeps <= Dependencies && "invariant");
3591 } else {
3592 assert(UnscheduledDeps == Dependencies && "invariant");
3593 }
3594
3595 if (IsScheduled) {
3596 assert(isSchedulingEntity() &&
3597 "unexpected scheduled state");
3598 for (const ScheduleData *BundleMember = this; BundleMember;
3599 BundleMember = BundleMember->NextInBundle) {
3600 assert(BundleMember->hasValidDependencies() &&
3601 BundleMember->UnscheduledDeps == 0 &&
3602 "unexpected scheduled state");
3603 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3604 "only bundle is marked scheduled");
3605 }
3606 }
3607
3608 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3609 "all bundle members must be in same basic block");
3610 }
3611
3612 /// Returns true if the dependency information has been calculated.
3613 /// Note that depenendency validity can vary between instructions within
3614 /// a single bundle.
3615 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3616
3617 /// Returns true for single instructions and for bundle representatives
3618 /// (= the head of a bundle).
3619 bool isSchedulingEntity() const { return FirstInBundle == this; }
3620
3621 /// Returns true if it represents an instruction bundle and not only a
3622 /// single instruction.
3623 bool isPartOfBundle() const {
3624 return NextInBundle != nullptr || FirstInBundle != this || TE;
3625 }
3626
3627 /// Returns true if it is ready for scheduling, i.e. it has no more
3628 /// unscheduled depending instructions/bundles.
3629 bool isReady() const {
3630 assert(isSchedulingEntity() &&
3631 "can't consider non-scheduling entity for ready list");
3632 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3633 }
3634
3635 /// Modifies the number of unscheduled dependencies for this instruction,
3636 /// and returns the number of remaining dependencies for the containing
3637 /// bundle.
3638 int incrementUnscheduledDeps(int Incr) {
3639 assert(hasValidDependencies() &&
3640 "increment of unscheduled deps would be meaningless");
3641 UnscheduledDeps += Incr;
3642 return FirstInBundle->unscheduledDepsInBundle();
3643 }
3644
3645 /// Sets the number of unscheduled dependencies to the number of
3646 /// dependencies.
3647 void resetUnscheduledDeps() {
3648 UnscheduledDeps = Dependencies;
3649 }
3650
3651 /// Clears all dependency information.
3652 void clearDependencies() {
3653 Dependencies = InvalidDeps;
3654 resetUnscheduledDeps();
3655 MemoryDependencies.clear();
3656 ControlDependencies.clear();
3657 }
3658
3659 int unscheduledDepsInBundle() const {
3660 assert(isSchedulingEntity() && "only meaningful on the bundle");
3661 int Sum = 0;
3662 for (const ScheduleData *BundleMember = this; BundleMember;
3663 BundleMember = BundleMember->NextInBundle) {
3664 if (BundleMember->UnscheduledDeps == InvalidDeps)
3665 return InvalidDeps;
3666 Sum += BundleMember->UnscheduledDeps;
3667 }
3668 return Sum;
3669 }
3670
3671 void dump(raw_ostream &os) const {
3672 if (!isSchedulingEntity()) {
3673 os << "/ " << *Inst;
3674 } else if (NextInBundle) {
3675 os << '[' << *Inst;
3676 ScheduleData *SD = NextInBundle;
3677 while (SD) {
3678 os << ';' << *SD->Inst;
3679 SD = SD->NextInBundle;
3680 }
3681 os << ']';
3682 } else {
3683 os << *Inst;
3684 }
3685 }
3686
3687 Instruction *Inst = nullptr;
3688
3689 /// The TreeEntry that this instruction corresponds to.
3690 TreeEntry *TE = nullptr;
3691
3692 /// Points to the head in an instruction bundle (and always to this for
3693 /// single instructions).
3694 ScheduleData *FirstInBundle = nullptr;
3695
3696 /// Single linked list of all instructions in a bundle. Null if it is a
3697 /// single instruction.
3698 ScheduleData *NextInBundle = nullptr;
3699
3700 /// Single linked list of all memory instructions (e.g. load, store, call)
3701 /// in the block - until the end of the scheduling region.
3702 ScheduleData *NextLoadStore = nullptr;
3703
3704 /// The dependent memory instructions.
3705 /// This list is derived on demand in calculateDependencies().
3706 SmallVector<ScheduleData *, 4> MemoryDependencies;
3707
3708 /// List of instructions which this instruction could be control dependent
3709 /// on. Allowing such nodes to be scheduled below this one could introduce
3710 /// a runtime fault which didn't exist in the original program.
3711 /// ex: this is a load or udiv following a readonly call which inf loops
3712 SmallVector<ScheduleData *, 4> ControlDependencies;
3713
3714 /// This ScheduleData is in the current scheduling region if this matches
3715 /// the current SchedulingRegionID of BlockScheduling.
3716 int SchedulingRegionID = 0;
3717
3718 /// Used for getting a "good" final ordering of instructions.
3719 int SchedulingPriority = 0;
3720
3721 /// The number of dependencies. Constitutes of the number of users of the
3722 /// instruction plus the number of dependent memory instructions (if any).
3723 /// This value is calculated on demand.
3724 /// If InvalidDeps, the number of dependencies is not calculated yet.
3725 int Dependencies = InvalidDeps;
3726
3727 /// The number of dependencies minus the number of dependencies of scheduled
3728 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3729 /// for scheduling.
3730 /// Note that this is negative as long as Dependencies is not calculated.
3731 int UnscheduledDeps = InvalidDeps;
3732
3733 /// True if this instruction is scheduled (or considered as scheduled in the
3734 /// dry-run).
3735 bool IsScheduled = false;
3736 };
3737
3738#ifndef NDEBUG
3740 const BoUpSLP::ScheduleData &SD) {
3741 SD.dump(os);
3742 return os;
3743 }
3744#endif
3745
3746 friend struct GraphTraits<BoUpSLP *>;
3747 friend struct DOTGraphTraits<BoUpSLP *>;
3748
3749 /// Contains all scheduling data for a basic block.
3750 /// It does not schedules instructions, which are not memory read/write
3751 /// instructions and their operands are either constants, or arguments, or
3752 /// phis, or instructions from others blocks, or their users are phis or from
3753 /// the other blocks. The resulting vector instructions can be placed at the
3754 /// beginning of the basic block without scheduling (if operands does not need
3755 /// to be scheduled) or at the end of the block (if users are outside of the
3756 /// block). It allows to save some compile time and memory used by the
3757 /// compiler.
3758 /// ScheduleData is assigned for each instruction in between the boundaries of
3759 /// the tree entry, even for those, which are not part of the graph. It is
3760 /// required to correctly follow the dependencies between the instructions and
3761 /// their correct scheduling. The ScheduleData is not allocated for the
3762 /// instructions, which do not require scheduling, like phis, nodes with
3763 /// extractelements/insertelements only or nodes with instructions, with
3764 /// uses/operands outside of the block.
3765 struct BlockScheduling {
3766 BlockScheduling(BasicBlock *BB)
3767 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3768
3769 void clear() {
3770 ReadyInsts.clear();
3771 ScheduleStart = nullptr;
3772 ScheduleEnd = nullptr;
3773 FirstLoadStoreInRegion = nullptr;
3774 LastLoadStoreInRegion = nullptr;
3775 RegionHasStackSave = false;
3776
3777 // Reduce the maximum schedule region size by the size of the
3778 // previous scheduling run.
3779 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3780 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3781 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3782 ScheduleRegionSize = 0;
3783
3784 // Make a new scheduling region, i.e. all existing ScheduleData is not
3785 // in the new region yet.
3786 ++SchedulingRegionID;
3787 }
3788
3789 ScheduleData *getScheduleData(Instruction *I) {
3790 if (BB != I->getParent())
3791 // Avoid lookup if can't possibly be in map.
3792 return nullptr;
3793 ScheduleData *SD = ScheduleDataMap.lookup(I);
3794 if (SD && isInSchedulingRegion(SD))
3795 return SD;
3796 return nullptr;
3797 }
3798
3799 ScheduleData *getScheduleData(Value *V) {
3800 if (auto *I = dyn_cast<Instruction>(V))
3801 return getScheduleData(I);
3802 return nullptr;
3803 }
3804
3805 bool isInSchedulingRegion(ScheduleData *SD) const {
3806 return SD->SchedulingRegionID == SchedulingRegionID;
3807 }
3808
3809 /// Marks an instruction as scheduled and puts all dependent ready
3810 /// instructions into the ready-list.
3811 template <typename ReadyListType>
3812 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3813 SD->IsScheduled = true;
3814 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3815
3816 for (ScheduleData *BundleMember = SD; BundleMember;
3817 BundleMember = BundleMember->NextInBundle) {
3818
3819 // Handle the def-use chain dependencies.
3820
3821 // Decrement the unscheduled counter and insert to ready list if ready.
3822 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3823 ScheduleData *OpDef = getScheduleData(I);
3824 if (OpDef && OpDef->hasValidDependencies() &&
3825 OpDef->incrementUnscheduledDeps(-1) == 0) {
3826 // There are no more unscheduled dependencies after
3827 // decrementing, so we can put the dependent instruction
3828 // into the ready list.
3829 ScheduleData *DepBundle = OpDef->FirstInBundle;
3830 assert(!DepBundle->IsScheduled &&
3831 "already scheduled bundle gets ready");
3832 ReadyList.insert(DepBundle);
3834 << "SLP: gets ready (def): " << *DepBundle << "\n");
3835 }
3836 };
3837
3838 // If BundleMember is a vector bundle, its operands may have been
3839 // reordered during buildTree(). We therefore need to get its operands
3840 // through the TreeEntry.
3841 if (TreeEntry *TE = BundleMember->TE) {
3842 // Need to search for the lane since the tree entry can be reordered.
3843 int Lane = std::distance(TE->Scalars.begin(),
3844 find(TE->Scalars, BundleMember->Inst));
3845 assert(Lane >= 0 && "Lane not set");
3846
3847 // Since vectorization tree is being built recursively this assertion
3848 // ensures that the tree entry has all operands set before reaching
3849 // this code. Couple of exceptions known at the moment are extracts
3850 // where their second (immediate) operand is not added. Since
3851 // immediates do not affect scheduler behavior this is considered
3852 // okay.
3853 auto *In = BundleMember->Inst;
3854 assert(
3855 In &&
3856 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3857 In->getNumOperands() == TE->getNumOperands()) &&
3858 "Missed TreeEntry operands?");
3859 (void)In; // fake use to avoid build failure when assertions disabled
3860
3861 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3862 OpIdx != NumOperands; ++OpIdx)
3863 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3864 DecrUnsched(I);
3865 } else {
3866 // If BundleMember is a stand-alone instruction, no operand reordering
3867 // has taken place, so we directly access its operands.
3868 for (Use &U : BundleMember->Inst->operands())
3869 if (auto *I = dyn_cast<Instruction>(U.get()))
3870 DecrUnsched(I);
3871 }
3872 // Handle the memory dependencies.
3873 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3874 if (MemoryDepSD->hasValidDependencies() &&
3875 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3876 // There are no more unscheduled dependencies after decrementing,
3877 // so we can put the dependent instruction into the ready list.
3878 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3879 assert(!DepBundle->IsScheduled &&
3880 "already scheduled bundle gets ready");
3881 ReadyList.insert(DepBundle);
3883 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3884 }
3885 }
3886 // Handle the control dependencies.
3887 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3888 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3889 // There are no more unscheduled dependencies after decrementing,
3890 // so we can put the dependent instruction into the ready list.
3891 ScheduleData *DepBundle = DepSD->FirstInBundle;
3892 assert(!DepBundle->IsScheduled &&
3893 "already scheduled bundle gets ready");
3894 ReadyList.insert(DepBundle);
3896 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3897 }
3898 }
3899 }
3900 }
3901
3902 /// Verify basic self consistency properties of the data structure.
3903 void verify() {
3904 if (!ScheduleStart)
3905 return;
3906
3907 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3908 ScheduleStart->comesBefore(ScheduleEnd) &&
3909 "Not a valid scheduling region?");
3910
3911 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3912 auto *SD = getScheduleData(I);
3913 if (!SD)
3914 continue;
3915 assert(isInSchedulingRegion(SD) &&
3916 "primary schedule data not in window?");
3917 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3918 "entire bundle in window!");
3919 SD->verify();
3920 }
3921
3922 for (auto *SD : ReadyInsts) {
3923 assert(SD->isSchedulingEntity() && SD->isReady() &&
3924 "item in ready list not ready?");
3925 (void)SD;
3926 }
3927 }
3928
3929 /// Put all instructions into the ReadyList which are ready for scheduling.
3930 template <typename ReadyListType>
3931 void initialFillReadyList(ReadyListType &ReadyList) {
3932 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3933 ScheduleData *SD = getScheduleData(I);
3934 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3935 SD->isReady()) {
3936 ReadyList.insert(SD);
3938 << "SLP: initially in ready list: " << *SD << "\n");
3939 }
3940 }
3941 }
3942
3943 /// Build a bundle from the ScheduleData nodes corresponding to the
3944 /// scalar instruction for each lane.
3945 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3946
3947 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3948 /// cyclic dependencies. This is only a dry-run, no instructions are
3949 /// actually moved at this stage.
3950 /// \returns the scheduling bundle. The returned Optional value is not
3951 /// std::nullopt if \p VL is allowed to be scheduled.
3952 std::optional<ScheduleData *>
3953 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3954 const InstructionsState &S);
3955
3956 /// Un-bundles a group of instructions.
3957 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3958
3959 /// Allocates schedule data chunk.
3960 ScheduleData *allocateScheduleDataChunks();
3961
3962 /// Extends the scheduling region so that V is inside the region.
3963 /// \returns true if the region size is within the limit.
3964 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3965
3966 /// Initialize the ScheduleData structures for new instructions in the
3967 /// scheduling region.
3968 void initScheduleData(Instruction *FromI, Instruction *ToI,
3969 ScheduleData *PrevLoadStore,
3970 ScheduleData *NextLoadStore);
3971
3972 /// Updates the dependency information of a bundle and of all instructions/
3973 /// bundles which depend on the original bundle.
3974 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3975 BoUpSLP *SLP);
3976
3977 /// Sets all instruction in the scheduling region to un-scheduled.
3978 void resetSchedule();
3979
3980 BasicBlock *BB;
3981
3982 /// Simple memory allocation for ScheduleData.
3984
3985 /// The size of a ScheduleData array in ScheduleDataChunks.
3986 int ChunkSize;
3987
3988 /// The allocator position in the current chunk, which is the last entry
3989 /// of ScheduleDataChunks.
3990 int ChunkPos;
3991
3992 /// Attaches ScheduleData to Instruction.
3993 /// Note that the mapping survives during all vectorization iterations, i.e.
3994 /// ScheduleData structures are recycled.
3996
3997 /// The ready-list for scheduling (only used for the dry-run).
3998 SetVector<ScheduleData *> ReadyInsts;
3999
4000 /// The first instruction of the scheduling region.
4001 Instruction *ScheduleStart = nullptr;
4002
4003 /// The first instruction _after_ the scheduling region.
4004 Instruction *ScheduleEnd = nullptr;
4005
4006 /// The first memory accessing instruction in the scheduling region
4007 /// (can be null).
4008 ScheduleData *FirstLoadStoreInRegion = nullptr;
4009
4010 /// The last memory accessing instruction in the scheduling region
4011 /// (can be null).
4012 ScheduleData *LastLoadStoreInRegion = nullptr;
4013
4014 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4015 /// region? Used to optimize the dependence calculation for the
4016 /// common case where there isn't.
4017 bool RegionHasStackSave = false;
4018
4019 /// The current size of the scheduling region.
4020 int ScheduleRegionSize = 0;
4021
4022 /// The maximum size allowed for the scheduling region.
4023 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4024
4025 /// The ID of the scheduling region. For a new vectorization iteration this
4026 /// is incremented which "removes" all ScheduleData from the region.
4027 /// Make sure that the initial SchedulingRegionID is greater than the
4028 /// initial SchedulingRegionID in ScheduleData (which is 0).
4029 int SchedulingRegionID = 1;
4030 };
4031
4032 /// Attaches the BlockScheduling structures to basic blocks.
4034
4035 /// Performs the "real" scheduling. Done before vectorization is actually
4036 /// performed in a basic block.
4037 void scheduleBlock(BlockScheduling *BS);
4038
4039 /// List of users to ignore during scheduling and that don't need extracting.
4040 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4041
4042 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4043 /// sorted SmallVectors of unsigned.
4044 struct OrdersTypeDenseMapInfo {
4045 static OrdersType getEmptyKey() {
4046 OrdersType V;
4047 V.push_back(~1U);
4048 return V;
4049 }
4050
4051 static OrdersType getTombstoneKey() {
4052 OrdersType V;
4053 V.push_back(~2U);
4054 return V;
4055 }
4056
4057 static unsigned getHashValue(const OrdersType &V) {
4058 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4059 }
4060
4061 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4062 return LHS == RHS;
4063 }
4064 };
4065
4066 // Analysis and block reference.
4067 Function *F;
4068 ScalarEvolution *SE;
4070 TargetLibraryInfo *TLI;
4071 LoopInfo *LI;
4072 DominatorTree *DT;
4073 AssumptionCache *AC;
4074 DemandedBits *DB;
4075 const DataLayout *DL;
4077
4078 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4079 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4080
4081 /// Instruction builder to construct the vectorized tree.
4083
4084 /// A map of scalar integer values to the smallest bit width with which they
4085 /// can legally be represented. The values map to (width, signed) pairs,
4086 /// where "width" indicates the minimum bit width and "signed" is True if the
4087 /// value must be signed-extended, rather than zero-extended, back to its
4088 /// original width.
4090
4091 /// Final size of the reduced vector, if the current graph represents the
4092 /// input for the reduction and it was possible to narrow the size of the
4093 /// reduction.
4094 unsigned ReductionBitWidth = 0;
4095
4096 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4097 /// type sizes, used in the tree.
4098 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4099
4100 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4101 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4102 DenseSet<unsigned> ExtraBitWidthNodes;
4103};
4104
4105} // end namespace slpvectorizer
4106
4107template <> struct GraphTraits<BoUpSLP *> {
4108 using TreeEntry = BoUpSLP::TreeEntry;
4109
4110 /// NodeRef has to be a pointer per the GraphWriter.
4112
4114
4115 /// Add the VectorizableTree to the index iterator to be able to return
4116 /// TreeEntry pointers.
4117 struct ChildIteratorType
4118 : public iterator_adaptor_base<
4119 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4121
4123 ContainerTy &VT)
4124 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4125
4126 NodeRef operator*() { return I->UserTE; }
4127 };
4128
4130 return R.VectorizableTree[0].get();
4131 }
4132
4133 static ChildIteratorType child_begin(NodeRef N) {
4134 return {N->UserTreeIndices.begin(), N->Container};
4135 }
4136
4137 static ChildIteratorType child_end(NodeRef N) {
4138 return {N->UserTreeIndices.end(), N->Container};
4139 }
4140
4141 /// For the node iterator we just need to turn the TreeEntry iterator into a
4142 /// TreeEntry* iterator so that it dereferences to NodeRef.
4143 class nodes_iterator {
4145 ItTy It;
4146
4147 public:
4148 nodes_iterator(const ItTy &It2) : It(It2) {}
4149 NodeRef operator*() { return It->get(); }
4150 nodes_iterator operator++() {
4151 ++It;
4152 return *this;
4153 }
4154 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4155 };
4156
4157 static nodes_iterator nodes_begin(BoUpSLP *R) {
4158 return nodes_iterator(R->VectorizableTree.begin());
4159 }
4160
4161 static nodes_iterator nodes_end(BoUpSLP *R) {
4162 return nodes_iterator(R->VectorizableTree.end());
4163 }
4164
4165 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4166};
4167
4168template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4169 using TreeEntry = BoUpSLP::TreeEntry;
4170
4171 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4172
4173 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4174 std::string Str;
4176 OS << Entry->Idx << ".\n";
4177 if (isSplat(Entry->Scalars))
4178 OS << "<splat> ";
4179 for (auto *V : Entry->Scalars) {
4180 OS << *V;
4181 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4182 return EU.Scalar == V;
4183 }))
4184 OS << " <extract>";
4185 OS << "\n";
4186 }
4187 return Str;
4188 }
4189
4190 static std::string getNodeAttributes(const TreeEntry *Entry,
4191 const BoUpSLP *) {
4192 if (Entry->isGather())
4193 return "color=red";
4194 if (Entry->State == TreeEntry::ScatterVectorize ||
4195 Entry->State == TreeEntry::StridedVectorize)
4196 return "color=blue";
4197 return "";
4198 }
4199};
4200
4201} // end namespace llvm
4202
4205 for (auto *I : DeletedInstructions) {
4206 if (!I->getParent()) {
4207 // Temporarily insert instruction back to erase them from parent and
4208 // memory later.
4209 if (isa<PHINode>(I))
4210 // Phi nodes must be the very first instructions in the block.
4211 I->insertBefore(F->getEntryBlock(),
4212 F->getEntryBlock().getFirstNonPHIIt());
4213 else
4214 I->insertBefore(F->getEntryBlock().getTerminator());
4215 continue;
4216 }
4217 for (Use &U : I->operands()) {
4218 auto *Op = dyn_cast<Instruction>(U.get());
4219 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4221 DeadInsts.emplace_back(Op);
4222 }
4223 I->dropAllReferences();
4224 }
4225 for (auto *I : DeletedInstructions) {
4226 assert(I->use_empty() &&
4227 "trying to erase instruction with users.");
4228 I->eraseFromParent();
4229 }
4230
4231 // Cleanup any dead scalar code feeding the vectorized instructions
4233
4234#ifdef EXPENSIVE_CHECKS
4235 // If we could guarantee that this call is not extremely slow, we could
4236 // remove the ifdef limitation (see PR47712).
4237 assert(!verifyFunction(*F, &dbgs()));
4238#endif
4239}
4240
4241/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4242/// contains original mask for the scalars reused in the node. Procedure
4243/// transform this mask in accordance with the given \p Mask.
4245 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4246 "Expected non-empty mask.");
4247 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4248 Prev.swap(Reuses);
4249 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4250 if (Mask[I] != PoisonMaskElem)
4251 Reuses[Mask[I]] = Prev[I];
4252}
4253
4254/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4255/// the original order of the scalars. Procedure transforms the provided order
4256/// in accordance with the given \p Mask. If the resulting \p Order is just an
4257/// identity order, \p Order is cleared.
4259 bool BottomOrder = false) {
4260 assert(!Mask.empty() && "Expected non-empty mask.");
4261 unsigned Sz = Mask.size();
4262 if (BottomOrder) {
4263 SmallVector<unsigned> PrevOrder;
4264 if (Order.empty()) {
4265 PrevOrder.resize(Sz);
4266 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4267 } else {
4268 PrevOrder.swap(Order);
4269 }
4270 Order.assign(Sz, Sz);
4271 for (unsigned I = 0; I < Sz; ++I)
4272 if (Mask[I] != PoisonMaskElem)
4273 Order[I] = PrevOrder[Mask[I]];
4274 if (all_of(enumerate(Order), [&](const auto &Data) {
4275 return Data.value() == Sz || Data.index() == Data.value();
4276 })) {
4277 Order.clear();
4278 return;
4279 }
4280 fixupOrderingIndices(Order);
4281 return;
4282 }
4283 SmallVector<int> MaskOrder;
4284 if (Order.empty()) {
4285 MaskOrder.resize(Sz);
4286 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4287 } else {
4288 inversePermutation(Order, MaskOrder);
4289 }
4290 reorderReuses(MaskOrder, Mask);
4291 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4292 Order.clear();
4293 return;
4294 }
4295 Order.assign(Sz, Sz);
4296 for (unsigned I = 0; I < Sz; ++I)
4297 if (MaskOrder[I] != PoisonMaskElem)
4298 Order[MaskOrder[I]] = I;
4299 fixupOrderingIndices(Order);
4300}
4301
4302std::optional<BoUpSLP::OrdersType>
4303BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4304 assert(TE.isGather() && "Expected gather node only.");
4305 // Try to find subvector extract/insert patterns and reorder only such
4306 // patterns.
4307 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4308 Type *ScalarTy = GatheredScalars.front()->getType();
4309 int NumScalars = GatheredScalars.size();
4310 if (!isValidElementType(ScalarTy))
4311 return std::nullopt;
4312 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4313 int NumParts = TTI->getNumberOfParts(VecTy);
4314 if (NumParts == 0 || NumParts >= NumScalars)
4315 NumParts = 1;
4316 SmallVector<int> ExtractMask;
4317 SmallVector<int> Mask;
4320 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4322 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4323 /*ForOrder=*/true);
4324 // No shuffled operands - ignore.
4325 if (GatherShuffles.empty() && ExtractShuffles.empty())
4326 return std::nullopt;
4327 OrdersType CurrentOrder(NumScalars, NumScalars);
4328 if (GatherShuffles.size() == 1 &&
4329 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4330 Entries.front().front()->isSame(TE.Scalars)) {
4331 // Perfect match in the graph, will reuse the previously vectorized
4332 // node. Cost is 0.
4333 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4334 return CurrentOrder;
4335 }
4336 auto IsSplatMask = [](ArrayRef<int> Mask) {
4337 int SingleElt = PoisonMaskElem;
4338 return all_of(Mask, [&](int I) {
4339 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4340 SingleElt = I;
4341 return I == PoisonMaskElem || I == SingleElt;
4342 });
4343 };
4344 // Exclusive broadcast mask - ignore.
4345 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4346 (Entries.size() != 1 ||
4347 Entries.front().front()->ReorderIndices.empty())) ||
4348 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4349 return std::nullopt;
4350 SmallBitVector ShuffledSubMasks(NumParts);
4351 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4352 ArrayRef<int> Mask, int PartSz, int NumParts,
4353 function_ref<unsigned(unsigned)> GetVF) {
4354 for (int I : seq<int>(0, NumParts)) {
4355 if (ShuffledSubMasks.test(I))
4356 continue;
4357 const int VF = GetVF(I);
4358 if (VF == 0)
4359 continue;
4360 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4361 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4362 // Shuffle of at least 2 vectors - ignore.
4363 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4364 std::fill(Slice.begin(), Slice.end(), NumScalars);
4365 ShuffledSubMasks.set(I);
4366 continue;
4367 }
4368 // Try to include as much elements from the mask as possible.
4369 int FirstMin = INT_MAX;
4370 int SecondVecFound = false;
4371 for (int K : seq<int>(Limit)) {
4372 int Idx = Mask[I * PartSz + K];
4373 if (Idx == PoisonMaskElem) {
4374 Value *V = GatheredScalars[I * PartSz + K];
4375 if (isConstant(V) && !isa<PoisonValue>(V)) {
4376 SecondVecFound = true;
4377 break;
4378 }
4379 continue;
4380 }
4381 if (Idx < VF) {
4382 if (FirstMin > Idx)
4383 FirstMin = Idx;
4384 } else {
4385 SecondVecFound = true;
4386 break;
4387 }
4388 }
4389 FirstMin = (FirstMin / PartSz) * PartSz;
4390 // Shuffle of at least 2 vectors - ignore.
4391 if (SecondVecFound) {
4392 std::fill(Slice.begin(), Slice.end(), NumScalars);
4393 ShuffledSubMasks.set(I);
4394 continue;
4395 }
4396 for (int K : seq<int>(Limit)) {
4397 int Idx = Mask[I * PartSz + K];
4398 if (Idx == PoisonMaskElem)
4399 continue;
4400 Idx -= FirstMin;
4401 if (Idx >= PartSz) {
4402 SecondVecFound = true;
4403 break;
4404 }
4405 if (CurrentOrder[I * PartSz + Idx] >
4406 static_cast<unsigned>(I * PartSz + K) &&
4407 CurrentOrder[I * PartSz + Idx] !=
4408 static_cast<unsigned>(I * PartSz + Idx))
4409 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4410 }
4411 // Shuffle of at least 2 vectors - ignore.
4412 if (SecondVecFound) {
4413 std::fill(Slice.begin(), Slice.end(), NumScalars);
4414 ShuffledSubMasks.set(I);
4415 continue;
4416 }
4417 }
4418 };
4419 int PartSz = getPartNumElems(NumScalars, NumParts);
4420 if (!ExtractShuffles.empty())
4421 TransformMaskToOrder(
4422 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4423 if (!ExtractShuffles[I])
4424 return 0U;
4425 unsigned VF = 0;
4426 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4427 for (unsigned Idx : seq<unsigned>(Sz)) {
4428 int K = I * PartSz + Idx;
4429 if (ExtractMask[K] == PoisonMaskElem)
4430 continue;
4431 if (!TE.ReuseShuffleIndices.empty())
4432 K = TE.ReuseShuffleIndices[K];
4433 if (!TE.ReorderIndices.empty())
4434 K = std::distance(TE.ReorderIndices.begin(),
4435 find(TE.ReorderIndices, K));
4436 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4437 if (!EI)
4438 continue;
4439 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4440 ->getElementCount()
4441 .getKnownMinValue());
4442 }
4443 return VF;
4444 });
4445 // Check special corner case - single shuffle of the same entry.
4446 if (GatherShuffles.size() == 1 && NumParts != 1) {
4447 if (ShuffledSubMasks.any())
4448 return std::nullopt;
4449 PartSz = NumScalars;
4450 NumParts = 1;
4451 }
4452 if (!Entries.empty())
4453 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4454 if (!GatherShuffles[I])
4455 return 0U;
4456 return std::max(Entries[I].front()->getVectorFactor(),
4457 Entries[I].back()->getVectorFactor());
4458 });
4459 int NumUndefs =
4460 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4461 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4462 return std::nullopt;
4463 return std::move(CurrentOrder);
4464}
4465
4466static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4467 const TargetLibraryInfo &TLI,
4468 bool CompareOpcodes = true) {
4469 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4470 return false;
4471 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4472 if (!GEP1)
4473 return false;
4474 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4475 if (!GEP2)
4476 return false;
4477 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4478 ((isConstant(GEP1->getOperand(1)) &&
4479 isConstant(GEP2->getOperand(1))) ||
4480 !CompareOpcodes ||
4481 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4482 .getOpcode());
4483}
4484
4485/// Calculates minimal alignment as a common alignment.
4486template <typename T>
4488 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4489 for (Value *V : VL.drop_front())
4490 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4491 return CommonAlignment;
4492}
4493
4494/// Check if \p Order represents reverse order.
4496 unsigned Sz = Order.size();
4497 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4498 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4499 });
4500}
4501
4502/// Checks if the provided list of pointers \p Pointers represents the strided
4503/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4504/// Otherwise, if \p Inst is not specified, just initialized optional value is
4505/// returned to show that the pointers represent strided pointers. If \p Inst
4506/// specified, the runtime stride is materialized before the given \p Inst.
4507/// \returns std::nullopt if the pointers are not pointers with the runtime
4508/// stride, nullptr or actual stride value, otherwise.
4509static std::optional<Value *>
4511 const DataLayout &DL, ScalarEvolution &SE,
4512 SmallVectorImpl<unsigned> &SortedIndices,
4513 Instruction *Inst = nullptr) {
4515 const SCEV *PtrSCEVLowest = nullptr;
4516 const SCEV *PtrSCEVHighest = nullptr;
4517 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4518 // addresses).
4519 for (Value *Ptr : PointerOps) {
4520 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4521 if (!PtrSCEV)
4522 return std::nullopt;
4523 SCEVs.push_back(PtrSCEV);
4524 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4525 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4526 continue;
4527 }
4528 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4529 if (isa<SCEVCouldNotCompute>(Diff))
4530 return std::nullopt;
4531 if (Diff->isNonConstantNegative()) {
4532 PtrSCEVLowest = PtrSCEV;
4533 continue;
4534 }
4535 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4536 if (isa<SCEVCouldNotCompute>(Diff1))
4537 return std::nullopt;
4538 if (Diff1->isNonConstantNegative()) {
4539 PtrSCEVHighest = PtrSCEV;
4540 continue;
4541 }
4542 }
4543 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4544 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4545 if (isa<SCEVCouldNotCompute>(Dist))
4546 return std::nullopt;
4547 int Size = DL.getTypeStoreSize(ElemTy);
4548 auto TryGetStride = [&](const SCEV *Dist,
4549 const SCEV *Multiplier) -> const SCEV * {
4550 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4551 if (M->getOperand(0) == Multiplier)
4552 return M->getOperand(1);
4553 if (M->getOperand(1) == Multiplier)
4554 return M->getOperand(0);
4555 return nullptr;
4556 }
4557 if (Multiplier == Dist)
4558 return SE.getConstant(Dist->getType(), 1);
4559 return SE.getUDivExactExpr(Dist, Multiplier);
4560 };
4561 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4562 const SCEV *Stride = nullptr;
4563 if (Size != 1 || SCEVs.size() > 2) {
4564 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4565 Stride = TryGetStride(Dist, Sz);
4566 if (!Stride)
4567 return std::nullopt;
4568 }
4569 if (!Stride || isa<SCEVConstant>(Stride))
4570 return std::nullopt;
4571 // Iterate through all pointers and check if all distances are
4572 // unique multiple of Stride.
4573 using DistOrdPair = std::pair<int64_t, int>;
4574 auto Compare = llvm::less_first();
4575 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4576 int Cnt = 0;
4577 bool IsConsecutive = true;
4578 for (const SCEV *PtrSCEV : SCEVs) {
4579 unsigned Dist = 0;
4580 if (PtrSCEV != PtrSCEVLowest) {
4581 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4582 const SCEV *Coeff = TryGetStride(Diff, Stride);
4583 if (!Coeff)
4584 return std::nullopt;
4585 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4586 if (!SC || isa<SCEVCouldNotCompute>(SC))
4587 return std::nullopt;
4588 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4589 SE.getMulExpr(Stride, SC)))
4590 ->isZero())
4591 return std::nullopt;
4592 Dist = SC->getAPInt().getZExtValue();
4593 }
4594 // If the strides are not the same or repeated, we can't vectorize.
4595 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4596 return std::nullopt;
4597 auto Res = Offsets.emplace(Dist, Cnt);
4598 if (!Res.second)
4599 return std::nullopt;
4600 // Consecutive order if the inserted element is the last one.
4601 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4602 ++Cnt;
4603 }
4604 if (Offsets.size() != SCEVs.size())
4605 return std::nullopt;
4606 SortedIndices.clear();
4607 if (!IsConsecutive) {
4608 // Fill SortedIndices array only if it is non-consecutive.
4609 SortedIndices.resize(PointerOps.size());
4610 Cnt = 0;
4611 for (const std::pair<int64_t, int> &Pair : Offsets) {
4612 SortedIndices[Cnt] = Pair.second;
4613 ++Cnt;
4614 }
4615 }
4616 if (!Inst)
4617 return nullptr;
4618 SCEVExpander Expander(SE, DL, "strided-load-vec");
4619 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4620}
4621
4622static std::pair<InstructionCost, InstructionCost>
4624 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4625 Type *ScalarTy, VectorType *VecTy);
4626
4627/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4628/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4629/// subvector pattern.
4630static InstructionCost
4632 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
4634 int Index = 0, VectorType *SubTp = nullptr,
4635 ArrayRef<const Value *> Args = std::nullopt) {
4636 if (Kind != TTI::SK_PermuteTwoSrc)
4637 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4638 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4639 int NumSubElts;
4640 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
4641 Mask, NumSrcElts, NumSubElts, Index)) {
4642 if (Index + NumSubElts > NumSrcElts &&
4643 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4644 return TTI.getShuffleCost(
4646 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4648 }
4649 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4650}
4651
4653 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4654 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4655 // Check that a vectorized load would load the same memory as a scalar
4656 // load. For example, we don't want to vectorize loads that are smaller
4657 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4658 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4659 // from such a struct, we read/write packed bits disagreeing with the
4660 // unvectorized version.
4661 Type *ScalarTy = VL0->getType();
4662
4663 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4664 return LoadsState::Gather;
4665
4666 // Make sure all loads in the bundle are simple - we can't vectorize
4667 // atomic or volatile loads.
4668 PointerOps.clear();
4669 const unsigned Sz = VL.size();
4670 PointerOps.resize(Sz);
4671 auto *POIter = PointerOps.begin();
4672 for (Value *V : VL) {
4673 auto *L = cast<LoadInst>(V);
4674 if (!L->isSimple())
4675 return LoadsState::Gather;
4676 *POIter = L->getPointerOperand();
4677 ++POIter;
4678 }
4679
4680 Order.clear();
4681 auto *VecTy = getWidenedType(ScalarTy, Sz);
4682 // Check the order of pointer operands or that all pointers are the same.
4683 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4684 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4685 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4686 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4687 "supported with VectorizeNonPowerOf2");
4688 return LoadsState::Gather;
4689 }
4690
4691 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4692 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4693 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4694 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4696 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4697 return arePointersCompatible(P, PointerOps.front(), *TLI);
4698 })) {
4699 if (IsSorted) {
4700 Value *Ptr0;
4701 Value *PtrN;
4702 if (Order.empty()) {
4703 Ptr0 = PointerOps.front();
4704 PtrN = PointerOps.back();
4705 } else {
4706 Ptr0 = PointerOps[Order.front()];
4707 PtrN = PointerOps[Order.back()];
4708 }
4709 std::optional<int> Diff =
4710 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4711 // Check that the sorted loads are consecutive.
4712 if (static_cast<unsigned>(*Diff) == Sz - 1)
4713 return LoadsState::Vectorize;
4714 // Simple check if not a strided access - clear order.
4715 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4716 // Try to generate strided load node if:
4717 // 1. Target with strided load support is detected.
4718 // 2. The number of loads is greater than MinProfitableStridedLoads,
4719 // or the potential stride <= MaxProfitableLoadStride and the
4720 // potential stride is power-of-2 (to avoid perf regressions for the very
4721 // small number of loads) and max distance > number of loads, or potential
4722 // stride is -1.
4723 // 3. The loads are ordered, or number of unordered loads <=
4724 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4725 // (this check is to avoid extra costs for very expensive shuffles).
4726 // 4. Any pointer operand is an instruction with the users outside of the
4727 // current graph (for masked gathers extra extractelement instructions
4728 // might be required).
4729 auto IsAnyPointerUsedOutGraph =
4730 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
4731 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
4732 return !getTreeEntry(U) && !MustGather.contains(U);
4733 });
4734 });
4735 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4737 (static_cast<unsigned>(std::abs(*Diff)) <=
4739 isPowerOf2_32(std::abs(*Diff)))) &&
4740 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4741 *Diff == -(static_cast<int>(Sz) - 1))) {
4742 int Stride = *Diff / static_cast<int>(Sz - 1);
4743 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4744 Align Alignment =
4745 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4746 ->getAlign();
4747 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4748 // Iterate through all pointers and check if all distances are
4749 // unique multiple of Dist.
4750 SmallSet<int, 4> Dists;
4751 for (Value *Ptr : PointerOps) {
4752 int Dist = 0;
4753 if (Ptr == PtrN)
4754 Dist = *Diff;
4755 else if (Ptr != Ptr0)
4756 Dist =
4757 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4758 // If the strides are not the same or repeated, we can't
4759 // vectorize.
4760 if (((Dist / Stride) * Stride) != Dist ||
4761 !Dists.insert(Dist).second)
4762 break;
4763 }
4764 if (Dists.size() == Sz)
4766 }
4767 }
4768 }
4769 }
4770 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4771 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4772 unsigned MinVF = getMinVF(Sz);
4773 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4774 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4775 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4776 unsigned VectorizedCnt = 0;
4778 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4779 Cnt += VF, ++VectorizedCnt) {
4780 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4782 SmallVector<Value *> PointerOps;
4783 LoadsState LS =
4784 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4785 /*TryRecursiveCheck=*/false);
4786 // Check that the sorted loads are consecutive.
4787 if (LS == LoadsState::Gather)
4788 break;
4789 // If need the reorder - consider as high-cost masked gather for now.
4790 if ((LS == LoadsState::Vectorize ||
4792 !Order.empty() && !isReverseOrder(Order))
4794 States.push_back(LS);
4795 }
4796 // Can be vectorized later as a serie of loads/insertelements.
4797 if (VectorizedCnt == VL.size() / VF) {
4798 // Compare masked gather cost and loads + insersubvector costs.
4800 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4801 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4802 CostKind, ScalarTy, VecTy);
4803 InstructionCost MaskedGatherCost =
4805 Instruction::Load, VecTy,
4806 cast<LoadInst>(VL0)->getPointerOperand(),
4807 /*VariableMask=*/false, CommonAlignment, CostKind) +
4808 VectorGEPCost - ScalarGEPCost;
4809 InstructionCost VecLdCost = 0;
4810 auto *SubVecTy = getWidenedType(ScalarTy, VF);
4811 for (auto [I, LS] : enumerate(States)) {
4812 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4813 switch (LS) {
4814 case LoadsState::Vectorize: {
4815 auto [ScalarGEPCost, VectorGEPCost] =
4816 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4817 LI0->getPointerOperand(), Instruction::Load,
4818 CostKind, ScalarTy, SubVecTy);
4819 VecLdCost += TTI.getMemoryOpCost(
4820 Instruction::Load, SubVecTy, LI0->getAlign(),
4821 LI0->getPointerAddressSpace(), CostKind,
4823 VectorGEPCost - ScalarGEPCost;
4824 break;
4825 }
4827 auto [ScalarGEPCost, VectorGEPCost] =
4828 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4829 LI0->getPointerOperand(), Instruction::Load,
4830 CostKind, ScalarTy, SubVecTy);
4831 VecLdCost +=
4833 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4834 /*VariableMask=*/false, CommonAlignment, CostKind) +
4835 VectorGEPCost - ScalarGEPCost;
4836 break;
4837 }
4839 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4840 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4841 LI0->getPointerOperand(), Instruction::GetElementPtr,
4842 CostKind, ScalarTy, SubVecTy);
4843 VecLdCost +=
4845 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4846 /*VariableMask=*/false, CommonAlignment, CostKind) +
4847 VectorGEPCost - ScalarGEPCost;
4848 break;
4849 }
4850 case LoadsState::Gather:
4852 "Expected only consecutive, strided or masked gather loads.");
4853 }
4854 SmallVector<int> ShuffleMask(VL.size());
4855 for (int Idx : seq<int>(0, VL.size()))
4856 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4857 VecLdCost +=
4859 ShuffleMask, CostKind, I * VF, SubVecTy);
4860 }
4861 // If masked gather cost is higher - better to vectorize, so
4862 // consider it as a gather node. It will be better estimated
4863 // later.
4864 if (MaskedGatherCost >= VecLdCost)
4865 return true;
4866 }
4867 }
4868 return false;
4869 };
4870 // TODO: need to improve analysis of the pointers, if not all of them are
4871 // GEPs or have > 2 operands, we end up with a gather node, which just
4872 // increases the cost.
4873 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4874 bool ProfitableGatherPointers =
4875 L && Sz > 2 &&
4876 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4877 return L->isLoopInvariant(V);
4878 })) <= Sz / 2;
4879 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4880 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4881 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4882 (GEP && GEP->getNumOperands() == 2 &&
4883 isa<Constant, Instruction>(GEP->getOperand(1)));
4884 })) {
4885 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4886 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4887 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4888 // Check if potential masked gather can be represented as series
4889 // of loads + insertsubvectors.
4890 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4891 // If masked gather cost is higher - better to vectorize, so
4892 // consider it as a gather node. It will be better estimated
4893 // later.
4894 return LoadsState::Gather;
4895 }
4897 }
4898 }
4899 }
4900
4901 return LoadsState::Gather;
4902}
4903
4905 const DataLayout &DL, ScalarEvolution &SE,
4906 SmallVectorImpl<unsigned> &SortedIndices) {
4908 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4909 "Expected list of pointer operands.");
4910 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4911 // Ptr into, sort and return the sorted indices with values next to one
4912 // another.
4914 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4915
4916 unsigned Cnt = 1;
4917 for (Value *Ptr : VL.drop_front()) {
4918 bool Found = any_of(Bases, [&](auto &Base) {
4919 std::optional<int> Diff =
4920 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4921 /*StrictCheck=*/true);
4922 if (!Diff)
4923 return false;
4924
4925 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4926 return true;
4927 });
4928
4929 if (!Found) {
4930 // If we haven't found enough to usefully cluster, return early.
4931 if (Bases.size() > VL.size() / 2 - 1)
4932 return false;
4933
4934 // Not found already - add a new Base
4935 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4936 }
4937 }
4938
4939 // For each of the bases sort the pointers by Offset and check if any of the
4940 // base become consecutively allocated.
4941 bool AnyConsecutive = false;
4942 for (auto &Base : Bases) {
4943 auto &Vec = Base.second;
4944 if (Vec.size() > 1) {
4945 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4946 const std::tuple<Value *, int, unsigned> &Y) {
4947 return std::get<1>(X) < std::get<1>(Y);
4948 });
4949 int InitialOffset = std::get<1>(Vec[0]);
4950 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4951 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4952 });
4953 }
4954 }
4955
4956 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4957 SortedIndices.clear();
4958 if (!AnyConsecutive)
4959 return false;
4960
4961 // If we have a better order, also sort the base pointers by increasing
4962 // (variable) values if possible, to try and keep the order more regular. In
4963 // order to create a valid strict-weak order we cluster by the Root of gep
4964 // chains and sort within each.
4966 for (auto &Base : Bases) {