LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107using namespace std::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
115 "Controls which SLP graphs should be vectorized.");
116
117static cl::opt<bool>
118 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
119 cl::desc("Run the SLP vectorization passes"));
120
121static cl::opt<bool>
122 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
123 cl::desc("Enable vectorization for wider vector utilization"));
124
125static cl::opt<int>
127 cl::desc("Only vectorize if you gain more than this "
128 "number "));
129
131 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
132 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
134
135static cl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
137 cl::desc("Attempt to vectorize horizontal reductions"));
138
140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
141 cl::desc(
142 "Attempt to vectorize horizontal reductions feeding into a store"));
143
144static cl::opt<int>
146 cl::desc("Attempt to vectorize for this register size in bits"));
147
150 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156static cl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
158 cl::desc("Limit the size of the SLP scheduling region per block"));
159
161 "slp-min-reg-size", cl::init(128), cl::Hidden,
162 cl::desc("Attempt to vectorize for this register size in bits"));
163
165 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
166 cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
169 "slp-min-tree-size", cl::init(3), cl::Hidden,
170 cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
175 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
184 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
185 cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
188 "slp-min-strided-loads", cl::init(2), cl::Hidden,
189 cl::desc("The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
191
193 "slp-max-stride", cl::init(8), cl::Hidden,
194 cl::desc("The maximum stride, considered to be profitable."));
195
196static cl::opt<bool>
197 ViewSLPTree("view-slp-tree", cl::Hidden,
198 cl::desc("Display the SLP trees with Graphviz"));
199
201 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
202 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206static const unsigned AliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210static constexpr int UsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215static const unsigned MaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219static const int MinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222static const unsigned MaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231static bool isValidElementType(Type *Ty) {
232 // TODO: Support ScalableVectorType.
233 if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
244 if (auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
250 return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254static unsigned getNumElements(Type *Ty) {
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
259 return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
264 return FixedVectorType::get(ScalarTy->getScalarType(),
265 VF * getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
272 Type *Ty, unsigned Sz) {
273 if (!isValidElementType(Ty))
274 return bit_ceil(Sz);
275 // Find the number of elements, which forms full vectors.
276 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277 if (NumParts == 0 || NumParts >= Sz)
278 return bit_ceil(Sz);
279 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285static unsigned
287 unsigned Sz) {
288 if (!isValidElementType(Ty))
289 return bit_floor(Sz);
290 // Find the number of elements, which forms full vectors.
291 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292 if (NumParts == 0 || NumParts >= Sz)
293 return bit_floor(Sz);
294 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
295 if (RegVF > Sz)
296 return bit_floor(Sz);
297 return (Sz / RegVF) * RegVF;
298}
299
300static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301 SmallVectorImpl<int> &Mask) {
302 // The ShuffleBuilder implementation use shufflevector to splat an "element".
303 // But the element have different meaning for SLP (scalar) and REVEC
304 // (vector). We need to expand Mask into masks which shufflevector can use
305 // directly.
306 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307 for (unsigned I : seq<unsigned>(Mask.size()))
308 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
309 I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
340 if (VL.empty())
341 return 0;
342 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343 return 0;
344 auto *SV = cast<ShuffleVectorInst>(VL.front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
349 return 0;
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352 return 0;
353 unsigned NumGroup = 0;
354 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[I]);
356 Value *Src = SV->getOperand(0);
357 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358 SmallBitVector ExpectedIndex(GroupSize);
359 if (!all_of(Group, [&](Value *V) {
360 auto *SV = cast<ShuffleVectorInst>(V);
361 // From the same source.
362 if (SV->getOperand(0) != Src)
363 return false;
364 int Index;
365 if (!SV->isExtractSubvectorMask(Index))
366 return false;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368 return true;
369 }))
370 return 0;
371 if (!ExpectedIndex.all())
372 return 0;
373 ++NumGroup;
374 }
375 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
376 return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
392 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
393 auto *SV = cast<ShuffleVectorInst>(VL.front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 SmallVector<int> Mask;
397 unsigned AccumulateLength = 0;
398 for (Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (int M : SV->getShuffleMask())
401 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405 return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410static bool isConstant(Value *V) {
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420 return false;
421 auto *I = dyn_cast<Instruction>(V);
422 if (!I || isa<ExtractValueInst>(I))
423 return true;
424 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425 return false;
426 if (isa<ExtractElementInst>(I))
427 return isConstant(I->getOperand(1));
428 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
429 return isConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
436 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
443 unsigned Part) {
444 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
450 std::string Result;
451 raw_string_ostream OS(Result);
452 if (Idx >= 0)
453 OS << "Idx: " << Idx << ", ";
454 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
455 return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
462 auto *It = find_if(VL, IsaPred<Instruction>);
463 if (It == VL.end())
464 return false;
465 Instruction *I0 = cast<Instruction>(*It);
467 return true;
468
469 BasicBlock *BB = I0->getParent();
470 for (Value *V : iterator_range(It, VL.end())) {
471 if (isa<PoisonValue>(V))
472 continue;
473 auto *II = dyn_cast<Instruction>(V);
474 if (!II)
475 return false;
476
477 if (BB != II->getParent())
478 return false;
479 }
480 return true;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
486 // Constant expressions and globals can't be vectorized like normal integer/FP
487 // constants.
488 return all_of(VL, isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493static bool isSplat(ArrayRef<Value *> VL) {
494 Value *FirstNonUndef = nullptr;
495 for (Value *V : VL) {
496 if (isa<UndefValue>(V))
497 continue;
498 if (!FirstNonUndef) {
499 FirstNonUndef = V;
500 continue;
501 }
502 if (V != FirstNonUndef)
503 return false;
504 }
505 return FirstNonUndef != nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
510 if (auto *Cmp = dyn_cast<CmpInst>(I))
511 return Cmp->isCommutative();
512 if (auto *BO = dyn_cast<BinaryOperator>(I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516 all_of(
517 BO->uses(),
518 [](const Use &U) {
519 // Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525 // Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535 all_of(BO->uses(), [](const Use &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539 return I->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
544 unsigned Offset) {
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547 "unsupported T");
548 int Index = Offset;
549 if (const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
551 if (!VT)
552 return std::nullopt;
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554 if (!CI)
555 return std::nullopt;
556 if (CI->getValue().uge(VT->getNumElements()))
557 return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560 return Index;
561 }
562 return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned> getElementIndex(const Value *Inst,
569 unsigned Offset = 0) {
570 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
571 return Index;
572 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
573 return Index;
574
575 int Index = Offset;
576
577 const auto *IV = dyn_cast<InsertValueInst>(Inst);
578 if (!IV)
579 return std::nullopt;
580
581 Type *CurrentType = IV->getType();
582 for (unsigned I : IV->indices()) {
583 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 } else {
590 return std::nullopt;
591 }
592 Index += I;
593 }
594 return Index;
595}
596
597namespace {
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612} // namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
617 UseMask MaskArg) {
618 SmallBitVector UseMask(VF, true);
619 for (auto [Idx, Value] : enumerate(Mask)) {
620 if (Value == PoisonMaskElem) {
621 if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623 continue;
624 }
625 if (MaskArg == UseMask::FirstArg && Value < VF)
626 UseMask.reset(Value);
627 else if (MaskArg == UseMask::SecondArg && Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630 return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
638 const SmallBitVector &UseMask = {}) {
639 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641 if (isa<T>(V))
642 return Res;
643 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644 if (!VecTy)
645 return Res.reset();
646 auto *C = dyn_cast<Constant>(V);
647 if (!C) {
648 if (!UseMask.empty()) {
649 const Value *Base = V;
650 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651 Base = II->getOperand(0);
652 if (isa<T>(II->getOperand(1)))
653 continue;
654 std::optional<unsigned> Idx = getElementIndex(II);
655 if (!Idx) {
656 Res.reset();
657 return Res;
658 }
659 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662 // TODO: Add analysis for shuffles here too.
663 if (V == Base) {
664 Res.reset();
665 } else {
666 SmallBitVector SubMask(UseMask.size(), false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 } else {
670 Res.reset();
671 }
672 return Res;
673 }
674 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
675 if (Constant *Elem = C->getAggregateElement(I))
676 if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680 return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
707 AssumptionCache *AC) {
708 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
709 if (It == VL.end())
710 return std::nullopt;
711 unsigned Size =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722 Value *Vec1 = nullptr;
723 Value *Vec2 = nullptr;
724 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
725 auto *EE = dyn_cast<ExtractElementInst>(V);
726 if (!EE)
727 return false;
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
730 return false;
731 return isGuaranteedNotToBePoison(Vec, AC);
732 });
733 enum ShuffleMode { Unknown, Select, Permute };
734 ShuffleMode CommonShuffleMode = Unknown;
735 Mask.assign(VL.size(), PoisonMaskElem);
736 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
737 // Undef can be represented as an undef element in a vector.
738 if (isa<UndefValue>(VL[I]))
739 continue;
740 auto *EI = cast<ExtractElementInst>(VL[I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 return std::nullopt;
743 auto *Vec = EI->getVectorOperand();
744 // We can extractelement from undef or poison vector.
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746 continue;
747 // All vector operands must have the same number of vector elements.
748 if (isa<UndefValue>(Vec)) {
749 Mask[I] = I;
750 } else {
751 if (isa<UndefValue>(EI->getIndexOperand()))
752 continue;
753 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754 if (!Idx)
755 return std::nullopt;
756 // Undefined behavior if Idx is negative or >= Size.
757 if (Idx->getValue().uge(Size))
758 continue;
759 unsigned IntIdx = Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762 if (isUndefVector(Vec).all() && HasNonUndefVec)
763 continue;
764 // For correct shuffling we have to have at most 2 different vector operands
765 // in all extractelement instructions.
766 if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 } else if (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] += Size;
771 } else {
772 return std::nullopt;
773 }
774 if (CommonShuffleMode == Permute)
775 continue;
776 // If the extract index is not the same as the operation number, it is a
777 // permutation.
778 if (Mask[I] % Size != I) {
779 CommonShuffleMode = Permute;
780 continue;
781 }
782 CommonShuffleMode = Select;
783 }
784 // If we're not crossing lanes in different vectors, consider it as blending.
785 if (CommonShuffleMode == Select && Vec2)
787 // If Vec2 was never used, we have a permutation of a single vector, otherwise
788 // we have permutation of 2 vectors.
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned> getExtractIndex(Instruction *E) {
795 unsigned Opcode = E->getOpcode();
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801 if (!CI)
802 return std::nullopt;
803 return CI->getZExtValue();
804 }
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
807 return std::nullopt;
808 return *EI->idx_begin();
809}
810
811namespace {
812
813/// Main data required for vectorization of instructions.
814class InstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816 Instruction *MainOp = nullptr;
817 Instruction *AltOp = nullptr;
818
819public:
820 Instruction *getMainOp() const {
821 assert(valid() && "InstructionsState is invalid.");
822 return MainOp;
823 }
824
825 Instruction *getAltOp() const {
826 assert(valid() && "InstructionsState is invalid.");
827 return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
832
833 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
837
838 bool isOpcodeOrAlt(Instruction *I) const {
839 unsigned CheckedOpcode = I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844 bool valid() const { return MainOp && AltOp; }
845
846 explicit operator bool() const { return valid(); }
847
848 InstructionsState() = delete;
849 InstructionsState(Instruction *MainOp, Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() { return {nullptr, nullptr}; }
852};
853
854} // end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861static bool isValidForAlternation(unsigned Opcode) {
862 if (Instruction::isIntDivRem(Opcode))
863 return false;
864
865 return true;
866}
867
868static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
869 const TargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
874 Value *Op1, const TargetLibraryInfo &TLI) {
875 return (isConstant(BaseOp0) && isConstant(Op0)) ||
876 (isConstant(BaseOp1) && isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880 getSameOpcode({BaseOp0, Op0}, TLI) ||
881 getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
888 const TargetLibraryInfo &TLI) {
889 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890 "Assessing comparisons of different types?");
891 CmpInst::Predicate BasePred = BaseCI->getPredicate();
892 CmpInst::Predicate Pred = CI->getPredicate();
894
895 Value *BaseOp0 = BaseCI->getOperand(0);
896 Value *BaseOp1 = BaseCI->getOperand(1);
897 Value *Op0 = CI->getOperand(0);
898 Value *Op1 = CI->getOperand(1);
899
900 return (BasePred == Pred &&
901 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
910 const TargetLibraryInfo &TLI) {
911 // Make sure these are all Instructions.
912 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
914
915 auto *It = find_if(VL, IsaPred<Instruction>);
916 if (It == VL.end())
917 return InstructionsState::invalid();
918
919 Instruction *MainOp = cast<Instruction>(*It);
920 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
924
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
928 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
930 Instruction *AltOp = MainOp;
931 unsigned Opcode = MainOp->getOpcode();
932 unsigned AltOpcode = Opcode;
933
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
935 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938 for (Value *V : VL) {
939 auto *I = dyn_cast<CmpInst>(V);
940 if (!I)
941 return false;
942 CmpInst::Predicate CurrentPred = I->getPredicate();
943 CmpInst::Predicate SwappedCurrentPred =
944 CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946 if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950 // Total number of predicates > 2, but if consider swapped predicates
951 // compatible only 2, consider swappable predicates as compatible opcodes,
952 // not alternate.
953 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955 // Check for one alternate opcode from another BinaryOperator.
956 // TODO - generalize to support all operators (types, calls etc.).
957 Intrinsic::ID BaseID = 0;
958 SmallVector<VFInfo> BaseMappings;
959 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
961 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
962 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963 return InstructionsState::invalid();
964 }
965 bool AnyPoison = InstCnt != VL.size();
966 // Skip MainOp.
967 for (Value *V : iterator_range(It + 1, VL.end())) {
968 auto *I = dyn_cast<Instruction>(V);
969 if (!I)
970 continue;
971
972 // Cannot combine poison and divisions.
973 // TODO: do some smart analysis of the CallInsts to exclude divide-like
974 // intrinsics/functions only.
975 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode = I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
980 continue;
981 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
982 isValidForAlternation(Opcode)) {
983 AltOpcode = InstOpcode;
984 AltOp = I;
985 continue;
986 }
987 } else if (IsCastOp && isa<CastInst>(I)) {
988 Value *Op0 = MainOp->getOperand(0);
989 Type *Ty0 = Op0->getType();
990 Value *Op1 = I->getOperand(0);
991 Type *Ty1 = Op1->getType();
992 if (Ty0 == Ty1) {
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
994 continue;
995 if (Opcode == AltOpcode) {
997 isValidForAlternation(InstOpcode) &&
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1000 AltOp = I;
1001 continue;
1002 }
1003 }
1004 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1008 if (Ty0 == Ty1) {
1009 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1012 "and CastInst.");
1013 // Check for compatible operands. If the corresponding operands are not
1014 // compatible - need to perform alternate vectorization.
1015 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1016 CmpInst::Predicate SwappedCurrentPred =
1017 CmpInst::getSwappedPredicate(CurrentPred);
1018
1019 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1021 continue;
1022
1023 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1024 continue;
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1027 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1028 continue;
1029 } else if (BasePred != CurrentPred) {
1030 assert(
1031 isValidForAlternation(InstOpcode) &&
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1033 AltOp = I;
1034 continue;
1035 }
1036 CmpInst::Predicate AltPred = AltInst->getPredicate();
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1039 continue;
1040 }
1041 } else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1044 "CastInst.");
1045 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1046 if (Gep->getNumOperands() != 2 ||
1047 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1048 return InstructionsState::invalid();
1049 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1051 return InstructionsState::invalid();
1052 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1057 auto *CallBase = cast<CallInst>(MainOp);
1058 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1064 CallBase->op_begin() +
1066 return InstructionsState::invalid();
1068 if (ID != BaseID)
1069 return InstructionsState::invalid();
1070 if (!ID) {
1071 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1072 if (Mappings.size() != BaseMappings.size() ||
1073 Mappings.front().ISA != BaseMappings.front().ISA ||
1074 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1075 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1076 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1077 Mappings.front().Shape.Parameters !=
1078 BaseMappings.front().Shape.Parameters)
1079 return InstructionsState::invalid();
1080 }
1081 }
1082 continue;
1083 }
1084 return InstructionsState::invalid();
1085 }
1086
1087 return InstructionsState(MainOp, AltOp);
1088}
1089
1090/// \returns true if all of the values in \p VL have the same type or false
1091/// otherwise.
1093 Type *Ty = VL.front()->getType();
1094 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1095}
1096
1097/// \returns True if in-tree use also needs extract. This refers to
1098/// possible scalar operand in vectorized instruction.
1099static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1100 TargetLibraryInfo *TLI,
1101 const TargetTransformInfo *TTI) {
1102 if (!UserInst)
1103 return false;
1104 unsigned Opcode = UserInst->getOpcode();
1105 switch (Opcode) {
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1108 return (LI->getPointerOperand() == Scalar);
1109 }
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1113 }
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1117 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1120 });
1121 }
1122 default:
1123 return false;
1124 }
1125}
1126
1127/// \returns the AA location that is being access by the instruction.
1129 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1130 return MemoryLocation::get(SI);
1131 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1132 return MemoryLocation::get(LI);
1133 return MemoryLocation();
1134}
1135
1136/// \returns True if the instruction is not a volatile or atomic load/store.
1137static bool isSimple(Instruction *I) {
1138 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1139 return LI->isSimple();
1140 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1141 return SI->isSimple();
1142 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1143 return !MI->isVolatile();
1144 return true;
1145}
1146
1147/// Shuffles \p Mask in accordance with the given \p SubMask.
1148/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1149/// one but two input vectors.
1150static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1151 bool ExtendingManyInputs = false) {
1152 if (SubMask.empty())
1153 return;
1154 assert(
1155 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1156 // Check if input scalars were extended to match the size of other node.
1157 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1158 "SubMask with many inputs support must be larger than the mask.");
1159 if (Mask.empty()) {
1160 Mask.append(SubMask.begin(), SubMask.end());
1161 return;
1162 }
1163 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1164 int TermValue = std::min(Mask.size(), SubMask.size());
1165 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1166 if (SubMask[I] == PoisonMaskElem ||
1167 (!ExtendingManyInputs &&
1168 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1169 continue;
1170 NewMask[I] = Mask[SubMask[I]];
1171 }
1172 Mask.swap(NewMask);
1173}
1174
1175/// Order may have elements assigned special value (size) which is out of
1176/// bounds. Such indices only appear on places which correspond to undef values
1177/// (see canReuseExtract for details) and used in order to avoid undef values
1178/// have effect on operands ordering.
1179/// The first loop below simply finds all unused indices and then the next loop
1180/// nest assigns these indices for undef values positions.
1181/// As an example below Order has two undef positions and they have assigned
1182/// values 3 and 7 respectively:
1183/// before: 6 9 5 4 9 2 1 0
1184/// after: 6 3 5 4 7 2 1 0
1186 const unsigned Sz = Order.size();
1187 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1188 SmallBitVector MaskedIndices(Sz);
1189 for (unsigned I = 0; I < Sz; ++I) {
1190 if (Order[I] < Sz)
1191 UnusedIndices.reset(Order[I]);
1192 else
1193 MaskedIndices.set(I);
1194 }
1195 if (MaskedIndices.none())
1196 return;
1197 assert(UnusedIndices.count() == MaskedIndices.count() &&
1198 "Non-synced masked/available indices.");
1199 int Idx = UnusedIndices.find_first();
1200 int MIdx = MaskedIndices.find_first();
1201 while (MIdx >= 0) {
1202 assert(Idx >= 0 && "Indices must be synced.");
1203 Order[MIdx] = Idx;
1204 Idx = UnusedIndices.find_next(Idx);
1205 MIdx = MaskedIndices.find_next(MIdx);
1206 }
1207}
1208
1209/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1210/// Opcode1.
1212 unsigned Opcode1) {
1213 Type *ScalarTy = VL[0]->getType();
1214 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1215 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1216 for (unsigned Lane : seq<unsigned>(VL.size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1218 continue;
1219 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1220 OpcodeMask.set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1222 }
1223 return OpcodeMask;
1224}
1225
1226namespace llvm {
1227
1229 SmallVectorImpl<int> &Mask) {
1230 Mask.clear();
1231 const unsigned E = Indices.size();
1232 Mask.resize(E, PoisonMaskElem);
1233 for (unsigned I = 0; I < E; ++I)
1234 Mask[Indices[I]] = I;
1235}
1236
1237/// Reorders the list of scalars in accordance with the given \p Mask.
1239 ArrayRef<int> Mask) {
1240 assert(!Mask.empty() && "Expected non-empty mask.");
1241 SmallVector<Value *> Prev(Scalars.size(),
1242 PoisonValue::get(Scalars.front()->getType()));
1243 Prev.swap(Scalars);
1244 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1245 if (Mask[I] != PoisonMaskElem)
1246 Scalars[Mask[I]] = Prev[I];
1247}
1248
1249/// Checks if the provided value does not require scheduling. It does not
1250/// require scheduling if this is not an instruction or it is an instruction
1251/// that does not read/write memory and all operands are either not instructions
1252/// or phi nodes or instructions from different blocks.
1254 auto *I = dyn_cast<Instruction>(V);
1255 if (!I)
1256 return true;
1257 return !mayHaveNonDefUseDependency(*I) &&
1258 all_of(I->operands(), [I](Value *V) {
1259 auto *IO = dyn_cast<Instruction>(V);
1260 if (!IO)
1261 return true;
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1263 });
1264}
1265
1266/// Checks if the provided value does not require scheduling. It does not
1267/// require scheduling if this is not an instruction or it is an instruction
1268/// that does not read/write memory and all users are phi nodes or instructions
1269/// from the different blocks.
1270static bool isUsedOutsideBlock(Value *V) {
1271 auto *I = dyn_cast<Instruction>(V);
1272 if (!I)
1273 return true;
1274 // Limits the number of uses to save compile time.
1275 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1276 all_of(I->users(), [I](User *U) {
1277 auto *IU = dyn_cast<Instruction>(U);
1278 if (!IU)
1279 return true;
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1281 });
1282}
1283
1284/// Checks if the specified value does not require scheduling. It does not
1285/// require scheduling if all operands and all users do not need to be scheduled
1286/// in the current basic block.
1289}
1290
1291/// Checks if the specified array of instructions does not require scheduling.
1292/// It is so if all either instructions have operands that do not require
1293/// scheduling or their users do not require scheduling since they are phis or
1294/// in other basic blocks.
1296 return !VL.empty() &&
1298}
1299
1300/// Returns true if widened type of \p Ty elements with size \p Sz represents
1301/// full vector type, i.e. adding extra element results in extra parts upon type
1302/// legalization.
1304 unsigned Sz) {
1305 if (Sz <= 1)
1306 return false;
1307 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1308 return false;
1309 if (has_single_bit(Sz))
1310 return true;
1311 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1312 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1313 Sz % NumParts == 0;
1314}
1315
1316namespace slpvectorizer {
1317
1318/// Bottom Up SLP Vectorizer.
1319class BoUpSLP {
1320 struct TreeEntry;
1321 struct ScheduleData;
1324
1325public:
1326 /// Tracks the state we can represent the loads in the given sequence.
1327 enum class LoadsState {
1328 Gather,
1329 Vectorize,
1332 };
1333
1340
1342 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1345 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB), DL(DL), ORE(ORE),
1347 Builder(Se->getContext(), TargetFolder(*DL)) {
1348 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1349 // Use the vector register size specified by the target unless overridden
1350 // by a command-line option.
1351 // TODO: It would be better to limit the vectorization factor based on
1352 // data type rather than just register size. For example, x86 AVX has
1353 // 256-bit registers, but it does not support integer operations
1354 // at that width (that requires AVX2).
1355 if (MaxVectorRegSizeOption.getNumOccurrences())
1356 MaxVecRegSize = MaxVectorRegSizeOption;
1357 else
1358 MaxVecRegSize =
1360 .getFixedValue();
1361
1362 if (MinVectorRegSizeOption.getNumOccurrences())
1363 MinVecRegSize = MinVectorRegSizeOption;
1364 else
1365 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1366 }
1367
1368 /// Vectorize the tree that starts with the elements in \p VL.
1369 /// Returns the vectorized root.
1371
1372 /// Vectorize the tree but with the list of externally used values \p
1373 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1374 /// generated extractvalue instructions.
1375 Value *
1376 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1377 Instruction *ReductionRoot = nullptr);
1378
1379 /// \returns the cost incurred by unwanted spills and fills, caused by
1380 /// holding live values over call sites.
1382
1383 /// \returns the vectorization cost of the subtree that starts at \p VL.
1384 /// A negative number means that this is profitable.
1385 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1386
1387 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1388 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1389 void buildTree(ArrayRef<Value *> Roots,
1390 const SmallDenseSet<Value *> &UserIgnoreLst);
1391
1392 /// Construct a vectorizable tree that starts at \p Roots.
1393 void buildTree(ArrayRef<Value *> Roots);
1394
1395 /// Returns whether the root node has in-tree uses.
1397 return !VectorizableTree.empty() &&
1398 !VectorizableTree.front()->UserTreeIndices.empty();
1399 }
1400
1401 /// Return the scalars of the root node.
1403 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1404 return VectorizableTree.front()->Scalars;
1405 }
1406
1407 /// Returns the type/is-signed info for the root node in the graph without
1408 /// casting.
1409 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1410 const TreeEntry &Root = *VectorizableTree.front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.find(&Root);
1415 if (It != MinBWs.end())
1416 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1417 It->second.first),
1418 It->second.second);
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1424 }
1425
1426 /// Checks if the root graph node can be emitted with narrower bitwidth at
1427 /// codegen and returns it signedness, if so.
1429 return MinBWs.at(VectorizableTree.front().get()).second;
1430 }
1431
1432 /// Returns reduction type after minbitdth analysis.
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.front()->Scalars.front()->getType()))
1439 return getWidenedType(
1440 VectorizableTree.front()->Scalars.front()->getType(),
1441 VectorizableTree.front()->getVectorFactor());
1442 return getWidenedType(
1444 VectorizableTree.front()->Scalars.front()->getContext(),
1445 ReductionBitWidth),
1446 VectorizableTree.front()->getVectorFactor());
1447 }
1448
1449 /// Builds external uses of the vectorized scalars, i.e. the list of
1450 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1451 /// ExternallyUsedValues contains additional list of external uses to handle
1452 /// vectorization of reductions.
1453 void
1454 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1455
1456 /// Transforms graph nodes to target specific representations, if profitable.
1457 void transformNodes();
1458
1459 /// Clear the internal data structures that are created by 'buildTree'.
1460 void deleteTree() {
1461 VectorizableTree.clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1464 MustGather.clear();
1465 NonScheduledFirst.clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.clear();
1468 IsGraphTransformMode = false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1474 BS->clear();
1475 }
1476 MinBWs.clear();
1477 ReductionBitWidth = 0;
1478 BaseGraphSize = 1;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList = nullptr;
1483 PostponedGathers.clear();
1484 ValueToGatherNodes.clear();
1485 }
1486
1487 unsigned getTreeSize() const { return VectorizableTree.size(); }
1488
1489 /// Returns the base graph size, before any transformations.
1490 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1491
1492 /// Perform LICM and CSE on the newly generated gather sequences.
1494
1495 /// Does this non-empty order represent an identity order? Identity
1496 /// should be represented as an empty order, so this is used to
1497 /// decide if we can canonicalize a computed order. Undef elements
1498 /// (represented as size) are ignored.
1500 assert(!Order.empty() && "expected non-empty order");
1501 const unsigned Sz = Order.size();
1502 return all_of(enumerate(Order), [&](const auto &P) {
1503 return P.value() == P.index() || P.value() == Sz;
1504 });
1505 }
1506
1507 /// Checks if the specified gather tree entry \p TE can be represented as a
1508 /// shuffled vector entry + (possibly) permutation with other gathers. It
1509 /// implements the checks only for possibly ordered scalars (Loads,
1510 /// ExtractElement, ExtractValue), which can be part of the graph.
1511 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1512
1513 /// Sort loads into increasing pointers offsets to allow greater clustering.
1514 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1515
1516 /// Gets reordering data for the given tree entry. If the entry is vectorized
1517 /// - just return ReorderIndices, otherwise check if the scalars can be
1518 /// reordered and return the most optimal order.
1519 /// \return std::nullopt if ordering is not important, empty order, if
1520 /// identity order is important, or the actual order.
1521 /// \param TopToBottom If true, include the order of vectorized stores and
1522 /// insertelement nodes, otherwise skip them.
1523 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1524 bool TopToBottom);
1525
1526 /// Reorders the current graph to the most profitable order starting from the
1527 /// root node to the leaf nodes. The best order is chosen only from the nodes
1528 /// of the same size (vectorization factor). Smaller nodes are considered
1529 /// parts of subgraph with smaller VF and they are reordered independently. We
1530 /// can make it because we still need to extend smaller nodes to the wider VF
1531 /// and we can merge reordering shuffles with the widening shuffles.
1532 void reorderTopToBottom();
1533
1534 /// Reorders the current graph to the most profitable order starting from
1535 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1536 /// number of reshuffles if the leaf nodes use the same order. In this case we
1537 /// can merge the orders and just shuffle user node instead of shuffling its
1538 /// operands. Plus, even the leaf nodes have different orders, it allows to
1539 /// sink reordering in the graph closer to the root node and merge it later
1540 /// during analysis.
1541 void reorderBottomToTop(bool IgnoreReorder = false);
1542
1543 /// \return The vector element size in bits to use when vectorizing the
1544 /// expression tree ending at \p V. If V is a store, the size is the width of
1545 /// the stored value. Otherwise, the size is the width of the largest loaded
1546 /// value reaching V. This method is used by the vectorizer to calculate
1547 /// vectorization factors.
1548 unsigned getVectorElementSize(Value *V);
1549
1550 /// Compute the minimum type sizes required to represent the entries in a
1551 /// vectorizable tree.
1553
1554 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1555 unsigned getMaxVecRegSize() const {
1556 return MaxVecRegSize;
1557 }
1558
1559 // \returns minimum vector register size as set by cl::opt.
1560 unsigned getMinVecRegSize() const {
1561 return MinVecRegSize;
1562 }
1563
1564 unsigned getMinVF(unsigned Sz) const {
1565 return std::max(2U, getMinVecRegSize() / Sz);
1566 }
1567
1568 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1569 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1570 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1571 return MaxVF ? MaxVF : UINT_MAX;
1572 }
1573
1574 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1575 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1576 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1577 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1578 ///
1579 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1580 unsigned canMapToVector(Type *T) const;
1581
1582 /// \returns True if the VectorizableTree is both tiny and not fully
1583 /// vectorizable. We do not vectorize such trees.
1584 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1585
1586 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1587 /// It may happen, if all gather nodes are loads and they cannot be
1588 /// "clusterized". In this case even subgraphs cannot be vectorized more
1589 /// effectively than the base graph.
1590 bool isTreeNotExtendable() const;
1591
1592 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1593 /// can be load combined in the backend. Load combining may not be allowed in
1594 /// the IR optimizer, so we do not want to alter the pattern. For example,
1595 /// partially transforming a scalar bswap() pattern into vector code is
1596 /// effectively impossible for the backend to undo.
1597 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1598 /// may not be necessary.
1599 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1600
1601 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1602 /// can be load combined in the backend. Load combining may not be allowed in
1603 /// the IR optimizer, so we do not want to alter the pattern. For example,
1604 /// partially transforming a scalar bswap() pattern into vector code is
1605 /// effectively impossible for the backend to undo.
1606 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1607 /// may not be necessary.
1608 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1609
1610 /// Checks if the given array of loads can be represented as a vectorized,
1611 /// scatter or just simple gather.
1612 /// \param VL list of loads.
1613 /// \param VL0 main load value.
1614 /// \param Order returned order of load instructions.
1615 /// \param PointerOps returned list of pointer operands.
1616 /// \param BestVF return best vector factor, if recursive check found better
1617 /// vectorization sequences rather than masked gather.
1618 /// \param TryRecursiveCheck used to check if long masked gather can be
1619 /// represented as a serie of loads/insert subvector, if profitable.
1622 SmallVectorImpl<Value *> &PointerOps,
1623 unsigned *BestVF = nullptr,
1624 bool TryRecursiveCheck = true) const;
1625
1626 /// Registers non-vectorizable sequence of loads
1627 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1628 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1629 }
1630
1631 /// Checks if the given loads sequence is known as not vectorizable
1632 template <typename T>
1634 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1635 }
1636
1638
1639 /// This structure holds any data we need about the edges being traversed
1640 /// during buildTree_rec(). We keep track of:
1641 /// (i) the user TreeEntry index, and
1642 /// (ii) the index of the edge.
1643 struct EdgeInfo {
1644 EdgeInfo() = default;
1645 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1647 /// The user TreeEntry.
1648 TreeEntry *UserTE = nullptr;
1649 /// The operand index of the use.
1650 unsigned EdgeIdx = UINT_MAX;
1651#ifndef NDEBUG
1653 const BoUpSLP::EdgeInfo &EI) {
1654 EI.dump(OS);
1655 return OS;
1656 }
1657 /// Debug print.
1658 void dump(raw_ostream &OS) const {
1659 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1660 << " EdgeIdx:" << EdgeIdx << "}";
1661 }
1662 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1663#endif
1664 bool operator == (const EdgeInfo &Other) const {
1665 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1666 }
1667 };
1668
1669 /// A helper class used for scoring candidates for two consecutive lanes.
1671 const TargetLibraryInfo &TLI;
1672 const DataLayout &DL;
1673 ScalarEvolution &SE;
1674 const BoUpSLP &R;
1675 int NumLanes; // Total number of lanes (aka vectorization factor).
1676 int MaxLevel; // The maximum recursion depth for accumulating score.
1677
1678 public:
1680 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1681 int MaxLevel)
1682 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1684
1685 // The hard-coded scores listed here are not very important, though it shall
1686 // be higher for better matches to improve the resulting cost. When
1687 // computing the scores of matching one sub-tree with another, we are
1688 // basically counting the number of values that are matching. So even if all
1689 // scores are set to 1, we would still get a decent matching result.
1690 // However, sometimes we have to break ties. For example we may have to
1691 // choose between matching loads vs matching opcodes. This is what these
1692 // scores are helping us with: they provide the order of preference. Also,
1693 // this is important if the scalar is externally used or used in another
1694 // tree entry node in the different lane.
1695
1696 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1697 static const int ScoreConsecutiveLoads = 4;
1698 /// The same load multiple times. This should have a better score than
1699 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1700 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1701 /// a vector load and 1.0 for a broadcast.
1702 static const int ScoreSplatLoads = 3;
1703 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1704 static const int ScoreReversedLoads = 3;
1705 /// A load candidate for masked gather.
1706 static const int ScoreMaskedGatherCandidate = 1;
1707 /// ExtractElementInst from same vector and consecutive indexes.
1708 static const int ScoreConsecutiveExtracts = 4;
1709 /// ExtractElementInst from same vector and reversed indices.
1710 static const int ScoreReversedExtracts = 3;
1711 /// Constants.
1712 static const int ScoreConstants = 2;
1713 /// Instructions with the same opcode.
1714 static const int ScoreSameOpcode = 2;
1715 /// Instructions with alt opcodes (e.g, add + sub).
1716 static const int ScoreAltOpcodes = 1;
1717 /// Identical instructions (a.k.a. splat or broadcast).
1718 static const int ScoreSplat = 1;
1719 /// Matching with an undef is preferable to failing.
1720 static const int ScoreUndef = 1;
1721 /// Score for failing to find a decent match.
1722 static const int ScoreFail = 0;
1723 /// Score if all users are vectorized.
1724 static const int ScoreAllUserVectorized = 1;
1725
1726 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1727 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1728 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1729 /// MainAltOps.
1731 ArrayRef<Value *> MainAltOps) const {
1732 if (!isValidElementType(V1->getType()) ||
1733 !isValidElementType(V2->getType()))
1735
1736 if (V1 == V2) {
1737 if (isa<LoadInst>(V1)) {
1738 // Retruns true if the users of V1 and V2 won't need to be extracted.
1739 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1740 // Bail out if we have too many uses to save compilation time.
1741 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1742 return false;
1743
1744 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1745 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1747 });
1748 };
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1750 };
1751 // A broadcast of a load can be cheaper on some targets.
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1753 ElementCount::getFixed(NumLanes)) &&
1754 ((int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1757 }
1759 }
1760
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1766 };
1767
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1770 if (LI1 && LI2) {
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1772 !LI2->isSimple())
1773 return CheckSameEntryOrFail();
1774
1775 std::optional<int> Dist = getPointersDiff(
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1778 if (!Dist || *Dist == 0) {
1779 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1780 getUnderlyingObject(LI2->getPointerOperand()) &&
1781 R.TTI->isLegalMaskedGather(
1782 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1784 return CheckSameEntryOrFail();
1785 }
1786 // The distance is too large - still may be profitable to use masked
1787 // loads/gathers.
1788 if (std::abs(*Dist) > NumLanes / 2)
1790 // This still will detect consecutive loads, but we might have "holes"
1791 // in some cases. It is ok for non-power-2 vectorization and may produce
1792 // better results. It should not affect current vectorization.
1795 }
1796
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1799 if (C1 && C2)
1801
1802 // Extracts from consecutive indexes of the same vector better score as
1803 // the extracts could be optimized away.
1804 Value *EV1;
1805 ConstantInt *Ex1Idx;
1806 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1807 // Undefs are always profitable for extractelements.
1808 // Compiler can easily combine poison and extractelement <non-poison> or
1809 // undef and extractelement <poison>. But combining undef +
1810 // extractelement <non-poison-but-may-produce-poison> requires some
1811 // extra operations.
1812 if (isa<UndefValue>(V2))
1813 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1816 Value *EV2 = nullptr;
1817 ConstantInt *Ex2Idx = nullptr;
1818 if (match(V2,
1820 m_Undef())))) {
1821 // Undefs are always profitable for extractelements.
1822 if (!Ex2Idx)
1824 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1826 if (EV2 == EV1) {
1827 int Idx1 = Ex1Idx->getZExtValue();
1828 int Idx2 = Ex2Idx->getZExtValue();
1829 int Dist = Idx2 - Idx1;
1830 // The distance is too large - still may be profitable to use
1831 // shuffles.
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1838 }
1840 }
1841 return CheckSameEntryOrFail();
1842 }
1843
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1846 if (I1 && I2) {
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1849 SmallVector<Value *, 4> Ops(MainAltOps);
1850 Ops.push_back(I1);
1851 Ops.push_back(I2);
1852 InstructionsState S = getSameOpcode(Ops, TLI);
1853 // Note: Only consider instructions with <= 2 operands to avoid
1854 // complexity explosion.
1855 if (S &&
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1857 !S.isAltShuffle()) &&
1858 all_of(Ops, [&S](Value *V) {
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1862 }))
1863 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1865 }
1866
1867 if (I1 && isa<PoisonValue>(V2))
1869
1870 if (isa<UndefValue>(V2))
1872
1873 return CheckSameEntryOrFail();
1874 }
1875
1876 /// Go through the operands of \p LHS and \p RHS recursively until
1877 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1878 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1879 /// of \p U1 and \p U2), except at the beginning of the recursion where
1880 /// these are set to nullptr.
1881 ///
1882 /// For example:
1883 /// \verbatim
1884 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1885 /// \ / \ / \ / \ /
1886 /// + + + +
1887 /// G1 G2 G3 G4
1888 /// \endverbatim
1889 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1890 /// each level recursively, accumulating the score. It starts from matching
1891 /// the additions at level 0, then moves on to the loads (level 1). The
1892 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1893 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1894 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1895 /// Please note that the order of the operands does not matter, as we
1896 /// evaluate the score of all profitable combinations of operands. In
1897 /// other words the score of G1 and G4 is the same as G1 and G2. This
1898 /// heuristic is based on ideas described in:
1899 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1900 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1901 /// Luís F. W. Góes
1903 Instruction *U2, int CurrLevel,
1904 ArrayRef<Value *> MainAltOps) const {
1905
1906 // Get the shallow score of V1 and V2.
1907 int ShallowScoreAtThisLevel =
1908 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1909
1910 // If reached MaxLevel,
1911 // or if V1 and V2 are not instructions,
1912 // or if they are SPLAT,
1913 // or if they are not consecutive,
1914 // or if profitable to vectorize loads or extractelements, early return
1915 // the current cost.
1916 auto *I1 = dyn_cast<Instruction>(LHS);
1917 auto *I2 = dyn_cast<Instruction>(RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1919 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 && "Should have early exited.");
1926
1927 // Contains the I2 operand indexes that got matched with I1 operands.
1928 SmallSet<unsigned, 4> Op2Used;
1929
1930 // Recursion towards the operands of I1 and I2. We are trying all possible
1931 // operand pairs, and keeping track of the best score.
1932 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1934 // Try to pair op1I with the best operand of I2.
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest = false;
1938 // If I2 is commutative try all combinations.
1939 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1940 unsigned ToIdx = isCommutative(I2)
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx && "Bad index");
1944 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1945 // Skip operands already paired with OpIdx1.
1946 if (Op2Used.count(OpIdx2))
1947 continue;
1948 // Recursively calculate the cost at each level
1949 int TmpScore =
1950 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1951 I1, I2, CurrLevel + 1, {});
1952 // Look for the best score.
1953 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1956 MaxOpIdx2 = OpIdx2;
1957 FoundBest = true;
1958 }
1959 }
1960 if (FoundBest) {
1961 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1962 Op2Used.insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1964 }
1965 }
1966 return ShallowScoreAtThisLevel;
1967 }
1968 };
1969 /// A helper data structure to hold the operands of a vector of instructions.
1970 /// This supports a fixed vector length for all operand vectors.
1972 /// For each operand we need (i) the value, and (ii) the opcode that it
1973 /// would be attached to if the expression was in a left-linearized form.
1974 /// This is required to avoid illegal operand reordering.
1975 /// For example:
1976 /// \verbatim
1977 /// 0 Op1
1978 /// |/
1979 /// Op1 Op2 Linearized + Op2
1980 /// \ / ----------> |/
1981 /// - -
1982 ///
1983 /// Op1 - Op2 (0 + Op1) - Op2
1984 /// \endverbatim
1985 ///
1986 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1987 ///
1988 /// Another way to think of this is to track all the operations across the
1989 /// path from the operand all the way to the root of the tree and to
1990 /// calculate the operation that corresponds to this path. For example, the
1991 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1992 /// corresponding operation is a '-' (which matches the one in the
1993 /// linearized tree, as shown above).
1994 ///
1995 /// For lack of a better term, we refer to this operation as Accumulated
1996 /// Path Operation (APO).
1997 struct OperandData {
1998 OperandData() = default;
1999 OperandData(Value *V, bool APO, bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2001 /// The operand value.
2002 Value *V = nullptr;
2003 /// TreeEntries only allow a single opcode, or an alternate sequence of
2004 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2005 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2006 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2007 /// (e.g., Add/Mul)
2008 bool APO = false;
2009 /// Helper data for the reordering function.
2010 bool IsUsed = false;
2011 };
2012
2013 /// During operand reordering, we are trying to select the operand at lane
2014 /// that matches best with the operand at the neighboring lane. Our
2015 /// selection is based on the type of value we are looking for. For example,
2016 /// if the neighboring lane has a load, we need to look for a load that is
2017 /// accessing a consecutive address. These strategies are summarized in the
2018 /// 'ReorderingMode' enumerator.
2019 enum class ReorderingMode {
2020 Load, ///< Matching loads to consecutive memory addresses
2021 Opcode, ///< Matching instructions based on opcode (same or alternate)
2022 Constant, ///< Matching constants
2023 Splat, ///< Matching the same instruction multiple times (broadcast)
2024 Failed, ///< We failed to create a vectorizable group
2025 };
2026
2028
2029 /// A vector of operand vectors.
2031 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2032 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2033 unsigned ArgSize = 0;
2034
2035 const TargetLibraryInfo &TLI;
2036 const DataLayout &DL;
2037 ScalarEvolution &SE;
2038 const BoUpSLP &R;
2039 const Loop *L = nullptr;
2040
2041 /// \returns the operand data at \p OpIdx and \p Lane.
2042 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2044 }
2045
2046 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2047 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2048 return OpsVec[OpIdx][Lane];
2049 }
2050
2051 /// Clears the used flag for all entries.
2052 void clearUsed() {
2053 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2056 ++Lane)
2057 OpsVec[OpIdx][Lane].IsUsed = false;
2058 }
2059
2060 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2061 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2063 }
2064
2065 /// \param Lane lane of the operands under analysis.
2066 /// \param OpIdx operand index in \p Lane lane we're looking the best
2067 /// candidate for.
2068 /// \param Idx operand index of the current candidate value.
2069 /// \returns The additional score due to possible broadcasting of the
2070 /// elements in the lane. It is more profitable to have power-of-2 unique
2071 /// elements in the lane, it will be vectorized with higher probability
2072 /// after removing duplicates. Currently the SLP vectorizer supports only
2073 /// vectorization of the power-of-2 number of unique scalars.
2074 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2075 const SmallBitVector &UsedLanes) const {
2076 Value *IdxLaneV = getData(Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2079 return 0;
2081 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2082 if (Ln == Lane)
2083 continue;
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2086 return 0;
2087 Uniques.try_emplace(OpIdxLnV, Ln);
2088 }
2089 unsigned UniquesCount = Uniques.size();
2090 auto IdxIt = Uniques.find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2098 return 0;
2099 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2102 bit_floor(UniquesCntWithOpIdxLaneV)) -
2103 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2105 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2106 }
2107
2108 /// \param Lane lane of the operands under analysis.
2109 /// \param OpIdx operand index in \p Lane lane we're looking the best
2110 /// candidate for.
2111 /// \param Idx operand index of the current candidate value.
2112 /// \returns The additional score for the scalar which users are all
2113 /// vectorized.
2114 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2115 Value *IdxLaneV = getData(Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2117 // Do not care about number of uses for vector-like instructions
2118 // (extractelement/extractvalue with constant indices), they are extracts
2119 // themselves and already externally used. Vectorization of such
2120 // instructions does not add extra extractelement instruction, just may
2121 // remove it.
2122 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2123 isVectorLikeInstWithConstOps(OpIdxLaneV))
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2127 return 0;
2128 return R.areAllUsersVectorized(IdxLaneI)
2130 : 0;
2131 }
2132
2133 /// Score scaling factor for fully compatible instructions but with
2134 /// different number of external uses. Allows better selection of the
2135 /// instructions with less external uses.
2136 static const int ScoreScaleFactor = 10;
2137
2138 /// \Returns the look-ahead score, which tells us how much the sub-trees
2139 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2140 /// score. This helps break ties in an informed way when we cannot decide on
2141 /// the order of the operands by just considering the immediate
2142 /// predecessors.
2143 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2144 int Lane, unsigned OpIdx, unsigned Idx,
2145 bool &IsUsed, const SmallBitVector &UsedLanes) {
2146 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2148 // Keep track of the instruction stack as we recurse into the operands
2149 // during the look-ahead score exploration.
2150 int Score =
2151 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2152 /*CurrLevel=*/1, MainAltOps);
2153 if (Score) {
2154 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2156 // Failed score.
2157 Score = 0;
2158 } else {
2159 Score += SplatScore;
2160 // Scale score to see the difference between different operands
2161 // and similar operands but all vectorized/not all vectorized
2162 // uses. It does not affect actual selection of the best
2163 // compatible operand in general, just allows to select the
2164 // operand with all vectorized uses.
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx, Idx);
2167 IsUsed = true;
2168 }
2169 }
2170 return Score;
2171 }
2172
2173 /// Best defined scores per lanes between the passes. Used to choose the
2174 /// best operand (with the highest score) between the passes.
2175 /// The key - {Operand Index, Lane}.
2176 /// The value - the best score between the passes for the lane and the
2177 /// operand.
2179 BestScoresPerLanes;
2180
2181 // Search all operands in Ops[*][Lane] for the one that matches best
2182 // Ops[OpIdx][LastLane] and return its opreand index.
2183 // If no good match can be found, return std::nullopt.
2184 std::optional<unsigned>
2185 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2186 ArrayRef<ReorderingMode> ReorderingModes,
2187 ArrayRef<Value *> MainAltOps,
2188 const SmallBitVector &UsedLanes) {
2189 unsigned NumOperands = getNumOperands();
2190
2191 // The operand of the previous lane at OpIdx.
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2193
2194 // Our strategy mode for OpIdx.
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2198
2199 // The linearized opcode of the operand at OpIdx, Lane.
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2201
2202 // The best operand index and its score.
2203 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2204 // are using the score to differentiate between the two.
2205 struct BestOpData {
2206 std::optional<unsigned> Idx;
2207 unsigned Score = 0;
2208 } BestOp;
2209 BestOp.Score =
2210 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2211 .first->second;
2212
2213 // Track if the operand must be marked as used. If the operand is set to
2214 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2215 // want to reestimate the operands again on the following iterations).
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2219 // Iterate through all unused operands and look for the best.
2220 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2221 // Get the operand at Idx and Lane.
2222 OperandData &OpData = getData(Idx, Lane);
2223 Value *Op = OpData.V;
2224 bool OpAPO = OpData.APO;
2225
2226 // Skip already selected operands.
2227 if (OpData.IsUsed)
2228 continue;
2229
2230 // Skip if we are trying to move the operand to a position with a
2231 // different opcode in the linearized tree form. This would break the
2232 // semantics.
2233 if (OpAPO != OpIdxAPO)
2234 continue;
2235
2236 // Look for an operand that matches the current mode.
2237 switch (RMode) {
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2242 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx, Idx, IsUsed, UsedLanes);
2245 if (Score > static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2247 Idx == OpIdx)) {
2248 BestOp.Idx = Idx;
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2251 }
2252 break;
2253 }
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2257 BestOp.Idx = Idx;
2258 if (isa<Constant>(Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2262 }
2263 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2264 IsUsed = false;
2265 }
2266 break;
2267 case ReorderingMode::Splat:
2268 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2269 IsUsed = Op == OpLastLane;
2270 if (Op == OpLastLane) {
2271 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2274 }
2275 BestOp.Idx = Idx;
2276 }
2277 break;
2278 case ReorderingMode::Failed:
2279 llvm_unreachable("Not expected Failed reordering mode.");
2280 }
2281 }
2282
2283 if (BestOp.Idx) {
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2285 return BestOp.Idx;
2286 }
2287 // If we could not find a good match return std::nullopt.
2288 return std::nullopt;
2289 }
2290
2291 /// Helper for reorderOperandVecs.
2292 /// \returns the lane that we should start reordering from. This is the one
2293 /// which has the least number of operands that can freely move about or
2294 /// less profitable because it already has the most optimal set of operands.
2295 unsigned getBestLaneToStartReordering() const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2298 // std::pair<unsigned, unsigned> is used to implement a simple voting
2299 // algorithm and choose the lane with the least number of operands that
2300 // can freely move about or less profitable because it already has the
2301 // most optimal set of operands. The first unsigned is a counter for
2302 // voting, the second unsigned is the counter of lanes with instructions
2303 // with same/alternate opcodes and same parent basic block.
2305 // Try to be closer to the original results, if we have multiple lanes
2306 // with same cost. If 2 lanes have the same cost, use the one with the
2307 // highest index.
2308 for (int I = getNumLanes(); I > 0; --I) {
2309 unsigned Lane = I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2312 // Compare the number of operands that can move and choose the one with
2313 // the least number.
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2317 HashMap.clear();
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2321 // Select the most optimal lane in terms of number of operands that
2322 // should be moved around.
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2329 if (!Inserted)
2330 ++It->second.first;
2331 }
2332 }
2333 // Select the lane with the minimum counter.
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2336 for (const auto &Data : reverse(HashMap)) {
2337 if (Data.second.first < CntMin) {
2338 CntMin = Data.second.first;
2339 BestLane = Data.second.second;
2340 }
2341 }
2342 return BestLane;
2343 }
2344
2345 /// Data structure that helps to reorder operands.
2346 struct OperandsOrderData {
2347 /// The best number of operands with the same APOs, which can be
2348 /// reordered.
2349 unsigned NumOfAPOs = UINT_MAX;
2350 /// Number of operands with the same/alternate instruction opcode and
2351 /// parent.
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2353 /// Hash for the actual operands ordering.
2354 /// Used to count operands, actually their position id and opcode
2355 /// value. It is used in the voting mechanism to find the lane with the
2356 /// least number of operands that can freely move about or less profitable
2357 /// because it already has the most optimal set of operands. Can be
2358 /// replaced with SmallVector<unsigned> instead but hash code is faster
2359 /// and requires less memory.
2360 unsigned Hash = 0;
2361 };
2362 /// \returns the maximum number of operands that are allowed to be reordered
2363 /// for \p Lane and the number of compatible instructions(with the same
2364 /// parent/opcode). This is used as a heuristic for selecting the first lane
2365 /// to start operand reordering.
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2369 // Operands with the same APO can be reordered. We therefore need to count
2370 // how many of them we have for each APO, like this: Cnt[APO] = x.
2371 // Since we only have two APOs, namely true and false, we can avoid using
2372 // a map. Instead we can simply count the number of operands that
2373 // correspond to one of them (in this case the 'true' APO), and calculate
2374 // the other by subtracting it from the total number of operands.
2375 // Operands with the same instruction opcode and parent are more
2376 // profitable since we don't need to move them in many cases, with a high
2377 // probability such lane already can be vectorized effectively.
2378 bool AllUndefs = true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2380 Instruction *OpcodeI = nullptr;
2381 BasicBlock *Parent = nullptr;
2382 unsigned Hash = 0;
2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2385 if (OpData.APO)
2386 ++CntTrue;
2387 // Use Boyer-Moore majority voting for finding the majority opcode and
2388 // the number of times it occurs.
2389 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2390 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2394 OpcodeI = I;
2395 Parent = I->getParent();
2396 } else {
2397 --NumOpsWithSameOpcodeParent;
2398 }
2399 } else {
2400 ++NumOpsWithSameOpcodeParent;
2401 }
2402 }
2403 Hash = hash_combine(
2404 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2406 }
2407 if (AllUndefs)
2408 return {};
2409 OperandsOrderData Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2412 Data.Hash = Hash;
2413 return Data;
2414 }
2415
2416 /// Go through the instructions in VL and append their operands.
2417 void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
2418 assert(!VL.empty() && "Bad VL");
2419 assert((empty() || VL.size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2421 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2422 // arguments to the intrinsic produces the same result.
2423 constexpr unsigned IntrinsicNumOperands = 2;
2424 unsigned NumOperands = VL0->getNumOperands();
2425 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2426 OpsVec.resize(NumOperands);
2427 unsigned NumLanes = VL.size();
2428 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2429 OpsVec[OpIdx].resize(NumLanes);
2430 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2431 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2432 "Expected instruction or poison value");
2433 // Our tree has just 3 nodes: the root and two operands.
2434 // It is therefore trivial to get the APO. We only need to check the
2435 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2436 // RHS operand. The LHS operand of both add and sub is never attached
2437 // to an inversese operation in the linearized form, therefore its APO
2438 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2439
2440 // Since operand reordering is performed on groups of commutative
2441 // operations or alternating sequences (e.g., +, -), we can safely
2442 // tell the inverse operations by checking commutativity.
2443 if (isa<PoisonValue>(VL[Lane])) {
2444 OpsVec[OpIdx][Lane] = {
2445 PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
2446 false};
2447 continue;
2448 }
2449 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2450 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2451 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2452 APO, false};
2453 }
2454 }
2455 }
2456
2457 /// \returns the number of operands.
2458 unsigned getNumOperands() const { return ArgSize; }
2459
2460 /// \returns the number of lanes.
2461 unsigned getNumLanes() const { return OpsVec[0].size(); }
2462
2463 /// \returns the operand value at \p OpIdx and \p Lane.
2464 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2465 return getData(OpIdx, Lane).V;
2466 }
2467
2468 /// \returns true if the data structure is empty.
2469 bool empty() const { return OpsVec.empty(); }
2470
2471 /// Clears the data.
2472 void clear() { OpsVec.clear(); }
2473
2474 /// \Returns true if there are enough operands identical to \p Op to fill
2475 /// the whole vector (it is mixed with constants or loop invariant values).
2476 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2477 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2478 assert(Op == getValue(OpIdx, Lane) &&
2479 "Op is expected to be getValue(OpIdx, Lane).");
2480 // Small number of loads - try load matching.
2481 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2482 return false;
2483 bool OpAPO = getData(OpIdx, Lane).APO;
2484 bool IsInvariant = L && L->isLoopInvariant(Op);
2485 unsigned Cnt = 0;
2486 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2487 if (Ln == Lane)
2488 continue;
2489 // This is set to true if we found a candidate for broadcast at Lane.
2490 bool FoundCandidate = false;
2491 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2492 OperandData &Data = getData(OpI, Ln);
2493 if (Data.APO != OpAPO || Data.IsUsed)
2494 continue;
2495 Value *OpILane = getValue(OpI, Lane);
2496 bool IsConstantOp = isa<Constant>(OpILane);
2497 // Consider the broadcast candidate if:
2498 // 1. Same value is found in one of the operands.
2499 if (Data.V == Op ||
2500 // 2. The operand in the given lane is not constant but there is a
2501 // constant operand in another lane (which can be moved to the
2502 // given lane). In this case we can represent it as a simple
2503 // permutation of constant and broadcast.
2504 (!IsConstantOp &&
2505 ((Lns > 2 && isa<Constant>(Data.V)) ||
2506 // 2.1. If we have only 2 lanes, need to check that value in the
2507 // next lane does not build same opcode sequence.
2508 (Lns == 2 &&
2509 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2510 isa<Constant>(Data.V)))) ||
2511 // 3. The operand in the current lane is loop invariant (can be
2512 // hoisted out) and another operand is also a loop invariant
2513 // (though not a constant). In this case the whole vector can be
2514 // hoisted out.
2515 // FIXME: need to teach the cost model about this case for better
2516 // estimation.
2517 (IsInvariant && !isa<Constant>(Data.V) &&
2518 !getSameOpcode({Op, Data.V}, TLI) &&
2519 L->isLoopInvariant(Data.V))) {
2520 FoundCandidate = true;
2521 Data.IsUsed = Data.V == Op;
2522 if (Data.V == Op)
2523 ++Cnt;
2524 break;
2525 }
2526 }
2527 if (!FoundCandidate)
2528 return false;
2529 }
2530 return getNumLanes() == 2 || Cnt > 1;
2531 }
2532
2533 /// Checks if there is at least single compatible operand in lanes other
2534 /// than \p Lane, compatible with the operand \p Op.
2535 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2536 assert(Op == getValue(OpIdx, Lane) &&
2537 "Op is expected to be getValue(OpIdx, Lane).");
2538 bool OpAPO = getData(OpIdx, Lane).APO;
2539 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2540 if (Ln == Lane)
2541 continue;
2542 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2543 const OperandData &Data = getData(OpI, Ln);
2544 if (Data.APO != OpAPO || Data.IsUsed)
2545 return true;
2546 Value *OpILn = getValue(OpI, Ln);
2547 return (L && L->isLoopInvariant(OpILn)) ||
2548 (getSameOpcode({Op, OpILn}, TLI) &&
2549 allSameBlock({Op, OpILn}));
2550 }))
2551 return true;
2552 }
2553 return false;
2554 }
2555
2556 public:
2557 /// Initialize with all the operands of the instruction vector \p RootVL.
2559 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2560 L(R.LI->getLoopFor((VL0->getParent()))) {
2561 // Append all the operands of RootVL.
2562 appendOperandsOfVL(RootVL, VL0);
2563 }
2564
2565 /// \Returns a value vector with the operands across all lanes for the
2566 /// opearnd at \p OpIdx.
2567 ValueList getVL(unsigned OpIdx) const {
2568 ValueList OpVL(OpsVec[OpIdx].size());
2569 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2570 "Expected same num of lanes across all operands");
2571 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2572 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2573 return OpVL;
2574 }
2575
2576 // Performs operand reordering for 2 or more operands.
2577 // The original operands are in OrigOps[OpIdx][Lane].
2578 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2579 void reorder() {
2580 unsigned NumOperands = getNumOperands();
2581 unsigned NumLanes = getNumLanes();
2582 // Each operand has its own mode. We are using this mode to help us select
2583 // the instructions for each lane, so that they match best with the ones
2584 // we have selected so far.
2585 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2586
2587 // This is a greedy single-pass algorithm. We are going over each lane
2588 // once and deciding on the best order right away with no back-tracking.
2589 // However, in order to increase its effectiveness, we start with the lane
2590 // that has operands that can move the least. For example, given the
2591 // following lanes:
2592 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2593 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2594 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2595 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2596 // we will start at Lane 1, since the operands of the subtraction cannot
2597 // be reordered. Then we will visit the rest of the lanes in a circular
2598 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2599
2600 // Find the first lane that we will start our search from.
2601 unsigned FirstLane = getBestLaneToStartReordering();
2602
2603 // Initialize the modes.
2604 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2605 Value *OpLane0 = getValue(OpIdx, FirstLane);
2606 // Keep track if we have instructions with all the same opcode on one
2607 // side.
2608 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2609 // Check if OpLane0 should be broadcast.
2610 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2611 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2612 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2613 else if (isa<LoadInst>(OpILane0))
2614 ReorderingModes[OpIdx] = ReorderingMode::Load;
2615 else
2616 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2617 } else if (isa<Constant>(OpLane0)) {
2618 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2619 } else if (isa<Argument>(OpLane0)) {
2620 // Our best hope is a Splat. It may save some cost in some cases.
2621 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2622 } else {
2623 llvm_unreachable("Unexpected value kind.");
2624 }
2625 }
2626
2627 // Check that we don't have same operands. No need to reorder if operands
2628 // are just perfect diamond or shuffled diamond match. Do not do it only
2629 // for possible broadcasts or non-power of 2 number of scalars (just for
2630 // now).
2631 auto &&SkipReordering = [this]() {
2632 SmallPtrSet<Value *, 4> UniqueValues;
2633 ArrayRef<OperandData> Op0 = OpsVec.front();
2634 for (const OperandData &Data : Op0)
2635 UniqueValues.insert(Data.V);
2637 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2638 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2639 return !UniqueValues.contains(Data.V);
2640 }))
2641 return false;
2642 }
2643 // TODO: Check if we can remove a check for non-power-2 number of
2644 // scalars after full support of non-power-2 vectorization.
2645 return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2646 };
2647
2648 // If the initial strategy fails for any of the operand indexes, then we
2649 // perform reordering again in a second pass. This helps avoid assigning
2650 // high priority to the failed strategy, and should improve reordering for
2651 // the non-failed operand indexes.
2652 for (int Pass = 0; Pass != 2; ++Pass) {
2653 // Check if no need to reorder operands since they're are perfect or
2654 // shuffled diamond match.
2655 // Need to do it to avoid extra external use cost counting for
2656 // shuffled matches, which may cause regressions.
2657 if (SkipReordering())
2658 break;
2659 // Skip the second pass if the first pass did not fail.
2660 bool StrategyFailed = false;
2661 // Mark all operand data as free to use.
2662 clearUsed();
2663 // We keep the original operand order for the FirstLane, so reorder the
2664 // rest of the lanes. We are visiting the nodes in a circular fashion,
2665 // using FirstLane as the center point and increasing the radius
2666 // distance.
2667 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2668 for (unsigned I = 0; I < NumOperands; ++I)
2669 MainAltOps[I].push_back(getData(I, FirstLane).V);
2670
2671 SmallBitVector UsedLanes(NumLanes);
2672 UsedLanes.set(FirstLane);
2673 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2674 // Visit the lane on the right and then the lane on the left.
2675 for (int Direction : {+1, -1}) {
2676 int Lane = FirstLane + Direction * Distance;
2677 if (Lane < 0 || Lane >= (int)NumLanes)
2678 continue;
2679 UsedLanes.set(Lane);
2680 int LastLane = Lane - Direction;
2681 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2682 "Out of bounds");
2683 // Look for a good match for each operand.
2684 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2685 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2686 std::optional<unsigned> BestIdx =
2687 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2688 MainAltOps[OpIdx], UsedLanes);
2689 // By not selecting a value, we allow the operands that follow to
2690 // select a better matching value. We will get a non-null value in
2691 // the next run of getBestOperand().
2692 if (BestIdx) {
2693 // Swap the current operand with the one returned by
2694 // getBestOperand().
2695 swap(OpIdx, *BestIdx, Lane);
2696 } else {
2697 // Enable the second pass.
2698 StrategyFailed = true;
2699 }
2700 // Try to get the alternate opcode and follow it during analysis.
2701 if (MainAltOps[OpIdx].size() != 2) {
2702 OperandData &AltOp = getData(OpIdx, Lane);
2703 InstructionsState OpS =
2704 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2705 if (OpS && OpS.isAltShuffle())
2706 MainAltOps[OpIdx].push_back(AltOp.V);
2707 }
2708 }
2709 }
2710 }
2711 // Skip second pass if the strategy did not fail.
2712 if (!StrategyFailed)
2713 break;
2714 }
2715 }
2716
2717#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2718 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2719 switch (RMode) {
2720 case ReorderingMode::Load:
2721 return "Load";
2722 case ReorderingMode::Opcode:
2723 return "Opcode";
2724 case ReorderingMode::Constant:
2725 return "Constant";
2726 case ReorderingMode::Splat:
2727 return "Splat";
2728 case ReorderingMode::Failed:
2729 return "Failed";
2730 }
2731 llvm_unreachable("Unimplemented Reordering Type");
2732 }
2733
2734 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2735 raw_ostream &OS) {
2736 return OS << getModeStr(RMode);
2737 }
2738
2739 /// Debug print.
2740 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2741 printMode(RMode, dbgs());
2742 }
2743
2744 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2745 return printMode(RMode, OS);
2746 }
2747
2749 const unsigned Indent = 2;
2750 unsigned Cnt = 0;
2751 for (const OperandDataVec &OpDataVec : OpsVec) {
2752 OS << "Operand " << Cnt++ << "\n";
2753 for (const OperandData &OpData : OpDataVec) {
2754 OS.indent(Indent) << "{";
2755 if (Value *V = OpData.V)
2756 OS << *V;
2757 else
2758 OS << "null";
2759 OS << ", APO:" << OpData.APO << "}\n";
2760 }
2761 OS << "\n";
2762 }
2763 return OS;
2764 }
2765
2766 /// Debug print.
2767 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2768#endif
2769 };
2770
2771 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2772 /// for a pair which have highest score deemed to have best chance to form
2773 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2774 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2775 /// of the cost, considered to be good enough score.
2776 std::optional<int>
2777 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2778 int Limit = LookAheadHeuristics::ScoreFail) const {
2779 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2781 int BestScore = Limit;
2782 std::optional<int> Index;
2783 for (int I : seq<int>(0, Candidates.size())) {
2784 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2785 Candidates[I].second,
2786 /*U1=*/nullptr, /*U2=*/nullptr,
2787 /*CurrLevel=*/1, {});
2788 if (Score > BestScore) {
2789 BestScore = Score;
2790 Index = I;
2791 }
2792 }
2793 return Index;
2794 }
2795
2796 /// Checks if the instruction is marked for deletion.
2797 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2798
2799 /// Removes an instruction from its block and eventually deletes it.
2800 /// It's like Instruction::eraseFromParent() except that the actual deletion
2801 /// is delayed until BoUpSLP is destructed.
2803 DeletedInstructions.insert(I);
2804 }
2805
2806 /// Remove instructions from the parent function and clear the operands of \p
2807 /// DeadVals instructions, marking for deletion trivially dead operands.
2808 template <typename T>
2811 for (T *V : DeadVals) {
2812 auto *I = cast<Instruction>(V);
2813 DeletedInstructions.insert(I);
2814 }
2815 DenseSet<Value *> Processed;
2816 for (T *V : DeadVals) {
2817 if (!V || !Processed.insert(V).second)
2818 continue;
2819 auto *I = cast<Instruction>(V);
2822 if (const TreeEntry *Entry = getTreeEntry(I)) {
2823 Entries.push_back(Entry);
2824 auto It = MultiNodeScalars.find(I);
2825 if (It != MultiNodeScalars.end())
2826 Entries.append(It->second.begin(), It->second.end());
2827 }
2828 for (Use &U : I->operands()) {
2829 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2830 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2832 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2833 return Entry->VectorizedValue == OpI;
2834 })))
2835 DeadInsts.push_back(OpI);
2836 }
2837 I->dropAllReferences();
2838 }
2839 for (T *V : DeadVals) {
2840 auto *I = cast<Instruction>(V);
2841 if (!I->getParent())
2842 continue;
2843 assert((I->use_empty() || all_of(I->uses(),
2844 [&](Use &U) {
2845 return isDeleted(
2846 cast<Instruction>(U.getUser()));
2847 })) &&
2848 "trying to erase instruction with users.");
2849 I->removeFromParent();
2850 SE->forgetValue(I);
2851 }
2852 // Process the dead instruction list until empty.
2853 while (!DeadInsts.empty()) {
2854 Value *V = DeadInsts.pop_back_val();
2855 Instruction *VI = cast_or_null<Instruction>(V);
2856 if (!VI || !VI->getParent())
2857 continue;
2859 "Live instruction found in dead worklist!");
2860 assert(VI->use_empty() && "Instructions with uses are not dead.");
2861
2862 // Don't lose the debug info while deleting the instructions.
2863 salvageDebugInfo(*VI);
2864
2865 // Null out all of the instruction's operands to see if any operand
2866 // becomes dead as we go.
2867 for (Use &OpU : VI->operands()) {
2868 Value *OpV = OpU.get();
2869 if (!OpV)
2870 continue;
2871 OpU.set(nullptr);
2872
2873 if (!OpV->use_empty())
2874 continue;
2875
2876 // If the operand is an instruction that became dead as we nulled out
2877 // the operand, and if it is 'trivially' dead, delete it in a future
2878 // loop iteration.
2879 if (auto *OpI = dyn_cast<Instruction>(OpV))
2880 if (!DeletedInstructions.contains(OpI) &&
2882 DeadInsts.push_back(OpI);
2883 }
2884
2885 VI->removeFromParent();
2886 DeletedInstructions.insert(VI);
2887 SE->forgetValue(VI);
2888 }
2889 }
2890
2891 /// Checks if the instruction was already analyzed for being possible
2892 /// reduction root.
2894 return AnalyzedReductionsRoots.count(I);
2895 }
2896 /// Register given instruction as already analyzed for being possible
2897 /// reduction root.
2899 AnalyzedReductionsRoots.insert(I);
2900 }
2901 /// Checks if the provided list of reduced values was checked already for
2902 /// vectorization.
2904 return AnalyzedReductionVals.contains(hash_value(VL));
2905 }
2906 /// Adds the list of reduced values to list of already checked values for the
2907 /// vectorization.
2909 AnalyzedReductionVals.insert(hash_value(VL));
2910 }
2911 /// Clear the list of the analyzed reduction root instructions.
2913 AnalyzedReductionsRoots.clear();
2914 AnalyzedReductionVals.clear();
2915 AnalyzedMinBWVals.clear();
2916 }
2917 /// Checks if the given value is gathered in one of the nodes.
2918 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2919 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2920 }
2921 /// Checks if the given value is gathered in one of the nodes.
2922 bool isGathered(const Value *V) const {
2923 return MustGather.contains(V);
2924 }
2925 /// Checks if the specified value was not schedule.
2926 bool isNotScheduled(const Value *V) const {
2927 return NonScheduledFirst.contains(V);
2928 }
2929
2930 /// Check if the value is vectorized in the tree.
2931 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2932
2933 ~BoUpSLP();
2934
2935private:
2936 /// Determine if a node \p E in can be demoted to a smaller type with a
2937 /// truncation. We collect the entries that will be demoted in ToDemote.
2938 /// \param E Node for analysis
2939 /// \param ToDemote indices of the nodes to be demoted.
2940 bool collectValuesToDemote(
2941 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2943 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2944 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2945
2946 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2947 /// reordering (i.e. the operands can be reordered because they have only one
2948 /// user and reordarable).
2949 /// \param ReorderableGathers List of all gather nodes that require reordering
2950 /// (e.g., gather of extractlements or partially vectorizable loads).
2951 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2952 /// reordering, subset of \p NonVectorized.
2953 bool
2954 canReorderOperands(TreeEntry *UserTE,
2955 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2956 ArrayRef<TreeEntry *> ReorderableGathers,
2957 SmallVectorImpl<TreeEntry *> &GatherOps);
2958
2959 /// Checks if the given \p TE is a gather node with clustered reused scalars
2960 /// and reorders it per given \p Mask.
2961 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2962
2963 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2964 /// if any. If it is not vectorized (gather node), returns nullptr.
2965 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2966 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2967 TreeEntry *TE = nullptr;
2968 const auto *It = find_if(VL, [&](Value *V) {
2969 TE = getTreeEntry(V);
2970 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2971 return true;
2972 auto It = MultiNodeScalars.find(V);
2973 if (It != MultiNodeScalars.end()) {
2974 for (TreeEntry *E : It->second) {
2975 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2976 TE = E;
2977 return true;
2978 }
2979 }
2980 }
2981 return false;
2982 });
2983 if (It != VL.end()) {
2984 assert(TE->isSame(VL) && "Expected same scalars.");
2985 return TE;
2986 }
2987 return nullptr;
2988 }
2989
2990 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2991 /// if any. If it is not vectorized (gather node), returns nullptr.
2992 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2993 unsigned OpIdx) const {
2994 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2995 const_cast<TreeEntry *>(UserTE), OpIdx);
2996 }
2997
2998 /// Checks if all users of \p I are the part of the vectorization tree.
2999 bool areAllUsersVectorized(
3000 Instruction *I,
3001 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3002
3003 /// Return information about the vector formed for the specified index
3004 /// of a vector of (the same) instruction.
3006
3007 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3008 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3009
3010 /// Gets the root instruction for the given node. If the node is a strided
3011 /// load/store node with the reverse order, the root instruction is the last
3012 /// one.
3013 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3014
3015 /// \returns Cast context for the given graph node.
3017 getCastContextHint(const TreeEntry &TE) const;
3018
3019 /// \returns the cost of the vectorizable entry.
3020 InstructionCost getEntryCost(const TreeEntry *E,
3021 ArrayRef<Value *> VectorizedVals,
3022 SmallPtrSetImpl<Value *> &CheckedExtracts);
3023
3024 /// This is the recursive part of buildTree.
3025 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3026 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3027
3028 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3029 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3030 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3031 /// returns false, setting \p CurrentOrder to either an empty vector or a
3032 /// non-identity permutation that allows to reuse extract instructions.
3033 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3034 /// extract order.
3035 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
3036 SmallVectorImpl<unsigned> &CurrentOrder,
3037 bool ResizeAllowed = false) const;
3038
3039 /// Vectorize a single entry in the tree.
3040 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3041 /// avoid issues with def-use order.
3042 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3043
3044 /// Returns vectorized operand node, that matches the order of the scalars
3045 /// operand number \p NodeIdx in entry \p E.
3046 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3047 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3048 unsigned NodeIdx) const {
3049 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3050 }
3051
3052 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3053 /// \p E.
3054 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3055 /// avoid issues with def-use order.
3056 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3057
3058 /// Create a new vector from a list of scalar values. Produces a sequence
3059 /// which exploits values reused across lanes, and arranges the inserts
3060 /// for ease of later optimization.
3061 template <typename BVTy, typename ResTy, typename... Args>
3062 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3063
3064 /// Create a new vector from a list of scalar values. Produces a sequence
3065 /// which exploits values reused across lanes, and arranges the inserts
3066 /// for ease of later optimization.
3067 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3068 bool PostponedPHIs);
3069
3070 /// Returns the instruction in the bundle, which can be used as a base point
3071 /// for scheduling. Usually it is the last instruction in the bundle, except
3072 /// for the case when all operands are external (in this case, it is the first
3073 /// instruction in the list).
3074 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3075
3076 /// Tries to find extractelement instructions with constant indices from fixed
3077 /// vector type and gather such instructions into a bunch, which highly likely
3078 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3079 /// was successful, the matched scalars are replaced by poison values in \p VL
3080 /// for future analysis.
3081 std::optional<TargetTransformInfo::ShuffleKind>
3082 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3083 SmallVectorImpl<int> &Mask) const;
3084
3085 /// Tries to find extractelement instructions with constant indices from fixed
3086 /// vector type and gather such instructions into a bunch, which highly likely
3087 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3088 /// was successful, the matched scalars are replaced by poison values in \p VL
3089 /// for future analysis.
3091 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3093 unsigned NumParts) const;
3094
3095 /// Checks if the gathered \p VL can be represented as a single register
3096 /// shuffle(s) of previous tree entries.
3097 /// \param TE Tree entry checked for permutation.
3098 /// \param VL List of scalars (a subset of the TE scalar), checked for
3099 /// permutations. Must form single-register vector.
3100 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3101 /// commands to build the mask using the original vector value, without
3102 /// relying on the potential reordering.
3103 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3104 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3105 std::optional<TargetTransformInfo::ShuffleKind>
3106 isGatherShuffledSingleRegisterEntry(
3107 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3108 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3109 bool ForOrder);
3110
3111 /// Checks if the gathered \p VL can be represented as multi-register
3112 /// shuffle(s) of previous tree entries.
3113 /// \param TE Tree entry checked for permutation.
3114 /// \param VL List of scalars (a subset of the TE scalar), checked for
3115 /// permutations.
3116 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3117 /// commands to build the mask using the original vector value, without
3118 /// relying on the potential reordering.
3119 /// \returns per-register series of ShuffleKind, if gathered values can be
3120 /// represented as shuffles of previous tree entries. \p Mask is filled with
3121 /// the shuffle mask (also on per-register base).
3123 isGatherShuffledEntry(
3124 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3126 unsigned NumParts, bool ForOrder = false);
3127
3128 /// \returns the cost of gathering (inserting) the values in \p VL into a
3129 /// vector.
3130 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3131 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3132 Type *ScalarTy) const;
3133
3134 /// Set the Builder insert point to one after the last instruction in
3135 /// the bundle
3136 void setInsertPointAfterBundle(const TreeEntry *E);
3137
3138 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3139 /// specified, the starting vector value is poison.
3140 Value *
3141 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3142 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3143
3144 /// \returns whether the VectorizableTree is fully vectorizable and will
3145 /// be beneficial even the tree height is tiny.
3146 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3147
3148 /// Run through the list of all gathered loads in the graph and try to find
3149 /// vector loads/masked gathers instead of regular gathers. Later these loads
3150 /// are reshufled to build final gathered nodes.
3151 void tryToVectorizeGatheredLoads(
3152 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3153 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3154 8> &GatheredLoads);
3155
3156 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3157 /// users of \p TE and collects the stores. It returns the map from the store
3158 /// pointers to the collected stores.
3160 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3161
3162 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3163 /// stores in \p StoresVec can form a vector instruction. If so it returns
3164 /// true and populates \p ReorderIndices with the shuffle indices of the
3165 /// stores when compared to the sorted vector.
3166 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3167 OrdersType &ReorderIndices) const;
3168
3169 /// Iterates through the users of \p TE, looking for scalar stores that can be
3170 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3171 /// their order and builds an order index vector for each store bundle. It
3172 /// returns all these order vectors found.
3173 /// We run this after the tree has formed, otherwise we may come across user
3174 /// instructions that are not yet in the tree.
3176 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3177
3178 /// Tries to reorder the gathering node for better vectorization
3179 /// opportunities.
3180 void reorderGatherNode(TreeEntry &TE);
3181
3182 struct TreeEntry {
3183 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3184 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3185
3186 /// \returns Common mask for reorder indices and reused scalars.
3187 SmallVector<int> getCommonMask() const {
3189 inversePermutation(ReorderIndices, Mask);
3190 ::addMask(Mask, ReuseShuffleIndices);
3191 return Mask;
3192 }
3193
3194 /// \returns true if the scalars in VL are equal to this entry.
3195 bool isSame(ArrayRef<Value *> VL) const {
3196 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3197 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3198 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3199 return VL.size() == Mask.size() &&
3200 std::equal(VL.begin(), VL.end(), Mask.begin(),
3201 [Scalars](Value *V, int Idx) {
3202 return (isa<UndefValue>(V) &&
3203 Idx == PoisonMaskElem) ||
3204 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3205 });
3206 };
3207 if (!ReorderIndices.empty()) {
3208 // TODO: implement matching if the nodes are just reordered, still can
3209 // treat the vector as the same if the list of scalars matches VL
3210 // directly, without reordering.
3212 inversePermutation(ReorderIndices, Mask);
3213 if (VL.size() == Scalars.size())
3214 return IsSame(Scalars, Mask);
3215 if (VL.size() == ReuseShuffleIndices.size()) {
3216 ::addMask(Mask, ReuseShuffleIndices);
3217 return IsSame(Scalars, Mask);
3218 }
3219 return false;
3220 }
3221 return IsSame(Scalars, ReuseShuffleIndices);
3222 }
3223
3224 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3225 return isGather() && !UserTreeIndices.empty() &&
3226 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3227 UserTreeIndices.front().UserTE == UserEI.UserTE;
3228 }
3229
3230 /// \returns true if current entry has same operands as \p TE.
3231 bool hasEqualOperands(const TreeEntry &TE) const {
3232 if (TE.getNumOperands() != getNumOperands())
3233 return false;
3234 SmallBitVector Used(getNumOperands());
3235 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3236 unsigned PrevCount = Used.count();
3237 for (unsigned K = 0; K < E; ++K) {
3238 if (Used.test(K))
3239 continue;
3240 if (getOperand(K) == TE.getOperand(I)) {
3241 Used.set(K);
3242 break;
3243 }
3244 }
3245 // Check if we actually found the matching operand.
3246 if (PrevCount == Used.count())
3247 return false;
3248 }
3249 return true;
3250 }
3251
3252 /// \return Final vectorization factor for the node. Defined by the total
3253 /// number of vectorized scalars, including those, used several times in the
3254 /// entry and counted in the \a ReuseShuffleIndices, if any.
3255 unsigned getVectorFactor() const {
3256 if (!ReuseShuffleIndices.empty())
3257 return ReuseShuffleIndices.size();
3258 return Scalars.size();
3259 };
3260
3261 /// Checks if the current node is a gather node.
3262 bool isGather() const {return State == NeedToGather; }
3263
3264 /// A vector of scalars.
3265 ValueList Scalars;
3266
3267 /// The Scalars are vectorized into this value. It is initialized to Null.
3268 WeakTrackingVH VectorizedValue = nullptr;
3269
3270 /// New vector phi instructions emitted for the vectorized phi nodes.
3271 PHINode *PHI = nullptr;
3272
3273 /// Do we need to gather this sequence or vectorize it
3274 /// (either with vector instruction or with scatter/gather
3275 /// intrinsics for store/load)?
3276 enum EntryState {
3277 Vectorize, ///< The node is regularly vectorized.
3278 ScatterVectorize, ///< Masked scatter/gather node.
3279 StridedVectorize, ///< Strided loads (and stores)
3280 NeedToGather, ///< Gather/buildvector node.
3281 CombinedVectorize, ///< Vectorized node, combined with its user into more
3282 ///< complex node like select/cmp to minmax, mul/add to
3283 ///< fma, etc. Must be used for the following nodes in
3284 ///< the pattern, not the very first one.
3285 };
3286 EntryState State;
3287
3288 /// List of combined opcodes supported by the vectorizer.
3289 enum CombinedOpcode {
3290 NotCombinedOp = -1,
3291 MinMax = Instruction::OtherOpsEnd + 1,
3292 };
3293 CombinedOpcode CombinedOp = NotCombinedOp;
3294
3295 /// Does this sequence require some shuffling?
3296 SmallVector<int, 4> ReuseShuffleIndices;
3297
3298 /// Does this entry require reordering?
3299 SmallVector<unsigned, 4> ReorderIndices;
3300
3301 /// Points back to the VectorizableTree.
3302 ///
3303 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3304 /// to be a pointer and needs to be able to initialize the child iterator.
3305 /// Thus we need a reference back to the container to translate the indices
3306 /// to entries.
3307 VecTreeTy &Container;
3308
3309 /// The TreeEntry index containing the user of this entry. We can actually
3310 /// have multiple users so the data structure is not truly a tree.
3311 SmallVector<EdgeInfo, 1> UserTreeIndices;
3312
3313 /// The index of this treeEntry in VectorizableTree.
3314 unsigned Idx = 0;
3315
3316 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3317 /// other nodes as a series of insertvector instructions.
3318 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3319
3320 private:
3321 /// The operands of each instruction in each lane Operands[op_index][lane].
3322 /// Note: This helps avoid the replication of the code that performs the
3323 /// reordering of operands during buildTree_rec() and vectorizeTree().
3325
3326 /// The main/alternate instruction.
3327 Instruction *MainOp = nullptr;
3328 Instruction *AltOp = nullptr;
3329
3330 /// Interleaving factor for interleaved loads Vectorize nodes.
3331 unsigned InterleaveFactor = 0;
3332
3333 public:
3334 /// Returns interleave factor for interleave nodes.
3335 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3336 /// Sets interleaving factor for the interleaving nodes.
3337 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3338
3339 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3340 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3341 if (Operands.size() < OpIdx + 1)
3342 Operands.resize(OpIdx + 1);
3343 assert(Operands[OpIdx].empty() && "Already resized?");
3344 assert(OpVL.size() <= Scalars.size() &&
3345 "Number of operands is greater than the number of scalars.");
3346 Operands[OpIdx].resize(OpVL.size());
3347 copy(OpVL, Operands[OpIdx].begin());
3348 }
3349
3350 /// Set this bundle's operand from Scalars.
3351 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3352 VLOperands Ops(Scalars, MainOp, R);
3353 if (RequireReorder)
3354 Ops.reorder();
3355 for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
3356 setOperand(I, Ops.getVL(I));
3357 }
3358
3359 /// Reorders operands of the node to the given mask \p Mask.
3360 void reorderOperands(ArrayRef<int> Mask) {
3361 for (ValueList &Operand : Operands)
3362 reorderScalars(Operand, Mask);
3363 }
3364
3365 /// \returns the \p OpIdx operand of this TreeEntry.
3366 ValueList &getOperand(unsigned OpIdx) {
3367 assert(OpIdx < Operands.size() && "Off bounds");
3368 return Operands[OpIdx];
3369 }
3370
3371 /// \returns the \p OpIdx operand of this TreeEntry.
3372 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3373 assert(OpIdx < Operands.size() && "Off bounds");
3374 return Operands[OpIdx];
3375 }
3376
3377 /// \returns the number of operands.
3378 unsigned getNumOperands() const { return Operands.size(); }
3379
3380 /// \return the single \p OpIdx operand.
3381 Value *getSingleOperand(unsigned OpIdx) const {
3382 assert(OpIdx < Operands.size() && "Off bounds");
3383 assert(!Operands[OpIdx].empty() && "No operand available");
3384 return Operands[OpIdx][0];
3385 }
3386
3387 /// Some of the instructions in the list have alternate opcodes.
3388 bool isAltShuffle() const { return MainOp != AltOp; }
3389
3390 bool isOpcodeOrAlt(Instruction *I) const {
3391 unsigned CheckedOpcode = I->getOpcode();
3392 return (getOpcode() == CheckedOpcode ||
3393 getAltOpcode() == CheckedOpcode);
3394 }
3395
3396 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3397 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3398 /// \p OpValue.
3399 Value *isOneOf(Value *Op) const {
3400 auto *I = dyn_cast<Instruction>(Op);
3401 if (I && isOpcodeOrAlt(I))
3402 return Op;
3403 return MainOp;
3404 }
3405
3406 void setOperations(const InstructionsState &S) {
3407 assert(S && "InstructionsState is invalid.");
3408 MainOp = S.getMainOp();
3409 AltOp = S.getAltOp();
3410 }
3411
3412 Instruction *getMainOp() const {
3413 return MainOp;
3414 }
3415
3416 Instruction *getAltOp() const {
3417 return AltOp;
3418 }
3419
3420 /// The main/alternate opcodes for the list of instructions.
3421 unsigned getOpcode() const {
3422 return MainOp ? MainOp->getOpcode() : 0;
3423 }
3424
3425 unsigned getAltOpcode() const {
3426 return AltOp ? AltOp->getOpcode() : 0;
3427 }
3428
3429 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3430 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3431 int findLaneForValue(Value *V) const {
3432 unsigned FoundLane = getVectorFactor();
3433 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3434 std::advance(It, 1)) {
3435 if (*It != V)
3436 continue;
3437 FoundLane = std::distance(Scalars.begin(), It);
3438 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3439 if (!ReorderIndices.empty())
3440 FoundLane = ReorderIndices[FoundLane];
3441 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3442 if (ReuseShuffleIndices.empty())
3443 break;
3444 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3445 RIt != ReuseShuffleIndices.end()) {
3446 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3447 break;
3448 }
3449 }
3450 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3451 return FoundLane;
3452 }
3453
3454 /// Build a shuffle mask for graph entry which represents a merge of main
3455 /// and alternate operations.
3456 void
3457 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3459 SmallVectorImpl<Value *> *OpScalars = nullptr,
3460 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3461
3462 /// Return true if this is a non-power-of-2 node.
3463 bool isNonPowOf2Vec() const {
3464 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3465 return IsNonPowerOf2;
3466 }
3467
3468 /// Return true if this is a node, which tries to vectorize number of
3469 /// elements, forming whole vectors.
3470 bool
3471 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3472 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3473 TTI, getValueType(Scalars.front()), Scalars.size());
3474 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3475 "Reshuffling not supported with non-power-of-2 vectors yet.");
3476 return IsNonPowerOf2;
3477 }
3478
3479 Value *getOrdered(unsigned Idx) const {
3480 assert(isGather() && "Must be used only for buildvectors/gathers.");
3481 if (ReorderIndices.empty())
3482 return Scalars[Idx];
3484 inversePermutation(ReorderIndices, Mask);
3485 return Scalars[Mask[Idx]];
3486 }
3487
3488#ifndef NDEBUG
3489 /// Debug printer.
3490 LLVM_DUMP_METHOD void dump() const {
3491 dbgs() << Idx << ".\n";
3492 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3493 dbgs() << "Operand " << OpI << ":\n";
3494 for (const Value *V : Operands[OpI])
3495 dbgs().indent(2) << *V << "\n";
3496 }
3497 dbgs() << "Scalars: \n";
3498 for (Value *V : Scalars)
3499 dbgs().indent(2) << *V << "\n";
3500 dbgs() << "State: ";
3501 switch (State) {
3502 case Vectorize:
3503 if (InterleaveFactor > 0) {
3504 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3505 << "\n";
3506 } else {
3507 dbgs() << "Vectorize\n";
3508 }
3509 break;
3510 case ScatterVectorize:
3511 dbgs() << "ScatterVectorize\n";
3512 break;
3513 case StridedVectorize:
3514 dbgs() << "StridedVectorize\n";
3515 break;
3516 case NeedToGather:
3517 dbgs() << "NeedToGather\n";
3518 break;
3519 case CombinedVectorize:
3520 dbgs() << "CombinedVectorize\n";
3521 break;
3522 }
3523 dbgs() << "MainOp: ";
3524 if (MainOp)
3525 dbgs() << *MainOp << "\n";
3526 else
3527 dbgs() << "NULL\n";
3528 dbgs() << "AltOp: ";
3529 if (AltOp)
3530 dbgs() << *AltOp << "\n";
3531 else
3532 dbgs() << "NULL\n";
3533 dbgs() << "VectorizedValue: ";
3534 if (VectorizedValue)
3535 dbgs() << *VectorizedValue << "\n";
3536 else
3537 dbgs() << "NULL\n";
3538 dbgs() << "ReuseShuffleIndices: ";
3539 if (ReuseShuffleIndices.empty())
3540 dbgs() << "Empty";
3541 else
3542 for (int ReuseIdx : ReuseShuffleIndices)
3543 dbgs() << ReuseIdx << ", ";
3544 dbgs() << "\n";
3545 dbgs() << "ReorderIndices: ";
3546 for (unsigned ReorderIdx : ReorderIndices)
3547 dbgs() << ReorderIdx << ", ";
3548 dbgs() << "\n";
3549 dbgs() << "UserTreeIndices: ";
3550 for (const auto &EInfo : UserTreeIndices)
3551 dbgs() << EInfo << ", ";
3552 dbgs() << "\n";
3553 if (!CombinedEntriesWithIndices.empty()) {
3554 dbgs() << "Combined entries: ";
3555 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3556 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3557 });
3558 dbgs() << "\n";
3559 }
3560 }
3561#endif
3562 };
3563
3564#ifndef NDEBUG
3565 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3566 InstructionCost VecCost, InstructionCost ScalarCost,
3567 StringRef Banner) const {
3568 dbgs() << "SLP: " << Banner << ":\n";
3569 E->dump();
3570 dbgs() << "SLP: Costs:\n";
3571 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3572 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3573 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3574 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3575 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3576 }
3577#endif
3578
3579 /// Create a new VectorizableTree entry.
3580 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3581 std::optional<ScheduleData *> Bundle,
3582 const InstructionsState &S,
3583 const EdgeInfo &UserTreeIdx,
3584 ArrayRef<int> ReuseShuffleIndices = {},
3585 ArrayRef<unsigned> ReorderIndices = {},
3586 unsigned InterleaveFactor = 0) {
3587 TreeEntry::EntryState EntryState =
3588 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3589 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3590 ReuseShuffleIndices, ReorderIndices);
3591 if (E && InterleaveFactor > 0)
3592 E->setInterleave(InterleaveFactor);
3593 return E;
3594 }
3595
3596 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3597 TreeEntry::EntryState EntryState,
3598 std::optional<ScheduleData *> Bundle,
3599 const InstructionsState &S,
3600 const EdgeInfo &UserTreeIdx,
3601 ArrayRef<int> ReuseShuffleIndices = {},
3602 ArrayRef<unsigned> ReorderIndices = {}) {
3603 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3604 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3605 "Need to vectorize gather entry?");
3606 // Gathered loads still gathered? Do not create entry, use the original one.
3607 if (GatheredLoadsEntriesFirst.has_value() &&
3608 EntryState == TreeEntry::NeedToGather && S &&
3609 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3610 !UserTreeIdx.UserTE)
3611 return nullptr;
3612 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3613 TreeEntry *Last = VectorizableTree.back().get();
3614 Last->Idx = VectorizableTree.size() - 1;
3615 Last->State = EntryState;
3616 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3617 // for non-power-of-two vectors.
3618 assert(
3620 ReuseShuffleIndices.empty()) &&
3621 "Reshuffling scalars not yet supported for nodes with padding");
3622 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3623 ReuseShuffleIndices.end());
3624 if (ReorderIndices.empty()) {
3625 Last->Scalars.assign(VL.begin(), VL.end());
3626 if (S)
3627 Last->setOperations(S);
3628 } else {
3629 // Reorder scalars and build final mask.
3630 Last->Scalars.assign(VL.size(), nullptr);
3631 transform(ReorderIndices, Last->Scalars.begin(),
3632 [VL](unsigned Idx) -> Value * {
3633 if (Idx >= VL.size())
3634 return UndefValue::get(VL.front()->getType());
3635 return VL[Idx];
3636 });
3637 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3638 if (S)
3639 Last->setOperations(S);
3640 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3641 }
3642 if (!Last->isGather()) {
3643 for (Value *V : VL) {
3644 const TreeEntry *TE = getTreeEntry(V);
3645 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3646 "Scalar already in tree!");
3647 if (TE) {
3648 if (TE != Last)
3649 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3650 continue;
3651 }
3652 ScalarToTreeEntry[V] = Last;
3653 }
3654 // Update the scheduler bundle to point to this TreeEntry.
3655 ScheduleData *BundleMember = *Bundle;
3656 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3657 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3658 doesNotNeedToSchedule(VL)) &&
3659 "Bundle and VL out of sync");
3660 if (BundleMember) {
3661 for (Value *V : VL) {
3663 continue;
3664 if (!BundleMember)
3665 continue;
3666 BundleMember->TE = Last;
3667 BundleMember = BundleMember->NextInBundle;
3668 }
3669 }
3670 assert(!BundleMember && "Bundle and VL out of sync");
3671 } else {
3672 // Build a map for gathered scalars to the nodes where they are used.
3673 bool AllConstsOrCasts = true;
3674 for (Value *V : VL)
3675 if (!isConstant(V)) {
3676 auto *I = dyn_cast<CastInst>(V);
3677 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3678 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3679 !UserTreeIdx.UserTE->isGather())
3680 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3681 }
3682 if (AllConstsOrCasts)
3683 CastMaxMinBWSizes =
3684 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3685 MustGather.insert(VL.begin(), VL.end());
3686 }
3687
3688 if (UserTreeIdx.UserTE)
3689 Last->UserTreeIndices.push_back(UserTreeIdx);
3690 return Last;
3691 }
3692
3693 /// -- Vectorization State --
3694 /// Holds all of the tree entries.
3695 TreeEntry::VecTreeTy VectorizableTree;
3696
3697#ifndef NDEBUG
3698 /// Debug printer.
3699 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3700 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3701 VectorizableTree[Id]->dump();
3702 dbgs() << "\n";
3703 }
3704 }
3705#endif
3706
3707 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3708
3709 const TreeEntry *getTreeEntry(Value *V) const {
3710 return ScalarToTreeEntry.lookup(V);
3711 }
3712
3713 /// Check that the operand node of alternate node does not generate
3714 /// buildvector sequence. If it is, then probably not worth it to build
3715 /// alternate shuffle, if number of buildvector operands + alternate
3716 /// instruction > than the number of buildvector instructions.
3717 /// \param S the instructions state of the analyzed values.
3718 /// \param VL list of the instructions with alternate opcodes.
3719 bool areAltOperandsProfitable(const InstructionsState &S,
3720 ArrayRef<Value *> VL) const;
3721
3722 /// Checks if the specified list of the instructions/values can be vectorized
3723 /// and fills required data before actual scheduling of the instructions.
3724 TreeEntry::EntryState
3725 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3726 bool IsScatterVectorizeUserTE,
3727 OrdersType &CurrentOrder,
3728 SmallVectorImpl<Value *> &PointerOps);
3729
3730 /// Maps a specific scalar to its tree entry.
3731 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3732
3733 /// List of scalars, used in several vectorize nodes, and the list of the
3734 /// nodes.
3736
3737 /// Maps a value to the proposed vectorizable size.
3738 SmallDenseMap<Value *, unsigned> InstrElementSize;
3739
3740 /// A list of scalars that we found that we need to keep as scalars.
3741 ValueSet MustGather;
3742
3743 /// A set of first non-schedulable values.
3744 ValueSet NonScheduledFirst;
3745
3746 /// A map between the vectorized entries and the last instructions in the
3747 /// bundles. The bundles are built in use order, not in the def order of the
3748 /// instructions. So, we cannot rely directly on the last instruction in the
3749 /// bundle being the last instruction in the program order during
3750 /// vectorization process since the basic blocks are affected, need to
3751 /// pre-gather them before.
3752 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3753
3754 /// List of gather nodes, depending on other gather/vector nodes, which should
3755 /// be emitted after the vector instruction emission process to correctly
3756 /// handle order of the vector instructions and shuffles.
3757 SetVector<const TreeEntry *> PostponedGathers;
3758
3759 using ValueToGatherNodesMap =
3761 ValueToGatherNodesMap ValueToGatherNodes;
3762
3763 /// A list of the load entries (node indices), which can be vectorized using
3764 /// strided or masked gather approach, but attempted to be represented as
3765 /// contiguous loads.
3766 SetVector<unsigned> LoadEntriesToVectorize;
3767
3768 /// true if graph nodes transforming mode is on.
3769 bool IsGraphTransformMode = false;
3770
3771 /// The index of the first gathered load entry in the VectorizeTree.
3772 std::optional<unsigned> GatheredLoadsEntriesFirst;
3773
3774 /// This POD struct describes one external user in the vectorized tree.
3775 struct ExternalUser {
3776 ExternalUser(Value *S, llvm::User *U, int L)
3777 : Scalar(S), User(U), Lane(L) {}
3778
3779 // Which scalar in our function.
3780 Value *Scalar;
3781
3782 // Which user that uses the scalar.
3784
3785 // Which lane does the scalar belong to.
3786 int Lane;
3787 };
3788 using UserList = SmallVector<ExternalUser, 16>;
3789
3790 /// Checks if two instructions may access the same memory.
3791 ///
3792 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3793 /// is invariant in the calling loop.
3794 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3795 Instruction *Inst2) {
3796 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3797 return true;
3798 // First check if the result is already in the cache.
3799 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3800 auto It = AliasCache.find(Key);
3801 if (It != AliasCache.end())
3802 return It->second;
3803 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3804 // Store the result in the cache.
3805 AliasCache.try_emplace(Key, Aliased);
3806 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3807 return Aliased;
3808 }
3809
3810 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3811
3812 /// Cache for alias results.
3813 /// TODO: consider moving this to the AliasAnalysis itself.
3815
3816 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3817 // globally through SLP because we don't perform any action which
3818 // invalidates capture results.
3819 BatchAAResults BatchAA;
3820
3821 /// Temporary store for deleted instructions. Instructions will be deleted
3822 /// eventually when the BoUpSLP is destructed. The deferral is required to
3823 /// ensure that there are no incorrect collisions in the AliasCache, which
3824 /// can happen if a new instruction is allocated at the same address as a
3825 /// previously deleted instruction.
3826 DenseSet<Instruction *> DeletedInstructions;
3827
3828 /// Set of the instruction, being analyzed already for reductions.
3829 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3830
3831 /// Set of hashes for the list of reduction values already being analyzed.
3832 DenseSet<size_t> AnalyzedReductionVals;
3833
3834 /// Values, already been analyzed for mininmal bitwidth and found to be
3835 /// non-profitable.
3836 DenseSet<Value *> AnalyzedMinBWVals;
3837
3838 /// A list of values that need to extracted out of the tree.
3839 /// This list holds pairs of (Internal Scalar : External User). External User
3840 /// can be nullptr, it means that this Internal Scalar will be used later,
3841 /// after vectorization.
3842 UserList ExternalUses;
3843
3844 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3845 /// extractelement instructions.
3846 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3847
3848 /// Values used only by @llvm.assume calls.
3850
3851 /// Holds all of the instructions that we gathered, shuffle instructions and
3852 /// extractelements.
3853 SetVector<Instruction *> GatherShuffleExtractSeq;
3854
3855 /// A list of blocks that we are going to CSE.
3856 DenseSet<BasicBlock *> CSEBlocks;
3857
3858 /// List of hashes of vector of loads, which are known to be non vectorizable.
3859 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3860
3861 /// Contains all scheduling relevant data for an instruction.
3862 /// A ScheduleData either represents a single instruction or a member of an
3863 /// instruction bundle (= a group of instructions which is combined into a
3864 /// vector instruction).
3865 struct ScheduleData {
3866 // The initial value for the dependency counters. It means that the
3867 // dependencies are not calculated yet.
3868 enum { InvalidDeps = -1 };
3869
3870 ScheduleData() = default;
3871
3872 void init(int BlockSchedulingRegionID, Instruction *I) {
3873 FirstInBundle = this;
3874 NextInBundle = nullptr;
3875 NextLoadStore = nullptr;
3876 IsScheduled = false;
3877 SchedulingRegionID = BlockSchedulingRegionID;
3878 clearDependencies();
3879 Inst = I;
3880 TE = nullptr;
3881 }
3882
3883 /// Verify basic self consistency properties
3884 void verify() {
3885 if (hasValidDependencies()) {
3886 assert(UnscheduledDeps <= Dependencies && "invariant");
3887 } else {
3888 assert(UnscheduledDeps == Dependencies && "invariant");
3889 }
3890
3891 if (IsScheduled) {
3892 assert(isSchedulingEntity() &&
3893 "unexpected scheduled state");
3894 for (const ScheduleData *BundleMember = this; BundleMember;
3895 BundleMember = BundleMember->NextInBundle) {
3896 assert(BundleMember->hasValidDependencies() &&
3897 BundleMember->UnscheduledDeps == 0 &&
3898 "unexpected scheduled state");
3899 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3900 "only bundle is marked scheduled");
3901 }
3902 }
3903
3904 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3905 "all bundle members must be in same basic block");
3906 }
3907
3908 /// Returns true if the dependency information has been calculated.
3909 /// Note that depenendency validity can vary between instructions within
3910 /// a single bundle.
3911 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3912
3913 /// Returns true for single instructions and for bundle representatives
3914 /// (= the head of a bundle).
3915 bool isSchedulingEntity() const { return FirstInBundle == this; }
3916
3917 /// Returns true if it represents an instruction bundle and not only a
3918 /// single instruction.
3919 bool isPartOfBundle() const {
3920 return NextInBundle != nullptr || FirstInBundle != this || TE;
3921 }
3922
3923 /// Returns true if it is ready for scheduling, i.e. it has no more
3924 /// unscheduled depending instructions/bundles.
3925 bool isReady() const {
3926 assert(isSchedulingEntity() &&
3927 "can't consider non-scheduling entity for ready list");
3928 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3929 }
3930
3931 /// Modifies the number of unscheduled dependencies for this instruction,
3932 /// and returns the number of remaining dependencies for the containing
3933 /// bundle.
3934 int incrementUnscheduledDeps(int Incr) {
3935 assert(hasValidDependencies() &&
3936 "increment of unscheduled deps would be meaningless");
3937 UnscheduledDeps += Incr;
3938 return FirstInBundle->unscheduledDepsInBundle();
3939 }
3940
3941 /// Sets the number of unscheduled dependencies to the number of
3942 /// dependencies.
3943 void resetUnscheduledDeps() {
3944 UnscheduledDeps = Dependencies;
3945 }
3946
3947 /// Clears all dependency information.
3948 void clearDependencies() {
3949 Dependencies = InvalidDeps;
3950 resetUnscheduledDeps();
3951 MemoryDependencies.clear();
3952 ControlDependencies.clear();
3953 }
3954
3955 int unscheduledDepsInBundle() const {
3956 assert(isSchedulingEntity() && "only meaningful on the bundle");
3957 int Sum = 0;
3958 for (const ScheduleData *BundleMember = this; BundleMember;
3959 BundleMember = BundleMember->NextInBundle) {
3960 if (BundleMember->UnscheduledDeps == InvalidDeps)
3961 return InvalidDeps;
3962 Sum += BundleMember->UnscheduledDeps;
3963 }
3964 return Sum;
3965 }
3966
3967 void dump(raw_ostream &os) const {
3968 if (!isSchedulingEntity()) {
3969 os << "/ " << *Inst;
3970 } else if (NextInBundle) {
3971 os << '[' << *Inst;
3972 ScheduleData *SD = NextInBundle;
3973 while (SD) {
3974 os << ';' << *SD->Inst;
3975 SD = SD->NextInBundle;
3976 }
3977 os << ']';
3978 } else {
3979 os << *Inst;
3980 }
3981 }
3982
3983 Instruction *Inst = nullptr;
3984
3985 /// The TreeEntry that this instruction corresponds to.
3986 TreeEntry *TE = nullptr;
3987
3988 /// Points to the head in an instruction bundle (and always to this for
3989 /// single instructions).
3990 ScheduleData *FirstInBundle = nullptr;
3991
3992 /// Single linked list of all instructions in a bundle. Null if it is a
3993 /// single instruction.
3994 ScheduleData *NextInBundle = nullptr;
3995
3996 /// Single linked list of all memory instructions (e.g. load, store, call)
3997 /// in the block - until the end of the scheduling region.
3998 ScheduleData *NextLoadStore = nullptr;
3999
4000 /// The dependent memory instructions.
4001 /// This list is derived on demand in calculateDependencies().
4002 SmallVector<ScheduleData *, 4> MemoryDependencies;
4003
4004 /// List of instructions which this instruction could be control dependent
4005 /// on. Allowing such nodes to be scheduled below this one could introduce
4006 /// a runtime fault which didn't exist in the original program.
4007 /// ex: this is a load or udiv following a readonly call which inf loops
4008 SmallVector<ScheduleData *, 4> ControlDependencies;
4009
4010 /// This ScheduleData is in the current scheduling region if this matches
4011 /// the current SchedulingRegionID of BlockScheduling.
4012 int SchedulingRegionID = 0;
4013
4014 /// Used for getting a "good" final ordering of instructions.
4015 int SchedulingPriority = 0;
4016
4017 /// The number of dependencies. Constitutes of the number of users of the
4018 /// instruction plus the number of dependent memory instructions (if any).
4019 /// This value is calculated on demand.
4020 /// If InvalidDeps, the number of dependencies is not calculated yet.
4021 int Dependencies = InvalidDeps;
4022
4023 /// The number of dependencies minus the number of dependencies of scheduled
4024 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4025 /// for scheduling.
4026 /// Note that this is negative as long as Dependencies is not calculated.
4027 int UnscheduledDeps = InvalidDeps;
4028
4029 /// True if this instruction is scheduled (or considered as scheduled in the
4030 /// dry-run).
4031 bool IsScheduled = false;
4032 };
4033
4034#ifndef NDEBUG
4036 const BoUpSLP::ScheduleData &SD) {
4037 SD.dump(os);
4038 return os;
4039 }
4040#endif
4041
4042 friend struct GraphTraits<BoUpSLP *>;
4043 friend struct DOTGraphTraits<BoUpSLP *>;
4044
4045 /// Contains all scheduling data for a basic block.
4046 /// It does not schedules instructions, which are not memory read/write
4047 /// instructions and their operands are either constants, or arguments, or
4048 /// phis, or instructions from others blocks, or their users are phis or from
4049 /// the other blocks. The resulting vector instructions can be placed at the
4050 /// beginning of the basic block without scheduling (if operands does not need
4051 /// to be scheduled) or at the end of the block (if users are outside of the
4052 /// block). It allows to save some compile time and memory used by the
4053 /// compiler.
4054 /// ScheduleData is assigned for each instruction in between the boundaries of
4055 /// the tree entry, even for those, which are not part of the graph. It is
4056 /// required to correctly follow the dependencies between the instructions and
4057 /// their correct scheduling. The ScheduleData is not allocated for the
4058 /// instructions, which do not require scheduling, like phis, nodes with
4059 /// extractelements/insertelements only or nodes with instructions, with
4060 /// uses/operands outside of the block.
4061 struct BlockScheduling {
4062 BlockScheduling(BasicBlock *BB)
4063 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4064
4065 void clear() {
4066 ReadyInsts.clear();
4067 ScheduleStart = nullptr;
4068 ScheduleEnd = nullptr;
4069 FirstLoadStoreInRegion = nullptr;
4070 LastLoadStoreInRegion = nullptr;
4071 RegionHasStackSave = false;
4072
4073 // Reduce the maximum schedule region size by the size of the
4074 // previous scheduling run.
4075 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4076 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4077 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4078 ScheduleRegionSize = 0;
4079
4080 // Make a new scheduling region, i.e. all existing ScheduleData is not
4081 // in the new region yet.
4082 ++SchedulingRegionID;
4083 }
4084
4085 ScheduleData *getScheduleData(Instruction *I) {
4086 if (BB != I->getParent())
4087 // Avoid lookup if can't possibly be in map.
4088 return nullptr;
4089 ScheduleData *SD = ScheduleDataMap.lookup(I);
4090 if (SD && isInSchedulingRegion(SD))
4091 return SD;
4092 return nullptr;
4093 }
4094
4095 ScheduleData *getScheduleData(Value *V) {
4096 if (auto *I = dyn_cast<Instruction>(V))
4097 return getScheduleData(I);
4098 return nullptr;
4099 }
4100
4101 bool isInSchedulingRegion(ScheduleData *SD) const {
4102 return SD->SchedulingRegionID == SchedulingRegionID;
4103 }
4104
4105 /// Marks an instruction as scheduled and puts all dependent ready
4106 /// instructions into the ready-list.
4107 template <typename ReadyListType>
4108 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4109 SD->IsScheduled = true;
4110 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4111
4112 for (ScheduleData *BundleMember = SD; BundleMember;
4113 BundleMember = BundleMember->NextInBundle) {
4114
4115 // Handle the def-use chain dependencies.
4116
4117 // Decrement the unscheduled counter and insert to ready list if ready.
4118 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4119 ScheduleData *OpDef = getScheduleData(I);
4120 if (OpDef && OpDef->hasValidDependencies() &&
4121 OpDef->incrementUnscheduledDeps(-1) == 0) {
4122 // There are no more unscheduled dependencies after
4123 // decrementing, so we can put the dependent instruction
4124 // into the ready list.
4125 ScheduleData *DepBundle = OpDef->FirstInBundle;
4126 assert(!DepBundle->IsScheduled &&
4127 "already scheduled bundle gets ready");
4128 ReadyList.insert(DepBundle);
4130 << "SLP: gets ready (def): " << *DepBundle << "\n");
4131 }
4132 };
4133
4134 // If BundleMember is a vector bundle, its operands may have been
4135 // reordered during buildTree(). We therefore need to get its operands
4136 // through the TreeEntry.
4137 if (TreeEntry *TE = BundleMember->TE) {
4138 // Need to search for the lane since the tree entry can be reordered.
4139 int Lane = std::distance(TE->Scalars.begin(),
4140 find(TE->Scalars, BundleMember->Inst));
4141 assert(Lane >= 0 && "Lane not set");
4142
4143 // Since vectorization tree is being built recursively this assertion
4144 // ensures that the tree entry has all operands set before reaching
4145 // this code. Couple of exceptions known at the moment are extracts
4146 // where their second (immediate) operand is not added. Since
4147 // immediates do not affect scheduler behavior this is considered
4148 // okay.
4149 auto *In = BundleMember->Inst;
4150 assert(
4151 In &&
4152 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4153 In->getNumOperands() == TE->getNumOperands()) &&
4154 "Missed TreeEntry operands?");
4155 (void)In; // fake use to avoid build failure when assertions disabled
4156
4157 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4158 OpIdx != NumOperands; ++OpIdx)
4159 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4160 DecrUnsched(I);
4161 } else {
4162 // If BundleMember is a stand-alone instruction, no operand reordering
4163 // has taken place, so we directly access its operands.
4164 for (Use &U : BundleMember->Inst->operands())
4165 if (auto *I = dyn_cast<Instruction>(U.get()))
4166 DecrUnsched(I);
4167 }
4168 // Handle the memory dependencies.
4169 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4170 if (MemoryDepSD->hasValidDependencies() &&
4171 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4172 // There are no more unscheduled dependencies after decrementing,
4173 // so we can put the dependent instruction into the ready list.
4174 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4175 assert(!DepBundle->IsScheduled &&
4176 "already scheduled bundle gets ready");
4177 ReadyList.insert(DepBundle);
4179 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4180 }
4181 }
4182 // Handle the control dependencies.
4183 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4184 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4185 // There are no more unscheduled dependencies after decrementing,
4186 // so we can put the dependent instruction into the ready list.
4187 ScheduleData *DepBundle = DepSD->FirstInBundle;
4188 assert(!DepBundle->IsScheduled &&
4189 "already scheduled bundle gets ready");
4190 ReadyList.insert(DepBundle);
4192 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4193 }
4194 }
4195 }
4196 }
4197
4198 /// Verify basic self consistency properties of the data structure.
4199 void verify() {
4200 if (!ScheduleStart)
4201 return;
4202
4203 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4204 ScheduleStart->comesBefore(ScheduleEnd) &&
4205 "Not a valid scheduling region?");
4206
4207 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4208 auto *SD = getScheduleData(I);
4209 if (!SD)
4210 continue;
4211 assert(isInSchedulingRegion(SD) &&
4212 "primary schedule data not in window?");
4213 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4214 "entire bundle in window!");
4215 SD->verify();
4216 }
4217
4218 for (auto *SD : ReadyInsts) {
4219 assert(SD->isSchedulingEntity() && SD->isReady() &&
4220 "item in ready list not ready?");
4221 (void)SD;
4222 }
4223 }
4224
4225 /// Put all instructions into the ReadyList which are ready for scheduling.
4226 template <typename ReadyListType>
4227 void initialFillReadyList(ReadyListType &ReadyList) {
4228 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4229 ScheduleData *SD = getScheduleData(I);
4230 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4231 SD->isReady()) {
4232 ReadyList.insert(SD);
4234 << "SLP: initially in ready list: " << *SD << "\n");
4235 }
4236 }
4237 }
4238
4239 /// Build a bundle from the ScheduleData nodes corresponding to the
4240 /// scalar instruction for each lane.
4241 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4242
4243 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4244 /// cyclic dependencies. This is only a dry-run, no instructions are
4245 /// actually moved at this stage.
4246 /// \returns the scheduling bundle. The returned Optional value is not
4247 /// std::nullopt if \p VL is allowed to be scheduled.
4248 std::optional<ScheduleData *>
4249 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4250 const InstructionsState &S);
4251
4252 /// Un-bundles a group of instructions.
4253 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4254
4255 /// Allocates schedule data chunk.
4256 ScheduleData *allocateScheduleDataChunks();
4257
4258 /// Extends the scheduling region so that V is inside the region.
4259 /// \returns true if the region size is within the limit.
4260 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4261
4262 /// Initialize the ScheduleData structures for new instructions in the
4263 /// scheduling region.
4264 void initScheduleData(Instruction *FromI, Instruction *ToI,
4265 ScheduleData *PrevLoadStore,
4266 ScheduleData *NextLoadStore);
4267
4268 /// Updates the dependency information of a bundle and of all instructions/
4269 /// bundles which depend on the original bundle.
4270 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4271 BoUpSLP *SLP);
4272
4273 /// Sets all instruction in the scheduling region to un-scheduled.
4274 void resetSchedule();
4275
4276 BasicBlock *BB;
4277
4278 /// Simple memory allocation for ScheduleData.
4280
4281 /// The size of a ScheduleData array in ScheduleDataChunks.
4282 int ChunkSize;
4283
4284 /// The allocator position in the current chunk, which is the last entry
4285 /// of ScheduleDataChunks.
4286 int ChunkPos;
4287
4288 /// Attaches ScheduleData to Instruction.
4289 /// Note that the mapping survives during all vectorization iterations, i.e.
4290 /// ScheduleData structures are recycled.
4292
4293 /// The ready-list for scheduling (only used for the dry-run).
4294 SetVector<ScheduleData *> ReadyInsts;
4295
4296 /// The first instruction of the scheduling region.
4297 Instruction *ScheduleStart = nullptr;
4298
4299 /// The first instruction _after_ the scheduling region.
4300 Instruction *ScheduleEnd = nullptr;
4301
4302 /// The first memory accessing instruction in the scheduling region
4303 /// (can be null).
4304 ScheduleData *FirstLoadStoreInRegion = nullptr;
4305
4306 /// The last memory accessing instruction in the scheduling region
4307 /// (can be null).
4308 ScheduleData *LastLoadStoreInRegion = nullptr;
4309
4310 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4311 /// region? Used to optimize the dependence calculation for the
4312 /// common case where there isn't.
4313 bool RegionHasStackSave = false;
4314
4315 /// The current size of the scheduling region.
4316 int ScheduleRegionSize = 0;
4317
4318 /// The maximum size allowed for the scheduling region.
4319 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4320
4321 /// The ID of the scheduling region. For a new vectorization iteration this
4322 /// is incremented which "removes" all ScheduleData from the region.
4323 /// Make sure that the initial SchedulingRegionID is greater than the
4324 /// initial SchedulingRegionID in ScheduleData (which is 0).
4325 int SchedulingRegionID = 1;
4326 };
4327
4328 /// Attaches the BlockScheduling structures to basic blocks.
4330
4331 /// Performs the "real" scheduling. Done before vectorization is actually
4332 /// performed in a basic block.
4333 void scheduleBlock(BlockScheduling *BS);
4334
4335 /// List of users to ignore during scheduling and that don't need extracting.
4336 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4337
4338 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4339 /// sorted SmallVectors of unsigned.
4340 struct OrdersTypeDenseMapInfo {
4341 static OrdersType getEmptyKey() {
4342 OrdersType V;
4343 V.push_back(~1U);
4344 return V;
4345 }
4346
4347 static OrdersType getTombstoneKey() {
4348 OrdersType V;
4349 V.push_back(~2U);
4350 return V;
4351 }
4352
4353 static unsigned getHashValue(const OrdersType &V) {
4354 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4355 }
4356
4357 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4358 return LHS == RHS;
4359 }
4360 };
4361
4362 // Analysis and block reference.
4363 Function *F;
4364 ScalarEvolution *SE;
4366 TargetLibraryInfo *TLI;
4367 LoopInfo *LI;
4368 DominatorTree *DT;
4369 AssumptionCache *AC;
4370 DemandedBits *DB;
4371 const DataLayout *DL;
4373
4374 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4375 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4376
4377 /// Instruction builder to construct the vectorized tree.
4379
4380 /// A map of scalar integer values to the smallest bit width with which they
4381 /// can legally be represented. The values map to (width, signed) pairs,
4382 /// where "width" indicates the minimum bit width and "signed" is True if the
4383 /// value must be signed-extended, rather than zero-extended, back to its
4384 /// original width.
4386
4387 /// Final size of the reduced vector, if the current graph represents the
4388 /// input for the reduction and it was possible to narrow the size of the
4389 /// reduction.
4390 unsigned ReductionBitWidth = 0;
4391
4392 /// Canonical graph size before the transformations.
4393 unsigned BaseGraphSize = 1;
4394
4395 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4396 /// type sizes, used in the tree.
4397 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4398
4399 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4400 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4401 DenseSet<unsigned> ExtraBitWidthNodes;
4402};
4403
4404} // end namespace slpvectorizer
4405
4406template <> struct GraphTraits<BoUpSLP *> {
4407 using TreeEntry = BoUpSLP::TreeEntry;
4408
4409 /// NodeRef has to be a pointer per the GraphWriter.
4411
4413
4414 /// Add the VectorizableTree to the index iterator to be able to return
4415 /// TreeEntry pointers.
4416 struct ChildIteratorType
4417 : public iterator_adaptor_base<
4418 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4420
4422 ContainerTy &VT)
4423 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4424
4425 NodeRef operator*() { return I->UserTE; }
4426 };
4427
4429 return R.VectorizableTree[0].get();
4430 }
4431
4432 static ChildIteratorType child_begin(NodeRef N) {
4433 return {N->UserTreeIndices.begin(), N->Container};
4434 }
4435
4436 static ChildIteratorType child_end(NodeRef N) {
4437 return {N->UserTreeIndices.end(), N->Container};
4438 }
4439
4440 /// For the node iterator we just need to turn the TreeEntry iterator into a
4441 /// TreeEntry* iterator so that it dereferences to NodeRef.
4442 class nodes_iterator {
4444 ItTy It;
4445
4446 public:
4447 nodes_iterator(const ItTy &It2) : It(It2) {}
4448 NodeRef operator*() { return It->get(); }
4449 nodes_iterator operator++() {
4450 ++It;
4451 return *this;
4452 }
4453 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4454 };
4455
4456 static nodes_iterator nodes_begin(BoUpSLP *R) {
4457 return nodes_iterator(R->VectorizableTree.begin());
4458 }
4459
4460 static nodes_iterator nodes_end(BoUpSLP *R) {
4461 return nodes_iterator(R->VectorizableTree.end());
4462 }
4463
4464 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4465};
4466
4467template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4468 using TreeEntry = BoUpSLP::TreeEntry;
4469
4470 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4471
4472 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4473 std::string Str;
4475 OS << Entry->Idx << ".\n";
4476 if (isSplat(Entry->Scalars))
4477 OS << "<splat> ";
4478 for (auto *V : Entry->Scalars) {
4479 OS << *V;
4480 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4481 return EU.Scalar == V;
4482 }))
4483 OS << " <extract>";
4484 OS << "\n";
4485 }
4486 return Str;
4487 }
4488
4489 static std::string getNodeAttributes(const TreeEntry *Entry,
4490 const BoUpSLP *) {
4491 if (Entry->isGather())
4492 return "color=red";
4493 if (Entry->State == TreeEntry::ScatterVectorize ||
4494 Entry->State == TreeEntry::StridedVectorize)
4495 return "color=blue";
4496 return "";
4497 }
4498};
4499
4500} // end namespace llvm
4501
4504 for (auto *I : DeletedInstructions) {
4505 if (!I->getParent()) {
4506 // Temporarily insert instruction back to erase them from parent and
4507 // memory later.
4508 if (isa<PHINode>(I))
4509 // Phi nodes must be the very first instructions in the block.
4510 I->insertBefore(F->getEntryBlock(),
4511 F->getEntryBlock().getFirstNonPHIIt());
4512 else
4513 I->insertBefore(F->getEntryBlock().getTerminator());
4514 continue;
4515 }
4516 for (Use &U : I->operands()) {
4517 auto *Op = dyn_cast<Instruction>(U.get());
4518 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4520 DeadInsts.emplace_back(Op);
4521 }
4522 I->dropAllReferences();
4523 }
4524 for (auto *I : DeletedInstructions) {
4525 assert(I->use_empty() &&
4526 "trying to erase instruction with users.");
4527 I->eraseFromParent();
4528 }
4529
4530 // Cleanup any dead scalar code feeding the vectorized instructions
4532
4533#ifdef EXPENSIVE_CHECKS
4534 // If we could guarantee that this call is not extremely slow, we could
4535 // remove the ifdef limitation (see PR47712).
4536 assert(!verifyFunction(*F, &dbgs()));
4537#endif
4538}
4539
4540/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4541/// contains original mask for the scalars reused in the node. Procedure
4542/// transform this mask in accordance with the given \p Mask.
4544 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4545 "Expected non-empty mask.");
4546 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4547 Prev.swap(Reuses);
4548 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4549 if (Mask[I] != PoisonMaskElem)
4550 Reuses[Mask[I]] = Prev[I];
4551}
4552
4553/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4554/// the original order of the scalars. Procedure transforms the provided order
4555/// in accordance with the given \p Mask. If the resulting \p Order is just an
4556/// identity order, \p Order is cleared.
4558 bool BottomOrder = false) {
4559 assert(!Mask.empty() && "Expected non-empty mask.");
4560 unsigned Sz = Mask.size();
4561 if (BottomOrder) {
4562 SmallVector<unsigned> PrevOrder;
4563 if (Order.empty()) {
4564 PrevOrder.resize(Sz);
4565 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4566 } else {
4567 PrevOrder.swap(Order);
4568 }
4569 Order.assign(Sz, Sz);
4570 for (unsigned I = 0; I < Sz; ++I)
4571 if (Mask[I] != PoisonMaskElem)
4572 Order[I] = PrevOrder[Mask[I]];
4573 if (all_of(enumerate(Order), [&](const auto &Data) {
4574 return Data.value() == Sz || Data.index() == Data.value();
4575 })) {
4576 Order.clear();
4577 return;
4578 }
4579 fixupOrderingIndices(Order);
4580 return;
4581 }
4582 SmallVector<int> MaskOrder;
4583 if (Order.empty()) {
4584 MaskOrder.resize(Sz);
4585 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4586 } else {
4587 inversePermutation(Order, MaskOrder);
4588 }
4589 reorderReuses(MaskOrder, Mask);
4590 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4591 Order.clear();
4592 return;
4593 }
4594 Order.assign(Sz, Sz);
4595 for (unsigned I = 0; I < Sz; ++I)
4596 if (MaskOrder[I] != PoisonMaskElem)
4597 Order[MaskOrder[I]] = I;
4598 fixupOrderingIndices(Order);
4599}
4600
4601std::optional<BoUpSLP::OrdersType>
4602BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4603 assert(TE.isGather() && "Expected gather node only.");
4604 // Try to find subvector extract/insert patterns and reorder only such
4605 // patterns.
4606 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4607 Type *ScalarTy = GatheredScalars.front()->getType();
4608 int NumScalars = GatheredScalars.size();
4609 if (!isValidElementType(ScalarTy))
4610 return std::nullopt;
4611 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4612 int NumParts = TTI->getNumberOfParts(VecTy);
4613 if (NumParts == 0 || NumParts >= NumScalars ||
4614 VecTy->getNumElements() % NumParts != 0 ||
4615 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4616 VecTy->getNumElements() / NumParts))
4617 NumParts = 1;
4618 SmallVector<int> ExtractMask;
4619 SmallVector<int> Mask;
4622 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4624 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4625 /*ForOrder=*/true);
4626 // No shuffled operands - ignore.
4627 if (GatherShuffles.empty() && ExtractShuffles.empty())
4628 return std::nullopt;
4629 OrdersType CurrentOrder(NumScalars, NumScalars);
4630 if (GatherShuffles.size() == 1 &&
4631 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4632 Entries.front().front()->isSame(TE.Scalars)) {
4633 // Perfect match in the graph, will reuse the previously vectorized
4634 // node. Cost is 0.
4635 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4636 return CurrentOrder;
4637 }
4638 auto IsSplatMask = [](ArrayRef<int> Mask) {
4639 int SingleElt = PoisonMaskElem;
4640 return all_of(Mask, [&](int I) {
4641 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4642 SingleElt = I;
4643 return I == PoisonMaskElem || I == SingleElt;
4644 });
4645 };
4646 // Exclusive broadcast mask - ignore.
4647 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4648 (Entries.size() != 1 ||
4649 Entries.front().front()->ReorderIndices.empty())) ||
4650 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4651 return std::nullopt;
4652 SmallBitVector ShuffledSubMasks(NumParts);
4653 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4654 ArrayRef<int> Mask, int PartSz, int NumParts,
4655 function_ref<unsigned(unsigned)> GetVF) {
4656 for (int I : seq<int>(0, NumParts)) {
4657 if (ShuffledSubMasks.test(I))
4658 continue;
4659 const int VF = GetVF(I);
4660 if (VF == 0)
4661 continue;
4662 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4663 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4664 // Shuffle of at least 2 vectors - ignore.
4665 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4666 std::fill(Slice.begin(), Slice.end(), NumScalars);
4667 ShuffledSubMasks.set(I);
4668 continue;
4669 }
4670 // Try to include as much elements from the mask as possible.
4671 int FirstMin = INT_MAX;
4672 int SecondVecFound = false;
4673 for (int K : seq<int>(Limit)) {
4674 int Idx = Mask[I * PartSz + K];
4675 if (Idx == PoisonMaskElem) {
4676 Value *V = GatheredScalars[I * PartSz + K];
4677 if (isConstant(V) && !isa<PoisonValue>(V)) {
4678 SecondVecFound = true;
4679 break;
4680 }
4681 continue;
4682 }
4683 if (Idx < VF) {
4684 if (FirstMin > Idx)
4685 FirstMin = Idx;
4686 } else {
4687 SecondVecFound = true;
4688 break;
4689 }
4690 }
4691 FirstMin = (FirstMin / PartSz) * PartSz;
4692 // Shuffle of at least 2 vectors - ignore.
4693 if (SecondVecFound) {
4694 std::fill(Slice.begin(), Slice.end(), NumScalars);
4695 ShuffledSubMasks.set(I);
4696 continue;
4697 }
4698 for (int K : seq<int>(Limit)) {
4699 int Idx = Mask[I * PartSz + K];
4700 if (Idx == PoisonMaskElem)
4701 continue;
4702 Idx -= FirstMin;
4703 if (Idx >= PartSz) {
4704 SecondVecFound = true;
4705 break;
4706 }
4707 if (CurrentOrder[I * PartSz + Idx] >
4708 static_cast<unsigned>(I * PartSz + K) &&
4709 CurrentOrder[I * PartSz + Idx] !=
4710 static_cast<unsigned>(I * PartSz + Idx))
4711 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4712 }
4713 // Shuffle of at least 2 vectors - ignore.
4714 if (SecondVecFound) {
4715 std::fill(Slice.begin(), Slice.end(), NumScalars);
4716 ShuffledSubMasks.set(I);
4717 continue;
4718 }
4719 }
4720 };
4721 int PartSz = getPartNumElems(NumScalars, NumParts);
4722 if (!ExtractShuffles.empty())
4723 TransformMaskToOrder(
4724 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4725 if (!ExtractShuffles[I])
4726 return 0U;
4727 unsigned VF = 0;
4728 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4729 for (unsigned Idx : seq<unsigned>(Sz)) {
4730 int K = I * PartSz + Idx;
4731 if (ExtractMask[K] == PoisonMaskElem)
4732 continue;
4733 if (!TE.ReuseShuffleIndices.empty())
4734 K = TE.ReuseShuffleIndices[K];
4735 if (K == PoisonMaskElem)
4736 continue;
4737 if (!TE.ReorderIndices.empty())
4738 K = std::distance(TE.ReorderIndices.begin(),
4739 find(TE.ReorderIndices, K));
4740 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4741 if (!EI)
4742 continue;
4743 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4744 ->getElementCount()
4745 .getKnownMinValue());
4746 }
4747 return VF;
4748 });
4749 // Check special corner case - single shuffle of the same entry.
4750 if (GatherShuffles.size() == 1 && NumParts != 1) {
4751 if (ShuffledSubMasks.any())
4752 return std::nullopt;
4753 PartSz = NumScalars;
4754 NumParts = 1;
4755 }
4756 if (!Entries.empty())
4757 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4758 if (!GatherShuffles[I])
4759 return 0U;
4760 return std::max(Entries[I].front()->getVectorFactor(),
4761 Entries[I].back()->getVectorFactor());
4762 });
4763 int NumUndefs =
4764 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4765 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4766 return std::nullopt;
4767 return std::move(CurrentOrder);
4768}
4769
4770static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4771 const TargetLibraryInfo &TLI,
4772 bool CompareOpcodes = true) {
4775 return false;
4776 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4777 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4778 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4779 (!GEP2 || GEP2->getNumOperands() == 2) &&
4780 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4781 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4782 !CompareOpcodes ||
4783 (GEP1 && GEP2 &&
4784 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4785}
4786
4787/// Calculates minimal alignment as a common alignment.
4788template <typename T>
4790 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4791 for (Value *V : VL.drop_front())
4792 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4793 return CommonAlignment;
4794}
4795
4796/// Check if \p Order represents reverse order.
4798 assert(!Order.empty() &&
4799 "Order is empty. Please check it before using isReverseOrder.");
4800 unsigned Sz = Order.size();
4801 return all_of(enumerate(Order), [&](const auto &Pair) {
4802 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4803 });
4804}
4805
4806/// Checks if the provided list of pointers \p Pointers represents the strided
4807/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4808/// Otherwise, if \p Inst is not specified, just initialized optional value is
4809/// returned to show that the pointers represent strided pointers. If \p Inst
4810/// specified, the runtime stride is materialized before the given \p Inst.
4811/// \returns std::nullopt if the pointers are not pointers with the runtime
4812/// stride, nullptr or actual stride value, otherwise.
4813static std::optional<Value *>
4815 const DataLayout &DL, ScalarEvolution &SE,
4816 SmallVectorImpl<unsigned> &SortedIndices,
4817 Instruction *Inst = nullptr) {
4819 const SCEV *PtrSCEVLowest = nullptr;
4820 const SCEV *PtrSCEVHighest = nullptr;
4821 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4822 // addresses).
4823 for (Value *Ptr : PointerOps) {
4824 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4825 if (!PtrSCEV)
4826 return std::nullopt;
4827 SCEVs.push_back(PtrSCEV);
4828 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4829 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4830 continue;
4831 }
4832 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4833 if (isa<SCEVCouldNotCompute>(Diff))
4834 return std::nullopt;
4835 if (Diff->isNonConstantNegative()) {
4836 PtrSCEVLowest = PtrSCEV;
4837 continue;
4838 }
4839 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4840 if (isa<SCEVCouldNotCompute>(Diff1))
4841 return std::nullopt;
4842 if (Diff1->isNonConstantNegative()) {
4843 PtrSCEVHighest = PtrSCEV;
4844 continue;
4845 }
4846 }
4847 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4848 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4849 if (isa<SCEVCouldNotCompute>(Dist))
4850 return std::nullopt;
4851 int Size = DL.getTypeStoreSize(ElemTy);
4852 auto TryGetStride = [&](const SCEV *Dist,
4853 const SCEV *Multiplier) -> const SCEV * {
4854 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4855 if (M->getOperand(0) == Multiplier)
4856 return M->getOperand(1);
4857 if (M->getOperand(1) == Multiplier)
4858 return M->getOperand(0);
4859 return nullptr;
4860 }
4861 if (Multiplier == Dist)
4862 return SE.getConstant(Dist->getType(), 1);
4863 return SE.getUDivExactExpr(Dist, Multiplier);
4864 };
4865 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4866 const SCEV *Stride = nullptr;
4867 if (Size != 1 || SCEVs.size() > 2) {
4868 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4869 Stride = TryGetStride(Dist, Sz);
4870 if (!Stride)
4871 return std::nullopt;
4872 }
4873 if (!Stride || isa<SCEVConstant>(Stride))
4874 return std::nullopt;
4875 // Iterate through all pointers and check if all distances are
4876 // unique multiple of Stride.
4877 using DistOrdPair = std::pair<int64_t, int>;
4878 auto Compare = llvm::less_first();
4879 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4880 int Cnt = 0;
4881 bool IsConsecutive = true;
4882 for (const SCEV *PtrSCEV : SCEVs) {
4883 unsigned Dist = 0;
4884 if (PtrSCEV != PtrSCEVLowest) {
4885 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4886 const SCEV *Coeff = TryGetStride(Diff, Stride);
4887 if (!Coeff)
4888 return std::nullopt;
4889 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4890 if (!SC || isa<SCEVCouldNotCompute>(SC))
4891 return std::nullopt;
4892 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4893 SE.getMulExpr(Stride, SC)))
4894 ->isZero())
4895 return std::nullopt;
4896 Dist = SC->getAPInt().getZExtValue();
4897 }
4898 // If the strides are not the same or repeated, we can't vectorize.
4899 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4900 return std::nullopt;
4901 auto Res = Offsets.emplace(Dist, Cnt);
4902 if (!Res.second)
4903 return std::nullopt;
4904 // Consecutive order if the inserted element is the last one.
4905 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4906 ++Cnt;
4907 }
4908 if (Offsets.size() != SCEVs.size())
4909 return std::nullopt;
4910 SortedIndices.clear();
4911 if (!IsConsecutive) {
4912 // Fill SortedIndices array only if it is non-consecutive.
4913 SortedIndices.resize(PointerOps.size());
4914 Cnt = 0;
4915 for (const std::pair<int64_t, int> &Pair : Offsets) {
4916 SortedIndices[Cnt] = Pair.second;
4917 ++Cnt;
4918 }
4919 }
4920 if (!Inst)
4921 return nullptr;
4922 SCEVExpander Expander(SE, DL, "strided-load-vec");
4923 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4924}
4925
4926static std::pair<InstructionCost, InstructionCost>
4928 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4929 Type *ScalarTy, VectorType *VecTy);
4930
4931/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4932/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4933/// subvector pattern.
4934static InstructionCost
4936 VectorType *Tp, ArrayRef<int> Mask = {},
4938 int Index = 0, VectorType *SubTp = nullptr,
4940 if (Kind != TTI::SK_PermuteTwoSrc)
4941 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4942 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4943 int NumSubElts;
4945 Mask, NumSrcElts, NumSubElts, Index)) {
4946 if (Index + NumSubElts > NumSrcElts &&
4947 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4948 return TTI.getShuffleCost(
4950 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4952 }
4953 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4954}
4955
4956/// Correctly creates insert_subvector, checking that the index is multiple of
4957/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4958/// using default shuffle.
4960 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
4961 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
4962 const unsigned SubVecVF = getNumElements(V->getType());
4963 if (Index % SubVecVF == 0) {
4964 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4965 Builder.getInt64(Index));
4966 } else {
4967 // Create shuffle, insertvector requires that index is multiple of
4968 // the subvector length.
4969 const unsigned VecVF = getNumElements(Vec->getType());
4971 std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
4972 for (unsigned I : seq<unsigned>(SubVecVF))
4973 Mask[I + Index] = I + VecVF;
4974 if (Generator) {
4975 Vec = Generator(Vec, V, Mask);
4976 } else {
4977 // 1. Resize V to the size of Vec.
4978 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
4979 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4980 V = Builder.CreateShuffleVector(V, ResizeMask);
4981 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
4982 }
4983 }
4984 return Vec;
4985}
4986
4990 SmallVectorImpl<Value *> &PointerOps,
4991 unsigned *BestVF, bool TryRecursiveCheck) const {
4992 // Check that a vectorized load would load the same memory as a scalar
4993 // load. For example, we don't want to vectorize loads that are smaller
4994 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4995 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4996 // from such a struct, we read/write packed bits disagreeing with the
4997 // unvectorized version.
4998 if (BestVF)
4999 *BestVF = 0;
5001 return LoadsState::Gather;
5002 Type *ScalarTy = VL0->getType();
5003
5004 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
5005 return LoadsState::Gather;
5006
5007 // Make sure all loads in the bundle are simple - we can't vectorize
5008 // atomic or volatile loads.
5009 PointerOps.clear();
5010 const unsigned Sz = VL.size();
5011 PointerOps.resize(Sz);
5012 auto *POIter = PointerOps.begin();
5013 for (Value *V : VL) {
5014 auto *L = dyn_cast<LoadInst>(V);
5015 if (!L || !L->isSimple())
5016 return LoadsState::Gather;
5017 *POIter = L->getPointerOperand();
5018 ++POIter;
5019 }
5020
5021 Order.clear();
5022 // Check the order of pointer operands or that all pointers are the same.
5023 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5024
5025 auto *VecTy = getWidenedType(ScalarTy, Sz);
5026 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5027 if (!IsSorted) {
5028 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
5029 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5030 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5032 }
5033
5034 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5035 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5036 return LoadsState::Gather;
5037
5038 if (!all_of(PointerOps, [&](Value *P) {
5039 return arePointersCompatible(P, PointerOps.front(), *TLI);
5040 }))
5041 return LoadsState::Gather;
5042
5043 } else {
5044 Value *Ptr0;
5045 Value *PtrN;
5046 if (Order.empty()) {
5047 Ptr0 = PointerOps.front();
5048 PtrN = PointerOps.back();
5049 } else {
5050 Ptr0 = PointerOps[Order.front()];
5051 PtrN = PointerOps[Order.back()];
5052 }
5053 std::optional<int> Diff =
5054 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5055 // Check that the sorted loads are consecutive.
5056 if (static_cast<unsigned>(*Diff) == Sz - 1)
5057 return LoadsState::Vectorize;
5058 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5059 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5060 return LoadsState::Gather;
5061 // Simple check if not a strided access - clear order.
5062 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5063 // Try to generate strided load node if:
5064 // 1. Target with strided load support is detected.
5065 // 2. The number of loads is greater than MinProfitableStridedLoads,
5066 // or the potential stride <= MaxProfitableLoadStride and the
5067 // potential stride is power-of-2 (to avoid perf regressions for the very
5068 // small number of loads) and max distance > number of loads, or potential
5069 // stride is -1.
5070 // 3. The loads are ordered, or number of unordered loads <=
5071 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5072 // (this check is to avoid extra costs for very expensive shuffles).
5073 // 4. Any pointer operand is an instruction with the users outside of the
5074 // current graph (for masked gathers extra extractelement instructions
5075 // might be required).
5076 auto IsAnyPointerUsedOutGraph =
5077 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5078 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5079 return !getTreeEntry(U) && !MustGather.contains(U);
5080 });
5081 });
5082 const unsigned AbsoluteDiff = std::abs(*Diff);
5083 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5085 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5086 has_single_bit(AbsoluteDiff))) &&
5087 AbsoluteDiff > Sz) ||
5088 *Diff == -(static_cast<int>(Sz) - 1))) {
5089 int Stride = *Diff / static_cast<int>(Sz - 1);
5090 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5091 Align Alignment =
5092 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5093 ->getAlign();
5094 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5095 // Iterate through all pointers and check if all distances are
5096 // unique multiple of Dist.
5097 SmallSet<int, 4> Dists;
5098 for (Value *Ptr : PointerOps) {
5099 int Dist = 0;
5100 if (Ptr == PtrN)
5101 Dist = *Diff;
5102 else if (Ptr != Ptr0)
5103 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5104 // If the strides are not the same or repeated, we can't
5105 // vectorize.
5106 if (((Dist / Stride) * Stride) != Dist ||
5107 !Dists.insert(Dist).second)
5108 break;
5109 }
5110 if (Dists.size() == Sz)
5112 }
5113 }
5114 }
5115 }
5116 // Correctly identify compare the cost of loads + shuffles rather than
5117 // strided/masked gather loads. Returns true if vectorized + shuffles
5118 // representation is better than just gather.
5119 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5120 unsigned *BestVF,
5121 bool ProfitableGatherPointers) {
5122 if (BestVF)
5123 *BestVF = 0;
5124 // Compare masked gather cost and loads + insert subvector costs.
5126 auto [ScalarGEPCost, VectorGEPCost] =
5127 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5128 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5129 // Estimate the cost of masked gather GEP. If not a splat, roughly
5130 // estimate as a buildvector, otherwise estimate as splat.
5131 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5132 VectorType *PtrVecTy =
5133 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5134 VecTy->getNumElements());
5135 if (static_cast<unsigned>(count_if(
5136 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5137 any_of(PointerOps, [&](Value *V) {
5138 return getUnderlyingObject(V) !=
5139 getUnderlyingObject(PointerOps.front());
5140 }))
5141 VectorGEPCost += TTI.getScalarizationOverhead(
5142 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5143 else
5144 VectorGEPCost +=
5146 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5147 /*Insert=*/true, /*Extract=*/false, CostKind) +
5149 // The cost of scalar loads.
5150 InstructionCost ScalarLoadsCost =
5151 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5152 [&](InstructionCost C, Value *V) {
5153 return C + TTI.getInstructionCost(
5154 cast<Instruction>(V), CostKind);
5155 }) +
5156 ScalarGEPCost;
5157 // The cost of masked gather.
5158 InstructionCost MaskedGatherCost =
5160 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5161 /*VariableMask=*/false, CommonAlignment, CostKind) +
5162 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5163 InstructionCost GatherCost =
5164 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5165 /*Extract=*/false, CostKind) +
5166 ScalarLoadsCost;
5167 // The list of loads is small or perform partial check already - directly
5168 // compare masked gather cost and gather cost.
5169 constexpr unsigned ListLimit = 4;
5170 if (!TryRecursiveCheck || VL.size() < ListLimit)
5171 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5172
5173 // FIXME: The following code has not been updated for non-power-of-2
5174 // vectors. The splitting logic here does not cover the original
5175 // vector if the vector factor is not a power of two. FIXME
5176 if (!has_single_bit(VL.size()))
5177 return false;
5178
5179 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5180 unsigned MinVF = getMinVF(2 * Sz);
5181 DemandedElts.clearAllBits();
5182 // Iterate through possible vectorization factors and check if vectorized +
5183 // shuffles is better than just gather.
5184 for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5186 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5187 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5189 SmallVector<Value *> PointerOps;
5190 LoadsState LS =
5191 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5192 /*TryRecursiveCheck=*/false);
5193 // Check that the sorted loads are consecutive.
5194 if (LS == LoadsState::Gather) {
5195 if (BestVF) {
5196 DemandedElts.setAllBits();
5197 break;
5198 }
5199 DemandedElts.setBits(Cnt, Cnt + VF);
5200 continue;
5201 }
5202 // If need the reorder - consider as high-cost masked gather for now.
5203 if ((LS == LoadsState::Vectorize ||
5205 !Order.empty() && !isReverseOrder(Order))
5207 States.push_back(LS);
5208 }
5209 if (DemandedElts.isAllOnes())
5210 // All loads gathered - try smaller VF.
5211 continue;
5212 // Can be vectorized later as a serie of loads/insertelements.
5213 InstructionCost VecLdCost = 0;
5214 if (!DemandedElts.isZero()) {
5215 VecLdCost =
5216 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5217 /*Extract=*/false, CostKind) +
5218 ScalarGEPCost;
5219 for (unsigned Idx : seq<unsigned>(VL.size()))
5220 if (DemandedElts[Idx])
5221 VecLdCost +=
5222 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5223 }
5224 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5225 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5226 for (auto [I, LS] : enumerate(States)) {
5227 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5228 InstructionCost VectorGEPCost =
5229 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5230 ? 0
5231 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5232 LI0->getPointerOperand(),
5233 Instruction::GetElementPtr, CostKind, ScalarTy,
5234 SubVecTy)
5235 .second;
5236 if (LS == LoadsState::ScatterVectorize) {
5237 if (static_cast<unsigned>(
5238 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5239 PointerOps.size() - 1 ||
5240 any_of(PointerOps, [&](Value *V) {
5241 return getUnderlyingObject(V) !=
5242 getUnderlyingObject(PointerOps.front());
5243 }))
5244 VectorGEPCost += TTI.getScalarizationOverhead(
5245 SubVecTy, APInt::getAllOnes(VF),
5246 /*Insert=*/true, /*Extract=*/false, CostKind);
5247 else
5248 VectorGEPCost +=
5250 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5251 /*Insert=*/true, /*Extract=*/false, CostKind) +
5252 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5253 CostKind);
5254 }
5255 switch (LS) {
5257 VecLdCost +=
5258 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5259 LI0->getPointerAddressSpace(), CostKind,
5261 VectorGEPCost;
5262 break;
5264 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5265 LI0->getPointerOperand(),
5266 /*VariableMask=*/false,
5267 CommonAlignment, CostKind) +
5268 VectorGEPCost;
5269 break;
5271 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5272 LI0->getPointerOperand(),
5273 /*VariableMask=*/false,
5274 CommonAlignment, CostKind) +
5275 VectorGEPCost;
5276 break;
5277 case LoadsState::Gather:
5278 // Gathers are already calculated - ignore.
5279 continue;
5280 }
5281 SmallVector<int> ShuffleMask(VL.size());
5282 for (int Idx : seq<int>(0, VL.size()))
5283 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5284 if (I > 0)
5285 VecLdCost +=
5286 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5287 CostKind, I * VF, SubVecTy);
5288 }
5289 // If masked gather cost is higher - better to vectorize, so
5290 // consider it as a gather node. It will be better estimated
5291 // later.
5292 if (MaskedGatherCost >= VecLdCost &&
5293 VecLdCost - GatherCost < -SLPCostThreshold) {
5294 if (BestVF)
5295 *BestVF = VF;
5296 return true;
5297 }
5298 }
5299 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5300 };
5301 // TODO: need to improve analysis of the pointers, if not all of them are
5302 // GEPs or have > 2 operands, we end up with a gather node, which just
5303 // increases the cost.
5304 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5305 bool ProfitableGatherPointers =
5306 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5307 return L->isLoopInvariant(V);
5308 })) <= Sz / 2;
5309 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5310 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5311 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5312 (GEP && GEP->getNumOperands() == 2 &&
5313 isa<Constant, Instruction>(GEP->getOperand(1)));
5314 })) {
5315 // Check if potential masked gather can be represented as series
5316 // of loads + insertsubvectors.
5317 // If masked gather cost is higher - better to vectorize, so
5318 // consider it as a gather node. It will be better estimated
5319 // later.
5320 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5321 ProfitableGatherPointers))
5323 }
5324
5325 return LoadsState::Gather;
5326}
5327
5329 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5330 const DataLayout &DL, ScalarEvolution &SE,
5331 SmallVectorImpl<unsigned> &SortedIndices) {
5332 assert(
5333 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5334 "Expected list of pointer operands.");
5335 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5336 // Ptr into, sort and return the sorted indices with values next to one
5337 // another.
5340 Bases;
5341 Bases
5342 .try_emplace(std::make_pair(
5344 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5345
5346 SortedIndices.clear();
5347 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5348 auto Key = std::make_pair(BBs[Cnt + 1],
5350 bool Found = any_of(Bases.try_emplace(Key).first->second,
5351 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5352 std::optional<int> Diff = getPointersDiff(
5353 ElemTy, std::get<0>(Base.front()), ElemTy,
5354 Ptr, DL, SE,
5355 /*StrictCheck=*/true);
5356 if (!Diff)
5357 return false;
5358
5359 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5360 return true;
5361 });
5362
5363 if (!Found) {
5364 // If we haven't found enough to usefully cluster, return early.
5365 if (Bases.size() > VL.size() / 2 - 1)
5366 return false;
5367
5368 // Not found already - add a new Base
5369 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5370 }
5371 }
5372
5373 if (Bases.size() == VL.size())
5374 return false;
5375
5376 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5377 Bases.front().second.size() == VL.size()))
5378 return false;
5379
5380 // For each of the bases sort the pointers by Offset and check if any of the
5381 // base become consecutively allocated.
5382 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5383 SmallPtrSet<Value *, 13> FirstPointers;
5384 SmallPtrSet<Value *, 13> SecondPointers;
5385 Value *P1 = Ptr1;
5386 Value *P2 = Ptr2;
5387 unsigned Depth = 0;
5388 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5389 if (P1 == P2 || Depth > RecursionMaxDepth)
5390 return false;
5391 FirstPointers.insert(P1);
5392 SecondPointers.insert(P2);
5393 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5394 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5395 ++Depth;
5396 }
5397 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5398 "Unable to find matching root.");
5399 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5400 };
5401 for (auto &Base : Bases) {
5402 for (auto &Vec : Base.second) {
5403 if (Vec.size() > 1) {
5404 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5405 const std::tuple<Value *, int, unsigned> &Y) {
5406 return std::get<1>(X) < std::get<1>(Y);
5407 });
5408 int InitialOffset = std::get<1>(Vec[0]);
5409 bool AnyConsecutive =
5410 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5411 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5412 });
5413 // Fill SortedIndices array only if it looks worth-while to sort the
5414 // ptrs.
5415 if (!AnyConsecutive)
5416 return false;
5417 }
5418 }
5419 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5420 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5421 });
5422 }
5423
5424 for (auto &T : Bases)
5425 for (const auto &Vec : T.second)
5426 for (const auto &P : Vec)
5427 SortedIndices.push_back(std::get<2>(P));
5428
5429 assert(SortedIndices.size() == VL.size() &&
5430 "Expected SortedIndices to be the size of VL");
5431 return true;
5432}
5433
5434std::optional<BoUpSLP::OrdersType>
5435BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5436 assert(TE.isGather() && "Expected gather node only.");
5437 Type *ScalarTy = TE.Scalars[0]->getType();
5438
5440 Ptrs.reserve(TE.Scalars.size());
5442 BBs.reserve(TE.Scalars.size());
5443 for (Value *V : TE.Scalars) {
5444 auto *L = dyn_cast<LoadInst>(V);
5445 if (!L || !L->isSimple())
5446 return std::nullopt;
5447 Ptrs.push_back(L->getPointerOperand());
5448 BBs.push_back(L->getParent());
5449 }
5450
5451 BoUpSLP::OrdersType Order;
5452 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5453 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5454 return std::move(Order);
5455 return std::nullopt;
5456}
5457
5458/// Check if two insertelement instructions are from the same buildvector.
5461 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5462 // Instructions must be from the same basic blocks.
5463 if (VU->getParent() != V->getParent())
5464 return false;
5465 // Checks if 2 insertelements are from the same buildvector.
5466 if (VU->getType() != V->getType())
5467 return false;
5468 // Multiple used inserts are separate nodes.
5469 if (!VU->hasOneUse() && !V->hasOneUse())
5470 return false;
5471 auto *IE1 = VU;
5472 auto *IE2 = V;
5473 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5474 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5475 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5476 return false;
5477 // Go through the vector operand of insertelement instructions trying to find
5478 // either VU as the original vector for IE2 or V as the original vector for
5479 // IE1.
5480 SmallBitVector ReusedIdx(
5481 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5482 bool IsReusedIdx = false;
5483 do {
5484 if (IE2 == VU && !IE1)
5485 return VU->hasOneUse();
5486 if (IE1 == V && !IE2)
5487 return V->hasOneUse();
5488 if (IE1 && IE1 != V) {
5489 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5490 IsReusedIdx |= ReusedIdx.test(Idx1);
5491 ReusedIdx.set(Idx1);
5492 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5493 IE1 = nullptr;
5494 else
5495 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5496 }
5497 if (IE2 && IE2 != VU) {
5498 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5499 IsReusedIdx |= ReusedIdx.test(Idx2);
5500 ReusedIdx.set(Idx2);
5501 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5502 IE2 = nullptr;
5503 else
5504 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5505 }
5506 } while (!IsReusedIdx && (IE1 || IE2));
5507 return false;
5508}
5509
5510std::optional<BoUpSLP::OrdersType>
5511BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5512 // No need to reorder if need to shuffle reuses, still need to shuffle the
5513 // node.
5514 if (!TE.ReuseShuffleIndices.empty()) {
5515 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5516 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5517 "Reshuffling scalars not yet supported for nodes with padding");
5518
5519 if (isSplat(TE.Scalars))
5520 return std::nullopt;
5521 // Check if reuse shuffle indices can be improved by reordering.
5522 // For this, check that reuse mask is "clustered", i.e. each scalar values
5523 // is used once in each submask of size <number_of_scalars>.
5524 // Example: 4 scalar values.
5525 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5526 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5527 // element 3 is used twice in the second submask.
5528 unsigned Sz = TE.Scalars.size();
5529 if (TE.isGather()) {
5530 if (std::optional<OrdersType> CurrentOrder =
5532 SmallVector<int> Mask;
5533 fixupOrderingIndices(*CurrentOrder);
5534 inversePermutation(*CurrentOrder, Mask);
5535 ::addMask(Mask, TE.ReuseShuffleIndices);
5536 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5537 unsigned Sz = TE.Scalars.size();
5538 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5539 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5540 if (Idx != PoisonMaskElem)
5541 Res[Idx + K * Sz] = I + K * Sz;
5542 }
5543 return std::move(Res);
5544 }
5545 }
5546 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5547 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5548 2 * TE.getVectorFactor())) == 1)
5549 return std::nullopt;
5550 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5551 Sz)) {
5552 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5553 if (TE.ReorderIndices.empty())
5554 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5555 else
5556 inversePermutation(TE.ReorderIndices, ReorderMask);
5557 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5558 unsigned VF = ReorderMask.size();
5559 OrdersType ResOrder(VF, VF);
5560 unsigned NumParts = divideCeil(VF, Sz);
5561 SmallBitVector UsedVals(NumParts);
5562 for (unsigned I = 0; I < VF; I += Sz) {
5563 int Val = PoisonMaskElem;
5564 unsigned UndefCnt = 0;
5565 unsigned Limit = std::min(Sz, VF - I);
5566 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5567 [&](int Idx) {
5568 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5569 Val = Idx;
5570 if (Idx == PoisonMaskElem)
5571 ++UndefCnt;
5572 return Idx != PoisonMaskElem && Idx != Val;
5573 }) ||
5574 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5575 UndefCnt > Sz / 2)
5576 return std::nullopt;
5577 UsedVals.set(Val);
5578 for (unsigned K = 0; K < NumParts; ++K) {
5579 unsigned Idx = Val + Sz * K;
5580 if (Idx < VF)
5581 ResOrder[Idx] = I + K;
5582 }
5583 }
5584 return std::move(ResOrder);
5585 }
5586 unsigned VF = TE.getVectorFactor();
5587 // Try build correct order for extractelement instructions.
5588 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5589 TE.ReuseShuffleIndices.end());
5590 if (TE.getOpcode() == Instruction::ExtractElement &&
5591 all_of(TE.Scalars, [Sz](Value *V) {
5592 if (isa<PoisonValue>(V))
5593 return true;
5594 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5595 return Idx && *Idx < Sz;
5596 })) {
5597 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5598 "by BinaryOperator and CastInst.");
5599 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5600 if (TE.ReorderIndices.empty())
5601 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5602 else
5603 inversePermutation(TE.ReorderIndices, ReorderMask);
5604 for (unsigned I = 0; I < VF; ++I) {
5605 int &Idx = ReusedMask[I];
5606 if (Idx == PoisonMaskElem)
5607 continue;
5608 Value *V = TE.Scalars[ReorderMask[Idx]];
5609 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5610 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5611 }
5612 }
5613 // Build the order of the VF size, need to reorder reuses shuffles, they are
5614 // always of VF size.
5615 OrdersType ResOrder(VF);
5616 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5617 auto *It = ResOrder.begin();
5618 for (unsigned K = 0; K < VF; K += Sz) {
5619 OrdersType CurrentOrder(TE.ReorderIndices);
5620 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5621 if (SubMask.front() == PoisonMaskElem)
5622 std::iota(SubMask.begin(), SubMask.end(), 0);
5623 reorderOrder(CurrentOrder, SubMask);
5624 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5625 std::advance(It, Sz);
5626 }
5627 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5628 return Data.index() == Data.value();
5629 }))
5630 return std::nullopt; // No need to reorder.
5631 return std::move(ResOrder);
5632 }
5633 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5634 any_of(TE.UserTreeIndices,
5635 [](const EdgeInfo &EI) {
5636 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5637 }) &&
5638 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5639 return std::nullopt;
5640 if ((TE.State == TreeEntry::Vectorize ||
5641 TE.State == TreeEntry::StridedVectorize) &&
5642 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5643 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5644 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5645 "BinaryOperator and CastInst.");
5646 return TE.ReorderIndices;
5647 }
5648 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5649 if (!TE.ReorderIndices.empty())
5650 return TE.ReorderIndices;
5651
5652 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5653 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5654 if (!V->hasNUsesOrMore(1))
5655 continue;
5656 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5657 if (!II)
5658 continue;
5659 Instruction *BVHead = nullptr;
5660 BasicBlock *BB = II->getParent();
5661 while (II && II->hasOneUse() && II->getParent() == BB) {
5662 BVHead = II;
5663 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5664 }
5665 I = BVHead;
5666 }
5667
5668 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5669 assert(BB1 != BB2 && "Expected different basic blocks.");
5670 auto *NodeA = DT->getNode(BB1);
5671 auto *NodeB = DT->getNode(BB2);
5672 assert(NodeA && "Should only process reachable instructions");
5673 assert(NodeB && "Should only process reachable instructions");
5674 assert((NodeA == NodeB) ==
5675 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5676 "Different nodes should have different DFS numbers");
5677 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5678 };
5679 auto PHICompare = [&](unsigned I1, unsigned I2) {
5680 Value *V1 = TE.Scalars[I1];
5681 Value *V2 = TE.Scalars[I2];
5682 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5683 return false;
5684 if (isa<PoisonValue>(V1))
5685 return true;
5686 if (isa<PoisonValue>(V2))
5687 return false;
5688 if (V1->getNumUses() < V2->getNumUses())
5689 return true;
5690 if (V1->getNumUses() > V2->getNumUses())
5691 return false;
5692 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5693 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5694 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5695 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5696 FirstUserOfPhi2->getParent());
5697 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5698 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5699 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5700 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5701 if (IE1 && !IE2)
5702 return true;
5703 if (!IE1 && IE2)
5704 return false;
5705 if (IE1 && IE2) {
5706 if (UserBVHead[I1] && !UserBVHead[I2])
5707 return true;
5708 if (!UserBVHead[I1])
5709 return false;
5710 if (UserBVHead[I1] == UserBVHead[I2])
5711 return getElementIndex(IE1) < getElementIndex(IE2);
5712 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5713 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5714 UserBVHead[I2]->getParent());
5715 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5716 }
5717 if (EE1 && !EE2)
5718 return true;
5719 if (!EE1 && EE2)
5720 return false;
5721 if (EE1 && EE2) {
5722 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5723 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5724 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5725 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5726 if (!Inst2 && !P2)
5727 return Inst1 || P1;
5728 if (EE1->getOperand(0) == EE2->getOperand(0))
5729 return getElementIndex(EE1) < getElementIndex(EE2);
5730 if (!Inst1 && Inst2)
5731 return false;
5732 if (Inst1 && Inst2) {
5733 if (Inst1->getParent() != Inst2->getParent())
5734 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5735 return Inst1->comesBefore(Inst2);
5736 }
5737 if (!P1 && P2)
5738 return false;
5739 assert(P1 && P2 &&
5740 "Expected either instructions or arguments vector operands.");
5741 return P1->getArgNo() < P2->getArgNo();
5742 }
5743 return false;
5744 };
5745 OrdersType Phis(TE.Scalars.size());
5746 std::iota(Phis.begin(), Phis.end(), 0);
5747 stable_sort(Phis, PHICompare);
5748 if (isIdentityOrder(Phis))
5749 return std::nullopt; // No need to reorder.
5750 return std::move(Phis);
5751 }
5752 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5753 // TODO: add analysis of other gather nodes with extractelement
5754 // instructions and other values/instructions, not only undefs.
5755 if ((TE.getOpcode() == Instruction::ExtractElement ||
5756 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5757 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5758 all_of(TE.Scalars, [](Value *V) {
5759 auto *EE = dyn_cast<ExtractElementInst>(V);
5760 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5761 })) {
5762 // Check that gather of extractelements can be represented as
5763 // just a shuffle of a single vector.
5764 OrdersType CurrentOrder;
5765 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5766 /*ResizeAllowed=*/true);
5767 if (Reuse || !CurrentOrder.empty())
5768 return std::move(CurrentOrder);
5769 }
5770 // If the gather node is <undef, v, .., poison> and
5771 // insertelement poison, v, 0 [+ permute]
5772 // is cheaper than
5773 // insertelement poison, v, n - try to reorder.
5774 // If rotating the whole graph, exclude the permute cost, the whole graph
5775 // might be transformed.
5776 int Sz = TE.Scalars.size();
5777 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5778 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5779 const auto *It =
5780 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5781 if (It == TE.Scalars.begin())
5782 return OrdersType();
5783 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5784 if (It != TE.Scalars.end()) {
5785 OrdersType Order(Sz, Sz);
5786 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5787 Order[Idx] = 0;
5788 fixupOrderingIndices(Order);
5789 SmallVector<int> Mask;
5790 inversePermutation(Order, Mask);
5791 InstructionCost PermuteCost =
5792 TopToBottom
5793 ? 0
5795 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5796 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5797 PoisonValue::get(Ty), *It);
5798 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5799 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5800 PoisonValue::get(Ty), *It);
5801 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5802 OrdersType Order(Sz, Sz);
5803 Order[Idx] = 0;
5804 return std::move(Order);
5805 }
5806 }
5807 }
5808 if (isSplat(TE.Scalars))
5809 return std::nullopt;
5810 if (TE.Scalars.size() >= 3)
5811 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5812 return Order;
5813 // Check if can include the order of vectorized loads. For masked gathers do
5814 // extra analysis later, so include such nodes into a special list.
5815 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5816 SmallVector<Value *> PointerOps;
5817 OrdersType CurrentOrder;
5818 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5819 CurrentOrder, PointerOps);
5821 return std::move(CurrentOrder);
5822 }
5823 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5824 // has been auditted for correctness with non-power-of-two vectors.
5825 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5826 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5827 return CurrentOrder;
5828 }
5829 return std::nullopt;
5830}
5831
5832/// Checks if the given mask is a "clustered" mask with the same clusters of
5833/// size \p Sz, which are not identity submasks.
5835 unsigned Sz) {
5836 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5837 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5838 return false;
5839 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5840 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5841 if (Cluster != FirstCluster)
5842 return false;
5843 }
5844 return true;
5845}
5846
5847void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5848 // Reorder reuses mask.
5849 reorderReuses(TE.ReuseShuffleIndices, Mask);
5850 const unsigned Sz = TE.Scalars.size();
5851 // For vectorized and non-clustered reused no need to do anything else.
5852 if (!TE.isGather() ||
5854 Sz) ||
5855 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5856 return;
5857 SmallVector<int> NewMask;
5858 inversePermutation(TE.ReorderIndices, NewMask);
5859 addMask(NewMask, TE.ReuseShuffleIndices);
5860 // Clear reorder since it is going to be applied to the new mask.
5861 TE.ReorderIndices.clear();
5862 // Try to improve gathered nodes with clustered reuses, if possible.
5863 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5864 SmallVector<unsigned> NewOrder(Slice);
5865 inversePermutation(NewOrder, NewMask);
5866 reorderScalars(TE.Scalars, NewMask);
5867 // Fill the reuses mask with the identity submasks.
5868 for (auto *It = TE.ReuseShuffleIndices.begin(),
5869 *End = TE.ReuseShuffleIndices.end();
5870 It != End; std::advance(It, Sz))
5871 std::iota(It, std::next(It, Sz), 0);
5872}
5873
5875 ArrayRef<unsigned> SecondaryOrder) {
5876 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5877 "Expected same size of orders");
5878 unsigned Sz = Order.size();
5879 SmallBitVector UsedIndices(Sz);
5880 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5881 if (Order[Idx] != Sz)
5882 UsedIndices.set(Order[Idx]);
5883 }
5884 if (SecondaryOrder.empty()) {
5885 for (unsigned Idx : seq<unsigned>(0, Sz))
5886 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5887 Order[Idx] = Idx;
5888 } else {
5889 for (unsigned Idx : seq<unsigned>(0, Sz))
5890 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5891 !UsedIndices.test(SecondaryOrder[Idx]))
5892 Order[Idx] = SecondaryOrder[Idx];
5893 }
5894}
5895
5897 // Maps VF to the graph nodes.
5899 // ExtractElement gather nodes which can be vectorized and need to handle
5900 // their ordering.
5902
5903 // Phi nodes can have preferred ordering based on their result users
5905
5906 // AltShuffles can also have a preferred ordering that leads to fewer
5907 // instructions, e.g., the addsub instruction in x86.
5908 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5909
5910 // Maps a TreeEntry to the reorder indices of external users.
5912 ExternalUserReorderMap;
5913 // Find all reorderable nodes with the given VF.
5914 // Currently the are vectorized stores,loads,extracts + some gathering of
5915 // extracts.
5916 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5917 const std::unique_ptr<TreeEntry> &TE) {
5918 // Look for external users that will probably be vectorized.
5919 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5920 findExternalStoreUsersReorderIndices(TE.get());
5921 if (!ExternalUserReorderIndices.empty()) {
5922 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5923 ExternalUserReorderMap.try_emplace(TE.get(),
5924 std::move(ExternalUserReorderIndices));
5925 }
5926
5927 // Patterns like [fadd,fsub] can be combined into a single instruction in
5928 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5929 // to take into account their order when looking for the most used order.
5930 if (TE->isAltShuffle()) {
5931 VectorType *VecTy =
5932 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5933 unsigned Opcode0 = TE->getOpcode();
5934 unsigned Opcode1 = TE->getAltOpcode();
5935 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5936 // If this pattern is supported by the target then we consider the order.
5937 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5938 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5939 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5940 }
5941 // TODO: Check the reverse order too.
5942 }
5943
5944 if (std::optional<OrdersType> CurrentOrder =
5945 getReorderingData(*TE, /*TopToBottom=*/true)) {
5946 // Do not include ordering for nodes used in the alt opcode vectorization,
5947 // better to reorder them during bottom-to-top stage. If follow the order
5948 // here, it causes reordering of the whole graph though actually it is
5949 // profitable just to reorder the subgraph that starts from the alternate
5950 // opcode vectorization node. Such nodes already end-up with the shuffle
5951 // instruction and it is just enough to change this shuffle rather than
5952 // rotate the scalars for the whole graph.
5953 unsigned Cnt = 0;
5954 const TreeEntry *UserTE = TE.get();
5955 while (UserTE && Cnt < RecursionMaxDepth) {
5956 if (UserTE->UserTreeIndices.size() != 1)
5957 break;
5958 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5959 return EI.UserTE->State == TreeEntry::Vectorize &&
5960 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5961 }))
5962 return;
5963 UserTE = UserTE->UserTreeIndices.back().UserTE;
5964 ++Cnt;
5965 }
5966 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5967 if (!(TE->State == TreeEntry::Vectorize ||
5968 TE->State == TreeEntry::StridedVectorize) ||
5969 !TE->ReuseShuffleIndices.empty())
5970 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5971 if (TE->State == TreeEntry::Vectorize &&
5972 TE->getOpcode() == Instruction::PHI)
5973 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5974 }
5975 });
5976
5977 // Reorder the graph nodes according to their vectorization factor.
5978 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5979 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5980 auto It = VFToOrderedEntries.find(VF);
5981 if (It == VFToOrderedEntries.end())
5982 continue;
5983 // Try to find the most profitable order. We just are looking for the most
5984 // used order and reorder scalar elements in the nodes according to this
5985 // mostly used order.
5986 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5987 // Delete VF entry upon exit.
5988 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5989
5990 // All operands are reordered and used only in this node - propagate the
5991 // most used order to the user node.
5994 OrdersUses;
5996 for (const TreeEntry *OpTE : OrderedEntries) {
5997 // No need to reorder this nodes, still need to extend and to use shuffle,
5998 // just need to merge reordering shuffle and the reuse shuffle.
5999 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6000 continue;
6001 // Count number of orders uses.
6002 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6003 &PhisToOrders]() -> const OrdersType & {
6004 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6005 auto It = GathersToOrders.find(OpTE);
6006 if (It != GathersToOrders.end())
6007 return It->second;
6008 }
6009 if (OpTE->isAltShuffle()) {
6010 auto It = AltShufflesToOrders.find(OpTE);
6011 if (It != AltShufflesToOrders.end())
6012 return It->second;
6013 }
6014 if (OpTE->State == TreeEntry::Vectorize &&
6015 OpTE->getOpcode() == Instruction::PHI) {
6016 auto It = PhisToOrders.find(OpTE);
6017 if (It != PhisToOrders.end())
6018 return It->second;
6019 }
6020 return OpTE->ReorderIndices;
6021 }();
6022 // First consider the order of the external scalar users.
6023 auto It = ExternalUserReorderMap.find(OpTE);
6024 if (It != ExternalUserReorderMap.end()) {
6025 const auto &ExternalUserReorderIndices = It->second;
6026 // If the OpTE vector factor != number of scalars - use natural order,
6027 // it is an attempt to reorder node with reused scalars but with
6028 // external uses.
6029 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6030 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6031 ExternalUserReorderIndices.size();
6032 } else {
6033 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
6034 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6035 }
6036 // No other useful reorder data in this entry.
6037 if (Order.empty())
6038 continue;
6039 }
6040 // Stores actually store the mask, not the order, need to invert.
6041 if (OpTE->State == TreeEntry::Vectorize &&
6042 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6043 assert(!OpTE->isAltShuffle() &&
6044 "Alternate instructions are only supported by BinaryOperator "
6045 "and CastInst.");
6046 SmallVector<int> Mask;
6047 inversePermutation(Order, Mask);
6048 unsigned E = Order.size();
6049 OrdersType CurrentOrder(E, E);
6050 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6051 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6052 });
6053 fixupOrderingIndices(CurrentOrder);
6054 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6055 } else {
6056 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6057 }
6058 }
6059 if (OrdersUses.empty())
6060 continue;
6061 // Choose the most used order.
6062 unsigned IdentityCnt = 0;
6063 unsigned FilledIdentityCnt = 0;
6064 OrdersType IdentityOrder(VF, VF);
6065 for (auto &Pair : OrdersUses) {
6066 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6067 if (!Pair.first.empty())
6068 FilledIdentityCnt += Pair.second;
6069 IdentityCnt += Pair.second;
6070 combineOrders(IdentityOrder, Pair.first);
6071 }
6072 }
6073 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6074 unsigned Cnt = IdentityCnt;
6075 for (auto &Pair : OrdersUses) {
6076 // Prefer identity order. But, if filled identity found (non-empty order)
6077 // with same number of uses, as the new candidate order, we can choose
6078 // this candidate order.
6079 if (Cnt < Pair.second ||
6080 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6081 Cnt == Pair.second && !BestOrder.empty() &&
6082 isIdentityOrder(BestOrder))) {
6083 combineOrders(Pair.first, BestOrder);
6084 BestOrder = Pair.first;
6085 Cnt = Pair.second;
6086 } else {
6087 combineOrders(BestOrder, Pair.first);
6088 }
6089 }
6090 // Set order of the user node.
6091 if (isIdentityOrder(BestOrder))
6092 continue;
6093 fixupOrderingIndices(BestOrder);
6094 SmallVector<int> Mask;
6095 inversePermutation(BestOrder, Mask);
6096 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6097 unsigned E = BestOrder.size();
6098 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6099 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6100 });
6101 // Do an actual reordering, if profitable.
6102 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6103 // Just do the reordering for the nodes with the given VF.
6104 if (TE->Scalars.size() != VF) {
6105 if (TE->ReuseShuffleIndices.size() == VF) {
6106 // Need to reorder the reuses masks of the operands with smaller VF to
6107 // be able to find the match between the graph nodes and scalar
6108 // operands of the given node during vectorization/cost estimation.
6109 assert(all_of(TE->UserTreeIndices,
6110 [VF, &TE](const EdgeInfo &EI) {
6111 return EI.UserTE->Scalars.size() == VF ||
6112 EI.UserTE->Scalars.size() ==
6113 TE->Scalars.size();
6114 }) &&
6115 "All users must be of VF size.");
6116 if (SLPReVec) {
6117 assert(SLPReVec && "Only supported by REVEC.");
6118 // ShuffleVectorInst does not do reorderOperands (and it should not
6119 // because ShuffleVectorInst supports only a limited set of
6120 // patterns). Only do reorderNodeWithReuses if all of the users are
6121 // not ShuffleVectorInst.
6122 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6123 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6124 }))
6125 continue;
6126 assert(none_of(TE->UserTreeIndices,
6127 [&](const EdgeInfo &EI) {
6128 return isa<ShuffleVectorInst>(
6129 EI.UserTE->getMainOp());
6130 }) &&
6131 "Does not know how to reorder.");
6132 }
6133 // Update ordering of the operands with the smaller VF than the given
6134 // one.
6135 reorderNodeWithReuses(*TE, Mask);
6136 }
6137 continue;
6138 }
6139 if ((TE->State == TreeEntry::Vectorize ||
6140 TE->State == TreeEntry::StridedVectorize) &&
6142 InsertElementInst>(TE->getMainOp()) ||
6143 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6144 assert(!TE->isAltShuffle() &&
6145 "Alternate instructions are only supported by BinaryOperator "
6146 "and CastInst.");
6147 // Build correct orders for extract{element,value}, loads and
6148 // stores.
6149 reorderOrder(TE->ReorderIndices, Mask);
6150 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6151 TE->reorderOperands(Mask);
6152 } else {
6153 // Reorder the node and its operands.
6154 TE->reorderOperands(Mask);
6155 assert(TE->ReorderIndices.empty() &&
6156 "Expected empty reorder sequence.");
6157 reorderScalars(TE->Scalars, Mask);
6158 }
6159 if (!TE->ReuseShuffleIndices.empty()) {
6160 // Apply reversed order to keep the original ordering of the reused
6161 // elements to avoid extra reorder indices shuffling.
6162 OrdersType CurrentOrder;
6163 reorderOrder(CurrentOrder, MaskOrder);
6164 SmallVector<int> NewReuses;
6165 inversePermutation(CurrentOrder, NewReuses);
6166 addMask(NewReuses, TE->ReuseShuffleIndices);
6167 TE->ReuseShuffleIndices.swap(NewReuses);
6168 }
6169 }
6170 }
6171}
6172
6173bool BoUpSLP::canReorderOperands(
6174 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6175 ArrayRef<TreeEntry *> ReorderableGathers,
6176 SmallVectorImpl<TreeEntry *> &GatherOps) {
6177 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6178 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6179 return OpData.first == I &&
6180 (OpData.second->State == TreeEntry::Vectorize ||
6181 OpData.second->State == TreeEntry::StridedVectorize);
6182 }))
6183 continue;
6184 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6185 // Do not reorder if operand node is used by many user nodes.
6186 if (any_of(TE->UserTreeIndices,
6187 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6188 return false;
6189 // Add the node to the list of the ordered nodes with the identity
6190 // order.
6191 Edges.emplace_back(I, TE);
6192 // Add ScatterVectorize nodes to the list of operands, where just
6193 // reordering of the scalars is required. Similar to the gathers, so
6194 // simply add to the list of gathered ops.
6195 // If there are reused scalars, process this node as a regular vectorize
6196 // node, just reorder reuses mask.
6197 if (TE->State != TreeEntry::Vectorize &&
6198 TE->State != TreeEntry::StridedVectorize &&
6199 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6200 GatherOps.push_back(TE);
6201 continue;
6202 }
6203 TreeEntry *Gather = nullptr;
6204 if (count_if(ReorderableGathers,
6205 [&Gather, UserTE, I](TreeEntry *TE) {
6206 assert(TE->State != TreeEntry::Vectorize &&
6207 TE->State != TreeEntry::StridedVectorize &&
6208 "Only non-vectorized nodes are expected.");
6209 if (any_of(TE->UserTreeIndices,
6210 [UserTE, I](const EdgeInfo &EI) {
6211 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6212 })) {
6213 assert(TE->isSame(UserTE->getOperand(I)) &&
6214 "Operand entry does not match operands.");
6215 Gather = TE;
6216 return true;
6217 }
6218 return false;
6219 }) > 1 &&
6220 !allConstant(UserTE->getOperand(I)))
6221 return false;
6222 if (Gather)
6223 GatherOps.push_back(Gather);
6224 }
6225 return true;
6226}
6227
6228void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6229 SetVector<TreeEntry *> OrderedEntries;
6230 DenseSet<const TreeEntry *> GathersToOrders;
6231 // Find all reorderable leaf nodes with the given VF.
6232 // Currently the are vectorized loads,extracts without alternate operands +
6233 // some gathering of extracts.
6234 SmallVector<TreeEntry *> NonVectorized;
6235 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6236 if (TE->State != TreeEntry::Vectorize &&
6237 TE->State != TreeEntry::StridedVectorize)
6238 NonVectorized.push_back(TE.get());
6239 if (std::optional<OrdersType> CurrentOrder =
6240 getReorderingData(*TE, /*TopToBottom=*/false)) {
6241 OrderedEntries.insert(TE.get());
6242 if (!(TE->State == TreeEntry::Vectorize ||
6243 TE->State == TreeEntry::StridedVectorize) ||
6244 !TE->ReuseShuffleIndices.empty())
6245 GathersToOrders.insert(TE.get());
6246 }
6247 }
6248
6249 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6250 // I.e., if the node has operands, that are reordered, try to make at least
6251 // one operand order in the natural order and reorder others + reorder the
6252 // user node itself.
6254 while (!OrderedEntries.empty()) {
6255 // 1. Filter out only reordered nodes.
6256 // 2. If the entry has multiple uses - skip it and jump to the next node.
6258 SmallVector<TreeEntry *> Filtered;
6259 for (TreeEntry *TE : OrderedEntries) {
6260 if (!(TE->State == TreeEntry::Vectorize ||
6261 TE->State == TreeEntry::StridedVectorize ||
6262 (TE->isGather() && GathersToOrders.contains(TE))) ||
6263 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6264 !all_of(drop_begin(TE->UserTreeIndices),
6265 [TE](const EdgeInfo &EI) {
6266 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6267 }) ||
6268 !Visited.insert(TE).second) {
6269 Filtered.push_back(TE);
6270 continue;
6271 }
6272 // Build a map between user nodes and their operands order to speedup
6273 // search. The graph currently does not provide this dependency directly.
6274 for (EdgeInfo &EI : TE->UserTreeIndices)
6275 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6276 }
6277 // Erase filtered entries.
6278 for (TreeEntry *TE : Filtered)
6279 OrderedEntries.remove(TE);
6281 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6282 UsersVec(Users.begin(), Users.end());
6283 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6284 return Data1.first->Idx > Data2.first->Idx;
6285 });
6286 for (auto &Data : UsersVec) {
6287 // Check that operands are used only in the User node.
6288 SmallVector<TreeEntry *> GatherOps;
6289 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6290 GatherOps)) {
6291 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6292 OrderedEntries.remove(Op.second);
6293 continue;
6294 }
6295 // All operands are reordered and used only in this node - propagate the
6296 // most used order to the user node.
6299 OrdersUses;
6300 // Do the analysis for each tree entry only once, otherwise the order of
6301 // the same node my be considered several times, though might be not
6302 // profitable.
6305 for (const auto &Op : Data.second) {
6306 TreeEntry *OpTE = Op.second;
6307 if (!VisitedOps.insert(OpTE).second)
6308 continue;
6309 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6310 continue;
6311 const auto Order = [&]() -> const OrdersType {
6312 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6313 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6314 .value_or(OrdersType(1));
6315 return OpTE->ReorderIndices;
6316 }();
6317 // The order is partially ordered, skip it in favor of fully non-ordered
6318 // orders.
6319 if (Order.size() == 1)
6320 continue;
6321 unsigned NumOps = count_if(
6322 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6323 return P.second == OpTE;
6324 });
6325 // Stores actually store the mask, not the order, need to invert.
6326 if (OpTE->State == TreeEntry::Vectorize &&
6327 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6328 assert(!OpTE->isAltShuffle() &&
6329 "Alternate instructions are only supported by BinaryOperator "
6330 "and CastInst.");
6331 SmallVector<int> Mask;
6332 inversePermutation(Order, Mask);
6333 unsigned E = Order.size();
6334 OrdersType CurrentOrder(E, E);
6335 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6336 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6337 });
6338 fixupOrderingIndices(CurrentOrder);
6339 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6340 NumOps;
6341 } else {
6342 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6343 }
6344 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6345 const auto AllowsReordering = [&](const TreeEntry *TE) {
6346 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6347 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6348 (IgnoreReorder && TE->Idx == 0))
6349 return true;
6350 if (TE->isGather()) {
6351 if (GathersToOrders.contains(TE))
6352 return !getReorderingData(*TE, /*TopToBottom=*/false)
6353 .value_or(OrdersType(1))
6354 .empty();
6355 return true;
6356 }
6357 return false;
6358 };
6359 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6360 TreeEntry *UserTE = EI.UserTE;
6361 if (!VisitedUsers.insert(UserTE).second)
6362 continue;
6363 // May reorder user node if it requires reordering, has reused
6364 // scalars, is an alternate op vectorize node or its op nodes require
6365 // reordering.
6366 if (AllowsReordering(UserTE))
6367 continue;
6368 // Check if users allow reordering.
6369 // Currently look up just 1 level of operands to avoid increase of
6370 // the compile time.
6371 // Profitable to reorder if definitely more operands allow
6372 // reordering rather than those with natural order.
6374 if (static_cast<unsigned>(count_if(
6375 Ops, [UserTE, &AllowsReordering](
6376 const std::pair<unsigned, TreeEntry *> &Op) {
6377 return AllowsReordering(Op.second) &&
6378 all_of(Op.second->UserTreeIndices,
6379 [UserTE](const EdgeInfo &EI) {
6380 return EI.UserTE == UserTE;
6381 });
6382 })) <= Ops.size() / 2)
6383 ++Res.first->second;
6384 }
6385 }
6386 if (OrdersUses.empty()) {
6387 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6388 OrderedEntries.remove(Op.second);
6389 continue;
6390 }
6391 // Choose the most used order.
6392 unsigned IdentityCnt = 0;
6393 unsigned VF = Data.second.front().second->getVectorFactor();
6394 OrdersType IdentityOrder(VF, VF);
6395 for (auto &Pair : OrdersUses) {
6396 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6397 IdentityCnt += Pair.second;
6398 combineOrders(IdentityOrder, Pair.first);
6399 }
6400 }
6401 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6402 unsigned Cnt = IdentityCnt;
6403 for (auto &Pair : OrdersUses) {
6404 // Prefer identity order. But, if filled identity found (non-empty
6405 // order) with same number of uses, as the new candidate order, we can
6406 // choose this candidate order.
6407 if (Cnt < Pair.second) {
6408 combineOrders(Pair.first, BestOrder);
6409 BestOrder = Pair.first;
6410 Cnt = Pair.second;
6411 } else {
6412 combineOrders(BestOrder, Pair.first);
6413 }
6414 }
6415 // Set order of the user node.
6416 if (isIdentityOrder(BestOrder)) {
6417 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6418 OrderedEntries.remove(Op.second);
6419 continue;
6420 }
6421 fixupOrderingIndices(BestOrder);
6422 // Erase operands from OrderedEntries list and adjust their orders.
6423 VisitedOps.clear();
6424 SmallVector<int> Mask;
6425 inversePermutation(BestOrder, Mask);
6426 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6427 unsigned E = BestOrder.size();
6428 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6429 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6430 });
6431 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6432 TreeEntry *TE = Op.second;
6433 OrderedEntries.remove(TE);
6434 if (!VisitedOps.insert(TE).second)
6435 continue;
6436 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6437 reorderNodeWithReuses(*TE, Mask);
6438 continue;
6439 }
6440 // Gathers are processed separately.
6441 if (TE->State != TreeEntry::Vectorize &&
6442 TE->State != TreeEntry::StridedVectorize &&
6443 (TE->State != TreeEntry::ScatterVectorize ||
6444 TE->ReorderIndices.empty()))
6445 continue;
6446 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6447 TE->ReorderIndices.empty()) &&
6448 "Non-matching sizes of user/operand entries.");
6449 reorderOrder(TE->ReorderIndices, Mask);
6450 if (IgnoreReorder && TE == VectorizableTree.front().get())
6451 IgnoreReorder = false;
6452 }
6453 // For gathers just need to reorder its scalars.
6454 for (TreeEntry *Gather : GatherOps) {
6455 assert(Gather->ReorderIndices.empty() &&
6456 "Unexpected reordering of gathers.");
6457 if (!Gather->ReuseShuffleIndices.empty()) {
6458 // Just reorder reuses indices.
6459 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6460 continue;
6461 }
6462 reorderScalars(Gather->Scalars, Mask);
6463 OrderedEntries.remove(Gather);
6464 }
6465 // Reorder operands of the user node and set the ordering for the user
6466 // node itself.
6467 if (Data.first->State != TreeEntry::Vectorize ||
6468 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6469 Data.first->getMainOp()) ||
6470 Data.first->isAltShuffle())
6471 Data.first->reorderOperands(Mask);
6472 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6473 Data.first->isAltShuffle() ||
6474 Data.first->State == TreeEntry::StridedVectorize) {
6475 reorderScalars(Data.first->Scalars, Mask);
6476 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6477 /*BottomOrder=*/true);
6478 if (Data.first->ReuseShuffleIndices.empty() &&
6479 !Data.first->ReorderIndices.empty() &&
6480 !Data.first->isAltShuffle()) {
6481 // Insert user node to the list to try to sink reordering deeper in
6482 // the graph.
6483 OrderedEntries.insert(Data.first);
6484 }
6485 } else {
6486 reorderOrder(Data.first->ReorderIndices, Mask);
6487 }
6488 }
6489 }
6490 // If the reordering is unnecessary, just remove the reorder.
6491 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6492 VectorizableTree.front()->ReuseShuffleIndices.empty())
6493 VectorizableTree.front()->ReorderIndices.clear();
6494}
6495
6496Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6497 if ((Entry.getOpcode() == Instruction::Store ||
6498 Entry.getOpcode() == Instruction::Load) &&
6499 Entry.State == TreeEntry::StridedVectorize &&
6500 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6501 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6502 return dyn_cast<Instruction>(Entry.Scalars.front());
6503}
6504
6506 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6507 DenseMap<Value *, unsigned> ScalarToExtUses;
6508 // Collect the values that we need to extract from the tree.
6509 for (auto &TEPtr : VectorizableTree) {
6510 TreeEntry *Entry = TEPtr.get();
6511
6512 // No need to handle users of gathered values.
6513 if (Entry->isGather())
6514 continue;
6515
6516 // For each lane:
6517 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6518 Value *Scalar = Entry->Scalars[Lane];
6519 if (!isa<Instruction>(Scalar))
6520 continue;
6521 // All uses must be replaced already? No need to do it again.
6522 auto It = ScalarToExtUses.find(Scalar);
6523 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6524 continue;
6525
6526 // Check if the scalar is externally used as an extra arg.
6527 const auto ExtI = ExternallyUsedValues.find(Scalar);
6528 if (ExtI != ExternallyUsedValues.end()) {
6529 int FoundLane = Entry->findLaneForValue(Scalar);
6530 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6531 << FoundLane << " from " << *Scalar << ".\n");
6532 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6533 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6534 continue;
6535 }
6536 for (User *U : Scalar->users()) {
6537 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6538
6539 Instruction *UserInst = dyn_cast<Instruction>(U);
6540 if (!UserInst || isDeleted(UserInst))
6541 continue;
6542
6543 // Ignore users in the user ignore list.
6544 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6545 continue;
6546
6547 // Skip in-tree scalars that become vectors
6548 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6549 // Some in-tree scalars will remain as scalar in vectorized
6550 // instructions. If that is the case, the one in FoundLane will
6551 // be used.
6552 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6554 Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
6555 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6556 << ".\n");
6557 assert(!UseEntry->isGather() && "Bad state");
6558 continue;
6559 }
6560 U = nullptr;
6561 if (It != ScalarToExtUses.end()) {
6562 ExternalUses[It->second].User = nullptr;
6563 break;
6564 }
6565 }
6566
6567 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6568 U = nullptr;
6569 int FoundLane = Entry->findLaneForValue(Scalar);
6570 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6571 << " from lane " << FoundLane << " from " << *Scalar
6572 << ".\n");
6573 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6574 ExternalUses.emplace_back(Scalar, U, FoundLane);
6575 if (!U)
6576 break;
6577 }
6578 }
6579 }
6580}
6581
6583BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6586 PtrToStoresMap;
6587 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6588 Value *V = TE->Scalars[Lane];
6589 // Don't iterate over the users of constant data.
6590 if (!isa<Instruction>(V))
6591 continue;
6592 // To save compilation time we don't visit if we have too many users.
6593 if (V->hasNUsesOrMore(UsesLimit))
6594 break;
6595
6596 // Collect stores per pointer object.
6597 for (User *U : V->users()) {
6598 auto *SI = dyn_cast<StoreInst>(U);
6599 // Test whether we can handle the store. V might be a global, which could
6600 // be used in a different function.
6601 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6602 !isValidElementType(SI->getValueOperand()->getType()))
6603 continue;
6604 // Skip entry if already
6605 if (getTreeEntry(U))
6606 continue;
6607
6608 Value *Ptr =
6609 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6610 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6611 SI->getValueOperand()->getType(), Ptr}];
6612 // For now just keep one store per pointer object per lane.
6613 // TODO: Extend this to support multiple stores per pointer per lane
6614 if (StoresVec.size() > Lane)
6615 continue;
6616 if (!StoresVec.empty()) {
6617 std::optional<int> Diff = getPointersDiff(
6618 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6619 SI->getValueOperand()->getType(),
6620 StoresVec.front()->getPointerOperand(), *DL, *SE,
6621 /*StrictCheck=*/true);
6622 // We failed to compare the pointers so just abandon this store.
6623 if (!Diff)
6624 continue;
6625 }
6626 StoresVec.push_back(SI);
6627 }
6628 }
6629 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6630 unsigned I = 0;
6631 for (auto &P : PtrToStoresMap) {
6632 Res[I].swap(P.second);
6633 ++I;
6634 }
6635 return Res;
6636}
6637
6638bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6639 OrdersType &ReorderIndices) const {
6640 // We check whether the stores in StoreVec can form a vector by sorting them
6641 // and checking whether they are consecutive.
6642
6643 // To avoid calling getPointersDiff() while sorting we create a vector of
6644 // pairs {store, offset from first} and sort this instead.
6646 StoreInst *S0 = StoresVec[0];
6647 StoreOffsetVec.emplace_back(0, 0);
6648 Type *S0Ty = S0->getValueOperand()->getType();
6649 Value *S0Ptr = S0->getPointerOperand();
6650 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6651 StoreInst *SI = StoresVec[Idx];
6652 std::optional<int> Diff =
6653 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6654 SI->getPointerOperand(), *DL, *SE,
6655 /*StrictCheck=*/true);
6656 StoreOffsetVec.emplace_back(*Diff, Idx);
6657 }
6658
6659 // Check if the stores are consecutive by checking if their difference is 1.
6660 if (StoreOffsetVec.size() != StoresVec.size())
6661 return false;
6662 sort(StoreOffsetVec,
6663 [](const std::pair<int, unsigned> &L,
6664 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6665 unsigned Idx = 0;
6666 int PrevDist = 0;
6667 for (const auto &P : StoreOffsetVec) {
6668 if (Idx > 0 && P.first != PrevDist + 1)
6669 return false;
6670 PrevDist = P.first;
6671 ++Idx;
6672 }
6673
6674 // Calculate the shuffle indices according to their offset against the sorted
6675 // StoreOffsetVec.
6676 ReorderIndices.assign(StoresVec.size(), 0);
6677 bool IsIdentity = true;
6678 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6679 ReorderIndices[P.second] = I;
6680 IsIdentity &= P.second == I;
6681 }
6682 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6683 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6684 // same convention here.
6685 if (IsIdentity)
6686 ReorderIndices.clear();
6687
6688 return true;
6689}
6690
6691#ifndef NDEBUG
6693 for (unsigned Idx : Order)
6694 dbgs() << Idx << ", ";
6695 dbgs() << "\n";
6696}
6697#endif
6698
6700BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6701 unsigned NumLanes = TE->Scalars.size();
6702
6703 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6704
6705 // Holds the reorder indices for each candidate store vector that is a user of
6706 // the current TreeEntry.
6707 SmallVector<OrdersType, 1> ExternalReorderIndices;
6708
6709 // Now inspect the stores collected per pointer and look for vectorization
6710 // candidates. For each candidate calculate the reorder index vector and push
6711 // it into `ExternalReorderIndices`
6712 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6713 // If we have fewer than NumLanes stores, then we can't form a vector.
6714 if (StoresVec.size() != NumLanes)
6715 continue;
6716
6717 // If the stores are not consecutive then abandon this StoresVec.
6718 OrdersType ReorderIndices;
6719 if (!canFormVector(StoresVec, ReorderIndices))
6720 continue;
6721
6722 // We now know that the scalars in StoresVec can form a vector instruction,
6723 // so set the reorder indices.
6724 ExternalReorderIndices.push_back(ReorderIndices);
6725 }
6726 return ExternalReorderIndices;
6727}
6728
6730 const SmallDenseSet<Value *> &UserIgnoreLst) {
6731 deleteTree();
6732 UserIgnoreList = &UserIgnoreLst;
6733 if (!allSameType(Roots))
6734 return;
6735 buildTree_rec(Roots, 0, EdgeInfo());
6736}
6737
6739 deleteTree();
6740 if (!allSameType(Roots))
6741 return;
6742 buildTree_rec(Roots, 0, EdgeInfo());
6743}
6744
6745/// Tries to find subvector of loads and builds new vector of only loads if can
6746/// be profitable.
6748 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6750 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6751 bool AddNew = true) {
6752 if (VL.empty())
6753 return;
6754 Type *ScalarTy = getValueType(VL.front());
6755 if (!isValidElementType(ScalarTy))
6756 return;
6758 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6759 for (Value *V : VL) {
6760 auto *LI = dyn_cast<LoadInst>(V);
6761 if (!LI)
6762 continue;
6763 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6764 continue;
6765 bool IsFound = false;
6766 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6767 assert(LI->getParent() == Data.front().first->getParent() &&
6768 LI->getType() == Data.front().first->getType() &&
6769 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6770 getUnderlyingObject(Data.front().first->getPointerOperand(),
6772 "Expected loads with the same type, same parent and same "
6773 "underlying pointer.");
6774 std::optional<int> Dist = getPointersDiff(
6775 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6776 Data.front().first->getPointerOperand(), DL, SE,
6777 /*StrictCheck=*/true);
6778 if (!Dist)
6779 continue;
6780 auto It = Map.find(*Dist);
6781 if (It != Map.end() && It->second != LI)
6782 continue;
6783 if (It == Map.end()) {
6784 Data.emplace_back(LI, *Dist);
6785 Map.try_emplace(*Dist, LI);
6786 }
6787 IsFound = true;
6788 break;
6789 }
6790 if (!IsFound) {
6791 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6792 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6793 }
6794 }
6795 auto FindMatchingLoads =
6798 &GatheredLoads,
6799 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6800 int &Offset, unsigned &Start) {
6801 if (Loads.empty())
6802 return GatheredLoads.end();
6804 LoadInst *LI = Loads.front().first;
6805 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6806 if (Idx < Start)
6807 continue;
6808 ToAdd.clear();
6809 if (LI->getParent() != Data.front().first->getParent() ||
6810 LI->getType() != Data.front().first->getType())
6811 continue;
6812 std::optional<int> Dist =
6814 Data.front().first->getType(),
6815 Data.front().first->getPointerOperand(), DL, SE,
6816 /*StrictCheck=*/true);
6817 if (!Dist)
6818 continue;
6819 SmallSet<int, 4> DataDists;
6821 for (std::pair<LoadInst *, int> P : Data) {
6822 DataDists.insert(P.second);
6823 DataLoads.insert(P.first);
6824 }
6825 // Found matching gathered loads - check if all loads are unique or
6826 // can be effectively vectorized.
6827 unsigned NumUniques = 0;
6828 for (auto [Cnt, Pair] : enumerate(Loads)) {
6829 bool Used = DataLoads.contains(Pair.first);
6830 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6831 ++NumUniques;
6832 ToAdd.insert(Cnt);
6833 } else if (Used) {
6834 Repeated.insert(Cnt);
6835 }
6836 }
6837 if (NumUniques > 0 &&
6838 (Loads.size() == NumUniques ||
6839 (Loads.size() - NumUniques >= 2 &&
6840 Loads.size() - NumUniques >= Loads.size() / 2 &&
6841 (has_single_bit(Data.size() + NumUniques) ||
6842 bit_ceil(Data.size()) <
6843 bit_ceil(Data.size() + NumUniques))))) {
6844 Offset = *Dist;
6845 Start = Idx + 1;
6846 return std::next(GatheredLoads.begin(), Idx);
6847 }
6848 }
6849 ToAdd.clear();
6850 return GatheredLoads.end();
6851 };
6852 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6853 unsigned Start = 0;
6854 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6855 int Offset = 0;
6856 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6857 Offset, Start);
6858 while (It != GatheredLoads.end()) {
6859 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6860 for (unsigned Idx : LocalToAdd)
6861 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6862 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6863 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6864 Start);
6865 }
6866 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6867 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6868 })) {
6869 auto AddNewLoads =
6871 for (unsigned Idx : seq<unsigned>(Data.size())) {
6872 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6873 continue;
6874 Loads.push_back(Data[Idx]);
6875 }
6876 };
6877 if (!AddNew) {
6878 LoadInst *LI = Data.front().first;
6879 It = find_if(
6880 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6881 return PD.front().first->getParent() == LI->getParent() &&
6882 PD.front().first->getType() == LI->getType();
6883 });
6884 while (It != GatheredLoads.end()) {
6885 AddNewLoads(*It);
6886 It = std::find_if(
6887 std::next(It), GatheredLoads.end(),
6888 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6889 return PD.front().first->getParent() == LI->getParent() &&
6890 PD.front().first->getType() == LI->getType();
6891 });
6892 }
6893 }
6894 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6895 AddNewLoads(GatheredLoads.emplace_back());
6896 }
6897 }
6898}
6899
6900void BoUpSLP::tryToVectorizeGatheredLoads(
6901 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6902 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6903 8> &GatheredLoads) {
6904 GatheredLoadsEntriesFirst = VectorizableTree.size();
6905
6906 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6907 LoadEntriesToVectorize.size());
6908 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6909 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6910 VectorizableTree[Idx]->Scalars.end());
6911
6912 // Sort loads by distance.
6913 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6914 const std::pair<LoadInst *, int> &L2) {
6915 return L1.second > L2.second;
6916 };
6917
6918 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
6919 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6920 Loads.size());
6921 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6922 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6923 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6924 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6925 };
6926
6927 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6928 BoUpSLP::ValueSet &VectorizedLoads,
6929 SmallVectorImpl<LoadInst *> &NonVectorized,
6930 bool Final, unsigned MaxVF) {
6932 unsigned StartIdx = 0;
6933 SmallVector<int> CandidateVFs;
6934 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6935 CandidateVFs.push_back(MaxVF);
6936 for (int NumElts = getFloorFullVectorNumberOfElements(
6937 *TTI, Loads.front()->getType(), MaxVF);
6938 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6939 *TTI, Loads.front()->getType(), NumElts - 1)) {
6940 CandidateVFs.push_back(NumElts);
6941 if (VectorizeNonPowerOf2 && NumElts > 2)
6942 CandidateVFs.push_back(NumElts - 1);
6943 }
6944
6945 if (Final && CandidateVFs.empty())
6946 return Results;
6947
6948 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6949 for (unsigned NumElts : CandidateVFs) {
6950 if (Final && NumElts > BestVF)
6951 continue;
6952 SmallVector<unsigned> MaskedGatherVectorized;
6953 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
6954 ++Cnt) {
6955 ArrayRef<LoadInst *> Slice =
6956 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
6957 if (VectorizedLoads.count(Slice.front()) ||
6958 VectorizedLoads.count(Slice.back()) ||
6960 continue;
6961 // Check if it is profitable to try vectorizing gathered loads. It is
6962 // profitable if we have more than 3 consecutive loads or if we have
6963 // less but all users are vectorized or deleted.
6964 bool AllowToVectorize = false;
6965 // Check if it is profitable to vectorize 2-elements loads.
6966 if (NumElts == 2) {
6967 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6968 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6969 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6970 for (LoadInst *LI : Slice) {
6971 // If single use/user - allow to vectorize.
6972 if (LI->hasOneUse())
6973 continue;
6974 // 1. Check if number of uses equals number of users.
6975 // 2. All users are deleted.
6976 // 3. The load broadcasts are not allowed or the load is not
6977 // broadcasted.
6978 if (static_cast<unsigned int>(std::distance(
6979 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6980 return false;
6981 if (!IsLegalBroadcastLoad)
6982 continue;
6983 if (LI->hasNUsesOrMore(UsesLimit))
6984 return false;
6985 for (User *U : LI->users()) {
6986 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
6987 continue;
6988 if (const TreeEntry *UTE = getTreeEntry(U)) {
6989 for (int I : seq<int>(UTE->getNumOperands())) {
6990 if (all_of(UTE->getOperand(I),
6991 [LI](Value *V) { return V == LI; }))
6992 // Found legal broadcast - do not vectorize.
6993 return false;
6994 }
6995 }
6996 }
6997 }
6998 return true;
6999 };
7000 AllowToVectorize = CheckIfAllowed(Slice);
7001 } else {
7002 AllowToVectorize =
7003 (NumElts >= 3 ||
7004 any_of(ValueToGatherNodes.at(Slice.front()),
7005 [=](const TreeEntry *TE) {
7006 return TE->Scalars.size() == 2 &&
7007 ((TE->Scalars.front() == Slice.front() &&
7008 TE->Scalars.back() == Slice.back()) ||
7009 (TE->Scalars.front() == Slice.back() &&
7010 TE->Scalars.back() == Slice.front()));
7011 })) &&
7012 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7013 Slice.size());
7014 }
7015 if (AllowToVectorize) {
7016 SmallVector<Value *> PointerOps;
7017 OrdersType CurrentOrder;
7018 // Try to build vector load.
7019 ArrayRef<Value *> Values(
7020 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7021 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7022 PointerOps, &BestVF);
7023 if (LS != LoadsState::Gather ||
7024 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7025 if (LS == LoadsState::ScatterVectorize) {
7026 if (MaskedGatherVectorized.empty() ||
7027 Cnt >= MaskedGatherVectorized.back() + NumElts)
7028 MaskedGatherVectorized.push_back(Cnt);
7029 continue;
7030 }
7031 if (LS != LoadsState::Gather) {
7032 Results.emplace_back(Values, LS);
7033 VectorizedLoads.insert(Slice.begin(), Slice.end());
7034 // If we vectorized initial block, no need to try to vectorize it
7035 // again.
7036 if (Cnt == StartIdx)
7037 StartIdx += NumElts;
7038 }
7039 // Check if the whole array was vectorized already - exit.
7040 if (StartIdx >= Loads.size())
7041 break;
7042 // Erase last masked gather candidate, if another candidate within
7043 // the range is found to be better.
7044 if (!MaskedGatherVectorized.empty() &&
7045 Cnt < MaskedGatherVectorized.back() + NumElts)
7046 MaskedGatherVectorized.pop_back();
7047 Cnt += NumElts - 1;
7048 continue;
7049 }
7050 }
7051 if (!AllowToVectorize || BestVF == 0)
7053 }
7054 // Mark masked gathers candidates as vectorized, if any.
7055 for (unsigned Cnt : MaskedGatherVectorized) {
7056 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7057 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7058 ArrayRef<Value *> Values(
7059 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7060 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7061 VectorizedLoads.insert(Slice.begin(), Slice.end());
7062 // If we vectorized initial block, no need to try to vectorize it again.
7063 if (Cnt == StartIdx)
7064 StartIdx += NumElts;
7065 }
7066 }
7067 for (LoadInst *LI : Loads) {
7068 if (!VectorizedLoads.contains(LI))
7069 NonVectorized.push_back(LI);
7070 }
7071 return Results;
7072 };
7073 auto ProcessGatheredLoads =
7074 [&, &TTI = *TTI](
7076 bool Final = false) {
7077 SmallVector<LoadInst *> NonVectorized;
7078 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7079 if (LoadsDists.size() <= 1) {
7080 NonVectorized.push_back(LoadsDists.back().first);
7081 continue;
7082 }
7083 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7084 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7085 transform(LoadsDists, OriginalLoads.begin(),
7086 [](const std::pair<LoadInst *, int> &L) -> LoadInst * {
7087 return L.first;
7088 });
7089 stable_sort(LocalLoadsDists, LoadSorter);
7091 unsigned MaxConsecutiveDistance = 0;
7092 unsigned CurrentConsecutiveDist = 1;
7093 int LastDist = LocalLoadsDists.front().second;
7094 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7095 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7096 if (getTreeEntry(L.first))
7097 continue;
7098 assert(LastDist >= L.second &&
7099 "Expected first distance always not less than second");
7100 if (static_cast<unsigned>(LastDist - L.second) ==
7101 CurrentConsecutiveDist) {
7102 ++CurrentConsecutiveDist;
7103 MaxConsecutiveDistance =
7104 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7105 Loads.push_back(L.first);
7106 continue;
7107 }
7108 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7109 !Loads.empty())
7110 Loads.pop_back();
7111 CurrentConsecutiveDist = 1;
7112 LastDist = L.second;
7113 Loads.push_back(L.first);
7114 }
7115 if (Loads.size() <= 1)
7116 continue;
7117 if (AllowMaskedGather)
7118 MaxConsecutiveDistance = Loads.size();
7119 else if (MaxConsecutiveDistance < 2)
7120 continue;
7121 BoUpSLP::ValueSet VectorizedLoads;
7122 SmallVector<LoadInst *> SortedNonVectorized;
7124 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7125 Final, MaxConsecutiveDistance);
7126 if (!Results.empty() && !SortedNonVectorized.empty() &&
7127 OriginalLoads.size() == Loads.size() &&
7128 MaxConsecutiveDistance == Loads.size() &&
7130 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7131 return P.second == LoadsState::ScatterVectorize;
7132 })) {
7133 VectorizedLoads.clear();
7134 SmallVector<LoadInst *> UnsortedNonVectorized;
7136 UnsortedResults =
7137 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7138 UnsortedNonVectorized, Final,
7139 OriginalLoads.size());
7140 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7141 SortedNonVectorized.swap(UnsortedNonVectorized);
7142 Results.swap(UnsortedResults);
7143 }
7144 }
7145 for (auto [Slice, _] : Results) {
7146 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7147 << Slice.size() << ")\n");
7148 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
7149 for (Value *L : Slice)
7150 if (!getTreeEntry(L))
7151 SortedNonVectorized.push_back(cast<LoadInst>(L));
7152 continue;
7153 }
7154
7155 // Select maximum VF as a maximum of user gathered nodes and
7156 // distance between scalar loads in these nodes.
7157 unsigned MaxVF = Slice.size();
7158 unsigned UserMaxVF = 0;
7159 unsigned InterleaveFactor = 0;
7160 if (MaxVF == 2) {
7161 UserMaxVF = MaxVF;
7162 } else {
7163 // Found distance between segments of the interleaved loads.
7164 std::optional<unsigned> InterleavedLoadsDistance = 0;
7165 unsigned Order = 0;
7166 std::optional<unsigned> CommonVF = 0;
7168 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7169 for (auto [Idx, V] : enumerate(Slice)) {
7170 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7171 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7172 unsigned Pos =
7173 EntryToPosition.try_emplace(E, Idx).first->second;
7174 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7175 if (CommonVF) {
7176 if (*CommonVF == 0) {
7177 CommonVF = E->Scalars.size();
7178 continue;
7179 }
7180 if (*CommonVF != E->Scalars.size())
7181 CommonVF.reset();
7182 }
7183 // Check if the load is the part of the interleaved load.
7184 if (Pos != Idx && InterleavedLoadsDistance) {
7185 if (!DeinterleavedNodes.contains(E) &&
7186 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7187 if (isa<Constant>(V))
7188 return false;
7189 if (getTreeEntry(V))
7190 return true;
7191 const auto &Nodes = ValueToGatherNodes.at(V);
7192 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7193 !is_contained(Slice, V);
7194 })) {
7195 InterleavedLoadsDistance.reset();
7196 continue;
7197 }
7198 DeinterleavedNodes.insert(E);
7199 if (*InterleavedLoadsDistance == 0) {
7200 InterleavedLoadsDistance = Idx - Pos;
7201 continue;
7202 }
7203 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7204 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7205 InterleavedLoadsDistance.reset();
7206 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7207 }
7208 }
7209 }
7210 DeinterleavedNodes.clear();
7211 // Check if the large load represents interleaved load operation.
7212 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7213 CommonVF.value_or(0) != 0) {
7214 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7215 unsigned VF = *CommonVF;
7216 OrdersType Order;
7217 SmallVector<Value *> PointerOps;
7218 // Segmented load detected - vectorize at maximum vector factor.
7219 if (InterleaveFactor <= Slice.size() &&
7221 getWidenedType(Slice.front()->getType(), VF),
7222 InterleaveFactor,
7223 cast<LoadInst>(Slice.front())->getAlign(),
7224 cast<LoadInst>(Slice.front())
7226 canVectorizeLoads(Slice, Slice.front(), Order,
7227 PointerOps) == LoadsState::Vectorize) {
7228 UserMaxVF = InterleaveFactor * VF;
7229 } else {
7230 InterleaveFactor = 0;
7231 }
7232 }
7233 // Cannot represent the loads as consecutive vectorizable nodes -
7234 // just exit.
7235 unsigned ConsecutiveNodesSize = 0;
7236 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7237 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7238 [&, Slice = Slice](const auto &P) {
7239 const auto *It = find_if(Slice, [&](Value *V) {
7240 return std::get<1>(P).contains(V);
7241 });
7242 if (It == Slice.end())
7243 return false;
7245 VectorizableTree[std::get<0>(P)]->Scalars;
7246 ConsecutiveNodesSize += VL.size();
7247 unsigned Start = std::distance(Slice.begin(), It);
7248 unsigned Sz = Slice.size() - Start;
7249 return Sz < VL.size() ||
7250 Slice.slice(std::distance(Slice.begin(), It),
7251 VL.size()) != VL;
7252 }))
7253 continue;
7254 // Try to build long masked gather loads.
7255 UserMaxVF = bit_ceil(UserMaxVF);
7256 if (InterleaveFactor == 0 &&
7257 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7258 [&, Slice = Slice](unsigned Idx) {
7259 OrdersType Order;
7260 SmallVector<Value *> PointerOps;
7261 return canVectorizeLoads(
7262 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7263 Slice[Idx * UserMaxVF], Order,
7264 PointerOps) ==
7265 LoadsState::ScatterVectorize;
7266 }))
7267 UserMaxVF = MaxVF;
7268 if (Slice.size() != ConsecutiveNodesSize)
7269 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7270 }
7271 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7272 bool IsVectorized = true;
7273 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7274 ArrayRef<Value *> SubSlice =
7275 Slice.slice(I, std::min(VF, E - I));
7276 if (getTreeEntry(SubSlice.front()))
7277 continue;
7278 // Check if the subslice is to be-vectorized entry, which is not
7279 // equal to entry.
7280 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7281 [&](const auto &P) {
7282 return !SubSlice.equals(
7283 VectorizableTree[std::get<0>(P)]
7284 ->Scalars) &&
7285 set_is_subset(SubSlice, std::get<1>(P));
7286 }))
7287 continue;
7288 unsigned Sz = VectorizableTree.size();
7289 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7290 if (Sz == VectorizableTree.size()) {
7291 IsVectorized = false;
7292 // Try non-interleaved vectorization with smaller vector
7293 // factor.
7294 if (InterleaveFactor > 0) {
7295 VF = 2 * (MaxVF / InterleaveFactor);
7296 InterleaveFactor = 0;
7297 }
7298 continue;
7299 }
7300 }
7301 if (IsVectorized)
7302 break;
7303 }
7304 }
7305 NonVectorized.append(SortedNonVectorized);
7306 }
7307 return NonVectorized;
7308 };
7309 for (const auto &GLs : GatheredLoads) {
7310 const auto &Ref = GLs.second;
7311 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7312 if (!Ref.empty() && !NonVectorized.empty() &&
7313 std::accumulate(
7314 Ref.begin(), Ref.end(), 0u,
7315 [](unsigned S,
7316 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
7317 return S + LoadsDists.size();
7318 }) != NonVectorized.size() &&
7319 IsMaskedGatherSupported(NonVectorized)) {
7321 for (LoadInst *LI : NonVectorized) {
7322 // Reinsert non-vectorized loads to other list of loads with the same
7323 // base pointers.
7324 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7325 FinalGatheredLoads,
7326 /*AddNew=*/false);
7327 }
7328 // Final attempt to vectorize non-vectorized loads.
7329 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7330 }
7331 }
7332 // Try to vectorize postponed load entries, previously marked as gathered.
7333 for (unsigned Idx : LoadEntriesToVectorize) {
7334 const TreeEntry &E = *VectorizableTree[Idx];
7335 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7336 // Avoid reordering, if possible.
7337 if (!E.ReorderIndices.empty()) {
7338 // Build a mask out of the reorder indices and reorder scalars per this
7339 // mask.
7340 SmallVector<int> ReorderMask;
7341 inversePermutation(E.ReorderIndices, ReorderMask);
7342 reorderScalars(GatheredScalars, ReorderMask);
7343 }
7344 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7345 }
7346 // If no new entries created, consider it as no gathered loads entries must be
7347 // handled.
7348 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7349 VectorizableTree.size())
7350 GatheredLoadsEntriesFirst.reset();
7351}
7352
7353/// \return true if the specified list of values has only one instruction that
7354/// requires scheduling, false otherwise.
7355#ifndef NDEBUG
7357 Value *NeedsScheduling = nullptr;
7358 for (Value *V : VL) {
7360 continue;
7361 if (!NeedsScheduling) {
7362 NeedsScheduling = V;
7363 continue;
7364 }
7365 return false;
7366 }
7367 return NeedsScheduling;
7368}
7369#endif
7370
7371/// Generates key/subkey pair for the given value to provide effective sorting
7372/// of the values and better detection of the vectorizable values sequences. The
7373/// keys/subkeys can be used for better sorting of the values themselves (keys)
7374/// and in values subgroups (subkeys).
7375static std::pair<size_t, size_t> generateKeySubkey(
7376 Value *V, const TargetLibraryInfo *TLI,
7377 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7378 bool AllowAlternate) {
7379 hash_code Key = hash_value(V->getValueID() + 2);
7380 hash_code SubKey = hash_value(0);
7381 // Sort the loads by the distance between the pointers.
7382 if (auto *LI = dyn_cast<LoadInst>(V)) {
7383 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7384 if (LI->isSimple())
7385 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7386 else
7387 Key = SubKey = hash_value(LI);
7388 } else if (isVectorLikeInstWithConstOps(V)) {
7389 // Sort extracts by the vector operands.
7390 if (isa<ExtractElementInst, UndefValue>(V))
7391 Key = hash_value(Value::UndefValueVal + 1);
7392 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7393 if (!isUndefVector(EI->getVectorOperand()).all() &&
7394 !isa<UndefValue>(EI->getIndexOperand()))
7395 SubKey = hash_value(EI->getVectorOperand());
7396 }
7397 } else if (auto *I = dyn_cast<Instruction>(V)) {
7398 // Sort other instructions just by the opcodes except for CMPInst.
7399 // For CMP also sort by the predicate kind.
7400 if ((isa<BinaryOperator, CastInst>(I)) &&
7401 isValidForAlternation(I->getOpcode())) {
7402 if (AllowAlternate)
7403 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7404 else
7405 Key = hash_combine(hash_value(I->getOpcode()), Key);
7406 SubKey = hash_combine(
7407 hash_value(I->getOpcode()), hash_value(I->getType()),
7408 hash_value(isa<BinaryOperator>(I)
7409 ? I->getType()
7410 : cast<CastInst>(I)->getOperand(0)->getType()));
7411 // For casts, look through the only operand to improve compile time.
7412 if (isa<CastInst>(I)) {
7413 std::pair<size_t, size_t> OpVals =
7414 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7415 /*AllowAlternate=*/true);
7416 Key = hash_combine(OpVals.first, Key);
7417 SubKey = hash_combine(OpVals.first, SubKey);
7418 }
7419 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7420 CmpInst::Predicate Pred = CI->getPredicate();
7421 if (CI->isCommutative())
7422 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7424 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7425 hash_value(SwapPred),
7426 hash_value(CI->getOperand(0)->getType()));
7427 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7430 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7431 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7432 SubKey = hash_combine(hash_value(I->getOpcode()),
7433 hash_value(Call->getCalledFunction()));
7434 } else {
7435 Key = hash_combine(hash_value(Call), Key);
7436 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7437 }
7438 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7439 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7440 hash_value(Op.Tag), SubKey);
7441 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7442 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7443 SubKey = hash_value(Gep->getPointerOperand());
7444 else
7445 SubKey = hash_value(Gep);
7446 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7447 !isa<ConstantInt>(I->getOperand(1))) {
7448 // Do not try to vectorize instructions with potentially high cost.
7449 SubKey = hash_value(I);
7450 } else {
7451 SubKey = hash_value(I->getOpcode());
7452 }
7453 Key = hash_combine(hash_value(I->getParent()), Key);
7454 }
7455 return std::make_pair(Key, SubKey);
7456}
7457
7458/// Checks if the specified instruction \p I is an alternate operation for
7459/// the given \p MainOp and \p AltOp instructions.
7460static bool isAlternateInstruction(const Instruction *I,
7461 const Instruction *MainOp,
7462 const Instruction *AltOp,
7463 const TargetLibraryInfo &TLI);
7464
7465bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7466 ArrayRef<Value *> VL) const {
7467 unsigned Opcode0 = S.getOpcode();
7468 unsigned Opcode1 = S.getAltOpcode();
7469 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7470 // If this pattern is supported by the target then consider it profitable.
7471 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7472 Opcode0, Opcode1, OpcodeMask))
7473 return true;
7475 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7476 Operands.emplace_back();
7477 // Prepare the operand vector.
7478 for (Value *V : VL) {
7479 if (isa<PoisonValue>(V)) {
7480 Operands.back().push_back(
7481 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7482 continue;
7483 }
7484 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7485 }
7486 }
7487 if (Operands.size() == 2) {
7488 // Try find best operands candidates.
7489 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7491 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7492 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7493 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7494 std::optional<int> Res = findBestRootPair(Candidates);
7495 switch (Res.value_or(0)) {
7496 case 0:
7497 break;
7498 case 1:
7499 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7500 break;
7501 case 2:
7502 std::swap(Operands[0][I], Operands[1][I]);
7503 break;
7504 default:
7505 llvm_unreachable("Unexpected index.");
7506 }
7507 }
7508 }
7509 DenseSet<unsigned> UniqueOpcodes;
7510 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7511 unsigned NonInstCnt = 0;
7512 // Estimate number of instructions, required for the vectorized node and for
7513 // the buildvector node.
7514 unsigned UndefCnt = 0;
7515 // Count the number of extra shuffles, required for vector nodes.
7516 unsigned ExtraShuffleInsts = 0;
7517 // Check that operands do not contain same values and create either perfect
7518 // diamond match or shuffled match.
7519 if (Operands.size() == 2) {
7520 // Do not count same operands twice.
7521 if (Operands.front() == Operands.back()) {
7522 Operands.erase(Operands.begin());
7523 } else if (!allConstant(Operands.front()) &&
7524 all_of(Operands.front(), [&](Value *V) {
7525 return is_contained(Operands.back(), V);
7526 })) {
7527 Operands.erase(Operands.begin());
7528 ++ExtraShuffleInsts;
7529 }
7530 }
7531 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7532 // Vectorize node, if:
7533 // 1. at least single operand is constant or splat.
7534 // 2. Operands have many loop invariants (the instructions are not loop
7535 // invariants).
7536 // 3. At least single unique operands is supposed to vectorized.
7537 return none_of(Operands,
7538 [&](ArrayRef<Value *> Op) {
7539 if (allConstant(Op) ||
7540 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7541 getSameOpcode(Op, *TLI)))
7542 return false;
7544 for (Value *V : Op) {
7545 if (isa<Constant, ExtractElementInst>(V) ||
7546 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7547 if (isa<UndefValue>(V))
7548 ++UndefCnt;
7549 continue;
7550 }
7551 auto Res = Uniques.try_emplace(V, 0);
7552 // Found first duplicate - need to add shuffle.
7553 if (!Res.second && Res.first->second == 1)
7554 ++ExtraShuffleInsts;
7555 ++Res.first->getSecond();
7556 if (auto *I = dyn_cast<Instruction>(V))
7557 UniqueOpcodes.insert(I->getOpcode());
7558 else if (Res.second)
7559 ++NonInstCnt;
7560 }
7561 return none_of(Uniques, [&](const auto &P) {
7562 return P.first->hasNUsesOrMore(P.second + 1) &&
7563 none_of(P.first->users(), [&](User *U) {
7564 return getTreeEntry(U) || Uniques.contains(U);
7565 });
7566 });
7567 }) ||
7568 // Do not vectorize node, if estimated number of vector instructions is
7569 // more than estimated number of buildvector instructions. Number of
7570 // vector operands is number of vector instructions + number of vector
7571 // instructions for operands (buildvectors). Number of buildvector
7572 // instructions is just number_of_operands * number_of_scalars.
7573 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7574 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7575 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7576}
7577
7578BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7579 const InstructionsState &S, ArrayRef<Value *> VL,
7580 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7581 SmallVectorImpl<Value *> &PointerOps) {
7582 assert(S.getMainOp() &&
7583 "Expected instructions with same/alternate opcodes only.");
7584
7585 unsigned ShuffleOrOp =
7586 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7587 Instruction *VL0 = S.getMainOp();
7588 switch (ShuffleOrOp) {
7589 case Instruction::PHI: {
7590 // Too many operands - gather, most probably won't be vectorized.
7591 if (VL0->getNumOperands() > MaxPHINumOperands)
7592 return TreeEntry::NeedToGather;
7593 // Check for terminator values (e.g. invoke).
7594 for (Value *V : VL) {
7595 auto *PHI = dyn_cast<PHINode>(V);
7596 if (!PHI)
7597 continue;
7598 for (Value *Incoming : PHI->incoming_values()) {
7599 Instruction *Term = dyn_cast<Instruction>(Incoming);
7600 if (Term && Term->isTerminator()) {
7602 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7603 return TreeEntry::NeedToGather;
7604 }
7605 }
7606 }
7607
7608 return TreeEntry::Vectorize;
7609 }
7610 case Instruction::ExtractValue:
7611 case Instruction::ExtractElement: {
7612 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7613 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7614 if (!has_single_bit(VL.size()))
7615 return TreeEntry::NeedToGather;
7616 if (Reuse || !CurrentOrder.empty())
7617 return TreeEntry::Vectorize;
7618 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7619 return TreeEntry::NeedToGather;
7620 }
7621 case Instruction::InsertElement: {
7622 // Check that we have a buildvector and not a shuffle of 2 or more
7623 // different vectors.
7624 ValueSet SourceVectors;
7625 for (Value *V : VL) {
7626 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7627 assert(getElementIndex(V) != std::nullopt &&
7628 "Non-constant or undef index?");
7629 }
7630
7631 if (count_if(VL, [&SourceVectors](Value *V) {
7632 return !SourceVectors.contains(V);
7633 }) >= 2) {
7634 // Found 2nd source vector - cancel.
7635 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7636 "different source vectors.\n");
7637 return TreeEntry::NeedToGather;
7638 }
7639
7640 if (any_of(VL, [&SourceVectors](Value *V) {
7641 // The last InsertElement can have multiple uses.
7642 return SourceVectors.contains(V) && !V->hasOneUse();
7643 })) {
7644 assert(SLPReVec && "Only supported by REVEC.");
7645 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7646 "multiple uses.\n");
7647 return TreeEntry::NeedToGather;
7648 }
7649
7650 return TreeEntry::Vectorize;
7651 }
7652 case Instruction::Load: {
7653 // Check that a vectorized load would load the same memory as a scalar
7654 // load. For example, we don't want to vectorize loads that are smaller
7655 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7656 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7657 // from such a struct, we read/write packed bits disagreeing with the
7658 // unvectorized version.
7659 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7661 return TreeEntry::Vectorize;
7663 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7664 // Delay slow vectorized nodes for better vectorization attempts.
7665 LoadEntriesToVectorize.insert(VectorizableTree.size());
7666 return TreeEntry::NeedToGather;
7667 }
7668 return TreeEntry::ScatterVectorize;
7670 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7671 // Delay slow vectorized nodes for better vectorization attempts.
7672 LoadEntriesToVectorize.insert(VectorizableTree.size());
7673 return TreeEntry::NeedToGather;
7674 }
7675 return TreeEntry::StridedVectorize;
7676 case LoadsState::Gather:
7677#ifndef NDEBUG
7678 Type *ScalarTy = VL0->getType();
7679 if (DL->getTypeSizeInBits(ScalarTy) !=
7680 DL->getTypeAllocSizeInBits(ScalarTy))
7681 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7682 else if (any_of(VL, [](Value *V) {
7683 auto *LI = dyn_cast<LoadInst>(V);
7684 return !LI || !LI->isSimple();
7685 }))
7686 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7687 else
7688 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7689#endif // NDEBUG
7691 return TreeEntry::NeedToGather;
7692 }
7693 llvm_unreachable("Unexpected state of loads");
7694 }
7695 case Instruction::ZExt:
7696 case Instruction::SExt:
7697 case Instruction::FPToUI:
7698 case Instruction::FPToSI:
7699 case Instruction::FPExt:
7700 case Instruction::PtrToInt:
7701 case Instruction::IntToPtr:
7702 case Instruction::SIToFP:
7703 case Instruction::UIToFP:
7704 case Instruction::Trunc:
7705 case Instruction::FPTrunc:
7706 case Instruction::BitCast: {
7707 Type *SrcTy = VL0->getOperand(0)->getType();
7708 for (Value *V : VL) {
7709 if (isa<PoisonValue>(V))
7710 continue;
7711 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7712 if (Ty != SrcTy || !isValidElementType(Ty)) {
7713 LLVM_DEBUG(
7714 dbgs() << "SLP: Gathering casts with different src types.\n");
7715 return TreeEntry::NeedToGather;
7716 }
7717 }
7718 return TreeEntry::Vectorize;
7719 }
7720 case Instruction::ICmp:
7721 case Instruction::FCmp: {
7722 // Check that all of the compares have the same predicate.
7723 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7725 Type *ComparedTy = VL0->getOperand(0)->getType();
7726 for (Value *V : VL) {
7727 if (isa<PoisonValue>(V))
7728 continue;
7729 auto *Cmp = cast<CmpInst>(V);
7730 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7731 Cmp->getOperand(0)->getType() != ComparedTy) {
7732 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7733 return TreeEntry::NeedToGather;
7734 }
7735 }
7736 return TreeEntry::Vectorize;
7737 }
7738 case Instruction::Select:
7739 case Instruction::FNeg:
7740 case Instruction::Add:
7741 case Instruction::FAdd:
7742 case Instruction::Sub:
7743 case Instruction::FSub:
7744 case Instruction::Mul:
7745 case Instruction::FMul:
7746 case Instruction::UDiv:
7747 case Instruction::SDiv:
7748 case Instruction::FDiv:
7749 case Instruction::URem:
7750 case Instruction::SRem:
7751 case Instruction::FRem:
7752 case Instruction::Shl:
7753 case Instruction::LShr:
7754 case Instruction::AShr:
7755 case Instruction::And:
7756 case Instruction::Or:
7757 case Instruction::Xor:
7758 case Instruction::Freeze:
7759 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7761 auto *I = dyn_cast<Instruction>(V);
7762 return I && I->isBinaryOp() && !I->isFast();
7763 }))
7764 return TreeEntry::NeedToGather;
7765 return TreeEntry::Vectorize;
7766 case Instruction::GetElementPtr: {
7767 // We don't combine GEPs with complicated (nested) indexing.
7768 for (Value *V : VL) {
7769 auto *I = dyn_cast<GetElementPtrInst>(V);
7770 if (!I)
7771 continue;
7772 if (I->getNumOperands() != 2) {
7773 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7774 return TreeEntry::NeedToGather;
7775 }
7776 }
7777
7778 // We can't combine several GEPs into one vector if they operate on
7779 // different types.
7780 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7781 for (Value *V : VL) {
7782 auto *GEP = dyn_cast<GEPOperator>(V);
7783 if (!GEP)
7784 continue;
7785 Type *CurTy = GEP->getSourceElementType();
7786 if (Ty0 != CurTy) {
7787 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7788 return TreeEntry::NeedToGather;
7789 }
7790 }
7791
7792 // We don't combine GEPs with non-constant indexes.
7793 Type *Ty1 = VL0->getOperand(1)->getType();
7794 for (Value *V : VL) {
7795 auto *I = dyn_cast<GetElementPtrInst>(V);
7796 if (!I)
7797 continue;
7798 auto *Op = I->getOperand(1);
7799 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7800 (Op->getType() != Ty1 &&
7801 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7802 Op->getType()->getScalarSizeInBits() >
7803 DL->getIndexSizeInBits(
7804 V->getType()->getPointerAddressSpace())))) {
7805 LLVM_DEBUG(
7806 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7807 return TreeEntry::NeedToGather;
7808 }
7809 }
7810
7811 return TreeEntry::Vectorize;
7812 }
7813 case Instruction::Store: {
7814 // Check if the stores are consecutive or if we need to swizzle them.
7815 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7816 // Avoid types that are padded when being allocated as scalars, while
7817 // being packed together in a vector (such as i1).
7818 if (DL->getTypeSizeInBits(ScalarTy) !=
7819 DL->getTypeAllocSizeInBits(ScalarTy)) {
7820 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7821 return TreeEntry::NeedToGather;
7822 }
7823 // Make sure all stores in the bundle are simple - we can't vectorize
7824 // atomic or volatile stores.
7825 for (Value *V : VL) {
7826 auto *SI = cast<StoreInst>(V);
7827 if (!SI->isSimple()) {
7828 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7829 return TreeEntry::NeedToGather;
7830 }
7831 PointerOps.push_back(SI->getPointerOperand());
7832 }
7833
7834 // Check the order of pointer operands.
7835 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7836 Value *Ptr0;
7837 Value *PtrN;
7838 if (CurrentOrder.empty()) {
7839 Ptr0 = PointerOps.front();
7840 PtrN = PointerOps.back();
7841 } else {
7842 Ptr0 = PointerOps[CurrentOrder.front()];
7843 PtrN = PointerOps[CurrentOrder.back()];
7844 }
7845 std::optional<int> Dist =
7846 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7847 // Check that the sorted pointer operands are consecutive.
7848 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7849 return TreeEntry::Vectorize;
7850 }
7851
7852 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7853 return TreeEntry::NeedToGather;
7854 }
7855 case Instruction::Call: {
7856 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7858 auto *I = dyn_cast<Instruction>(V);
7859 return I && !I->isFast();
7860 }))
7861 return TreeEntry::NeedToGather;
7862 // Check if the calls are all to the same vectorizable intrinsic or
7863 // library function.
7864 CallInst *CI = cast<CallInst>(VL0);
7866
7867 VFShape Shape = VFShape::get(
7868 CI->getFunctionType(),
7869 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7870 false /*HasGlobalPred*/);
7871 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7872
7873 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7874 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7875 return TreeEntry::NeedToGather;
7876 }
7877 Function *F = CI->getCalledFunction();
7878 unsigned NumArgs = CI->arg_size();
7879 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7880 for (unsigned J = 0; J != NumArgs; ++J)
7882 ScalarArgs[J] = CI->getArgOperand(J);
7883 for (Value *V : VL) {
7884 CallInst *CI2 = dyn_cast<CallInst>(V);
7885 if (!CI2 || CI2->getCalledFunction() != F ||
7886 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7887 (VecFunc &&
7888 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7890 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7891 << "\n");
7892 return TreeEntry::NeedToGather;
7893 }
7894 // Some intrinsics have scalar arguments and should be same in order for
7895 // them to be vectorized.
7896 for (unsigned J = 0; J != NumArgs; ++J) {
7898 Value *A1J = CI2->getArgOperand(J);
7899 if (ScalarArgs[J] != A1J) {
7901 << "SLP: mismatched arguments in call:" << *CI
7902 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7903 return TreeEntry::NeedToGather;
7904 }
7905 }
7906 }
7907 // Verify that the bundle operands are identical between the two calls.
7908 if (CI->hasOperandBundles() &&
7909 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7910 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7911 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7912 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7913 << "!=" << *V << '\n');
7914 return TreeEntry::NeedToGather;
7915 }
7916 }
7917
7918 return TreeEntry::Vectorize;
7919 }
7920 case Instruction::ShuffleVector: {
7921 if (!S.isAltShuffle()) {
7922 // REVEC can support non alternate shuffle.
7924 return TreeEntry::Vectorize;
7925 // If this is not an alternate sequence of opcode like add-sub
7926 // then do not vectorize this instruction.
7927 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7928 return TreeEntry::NeedToGather;
7929 }
7930 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7931 LLVM_DEBUG(
7932 dbgs()
7933 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7934 "the whole alt sequence is not profitable.\n");
7935 return TreeEntry::NeedToGather;
7936 }
7937
7938 return TreeEntry::Vectorize;
7939 }
7940 default:
7941 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7942 return TreeEntry::NeedToGather;
7943 }
7944}
7945
7946namespace {
7947/// Allows to correctly handle operands of the phi nodes based on the \p Main
7948/// PHINode order of incoming basic blocks/values.
7949class PHIHandler {
7950 DominatorTree &DT;
7951 PHINode *Main = nullptr;
7954
7955public:
7956 PHIHandler() = delete;
7957 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7958 : DT(DT), Main(Main), Phis(Phis),
7959 Operands(Main->getNumIncomingValues(),
7960 SmallVector<Value *>(Phis.size(), nullptr)) {}
7961 void buildOperands() {
7962 constexpr unsigned FastLimit = 4;
7963 if (Main->getNumIncomingValues() <= FastLimit) {
7964 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7965 BasicBlock *InBB = Main->getIncomingBlock(I);
7966 if (!DT.isReachableFromEntry(InBB)) {
7967 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7968 continue;
7969 }
7970 // Prepare the operand vector.
7971 for (auto [Idx, V] : enumerate(Phis)) {
7972 auto *P = dyn_cast<PHINode>(V);
7973 if (!P) {
7974 assert(isa<PoisonValue>(V) &&
7975 "Expected isa instruction or poison value.");
7976 Operands[I][Idx] = V;
7977 continue;
7978 }
7979 if (P->getIncomingBlock(I) == InBB)
7980 Operands[I][Idx] = P->getIncomingValue(I);
7981 else
7982 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
7983 }
7984 }
7985 return;
7986 }
7988 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7989 BasicBlock *InBB = Main->getIncomingBlock(I);
7990 if (!DT.isReachableFromEntry(InBB)) {
7991 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7992 continue;
7993 }
7994 Blocks.try_emplace(InBB).first->second.push_back(I);
7995 }
7996 for (auto [Idx, V] : enumerate(Phis)) {
7997 if (isa<PoisonValue>(V)) {
7998 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
7999 Operands[I][Idx] = V;
8000 continue;
8001 }
8002 auto *P = cast<PHINode>(V);
8003 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
8004 BasicBlock *InBB = P->getIncomingBlock(I);
8005 if (InBB == Main->getIncomingBlock(I)) {
8006 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8007 continue;
8008 Operands[I][Idx] = P->getIncomingValue(I);
8009 continue;
8010 }
8011 auto It = Blocks.find(InBB);
8012 if (It == Blocks.end())
8013 continue;
8014 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
8015 }
8016 }
8017 for (const auto &P : Blocks) {
8018 if (P.getSecond().size() <= 1)
8019 continue;
8020 unsigned BasicI = P.getSecond().front();
8021 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
8023 [&](const auto &Data) {
8024 return !Data.value() ||
8025 Data.value() == Operands[BasicI][Data.index()];
8026 }) &&
8027 "Expected empty operands list.");
8028 Operands[I] = Operands[BasicI];
8029 }
8030 }
8031 }
8032 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
8033};
8034} // namespace
8035
8036void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8037 const EdgeInfo &UserTreeIdx,
8038 unsigned InterleaveFactor) {
8039 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8040
8041 SmallVector<int> ReuseShuffleIndices;
8042 SmallVector<Value *> UniqueValues;
8043 SmallVector<Value *> NonUniqueValueVL;
8044 auto TryToFindDuplicates = [&](const InstructionsState &S,
8045 bool DoNotFail = false) {
8046 // Check that every instruction appears once in this bundle.
8047 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8048 for (Value *V : VL) {
8049 if (isConstant(V)) {
8050 ReuseShuffleIndices.emplace_back(
8051 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8052 UniqueValues.emplace_back(V);
8053 continue;
8054 }
8055 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8056 ReuseShuffleIndices.emplace_back(Res.first->second);
8057 if (Res.second)
8058 UniqueValues.emplace_back(V);
8059 }
8060 size_t NumUniqueScalarValues = UniqueValues.size();
8061 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8062 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8063 if (NumUniqueScalarValues == VL.size() &&
8064 (VectorizeNonPowerOf2 || IsFullVectors)) {
8065 ReuseShuffleIndices.clear();
8066 } else {
8067 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8068 if ((UserTreeIdx.UserTE &&
8069 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8070 !has_single_bit(VL.size())) {
8071 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8072 "for nodes with padding.\n");
8073 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8074 return false;
8075 }
8076 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8077 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8078 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8079 return isa<UndefValue>(V) || !isConstant(V);
8080 }))) {
8081 if (DoNotFail && UniquePositions.size() > 1 &&
8082 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8083 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8084 // Find the number of elements, which forms full vectors.
8085 unsigned PWSz = getFullVectorNumberOfElements(
8086 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8087 if (PWSz == VL.size()) {
8088 ReuseShuffleIndices.clear();
8089 } else {
8090 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8091 NonUniqueValueVL.append(
8092 PWSz - UniqueValues.size(),
8093 PoisonValue::get(UniqueValues.front()->getType()));
8094 VL = NonUniqueValueVL;
8095 }
8096 return true;
8097 }
8098 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8099 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8100 return false;
8101 }
8102 VL = UniqueValues;
8103 }
8104 return true;
8105 };
8106
8107 InstructionsState S = getSameOpcode(VL, *TLI);
8108
8109 // Don't go into catchswitch blocks, which can happen with PHIs.
8110 // Such blocks can only have PHIs and the catchswitch. There is no
8111 // place to insert a shuffle if we need to, so just avoid that issue.
8112 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8113 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8114 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8115 return;
8116 }
8117
8118 // Check if this is a duplicate of another entry.
8119 if (S) {
8120 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8121 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
8122 << ".\n");
8123 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8124 auto It = MultiNodeScalars.find(S.getMainOp());
8125 if (It != MultiNodeScalars.end()) {
8126 auto *TEIt = find_if(It->getSecond(),
8127 [&](TreeEntry *ME) { return ME->isSame(VL); });
8128 if (TEIt != It->getSecond().end())
8129 E = *TEIt;
8130 else
8131 E = nullptr;
8132 } else {
8133 E = nullptr;
8134 }
8135 }
8136 if (!E) {
8137 if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8138 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
8139 if (TryToFindDuplicates(S))
8140 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8141 ReuseShuffleIndices);
8142 return;
8143 }
8145 Nodes.insert(getTreeEntry(S.getMainOp()));
8146 for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8147 Nodes.insert(E);
8148 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8149 if (any_of(Nodes, [&](const TreeEntry *E) {
8150 if (all_of(E->Scalars,
8151 [&](Value *V) { return Values.contains(V); }))
8152 return true;
8153 SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8154 E->Scalars.end());
8155 return (
8156 all_of(VL, [&](Value *V) { return EValues.contains(V); }));
8157 })) {
8158 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8159 if (TryToFindDuplicates(S))
8160 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8161 ReuseShuffleIndices);
8162 return;
8163 }
8164 } else {
8165 // Record the reuse of the tree node. FIXME, currently this is only
8166 // used to properly draw the graph rather than for the actual
8167 // vectorization.
8168 E->UserTreeIndices.push_back(UserTreeIdx);
8169 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8170 << ".\n");
8171 return;
8172 }
8173 }
8174 }
8175
8176 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8177 // a load), in which case peek through to include it in the tree, without
8178 // ballooning over-budget.
8179 if (Depth >= RecursionMaxDepth &&
8180 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8181 (match(S.getMainOp(), m_Load(m_Value())) ||
8182 all_of(VL, [&S](const Value *I) {
8183 return match(I,
8185 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8186 })))) {
8187 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8188 if (TryToFindDuplicates(S))
8189 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8190 ReuseShuffleIndices);
8191 return;
8192 }
8193
8194 // Don't handle scalable vectors
8195 if (S && S.getOpcode() == Instruction::ExtractElement &&
8196 isa<ScalableVectorType>(
8197 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8198 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8199 if (TryToFindDuplicates(S))
8200 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8201 ReuseShuffleIndices);
8202 return;
8203 }
8204
8205 // Don't handle vectors.
8206 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8207 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8208 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8209 return;
8210 }
8211
8212 // If all of the operands are identical or constant we have a simple solution.
8213 // If we deal with insert/extract instructions, they all must have constant
8214 // indices, otherwise we should gather them, not try to vectorize.
8215 // If alternate op node with 2 elements with gathered operands - do not
8216 // vectorize.
8217 auto &&NotProfitableForVectorization = [&S, this,
8219 if (!S || !S.isAltShuffle() || VL.size() > 2)
8220 return false;
8221 if (VectorizableTree.size() < MinTreeSize)
8222 return false;
8223 if (Depth >= RecursionMaxDepth - 1)
8224 return true;
8225 // Check if all operands are extracts, part of vector node or can build a
8226 // regular vectorize node.
8227 SmallVector<unsigned, 8> InstsCount;
8228 for (Value *V : VL) {
8229 auto *I = cast<Instruction>(V);
8230 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8231 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8232 }));
8233 }
8234 bool IsCommutative =
8235 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8236 if ((IsCommutative &&
8237 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8238 (!IsCommutative &&
8239 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8240 return true;
8241 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8243 auto *I1 = cast<Instruction>(VL.front());
8244 auto *I2 = cast<Instruction>(VL.back());
8245 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8246 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8247 I2->getOperand(Op));
8248 if (static_cast<unsigned>(count_if(
8249 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8251 })) >= S.getMainOp()->getNumOperands() / 2)
8252 return false;
8253 if (S.getMainOp()->getNumOperands() > 2)
8254 return true;
8255 if (IsCommutative) {
8256 // Check permuted operands.
8257 Candidates.clear();
8258 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8259 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8260 I2->getOperand((Op + 1) % E));
8261 if (any_of(
8262 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8264 }))
8265 return false;
8266 }
8267 return true;
8268 };
8269 SmallVector<unsigned> SortedIndices;
8270 BasicBlock *BB = nullptr;
8271 bool IsScatterVectorizeUserTE =
8272 UserTreeIdx.UserTE &&
8273 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8274 bool AreAllSameBlock = S && allSameBlock(VL);
8275 bool AreScatterAllGEPSameBlock =
8276 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8277 VL.size() > 2 &&
8278 all_of(VL,
8279 [&BB](Value *V) {
8280 auto *I = dyn_cast<GetElementPtrInst>(V);
8281 if (!I)
8282 return doesNotNeedToBeScheduled(V);
8283 if (!BB)
8284 BB = I->getParent();
8285 return BB == I->getParent() && I->getNumOperands() == 2;
8286 }) &&
8287 BB &&
8288 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8289 SortedIndices));
8290 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8291 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
8292 (S &&
8293 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8294 S.getMainOp()) &&
8296 NotProfitableForVectorization(VL)) {
8297 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8298 if (TryToFindDuplicates(S))
8299 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8300 ReuseShuffleIndices);
8301 return;
8302 }
8303
8304 // Don't vectorize ephemeral values.
8305 if (S && !EphValues.empty()) {
8306 for (Value *V : VL) {
8307 if (EphValues.count(V)) {
8308 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8309 << ") is ephemeral.\n");
8310 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8311 return;
8312 }
8313 }
8314 }
8315
8316 // We now know that this is a vector of instructions of the same type from
8317 // the same block.
8318
8319 // Check that none of the instructions in the bundle are already in the tree.
8320 for (Value *V : VL) {
8321 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8323 continue;
8324 if (getTreeEntry(V)) {
8325 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8326 << ") is already in tree.\n");
8327 if (TryToFindDuplicates(S))
8328 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8329 ReuseShuffleIndices);
8330 return;
8331 }
8332 }
8333
8334 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8335 if (UserIgnoreList && !UserIgnoreList->empty()) {
8336 for (Value *V : VL) {
8337 if (UserIgnoreList->contains(V)) {
8338 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8339 if (TryToFindDuplicates(S))
8340 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8341 ReuseShuffleIndices);
8342 return;
8343 }
8344 }
8345 }
8346
8347 // Special processing for sorted pointers for ScatterVectorize node with
8348 // constant indeces only.
8349 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8350 assert(VL.front()->getType()->isPointerTy() &&
8351 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8352 "Expected pointers only.");
8353 // Reset S to make it GetElementPtr kind of node.
8354 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8355 assert(It != VL.end() && "Expected at least one GEP.");
8356 S = getSameOpcode(*It, *TLI);
8357 }
8358
8359 // Check that all of the users of the scalars that we want to vectorize are
8360 // schedulable.
8361 Instruction *VL0 = S.getMainOp();
8362 BB = VL0->getParent();
8363
8364 if (S &&
8365 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8366 !DT->isReachableFromEntry(BB))) {
8367 // Don't go into unreachable blocks. They may contain instructions with
8368 // dependency cycles which confuse the final scheduling.
8369 // Do not vectorize EH and non-returning blocks, not profitable in most
8370 // cases.
8371 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8372 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8373 return;
8374 }
8375
8376 // Check that every instruction appears once in this bundle.
8377 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8378 return;
8379
8380 // Perform specific checks for each particular instruction kind.
8381 OrdersType CurrentOrder;
8382 SmallVector<Value *> PointerOps;
8383 TreeEntry::EntryState State = getScalarsVectorizationState(
8384 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8385 if (State == TreeEntry::NeedToGather) {
8386 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8387 ReuseShuffleIndices);
8388 return;
8389 }
8390
8391 auto &BSRef = BlocksSchedules[BB];
8392 if (!BSRef)
8393 BSRef = std::make_unique<BlockScheduling>(BB);
8394
8395 BlockScheduling &BS = *BSRef;
8396
8397 std::optional<ScheduleData *> Bundle =
8398 BS.tryScheduleBundle(UniqueValues, this, S);
8399#ifdef EXPENSIVE_CHECKS
8400 // Make sure we didn't break any internal invariants
8401 BS.verify();
8402#endif
8403 if (!Bundle) {
8404 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8405 assert((!BS.getScheduleData(VL0) ||
8406 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8407 "tryScheduleBundle should cancelScheduling on failure");
8408 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8409 ReuseShuffleIndices);
8410 NonScheduledFirst.insert(VL.front());
8411 if (S.getOpcode() == Instruction::Load &&
8412 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8414 return;
8415 }
8416 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8417
8418 unsigned ShuffleOrOp =
8419 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8420 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8421 // Postpone PHI nodes creation
8422 SmallVector<unsigned> PHIOps;
8423 for (unsigned I : seq<unsigned>(Operands.size())) {
8425 if (Op.empty())
8426 continue;
8427 InstructionsState S = getSameOpcode(Op, *TLI);
8428 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8429 buildTree_rec(Op, Depth + 1, {TE, I});
8430 else
8431 PHIOps.push_back(I);
8432 }
8433 for (unsigned I : PHIOps)
8434 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8435 };
8436 switch (ShuffleOrOp) {
8437 case Instruction::PHI: {
8438 auto *PH = cast<PHINode>(VL0);
8439
8440 TreeEntry *TE =
8441 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8442 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8443 TE->dump());
8444
8445 // Keeps the reordered operands to avoid code duplication.
8446 PHIHandler Handler(*DT, PH, VL);
8447 Handler.buildOperands();
8448 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8449 TE->setOperand(I, Handler.getOperands(I));
8450 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8451 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8452 Operands[I] = Handler.getOperands(I);
8453 CreateOperandNodes(TE, Operands);
8454 return;
8455 }
8456 case Instruction::ExtractValue:
8457 case Instruction::ExtractElement: {
8458 if (CurrentOrder.empty()) {
8459 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8460 } else {
8461 LLVM_DEBUG({
8462 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8463 "with order";
8464 for (unsigned Idx : CurrentOrder)
8465 dbgs() << " " << Idx;
8466 dbgs() << "\n";
8467 });
8468 fixupOrderingIndices(CurrentOrder);
8469 }
8470 // Insert new order with initial value 0, if it does not exist,
8471 // otherwise return the iterator to the existing one.
8472 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8473 ReuseShuffleIndices, CurrentOrder);
8474 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8475 "(ExtractValueInst/ExtractElementInst).\n";
8476 TE->dump());
8477 // This is a special case, as it does not gather, but at the same time
8478 // we are not extending buildTree_rec() towards the operands.
8479 TE->setOperand(*this);
8480 return;
8481 }
8482 case Instruction::InsertElement: {
8483 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8484
8485 auto OrdCompare = [](const std::pair<int, int> &P1,
8486 const std::pair<int, int> &P2) {
8487 return P1.first > P2.first;
8488 };
8490 decltype(OrdCompare)>
8491 Indices(OrdCompare);
8492 for (int I = 0, E = VL.size(); I < E; ++I) {
8493 unsigned Idx = *getElementIndex(VL[I]);
8494 Indices.emplace(Idx, I);
8495 }
8496 OrdersType CurrentOrder(VL.size(), VL.size());
8497 bool IsIdentity = true;
8498 for (int I = 0, E = VL.size(); I < E; ++I) {
8499 CurrentOrder[Indices.top().second] = I;
8500 IsIdentity &= Indices.top().second == I;
8501 Indices.pop();
8502 }
8503 if (IsIdentity)
8504 CurrentOrder.clear();
8505 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8506 {}, CurrentOrder);
8507 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8508 TE->dump());
8509
8510 TE->setOperand(*this);
8511 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8512 return;
8513 }
8514 case Instruction::Load: {
8515 // Check that a vectorized load would load the same memory as a scalar
8516 // load. For example, we don't want to vectorize loads that are smaller
8517 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8518 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8519 // from such a struct, we read/write packed bits disagreeing with the
8520 // unvectorized version.
8521 TreeEntry *TE = nullptr;
8522 fixupOrderingIndices(CurrentOrder);
8523 switch (State) {
8524 case TreeEntry::Vectorize:
8525 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8526 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8527 if (CurrentOrder.empty())
8528 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8529 TE->dump());
8530 else
8532 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8533 TE->dump());
8534 break;
8535 case TreeEntry::StridedVectorize:
8536 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8537 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8538 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8539 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8540 TE->dump());
8541 break;
8542 case TreeEntry::ScatterVectorize:
8543 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8544 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8545 UserTreeIdx, ReuseShuffleIndices);
8546 LLVM_DEBUG(
8547 dbgs()
8548 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8549 TE->dump());
8550 break;
8551 case TreeEntry::CombinedVectorize:
8552 case TreeEntry::NeedToGather:
8553 llvm_unreachable("Unexpected loads state.");
8554 }
8555 TE->setOperand(*this);
8556 if (State == TreeEntry::ScatterVectorize)
8557 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8558 return;
8559 }
8560 case Instruction::ZExt:
8561 case Instruction::SExt:
8562 case Instruction::FPToUI:
8563 case Instruction::FPToSI:
8564 case Instruction::FPExt:
8565 case Instruction::PtrToInt:
8566 case Instruction::IntToPtr:
8567 case Instruction::SIToFP:
8568 case Instruction::UIToFP:
8569 case Instruction::Trunc:
8570 case Instruction::FPTrunc:
8571 case Instruction::BitCast: {
8572 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8573 std::make_pair(std::numeric_limits<unsigned>::min(),
8574 std::numeric_limits<unsigned>::max()));
8575 if (ShuffleOrOp == Instruction::ZExt ||
8576 ShuffleOrOp == Instruction::SExt) {
8577 CastMaxMinBWSizes = std::make_pair(
8578 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8579 PrevMaxBW),
8580 std::min<unsigned>(
8581 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8582 PrevMinBW));
8583 } else if (ShuffleOrOp == Instruction::Trunc) {
8584 CastMaxMinBWSizes = std::make_pair(
8585 std::max<unsigned>(
8586 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8587 PrevMaxBW),
8588 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8589 PrevMinBW));
8590 }
8591 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8592 ReuseShuffleIndices);
8593 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8594 TE->dump());
8595
8596 TE->setOperand(*this);
8597 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8598 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8599 if (ShuffleOrOp == Instruction::Trunc) {
8600 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8601 } else if (ShuffleOrOp == Instruction::SIToFP ||
8602 ShuffleOrOp == Instruction::UIToFP) {
8603 unsigned NumSignBits =
8604 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8605 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8606 APInt Mask = DB->getDemandedBits(OpI);
8607 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8608 }
8609 if (NumSignBits * 2 >=
8610 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8611 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8612 }
8613 return;
8614 }
8615 case Instruction::ICmp:
8616 case Instruction::FCmp: {
8617 // Check that all of the compares have the same predicate.
8618 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8619 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8620 ReuseShuffleIndices);
8621 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8622 TE->dump());
8623
8625 VLOperands Ops(VL, VL0, *this);
8626 if (cast<CmpInst>(VL0)->isCommutative()) {
8627 // Commutative predicate - collect + sort operands of the instructions
8628 // so that each side is more likely to have the same opcode.
8630 "Commutative Predicate mismatch");
8631 Ops.reorder();
8632 Left = Ops.getVL(0);
8633 Right = Ops.getVL(1);
8634 } else {
8635 // Collect operands - commute if it uses the swapped predicate.
8636 for (Value *V : VL) {
8637 if (isa<PoisonValue>(V)) {
8638 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8639 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8640 continue;
8641 }
8642 auto *Cmp = cast<CmpInst>(V);
8643 Value *LHS = Cmp->getOperand(0);
8644 Value *RHS = Cmp->getOperand(1);
8645 if (Cmp->getPredicate() != P0)
8646 std::swap(LHS, RHS);
8647 Left.push_back(LHS);
8648 Right.push_back(RHS);
8649 }
8650 }
8651 TE->setOperand(0, Left);
8652 TE->setOperand(1, Right);
8653 buildTree_rec(Left, Depth + 1, {TE, 0});
8654 buildTree_rec(Right, Depth + 1, {TE, 1});
8655 if (ShuffleOrOp == Instruction::ICmp) {
8656 unsigned NumSignBits0 =
8657 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8658 if (NumSignBits0 * 2 >=
8659 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8660 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8661 unsigned NumSignBits1 =
8662 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8663 if (NumSignBits1 * 2 >=
8664 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8665 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8666 }
8667 return;
8668 }
8669 case Instruction::Select:
8670 case Instruction::FNeg:
8671 case Instruction::Add:
8672 case Instruction::FAdd:
8673 case Instruction::Sub:
8674 case Instruction::FSub:
8675 case Instruction::Mul:
8676 case Instruction::FMul:
8677 case Instruction::UDiv:
8678 case Instruction::SDiv:
8679 case Instruction::FDiv:
8680 case Instruction::URem:
8681 case Instruction::SRem:
8682 case Instruction::FRem:
8683 case Instruction::Shl:
8684 case Instruction::LShr:
8685 case Instruction::AShr:
8686 case Instruction::And:
8687 case Instruction::Or:
8688 case Instruction::Xor:
8689 case Instruction::Freeze: {
8690 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8691 ReuseShuffleIndices);
8692 LLVM_DEBUG(
8693 dbgs() << "SLP: added a new TreeEntry "
8694 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8695 TE->dump());
8696
8697 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8698 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8699 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8700 return;
8701 }
8702 case Instruction::GetElementPtr: {
8703 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8704 ReuseShuffleIndices);
8705 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8706 TE->dump());
8708 // Prepare the operand vector for pointer operands.
8709 for (Value *V : VL) {
8710 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8711 if (!GEP) {
8712 Operands.front().push_back(V);
8713 continue;
8714 }
8715 Operands.front().push_back(GEP->getPointerOperand());
8716 }
8717 TE->setOperand(0, Operands.front());
8718 // Need to cast all indices to the same type before vectorization to
8719 // avoid crash.
8720 // Required to be able to find correct matches between different gather
8721 // nodes and reuse the vectorized values rather than trying to gather them
8722 // again.
8723 int IndexIdx = 1;
8724 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8725 Type *Ty = all_of(VL,
8726 [VL0Ty, IndexIdx](Value *V) {
8727 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8728 if (!GEP)
8729 return true;
8730 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8731 })
8732 ? VL0Ty
8733 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8734 ->getPointerOperandType()
8735 ->getScalarType());
8736 // Prepare the operand vector.
8737 for (Value *V : VL) {
8738 auto *I = dyn_cast<GetElementPtrInst>(V);
8739 if (!I) {
8740 Operands.back().push_back(
8741 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8742 continue;
8743 }
8744 auto *Op = I->getOperand(IndexIdx);
8745 auto *CI = dyn_cast<ConstantInt>(Op);
8746 if (!CI)
8747 Operands.back().push_back(Op);
8748 else
8749 Operands.back().push_back(ConstantFoldIntegerCast(
8750 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8751 }
8752 TE->setOperand(IndexIdx, Operands.back());
8753
8754 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8755 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8756 return;
8757 }
8758 case Instruction::Store: {
8759 bool Consecutive = CurrentOrder.empty();
8760 if (!Consecutive)
8761 fixupOrderingIndices(CurrentOrder);
8762 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8763 ReuseShuffleIndices, CurrentOrder);
8764 if (Consecutive)
8765 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8766 TE->dump());
8767 else
8768 LLVM_DEBUG(
8769 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8770 TE->dump());
8771 TE->setOperand(*this);
8772 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8773 return;
8774 }
8775 case Instruction::Call: {
8776 // Check if the calls are all to the same vectorizable intrinsic or
8777 // library function.
8778 CallInst *CI = cast<CallInst>(VL0);
8780
8781 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8782 ReuseShuffleIndices);
8783 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8784 TE->dump());
8785 TE->setOperand(*this, isCommutative(VL0));
8786 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8787 // For scalar operands no need to create an entry since no need to
8788 // vectorize it.
8790 continue;
8791 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8792 }
8793 return;
8794 }
8795 case Instruction::ShuffleVector: {
8796 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8797 ReuseShuffleIndices);
8798 if (S.isAltShuffle()) {
8799 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8800 TE->dump());
8801 } else {
8802 assert(SLPReVec && "Only supported by REVEC.");
8803 LLVM_DEBUG(
8804 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8805 TE->dump());
8806 }
8807
8808 // Reorder operands if reordering would enable vectorization.
8809 auto *CI = dyn_cast<CmpInst>(VL0);
8810 if (CI && any_of(VL, [](Value *V) {
8811 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8812 })) {
8813 auto *MainCI = cast<CmpInst>(S.getMainOp());
8814 auto *AltCI = cast<CmpInst>(S.getAltOp());
8815 CmpInst::Predicate MainP = MainCI->getPredicate();
8816 CmpInst::Predicate AltP = AltCI->getPredicate();
8817 assert(MainP != AltP &&
8818 "Expected different main/alternate predicates.");
8820 // Collect operands - commute if it uses the swapped predicate or
8821 // alternate operation.
8822 for (Value *V : VL) {
8823 if (isa<PoisonValue>(V)) {
8824 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8825 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8826 continue;
8827 }
8828 auto *Cmp = cast<CmpInst>(V);
8829 Value *LHS = Cmp->getOperand(0);
8830 Value *RHS = Cmp->getOperand(1);
8831
8832 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8833 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8834 std::swap(LHS, RHS);
8835 } else {
8836 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8837 std::swap(LHS, RHS);
8838 }
8839 Left.push_back(LHS);
8840 Right.push_back(RHS);
8841 }
8842 TE->setOperand(0, Left);
8843 TE->setOperand(1, Right);
8844 buildTree_rec(Left, Depth + 1, {TE, 0});
8845 buildTree_rec(Right, Depth + 1, {TE, 1});
8846 return;
8847 }
8848
8849 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8850 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8851 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8852 return;
8853 }
8854 default:
8855 break;
8856 }
8857 llvm_unreachable("Unexpected vectorization of the instructions.");
8858}
8859
8861 unsigned N = 1;
8862 Type *EltTy = T;
8863
8864 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8865 if (EltTy->isEmptyTy())
8866 return 0;
8867 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8868 // Check that struct is homogeneous.
8869 for (const auto *Ty : ST->elements())
8870 if (Ty != *ST->element_begin())
8871 return 0;
8872 N *= ST->getNumElements();
8873 EltTy = *ST->element_begin();
8874 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8875 N *= AT->getNumElements();
8876 EltTy = AT->getElementType();
8877 } else {
8878 auto *VT = cast<FixedVectorType>(EltTy);
8879 N *= VT->getNumElements();
8880 EltTy = VT->getElementType();
8881 }
8882 }
8883
8884 if (!isValidElementType(EltTy))
8885 return 0;
8886 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8887 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8888 VTSize != DL->getTypeStoreSizeInBits(T))
8889 return 0;
8890 return N;
8891}
8892
8893bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
8894 SmallVectorImpl<unsigned> &CurrentOrder,
8895 bool ResizeAllowed) const {
8896 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8897 assert(It != VL.end() && "Expected at least one extract instruction.");
8898 auto *E0 = cast<Instruction>(*It);
8899 assert(
8900 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8901 "Invalid opcode");
8902 // Check if all of the extracts come from the same vector and from the
8903 // correct offset.
8904 Value *Vec = E0->getOperand(0);
8905
8906 CurrentOrder.clear();
8907
8908 // We have to extract from a vector/aggregate with the same number of elements.
8909 unsigned NElts;
8910 if (E0->getOpcode() == Instruction::ExtractValue) {
8911 NElts = canMapToVector(Vec->getType());
8912 if (!NElts)
8913 return false;
8914 // Check if load can be rewritten as load of vector.
8915 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8916 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8917 return false;
8918 } else {
8919 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8920 }
8921
8922 unsigned E = VL.size();
8923 if (!ResizeAllowed && NElts != E)
8924 return false;
8925 SmallVector<int> Indices(E, PoisonMaskElem);
8926 unsigned MinIdx = NElts, MaxIdx = 0;
8927 for (auto [I, V] : enumerate(VL)) {
8928 auto *Inst = dyn_cast<Instruction>(V);
8929 if (!Inst)
8930 continue;
8931 if (Inst->getOperand(0) != Vec)
8932 return false;
8933 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8934 if (isa<UndefValue>(EE->getIndexOperand()))
8935 continue;
8936 std::optional<unsigned> Idx = getExtractIndex(Inst);
8937 if (!Idx)
8938 return false;
8939 const unsigned ExtIdx = *Idx;
8940 if (ExtIdx >= NElts)
8941 continue;
8942 Indices[I] = ExtIdx;
8943 if (MinIdx > ExtIdx)
8944 MinIdx = ExtIdx;
8945 if (MaxIdx < ExtIdx)
8946 MaxIdx = ExtIdx;
8947 }
8948 if (MaxIdx - MinIdx + 1 > E)
8949 return false;
8950 if (MaxIdx + 1 <= E)
8951 MinIdx = 0;
8952
8953 // Check that all of the indices extract from the correct offset.
8954 bool ShouldKeepOrder = true;
8955 // Assign to all items the initial value E + 1 so we can check if the extract
8956 // instruction index was used already.
8957 // Also, later we can check that all the indices are used and we have a
8958 // consecutive access in the extract instructions, by checking that no
8959 // element of CurrentOrder still has value E + 1.
8960 CurrentOrder.assign(E, E);
8961 for (unsigned I = 0; I < E; ++I) {
8962 if (Indices[I] == PoisonMaskElem)
8963 continue;
8964 const unsigned ExtIdx = Indices[I] - MinIdx;
8965 if (CurrentOrder[ExtIdx] != E) {
8966 CurrentOrder.clear();
8967 return false;
8968 }
8969 ShouldKeepOrder &= ExtIdx == I;
8970 CurrentOrder[ExtIdx] = I;
8971 }
8972 if (ShouldKeepOrder)
8973 CurrentOrder.clear();
8974
8975 return ShouldKeepOrder;
8976}
8977
8978bool BoUpSLP::areAllUsersVectorized(
8979 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
8980 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
8981 all_of(I->users(), [this](User *U) {
8982 return ScalarToTreeEntry.contains(U) ||
8983 isVectorLikeInstWithConstOps(U) ||
8984 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8985 });
8986}
8987
8988static std::pair<InstructionCost, InstructionCost>
8991 ArrayRef<Type *> ArgTys) {
8993
8994 // Calculate the cost of the scalar and vector calls.
8995 FastMathFlags FMF;
8996 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
8997 FMF = FPCI->getFastMathFlags();
8999 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
9000 dyn_cast<IntrinsicInst>(CI));
9001 auto IntrinsicCost =
9003
9004 auto Shape = VFShape::get(CI->getFunctionType(),
9006 false /*HasGlobalPred*/);
9007 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9008 auto LibCost = IntrinsicCost;
9009 if (!CI->isNoBuiltin() && VecFunc) {
9010 // Calculate the cost of the vector library call.
9011 // If the corresponding vector call is cheaper, return its cost.
9012 LibCost =
9013 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9014 }
9015 return {IntrinsicCost, LibCost};
9016}
9017
9018void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9019 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
9020 SmallVectorImpl<Value *> *OpScalars,
9021 SmallVectorImpl<Value *> *AltScalars) const {
9022 unsigned Sz = Scalars.size();
9023 Mask.assign(Sz, PoisonMaskElem);
9024 SmallVector<int> OrderMask;
9025 if (!ReorderIndices.empty())
9026 inversePermutation(ReorderIndices, OrderMask);
9027 for (unsigned I = 0; I < Sz; ++I) {
9028 unsigned Idx = I;
9029 if (!ReorderIndices.empty())
9030 Idx = OrderMask[I];
9031 if (isa<PoisonValue>(Scalars[Idx]))
9032 continue;
9033 auto *OpInst = cast<Instruction>(Scalars[Idx]);
9034 if (IsAltOp(OpInst)) {
9035 Mask[I] = Sz + Idx;
9036 if (AltScalars)
9037 AltScalars->push_back(OpInst);
9038 } else {
9039 Mask[I] = Idx;
9040 if (OpScalars)
9041 OpScalars->push_back(OpInst);
9042 }
9043 }
9044 if (!ReuseShuffleIndices.empty()) {
9045 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9046 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9047 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9048 });
9049 Mask.swap(NewMask);
9050 }
9051}
9052
9054 const Instruction *MainOp,
9055 const Instruction *AltOp,
9056 const TargetLibraryInfo &TLI) {
9057 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9058 auto *AltCI = cast<CmpInst>(AltOp);
9059 CmpInst::Predicate MainP = MainCI->getPredicate();
9060 CmpInst::Predicate AltP = AltCI->getPredicate();
9061 assert(MainP != AltP && "Expected different main/alternate predicates.");
9062 auto *CI = cast<CmpInst>(I);
9063 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9064 return false;
9065 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9066 return true;
9067 CmpInst::Predicate P = CI->getPredicate();
9069
9070 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9071 "CmpInst expected to match either main or alternate predicate or "
9072 "their swap.");
9073 (void)AltP;
9074 return MainP != P && MainP != SwappedP;
9075 }
9076 return I->getOpcode() == AltOp->getOpcode();
9077}
9078
9079TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9080 assert(!Ops.empty());
9081 const auto *Op0 = Ops.front();
9082
9083 const bool IsConstant = all_of(Ops, [](Value *V) {
9084 // TODO: We should allow undef elements here
9085 return isConstant(V) && !isa<UndefValue>(V);
9086 });
9087 const bool IsUniform = all_of(Ops, [=](Value *V) {
9088 // TODO: We should allow undef elements here
9089 return V == Op0;
9090 });
9091 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9092 // TODO: We should allow undef elements here
9093 if (auto *CI = dyn_cast<ConstantInt>(V))
9094 return CI->getValue().isPowerOf2();
9095 return false;
9096 });
9097 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9098 // TODO: We should allow undef elements here
9099 if (auto *CI = dyn_cast<ConstantInt>(V))
9100 return CI->getValue().isNegatedPowerOf2();
9101 return false;
9102 });
9103
9105 if (IsConstant && IsUniform)
9107 else if (IsConstant)
9109 else if (IsUniform)
9111
9113 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9114 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9115
9116 return {VK, VP};
9117}
9118
9119namespace {
9120/// The base class for shuffle instruction emission and shuffle cost estimation.
9121class BaseShuffleAnalysis {
9122protected:
9123 Type *ScalarTy = nullptr;
9124
9125 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9126
9127 /// V is expected to be a vectorized value.
9128 /// When REVEC is disabled, there is no difference between VF and
9129 /// VNumElements.
9130 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9131 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9132 /// of 8.
9133 unsigned getVF(Value *V) const {
9134 assert(V && "V cannot be nullptr");
9135 assert(isa<FixedVectorType>(V->getType()) &&
9136 "V does not have FixedVectorType");
9137 assert(ScalarTy && "ScalarTy cannot be nullptr");
9138 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9139 unsigned VNumElements =
9140 cast<FixedVectorType>(V->getType())->getNumElements();
9141 assert(VNumElements > ScalarTyNumElements &&
9142 "the number of elements of V is not large enough");
9143 assert(VNumElements % ScalarTyNumElements == 0 &&
9144 "the number of elements of V is not a vectorized value");
9145 return VNumElements / ScalarTyNumElements;
9146 }
9147
9148 /// Checks if the mask is an identity mask.
9149 /// \param IsStrict if is true the function returns false if mask size does
9150 /// not match vector size.
9151 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9152 bool IsStrict) {
9153 int Limit = Mask.size();
9154 int VF = VecTy->getNumElements();
9155 int Index = -1;
9156 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9157 return true;
9158 if (!IsStrict) {
9159 // Consider extract subvector starting from index 0.
9161 Index == 0)
9162 return true;
9163 // All VF-size submasks are identity (e.g.
9164 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9165 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9166 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9167 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9169 }))
9170 return true;
9171 }
9172 return false;
9173 }
9174
9175 /// Tries to combine 2 different masks into single one.
9176 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9177 /// change the size of the vector, \p LocalVF is the original size of the
9178 /// shuffled vector.
9179 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9180 ArrayRef<int> ExtMask) {
9181 unsigned VF = Mask.size();
9182 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9183 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9184 if (ExtMask[I] == PoisonMaskElem)
9185 continue;
9186 int MaskedIdx = Mask[ExtMask[I] % VF];
9187 NewMask[I] =
9188 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9189 }
9190 Mask.swap(NewMask);
9191 }
9192
9193 /// Looks through shuffles trying to reduce final number of shuffles in the
9194 /// code. The function looks through the previously emitted shuffle
9195 /// instructions and properly mark indices in mask as undef.
9196 /// For example, given the code
9197 /// \code
9198 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9199 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9200 /// \endcode
9201 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9202 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9203 /// <0, 1, 2, 3> for the shuffle.
9204 /// If 2 operands are of different size, the smallest one will be resized and
9205 /// the mask recalculated properly.
9206 /// For example, given the code
9207 /// \code
9208 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9209 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9210 /// \endcode
9211 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9212 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9213 /// <0, 1, 2, 3> for the shuffle.
9214 /// So, it tries to transform permutations to simple vector merge, if
9215 /// possible.
9216 /// \param V The input vector which must be shuffled using the given \p Mask.
9217 /// If the better candidate is found, \p V is set to this best candidate
9218 /// vector.
9219 /// \param Mask The input mask for the shuffle. If the best candidate is found
9220 /// during looking-through-shuffles attempt, it is updated accordingly.
9221 /// \param SinglePermute true if the shuffle operation is originally a
9222 /// single-value-permutation. In this case the look-through-shuffles procedure
9223 /// may look for resizing shuffles as the best candidates.
9224 /// \return true if the shuffle results in the non-resizing identity shuffle
9225 /// (and thus can be ignored), false - otherwise.
9226 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9227 bool SinglePermute) {
9228 Value *Op = V;
9229 ShuffleVectorInst *IdentityOp = nullptr;
9230 SmallVector<int> IdentityMask;
9231 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9232 // Exit if not a fixed vector type or changing size shuffle.
9233 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9234 if (!SVTy)
9235 break;
9236 // Remember the identity or broadcast mask, if it is not a resizing
9237 // shuffle. If no better candidates are found, this Op and Mask will be
9238 // used in the final shuffle.
9239 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9240 if (!IdentityOp || !SinglePermute ||
9241 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9243 IdentityMask.size()))) {
9244 IdentityOp = SV;
9245 // Store current mask in the IdentityMask so later we did not lost
9246 // this info if IdentityOp is selected as the best candidate for the
9247 // permutation.
9248 IdentityMask.assign(Mask);
9249 }
9250 }
9251 // Remember the broadcast mask. If no better candidates are found, this Op
9252 // and Mask will be used in the final shuffle.
9253 // Zero splat can be used as identity too, since it might be used with
9254 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9255 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9256 // expensive, the analysis founds out, that the source vector is just a
9257 // broadcast, this original mask can be transformed to identity mask <0,
9258 // 1, 2, 3>.
9259 // \code
9260 // %0 = shuffle %v, poison, zeroinitalizer
9261 // %res = shuffle %0, poison, <3, 1, 2, 0>
9262 // \endcode
9263 // may be transformed to
9264 // \code
9265 // %0 = shuffle %v, poison, zeroinitalizer
9266 // %res = shuffle %0, poison, <0, 1, 2, 3>
9267 // \endcode
9268 if (SV->isZeroEltSplat()) {
9269 IdentityOp = SV;
9270 IdentityMask.assign(Mask);
9271 }
9272 int LocalVF = Mask.size();
9273 if (auto *SVOpTy =
9274 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9275 LocalVF = SVOpTy->getNumElements();
9276 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9277 for (auto [Idx, I] : enumerate(Mask)) {
9278 if (I == PoisonMaskElem ||
9279 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9280 continue;
9281 ExtMask[Idx] = SV->getMaskValue(I);
9282 }
9283 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9284 SV->getOperand(0),
9285 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9286 .all();
9287 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9288 SV->getOperand(1),
9289 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9290 .all();
9291 if (!IsOp1Undef && !IsOp2Undef) {
9292 // Update mask and mark undef elems.
9293 for (int &I : Mask) {
9294 if (I == PoisonMaskElem)
9295 continue;
9296 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9298 I = PoisonMaskElem;
9299 }
9300 break;
9301 }
9302 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9303 combineMasks(LocalVF, ShuffleMask, Mask);
9304 Mask.swap(ShuffleMask);
9305 if (IsOp2Undef)
9306 Op = SV->getOperand(0);
9307 else
9308 Op = SV->getOperand(1);
9309 }
9310 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9311 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9313 if (IdentityOp) {
9314 V = IdentityOp;
9315 assert(Mask.size() == IdentityMask.size() &&
9316 "Expected masks of same sizes.");
9317 // Clear known poison elements.
9318 for (auto [I, Idx] : enumerate(Mask))
9319 if (Idx == PoisonMaskElem)
9320 IdentityMask[I] = PoisonMaskElem;
9321 Mask.swap(IdentityMask);
9322 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9323 return SinglePermute &&
9324 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9325 /*IsStrict=*/true) ||
9326 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9327 Shuffle->isZeroEltSplat() &&
9329 }
9330 V = Op;
9331 return false;
9332 }
9333 V = Op;
9334 return true;
9335 }
9336
9337 /// Smart shuffle instruction emission, walks through shuffles trees and
9338 /// tries to find the best matching vector for the actual shuffle
9339 /// instruction.
9340 template <typename T, typename ShuffleBuilderTy>
9341 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9342 ShuffleBuilderTy &Builder) {
9343 assert(V1 && "Expected at least one vector value.");
9344 if (V2)
9345 Builder.resizeToMatch(V1, V2);
9346 int VF = Mask.size();
9347 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9348 VF = FTy->getNumElements();
9349 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9350 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9351 .all()) {
9352 // Peek through shuffles.
9353 Value *Op1 = V1;
9354 Value *Op2 = V2;
9355 int VF =
9356 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9357 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9358 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9359 for (int I = 0, E = Mask.size(); I < E; ++I) {
9360 if (Mask[I] < VF)
9361 CombinedMask1[I] = Mask[I];
9362 else
9363 CombinedMask2[I] = Mask[I] - VF;
9364 }
9365 Value *PrevOp1;
9366 Value *PrevOp2;
9367 do {
9368 PrevOp1 = Op1;
9369 PrevOp2 = Op2;
9370 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9371 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9372 // Check if we have 2 resizing shuffles - need to peek through operands
9373 // again.
9374 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9375 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9376 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9377 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9378 if (I == PoisonMaskElem)
9379 continue;
9380 ExtMask1[Idx] = SV1->getMaskValue(I);
9381 }
9382 SmallBitVector UseMask1 = buildUseMask(
9383 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9384 ->getNumElements(),
9385 ExtMask1, UseMask::SecondArg);
9386 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9387 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9388 if (I == PoisonMaskElem)
9389 continue;
9390 ExtMask2[Idx] = SV2->getMaskValue(I);
9391 }
9392 SmallBitVector UseMask2 = buildUseMask(
9393 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9394 ->getNumElements(),
9395 ExtMask2, UseMask::SecondArg);
9396 if (SV1->getOperand(0)->getType() ==
9397 SV2->getOperand(0)->getType() &&
9398 SV1->getOperand(0)->getType() != SV1->getType() &&
9399 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9400 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9401 Op1 = SV1->getOperand(0);
9402 Op2 = SV2->getOperand(0);
9403 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9404 int LocalVF = ShuffleMask1.size();
9405 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9406 LocalVF = FTy->getNumElements();
9407 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9408 CombinedMask1.swap(ShuffleMask1);
9409 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9410 LocalVF = ShuffleMask2.size();
9411 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9412 LocalVF = FTy->getNumElements();
9413 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9414 CombinedMask2.swap(ShuffleMask2);
9415 }
9416 }
9417 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9418 Builder.resizeToMatch(Op1, Op2);
9419 VF = std::max(cast<VectorType>(Op1->getType())
9420 ->getElementCount()
9421 .getKnownMinValue(),
9422 cast<VectorType>(Op2->getType())
9423 ->getElementCount()
9424 .getKnownMinValue());
9425 for (int I = 0, E = Mask.size(); I < E; ++I) {
9426 if (CombinedMask2[I] != PoisonMaskElem) {
9427 assert(CombinedMask1[I] == PoisonMaskElem &&
9428 "Expected undefined mask element");
9429 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9430 }
9431 }
9432 if (Op1 == Op2 &&
9433 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9434 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9435 isa<ShuffleVectorInst>(Op1) &&
9436 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9437 ArrayRef(CombinedMask1))))
9438 return Builder.createIdentity(Op1);
9439 return Builder.createShuffleVector(
9440 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9441 CombinedMask1);
9442 }
9443 if (isa<PoisonValue>(V1))
9444 return Builder.createPoison(
9445 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9446 SmallVector<int> NewMask(Mask);
9447 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9448 assert(V1 && "Expected non-null value after looking through shuffles.");
9449
9450 if (!IsIdentity)
9451 return Builder.createShuffleVector(V1, NewMask);
9452 return Builder.createIdentity(V1);
9453 }
9454};
9455} // namespace
9456
9457/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9458static std::pair<InstructionCost, InstructionCost>
9460 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9461 Type *ScalarTy, VectorType *VecTy) {
9462 InstructionCost ScalarCost = 0;
9463 InstructionCost VecCost = 0;
9464 // Here we differentiate two cases: (1) when Ptrs represent a regular
9465 // vectorization tree node (as they are pointer arguments of scattered
9466 // loads) or (2) when Ptrs are the arguments of loads or stores being
9467 // vectorized as plane wide unit-stride load/store since all the
9468 // loads/stores are known to be from/to adjacent locations.
9469 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9470 // Case 2: estimate costs for pointer related costs when vectorizing to
9471 // a wide load/store.
9472 // Scalar cost is estimated as a set of pointers with known relationship
9473 // between them.
9474 // For vector code we will use BasePtr as argument for the wide load/store
9475 // but we also need to account all the instructions which are going to
9476 // stay in vectorized code due to uses outside of these scalar
9477 // loads/stores.
9478 ScalarCost = TTI.getPointersChainCost(
9479 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9480 CostKind);
9481
9482 SmallVector<const Value *> PtrsRetainedInVecCode;
9483 for (Value *V : Ptrs) {
9484 if (V == BasePtr) {
9485 PtrsRetainedInVecCode.push_back(V);
9486 continue;
9487 }
9488 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9489 // For simplicity assume Ptr to stay in vectorized code if it's not a
9490 // GEP instruction. We don't care since it's cost considered free.
9491 // TODO: We should check for any uses outside of vectorizable tree
9492 // rather than just single use.
9493 if (!Ptr || !Ptr->hasOneUse())
9494 PtrsRetainedInVecCode.push_back(V);
9495 }
9496
9497 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9498 // If all pointers stay in vectorized code then we don't have
9499 // any savings on that.
9500 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9501 }
9502 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9503 TTI::PointersChainInfo::getKnownStride(),
9504 VecTy, CostKind);
9505 } else {
9506 // Case 1: Ptrs are the arguments of loads that we are going to transform
9507 // into masked gather load intrinsic.
9508 // All the scalar GEPs will be removed as a result of vectorization.
9509 // For any external uses of some lanes extract element instructions will
9510 // be generated (which cost is estimated separately).
9511 TTI::PointersChainInfo PtrsInfo =
9512 all_of(Ptrs,
9513 [](const Value *V) {
9514 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9515 return Ptr && !Ptr->hasAllConstantIndices();
9516 })
9517 ? TTI::PointersChainInfo::getUnknownStride()
9518 : TTI::PointersChainInfo::getKnownStride();
9519
9520 ScalarCost =
9521 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9522 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9523 if (!BaseGEP) {
9524 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9525 if (It != Ptrs.end())
9526 BaseGEP = cast<GEPOperator>(*It);
9527 }
9528 if (BaseGEP) {
9529 SmallVector<const Value *> Indices(BaseGEP->indices());
9530 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9531 BaseGEP->getPointerOperand(), Indices, VecTy,
9532 CostKind);
9533 }
9534 }
9535
9536 return std::make_pair(ScalarCost, VecCost);
9537}
9538
9539void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9540 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9541 "Expected gather node without reordering.");
9543 SmallSet<size_t, 2> LoadKeyUsed;
9544
9545 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9546 // instructions have same opcode already.
9547 if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
9548 all_of(TE.Scalars, isConstant))
9549 return;
9550
9551 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9552 return VectorizableTree[Idx]->isSame(TE.Scalars);
9553 }))
9554 return;
9555
9556 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9557 Key = hash_combine(hash_value(LI->getParent()), Key);
9558 Value *Ptr =
9560 if (LoadKeyUsed.contains(Key)) {
9561 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9562 if (LIt != LoadsMap.end()) {
9563 for (LoadInst *RLI : LIt->second) {
9564 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9565 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9566 /*StrictCheck=*/true))
9567 return hash_value(RLI->getPointerOperand());
9568 }
9569 for (LoadInst *RLI : LIt->second) {
9571 LI->getPointerOperand(), *TLI)) {
9572 hash_code SubKey = hash_value(RLI->getPointerOperand());
9573 return SubKey;
9574 }
9575 }
9576 if (LIt->second.size() > 2) {
9577 hash_code SubKey =
9578 hash_value(LIt->second.back()->getPointerOperand());
9579 return SubKey;
9580 }
9581 }
9582 }
9583 LoadKeyUsed.insert(Key);
9584 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9585 return hash_value(LI->getPointerOperand());
9586 };
9589 bool IsOrdered = true;
9590 unsigned NumInstructions = 0;
9591 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9592 // nodes.
9593 for (auto [I, V] : enumerate(TE.Scalars)) {
9594 size_t Key = 1, Idx = 1;
9595 if (auto *Inst = dyn_cast<Instruction>(V);
9596 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9597 !isDeleted(Inst) && !isVectorized(V)) {
9598 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9599 /*AllowAlternate=*/false);
9600 ++NumInstructions;
9601 }
9602 auto &Container = SortedValues[Key];
9603 if (IsOrdered && !KeyToIndex.contains(V) &&
9604 !(isa<Constant, ExtractElementInst>(V) ||
9606 ((Container.contains(Idx) &&
9607 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9608 (!Container.empty() && !Container.contains(Idx) &&
9609 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9610 IsOrdered = false;
9611 auto &KTI = KeyToIndex[V];
9612 if (KTI.empty())
9613 Container[Idx].push_back(V);
9614 KTI.push_back(I);
9615 }
9617 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9618 if (!IsOrdered && NumInstructions > 1) {
9619 unsigned Cnt = 0;
9620 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9621 for (const auto &D : SortedValues) {
9622 for (const auto &P : D.second) {
9623 unsigned Sz = 0;
9624 for (Value *V : P.second) {
9625 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9626 for (auto [K, Idx] : enumerate(Indices)) {
9627 TE.ReorderIndices[Cnt + K] = Idx;
9628 TE.Scalars[Cnt + K] = V;
9629 }
9630 Sz += Indices.size();
9631 Cnt += Indices.size();
9632 }
9633 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9634 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9635 *TTI, TE.Scalars.front()->getType(), Sz);
9636 SubVectors.emplace_back(Cnt - Sz, SubVF);
9637 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9638 DemandedElts.clearBit(I);
9639 } else if (!P.second.empty() && isConstant(P.second.front())) {
9640 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9641 DemandedElts.clearBit(I);
9642 }
9643 }
9644 }
9645 }
9646 // Reuses always require shuffles, so consider it as profitable.
9647 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9648 return;
9649 // Do simple cost estimation.
9652 auto *ScalarTy = TE.Scalars.front()->getType();
9653 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9654 for (auto [Idx, Sz] : SubVectors) {
9656 Idx, getWidenedType(ScalarTy, Sz));
9657 }
9658 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9659 assert(SLPReVec && "Only supported by REVEC.");
9660 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9661 // of CreateInsertElement.
9662 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9663 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9664 if (DemandedElts[I])
9665 Cost +=
9666 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9667 CostKind, I * ScalarTyNumElements, FTy);
9668 } else {
9669 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9670 /*Extract=*/false, CostKind);
9671 }
9672 int Sz = TE.Scalars.size();
9673 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9674 TE.ReorderIndices.end());
9675 for (unsigned I : seq<unsigned>(Sz)) {
9676 Value *V = TE.getOrdered(I);
9677 if (isa<PoisonValue>(V)) {
9678 ReorderMask[I] = PoisonMaskElem;
9679 } else if (isConstant(V) || DemandedElts[I]) {
9680 ReorderMask[I] = I + TE.ReorderIndices.size();
9681 }
9682 }
9684 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9687 VecTy, ReorderMask);
9688 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9689 ReorderMask.assign(Sz, PoisonMaskElem);
9690 for (unsigned I : seq<unsigned>(Sz)) {
9691 Value *V = TE.getOrdered(I);
9692 if (isConstant(V)) {
9693 DemandedElts.clearBit(I);
9694 if (!isa<PoisonValue>(V))
9695 ReorderMask[I] = I;
9696 } else {
9697 ReorderMask[I] = I + Sz;
9698 }
9699 }
9701 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9702 if (!DemandedElts.isAllOnes())
9703 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9704 if (Cost >= BVCost) {
9705 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9706 reorderScalars(TE.Scalars, Mask);
9707 TE.ReorderIndices.clear();
9708 }
9709}
9710
9713 BaseGraphSize = VectorizableTree.size();
9714 // Turn graph transforming mode on and off, when done.
9715 class GraphTransformModeRAAI {
9716 bool &SavedIsGraphTransformMode;
9717
9718 public:
9719 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9720 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9721 IsGraphTransformMode = true;
9722 }
9723 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9724 } TransformContext(IsGraphTransformMode);
9725 // Operands are profitable if they are:
9726 // 1. At least one constant
9727 // or
9728 // 2. Splats
9729 // or
9730 // 3. Results in good vectorization opportunity, i.e. may generate vector
9731 // nodes and reduce cost of the graph.
9732 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9733 const InstructionsState &S) {
9735 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9736 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9737 I2->getOperand(Op));
9738 return all_of(
9739 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9740 return all_of(Cand,
9741 [](const std::pair<Value *, Value *> &P) {
9742 return isa<Constant>(P.first) ||
9743 isa<Constant>(P.second) || P.first == P.second;
9744 }) ||
9746 });
9747 };
9748
9749 // Try to reorder gather nodes for better vectorization opportunities.
9750 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9751 TreeEntry &E = *VectorizableTree[Idx];
9752 if (E.isGather())
9753 reorderGatherNode(E);
9754 }
9755
9756 // The tree may grow here, so iterate over nodes, built before.
9757 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9758 TreeEntry &E = *VectorizableTree[Idx];
9759 if (E.isGather()) {
9760 ArrayRef<Value *> VL = E.Scalars;
9761 const unsigned Sz = getVectorElementSize(VL.front());
9762 unsigned MinVF = getMinVF(2 * Sz);
9763 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9764 // same opcode and same parent block or all constants.
9765 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9766 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9767 E.isAltShuffle() || !allSameBlock(VL)) ||
9768 allConstant(VL) || isSplat(VL))
9769 continue;
9770 // Try to find vectorizable sequences and transform them into a series of
9771 // insertvector instructions.
9772 unsigned StartIdx = 0;
9773 unsigned End = VL.size();
9774 for (unsigned VF = getFloorFullVectorNumberOfElements(
9775 *TTI, VL.front()->getType(), VL.size() - 1);
9776 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9777 *TTI, VL.front()->getType(), VF - 1)) {
9778 if (StartIdx + VF > End)
9779 continue;
9781 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9782 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9783 // If any instruction is vectorized already - do not try again.
9784 // Reuse the existing node, if it fully matches the slice.
9785 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9786 SE || getTreeEntry(Slice.back())) {
9787 if (!SE)
9788 continue;
9789 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9790 continue;
9791 }
9792 // Constant already handled effectively - skip.
9793 if (allConstant(Slice))
9794 continue;
9795 // Do not try to vectorize small splats (less than vector register and
9796 // only with the single non-undef element).
9797 bool IsSplat = isSplat(Slice);
9798 if (Slices.empty() || !IsSplat ||
9799 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9800 Slice.front()->getType(), VF)),
9801 1U, VF - 1) !=
9803 Slice.front()->getType(), 2 * VF)),
9804 1U, 2 * VF)) ||
9805 count(Slice, Slice.front()) ==
9806 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9807 : 1)) {
9808 if (IsSplat)
9809 continue;
9810 InstructionsState S = getSameOpcode(Slice, *TLI);
9811 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9812 (S.getOpcode() == Instruction::Load &&
9814 (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9815 continue;
9816 if (VF == 2) {
9817 // Try to vectorize reduced values or if all users are vectorized.
9818 // For expensive instructions extra extracts might be profitable.
9819 if ((!UserIgnoreList || E.Idx != 0) &&
9820 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9822 !all_of(Slice, [&](Value *V) {
9823 if (isa<PoisonValue>(V))
9824 return true;
9825 return areAllUsersVectorized(cast<Instruction>(V),
9826 UserIgnoreList);
9827 }))
9828 continue;
9829 if (S.getOpcode() == Instruction::Load) {
9830 OrdersType Order;
9831 SmallVector<Value *> PointerOps;
9832 LoadsState Res =
9833 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9834 // Do not vectorize gathers.
9835 if (Res == LoadsState::ScatterVectorize ||
9836 Res == LoadsState::Gather) {
9837 if (Res == LoadsState::Gather) {
9839 // If reductions and the scalars from the root node are
9840 // analyzed - mark as non-vectorizable reduction.
9841 if (UserIgnoreList && E.Idx == 0)
9842 analyzedReductionVals(Slice);
9843 }
9844 continue;
9845 }
9846 } else if (S.getOpcode() == Instruction::ExtractElement ||
9847 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9849 !CheckOperandsProfitability(
9850 S.getMainOp(),
9851 cast<Instruction>(*find_if(reverse(Slice),
9852 IsaPred<Instruction>)),
9853 S))) {
9854 // Do not vectorize extractelements (handled effectively
9855 // alread). Do not vectorize non-profitable instructions (with
9856 // low cost and non-vectorizable operands.)
9857 continue;
9858 }
9859 }
9860 }
9861 Slices.emplace_back(Cnt, Slice.size());
9862 }
9863 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9864 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9865 if (StartIdx == Cnt)
9866 StartIdx = Cnt + Sz;
9867 if (End == Cnt + Sz)
9868 End = Cnt;
9869 };
9870 for (auto [Cnt, Sz] : Slices) {
9871 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9872 // If any instruction is vectorized already - do not try again.
9873 if (TreeEntry *SE = getTreeEntry(Slice.front());
9874 SE || getTreeEntry(Slice.back())) {
9875 if (!SE)
9876 continue;
9877 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9878 continue;
9879 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9880 AddCombinedNode(SE->Idx, Cnt, Sz);
9881 continue;
9882 }
9883 unsigned PrevSize = VectorizableTree.size();
9884 [[maybe_unused]] unsigned PrevEntriesSize =
9885 LoadEntriesToVectorize.size();
9886 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9887 if (PrevSize + 1 == VectorizableTree.size() &&
9888 VectorizableTree[PrevSize]->isGather() &&
9889 VectorizableTree[PrevSize]->getOpcode() !=
9890 Instruction::ExtractElement &&
9891 !isSplat(Slice)) {
9892 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9893 analyzedReductionVals(Slice);
9894 VectorizableTree.pop_back();
9895 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9896 "LoadEntriesToVectorize expected to remain the same");
9897 continue;
9898 }
9899 AddCombinedNode(PrevSize, Cnt, Sz);
9900 }
9901 }
9902 // Restore ordering, if no extra vectorization happened.
9903 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9904 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9905 reorderScalars(E.Scalars, Mask);
9906 E.ReorderIndices.clear();
9907 }
9908 }
9909 switch (E.getOpcode()) {
9910 case Instruction::Load: {
9911 // No need to reorder masked gather loads, just reorder the scalar
9912 // operands.
9913 if (E.State != TreeEntry::Vectorize)
9914 break;
9915 Type *ScalarTy = E.getMainOp()->getType();
9916 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9917 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9918 // Check if profitable to represent consecutive load + reverse as strided
9919 // load with stride -1.
9920 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9921 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9922 SmallVector<int> Mask;
9923 inversePermutation(E.ReorderIndices, Mask);
9924 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9925 InstructionCost OriginalVecCost =
9926 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9931 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9932 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9933 if (StridedCost < OriginalVecCost)
9934 // Strided load is more profitable than consecutive load + reverse -
9935 // transform the node to strided load.
9936 E.State = TreeEntry::StridedVectorize;
9937 }
9938 break;
9939 }
9940 case Instruction::Store: {
9941 Type *ScalarTy =
9942 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9943 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9944 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9945 // Check if profitable to represent consecutive load + reverse as strided
9946 // load with stride -1.
9947 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9948 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9949 SmallVector<int> Mask;
9950 inversePermutation(E.ReorderIndices, Mask);
9951 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9952 InstructionCost OriginalVecCost =
9953 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9958 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9959 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
9960 if (StridedCost < OriginalVecCost)
9961 // Strided store is more profitable than reverse + consecutive store -
9962 // transform the node to strided store.
9963 E.State = TreeEntry::StridedVectorize;
9964 } else if (!E.ReorderIndices.empty()) {
9965 // Check for interleaved stores.
9966 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
9967 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9968 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
9969 if (Mask.size() < 4)
9970 return 0u;
9971 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9973 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
9975 VecTy, Factor, BaseSI->getAlign(),
9976 BaseSI->getPointerAddressSpace()))
9977 return Factor;
9978 }
9979
9980 return 0u;
9981 };
9982 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9983 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9984 if (InterleaveFactor != 0)
9985 E.setInterleave(InterleaveFactor);
9986 }
9987 break;
9988 }
9989 case Instruction::Select: {
9990 if (E.State != TreeEntry::Vectorize)
9991 break;
9992 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
9993 if (MinMaxID == Intrinsic::not_intrinsic)
9994 break;
9995 // This node is a minmax node.
9996 E.CombinedOp = TreeEntry::MinMax;
9997 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
9998 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9999 CondEntry->State == TreeEntry::Vectorize) {
10000 // The condition node is part of the combined minmax node.
10001 CondEntry->State = TreeEntry::CombinedVectorize;
10002 }
10003 break;
10004 }
10005 default:
10006 break;
10007 }
10008 }
10009
10010 if (LoadEntriesToVectorize.empty()) {
10011 // Single load node - exit.
10012 if (VectorizableTree.size() <= 1 &&
10013 VectorizableTree.front()->getOpcode() == Instruction::Load)
10014 return;
10015 // Small graph with small VF - exit.
10016 constexpr unsigned SmallTree = 3;
10017 constexpr unsigned SmallVF = 2;
10018 if ((VectorizableTree.size() <= SmallTree &&
10019 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10020 (VectorizableTree.size() <= 2 && UserIgnoreList))
10021 return;
10022
10023 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10024 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
10025 getCanonicalGraphSize() <= SmallTree &&
10026 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10027 [](const std::unique_ptr<TreeEntry> &TE) {
10028 return TE->isGather() &&
10029 TE->getOpcode() == Instruction::Load &&
10030 !allSameBlock(TE->Scalars);
10031 }) == 1)
10032 return;
10033 }
10034
10035 // A list of loads to be gathered during the vectorization process. We can
10036 // try to vectorize them at the end, if profitable.
10039 GatheredLoads;
10040
10041 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10042 TreeEntry &E = *TE;
10043 if (E.isGather() &&
10044 (E.getOpcode() == Instruction::Load ||
10045 (!E.getOpcode() && any_of(E.Scalars,
10046 [&](Value *V) {
10047 return isa<LoadInst>(V) &&
10048 !isVectorized(V) &&
10049 !isDeleted(cast<Instruction>(V));
10050 }))) &&
10051 !isSplat(E.Scalars)) {
10052 for (Value *V : E.Scalars) {
10053 auto *LI = dyn_cast<LoadInst>(V);
10054 if (!LI)
10055 continue;
10056 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10057 continue;
10059 *this, V, *DL, *SE, *TTI,
10060 GatheredLoads[std::make_tuple(
10061 LI->getParent(),
10063 LI->getType())]);
10064 }
10065 }
10066 }
10067 // Try to vectorize gathered loads if this is not just a gather of loads.
10068 if (!GatheredLoads.empty())
10069 tryToVectorizeGatheredLoads(GatheredLoads);
10070}
10071
10072/// Merges shuffle masks and emits final shuffle instruction, if required. It
10073/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10074/// when the actual shuffle instruction is generated only if this is actually
10075/// required. Otherwise, the shuffle instruction emission is delayed till the
10076/// end of the process, to reduce the number of emitted instructions and further
10077/// analysis/transformations.
10078class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10079 bool IsFinalized = false;
10080 SmallVector<int> CommonMask;
10082 const TargetTransformInfo &TTI;
10084 SmallDenseSet<Value *> VectorizedVals;
10085 BoUpSLP &R;
10086 SmallPtrSetImpl<Value *> &CheckedExtracts;
10087 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10088 /// While set, still trying to estimate the cost for the same nodes and we
10089 /// can delay actual cost estimation (virtual shuffle instruction emission).
10090 /// May help better estimate the cost if same nodes must be permuted + allows
10091 /// to move most of the long shuffles cost estimation to TTI.
10092 bool SameNodesEstimated = true;
10093
10094 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10095 if (Ty->getScalarType()->isPointerTy()) {
10099 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10100 Ty->getScalarType());
10101 if (auto *VTy = dyn_cast<VectorType>(Ty))
10102 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10103 return Res;
10104 }
10105 return Constant::getAllOnesValue(Ty);
10106 }
10107
10108 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10109 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10110 return TTI::TCC_Free;
10111 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10112 InstructionCost GatherCost = 0;
10113 SmallVector<Value *> Gathers(VL);
10114 if (!Root && isSplat(VL)) {
10115 // Found the broadcasting of the single scalar, calculate the cost as
10116 // the broadcast.
10117 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10118 assert(It != VL.end() && "Expected at least one non-undef value.");
10119 // Add broadcast for non-identity shuffle only.
10120 bool NeedShuffle =
10121 count(VL, *It) > 1 &&
10122 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10123 if (!NeedShuffle) {
10124 if (isa<FixedVectorType>(ScalarTy)) {
10125 assert(SLPReVec && "FixedVectorType is not expected.");
10126 return TTI.getShuffleCost(
10127 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10128 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10129 cast<FixedVectorType>(ScalarTy));
10130 }
10131 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10132 CostKind, std::distance(VL.begin(), It),
10133 PoisonValue::get(VecTy), *It);
10134 }
10135
10136 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10137 transform(VL, ShuffleMask.begin(), [](Value *V) {
10138 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10139 });
10140 InstructionCost InsertCost =
10141 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10142 PoisonValue::get(VecTy), *It);
10143 return InsertCost + ::getShuffleCost(TTI,
10145 VecTy, ShuffleMask, CostKind,
10146 /*Index=*/0, /*SubTp=*/nullptr,
10147 /*Args=*/*It);
10148 }
10149 return GatherCost +
10150 (all_of(Gathers, IsaPred<UndefValue>)
10152 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10153 ScalarTy));
10154 };
10155
10156 /// Compute the cost of creating a vector containing the extracted values from
10157 /// \p VL.
10159 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10160 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10161 unsigned NumParts) {
10162 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10163 unsigned NumElts =
10164 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10165 auto *EE = dyn_cast<ExtractElementInst>(V);
10166 if (!EE)
10167 return Sz;
10168 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10169 if (!VecTy)
10170 return Sz;
10171 return std::max(Sz, VecTy->getNumElements());
10172 });
10173 // FIXME: this must be moved to TTI for better estimation.
10174 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10175 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10177 -> std::optional<TTI::ShuffleKind> {
10178 if (NumElts <= EltsPerVector)
10179 return std::nullopt;
10180 int OffsetReg0 =
10181 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10182 [](int S, int I) {
10183 if (I == PoisonMaskElem)
10184 return S;
10185 return std::min(S, I);
10186 }),
10187 EltsPerVector);
10188 int OffsetReg1 = OffsetReg0;
10189 DenseSet<int> RegIndices;
10190 // Check that if trying to permute same single/2 input vectors.
10192 int FirstRegId = -1;
10193 Indices.assign(1, OffsetReg0);
10194 for (auto [Pos, I] : enumerate(Mask)) {
10195 if (I == PoisonMaskElem)
10196 continue;
10197 int Idx = I - OffsetReg0;
10198 int RegId =
10199 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10200 if (FirstRegId < 0)
10201 FirstRegId = RegId;
10202 RegIndices.insert(RegId);
10203 if (RegIndices.size() > 2)
10204 return std::nullopt;
10205 if (RegIndices.size() == 2) {
10206 ShuffleKind = TTI::SK_PermuteTwoSrc;
10207 if (Indices.size() == 1) {
10208 OffsetReg1 = alignDown(
10209 std::accumulate(
10210 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10211 [&](int S, int I) {
10212 if (I == PoisonMaskElem)
10213 return S;
10214 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10215 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10216 if (RegId == FirstRegId)
10217 return S;
10218 return std::min(S, I);
10219 }),
10220 EltsPerVector);
10221 Indices.push_back(OffsetReg1 % NumElts);
10222 }
10223 Idx = I - OffsetReg1;
10224 }
10225 I = (Idx % NumElts) % EltsPerVector +
10226 (RegId == FirstRegId ? 0 : EltsPerVector);
10227 }
10228 return ShuffleKind;
10229 };
10231
10232 // Process extracts in blocks of EltsPerVector to check if the source vector
10233 // operand can be re-used directly. If not, add the cost of creating a
10234 // shuffle to extract the values into a vector register.
10235 for (unsigned Part : seq<unsigned>(NumParts)) {
10236 if (!ShuffleKinds[Part])
10237 continue;
10238 ArrayRef<int> MaskSlice = Mask.slice(
10239 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10240 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10241 copy(MaskSlice, SubMask.begin());
10243 std::optional<TTI::ShuffleKind> RegShuffleKind =
10244 CheckPerRegistersShuffle(SubMask, Indices);
10245 if (!RegShuffleKind) {
10246 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10248 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10249 Cost +=
10250 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10251 getWidenedType(ScalarTy, NumElts), MaskSlice);
10252 continue;
10253 }
10254 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10255 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10256 Cost +=
10257 ::getShuffleCost(TTI, *RegShuffleKind,
10258 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10259 }
10260 const unsigned BaseVF = getFullVectorNumberOfElements(
10261 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10262 for (unsigned Idx : Indices) {
10263 assert((Idx + EltsPerVector) <= BaseVF &&
10264 "SK_ExtractSubvector index out of range");
10266 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10267 Idx, getWidenedType(ScalarTy, EltsPerVector));
10268 }
10269 // Second attempt to check, if just a permute is better estimated than
10270 // subvector extract.
10271 SubMask.assign(NumElts, PoisonMaskElem);
10272 copy(MaskSlice, SubMask.begin());
10273 InstructionCost OriginalCost = ::getShuffleCost(
10274 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10275 if (OriginalCost < Cost)
10276 Cost = OriginalCost;
10277 }
10278 return Cost;
10279 }
10280 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10281 /// shuffle emission.
10282 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10283 ArrayRef<int> Mask) {
10284 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10285 if (Mask[Idx] != PoisonMaskElem)
10286 CommonMask[Idx] = Idx;
10287 }
10288 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10289 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10290 /// elements.
10291 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10292 ArrayRef<int> Mask, unsigned Part,
10293 unsigned SliceSize) {
10294 if (SameNodesEstimated) {
10295 // Delay the cost estimation if the same nodes are reshuffling.
10296 // If we already requested the cost of reshuffling of E1 and E2 before, no
10297 // need to estimate another cost with the sub-Mask, instead include this
10298 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10299 // estimation.
10300 if ((InVectors.size() == 2 &&
10301 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10302 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10303 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10304 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10305 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10306 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10307 "Expected all poisoned elements.");
10308 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10309 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10310 return;
10311 }
10312 // Found non-matching nodes - need to estimate the cost for the matched
10313 // and transform mask.
10314 Cost += createShuffle(InVectors.front(),
10315 InVectors.size() == 1 ? nullptr : InVectors.back(),
10316 CommonMask);
10317 transformMaskAfterShuffle(CommonMask, CommonMask);
10318 } else if (InVectors.size() == 2) {
10319 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10320 transformMaskAfterShuffle(CommonMask, CommonMask);
10321 }
10322 SameNodesEstimated = false;
10323 if (!E2 && InVectors.size() == 1) {
10324 unsigned VF = E1.getVectorFactor();
10325 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10326 VF = std::max(VF,
10327 cast<FixedVectorType>(V1->getType())->getNumElements());
10328 } else {
10329 const auto *E = cast<const TreeEntry *>(InVectors.front());
10330 VF = std::max(VF, E->getVectorFactor());
10331 }
10332 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10333 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10334 CommonMask[Idx] = Mask[Idx] + VF;
10335 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10336 transformMaskAfterShuffle(CommonMask, CommonMask);
10337 } else {
10338 auto P = InVectors.front();
10339 Cost += createShuffle(&E1, E2, Mask);
10340 unsigned VF = Mask.size();
10341 if (Value *V1 = P.dyn_cast<Value *>()) {
10342 VF = std::max(VF,
10343 getNumElements(V1->getType()));
10344 } else {
10345 const auto *E = cast<const TreeEntry *>(P);
10346 VF = std::max(VF, E->getVectorFactor());
10347 }
10348 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10349 if (Mask[Idx] != PoisonMaskElem)
10350 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10351 Cost += createShuffle(P, InVectors.front(), CommonMask);
10352 transformMaskAfterShuffle(CommonMask, CommonMask);
10353 }
10354 }
10355
10356 class ShuffleCostBuilder {
10357 const TargetTransformInfo &TTI;
10358
10359 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10360 int Index = -1;
10361 return Mask.empty() ||
10362 (VF == Mask.size() &&
10365 Index == 0);
10366 }
10367
10368 public:
10369 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10370 ~ShuffleCostBuilder() = default;
10371 InstructionCost createShuffleVector(Value *V1, Value *,
10372 ArrayRef<int> Mask) const {
10373 // Empty mask or identity mask are free.
10374 unsigned VF =
10375 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10376 if (isEmptyOrIdentity(Mask, VF))
10377 return TTI::TCC_Free;
10378 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10379 cast<VectorType>(V1->getType()), Mask);
10380 }
10381 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10382 // Empty mask or identity mask are free.
10383 unsigned VF =
10384 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10385 if (isEmptyOrIdentity(Mask, VF))
10386 return TTI::TCC_Free;
10387 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10388 cast<VectorType>(V1->getType()), Mask);
10389 }
10390 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10391 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10392 return TTI::TCC_Free;
10393 }
10394 void resizeToMatch(Value *&, Value *&) const {}
10395 };
10396
10397 /// Smart shuffle instruction emission, walks through shuffles trees and
10398 /// tries to find the best matching vector for the actual shuffle
10399 /// instruction.
10401 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10403 ArrayRef<int> Mask) {
10404 ShuffleCostBuilder Builder(TTI);
10405 SmallVector<int> CommonMask(Mask);
10406 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10407 unsigned CommonVF = Mask.size();
10408 InstructionCost ExtraCost = 0;
10409 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10410 unsigned VF) -> InstructionCost {
10411 if (E.isGather() && allConstant(E.Scalars))
10412 return TTI::TCC_Free;
10413 Type *EScalarTy = E.Scalars.front()->getType();
10414 bool IsSigned = true;
10415 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10416 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10417 IsSigned = It->second.second;
10418 }
10419 if (EScalarTy != ScalarTy) {
10420 unsigned CastOpcode = Instruction::Trunc;
10421 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10422 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10423 if (DstSz > SrcSz)
10424 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10425 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10426 getWidenedType(EScalarTy, VF),
10427 TTI::CastContextHint::None, CostKind);
10428 }
10429 return TTI::TCC_Free;
10430 };
10431 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10432 if (isa<Constant>(V))
10433 return TTI::TCC_Free;
10434 auto *VecTy = cast<VectorType>(V->getType());
10435 Type *EScalarTy = VecTy->getElementType();
10436 if (EScalarTy != ScalarTy) {
10437 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10438 unsigned CastOpcode = Instruction::Trunc;
10439 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10440 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10441 if (DstSz > SrcSz)
10442 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10443 return TTI.getCastInstrCost(
10444 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10445 VecTy, TTI::CastContextHint::None, CostKind);
10446 }
10447 return TTI::TCC_Free;
10448 };
10449 if (!V1 && !V2 && !P2.isNull()) {
10450 // Shuffle 2 entry nodes.
10451 const TreeEntry *E = cast<const TreeEntry *>(P1);
10452 unsigned VF = E->getVectorFactor();
10453 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10454 CommonVF = std::max(VF, E2->getVectorFactor());
10455 assert(all_of(Mask,
10456 [=](int Idx) {
10457 return Idx < 2 * static_cast<int>(CommonVF);
10458 }) &&
10459 "All elements in mask must be less than 2 * CommonVF.");
10460 if (E->Scalars.size() == E2->Scalars.size()) {
10461 SmallVector<int> EMask = E->getCommonMask();
10462 SmallVector<int> E2Mask = E2->getCommonMask();
10463 if (!EMask.empty() || !E2Mask.empty()) {
10464 for (int &Idx : CommonMask) {
10465 if (Idx == PoisonMaskElem)
10466 continue;
10467 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10468 Idx = EMask[Idx];
10469 else if (Idx >= static_cast<int>(CommonVF))
10470 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10471 E->Scalars.size();
10472 }
10473 }
10474 CommonVF = E->Scalars.size();
10475 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10476 GetNodeMinBWAffectedCost(*E2, CommonVF);
10477 } else {
10478 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10479 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10480 }
10481 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10482 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10483 } else if (!V1 && P2.isNull()) {
10484 // Shuffle single entry node.
10485 const TreeEntry *E = cast<const TreeEntry *>(P1);
10486 unsigned VF = E->getVectorFactor();
10487 CommonVF = VF;
10488 assert(
10489 all_of(Mask,
10490 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10491 "All elements in mask must be less than CommonVF.");
10492 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10493 SmallVector<int> EMask = E->getCommonMask();
10494 assert(!EMask.empty() && "Expected non-empty common mask.");
10495 for (int &Idx : CommonMask) {
10496 if (Idx != PoisonMaskElem)
10497 Idx = EMask[Idx];
10498 }
10499 CommonVF = E->Scalars.size();
10500 } else if (unsigned Factor = E->getInterleaveFactor();
10501 Factor > 0 && E->Scalars.size() != Mask.size() &&
10503 Factor)) {
10504 // Deinterleaved nodes are free.
10505 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10506 }
10507 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10508 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10509 // Not identity/broadcast? Try to see if the original vector is better.
10510 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10511 CommonVF == CommonMask.size() &&
10512 any_of(enumerate(CommonMask),
10513 [](const auto &&P) {
10514 return P.value() != PoisonMaskElem &&
10515 static_cast<unsigned>(P.value()) != P.index();
10516 }) &&
10517 any_of(CommonMask,
10518 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10519 SmallVector<int> ReorderMask;
10520 inversePermutation(E->ReorderIndices, ReorderMask);
10521 ::addMask(CommonMask, ReorderMask);
10522 }
10523 } else if (V1 && P2.isNull()) {
10524 // Shuffle single vector.
10525 ExtraCost += GetValueMinBWAffectedCost(V1);
10526 CommonVF = getVF(V1);
10527 assert(
10528 all_of(Mask,
10529 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10530 "All elements in mask must be less than CommonVF.");
10531 } else if (V1 && !V2) {
10532 // Shuffle vector and tree node.
10533 unsigned VF = getVF(V1);
10534 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10535 CommonVF = std::max(VF, E2->getVectorFactor());
10536 assert(all_of(Mask,
10537 [=](int Idx) {
10538 return Idx < 2 * static_cast<int>(CommonVF);
10539 }) &&
10540 "All elements in mask must be less than 2 * CommonVF.");
10541 if (E2->Scalars.size() == VF && VF != CommonVF) {
10542 SmallVector<int> E2Mask = E2->getCommonMask();
10543 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10544 for (int &Idx : CommonMask) {
10545 if (Idx == PoisonMaskElem)
10546 continue;
10547 if (Idx >= static_cast<int>(CommonVF))
10548 Idx = E2Mask[Idx - CommonVF] + VF;
10549 }
10550 CommonVF = VF;
10551 }
10552 ExtraCost += GetValueMinBWAffectedCost(V1);
10553 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10554 ExtraCost += GetNodeMinBWAffectedCost(
10555 *E2, std::min(CommonVF, E2->getVectorFactor()));
10556 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10557 } else if (!V1 && V2) {
10558 // Shuffle vector and tree node.
10559 unsigned VF = getVF(V2);
10560 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10561 CommonVF = std::max(VF, E1->getVectorFactor());
10562 assert(all_of(Mask,
10563 [=](int Idx) {
10564 return Idx < 2 * static_cast<int>(CommonVF);
10565 }) &&
10566 "All elements in mask must be less than 2 * CommonVF.");
10567 if (E1->Scalars.size() == VF && VF != CommonVF) {
10568 SmallVector<int> E1Mask = E1->getCommonMask();
10569 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10570 for (int &Idx : CommonMask) {
10571 if (Idx == PoisonMaskElem)
10572 continue;
10573 if (Idx >= static_cast<int>(CommonVF))
10574 Idx = E1Mask[Idx - CommonVF] + VF;
10575 else
10576 Idx = E1Mask[Idx];
10577 }
10578 CommonVF = VF;
10579 }
10580 ExtraCost += GetNodeMinBWAffectedCost(
10581 *E1, std::min(CommonVF, E1->getVectorFactor()));
10582 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10583 ExtraCost += GetValueMinBWAffectedCost(V2);
10584 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10585 } else {
10586 assert(V1 && V2 && "Expected both vectors.");
10587 unsigned VF = getVF(V1);
10588 CommonVF = std::max(VF, getVF(V2));
10589 assert(all_of(Mask,
10590 [=](int Idx) {
10591 return Idx < 2 * static_cast<int>(CommonVF);
10592 }) &&
10593 "All elements in mask must be less than 2 * CommonVF.");
10594 ExtraCost +=
10595 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10596 if (V1->getType() != V2->getType()) {
10597 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10598 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10599 } else {
10600 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10601 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10602 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10603 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10604 }
10605 }
10606 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10607 assert(SLPReVec && "FixedVectorType is not expected.");
10609 CommonMask);
10610 }
10611 InVectors.front() =
10612 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10613 if (InVectors.size() == 2)
10614 InVectors.pop_back();
10615 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10616 V1, V2, CommonMask, Builder);
10617 }
10618
10619public:
10621 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10622 SmallPtrSetImpl<Value *> &CheckedExtracts)
10623 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10624 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10625 CheckedExtracts(CheckedExtracts) {}
10626 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10627 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10628 unsigned NumParts, bool &UseVecBaseAsInput) {
10629 UseVecBaseAsInput = false;
10630 if (Mask.empty())
10631 return nullptr;
10632 Value *VecBase = nullptr;
10633 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10634 if (!E->ReorderIndices.empty()) {
10635 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10636 E->ReorderIndices.end());
10637 reorderScalars(VL, ReorderMask);
10638 }
10639 // Check if it can be considered reused if same extractelements were
10640 // vectorized already.
10641 bool PrevNodeFound = any_of(
10642 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10643 [&](const std::unique_ptr<TreeEntry> &TE) {
10644 return ((!TE->isAltShuffle() &&
10645 TE->getOpcode() == Instruction::ExtractElement) ||
10646 TE->isGather()) &&
10647 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10648 return VL.size() > Data.index() &&
10649 (Mask[Data.index()] == PoisonMaskElem ||
10650 isa<UndefValue>(VL[Data.index()]) ||
10651 Data.value() == VL[Data.index()]);
10652 });
10653 });
10654 SmallPtrSet<Value *, 4> UniqueBases;
10655 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10656 for (unsigned Part : seq<unsigned>(NumParts)) {
10657 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10658 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10659 for (auto [I, V] :
10660 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10661 // Ignore non-extractelement scalars.
10662 if (isa<UndefValue>(V) ||
10663 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10664 continue;
10665 // If all users of instruction are going to be vectorized and this
10666 // instruction itself is not going to be vectorized, consider this
10667 // instruction as dead and remove its cost from the final cost of the
10668 // vectorized tree.
10669 // Also, avoid adjusting the cost for extractelements with multiple uses
10670 // in different graph entries.
10671 auto *EE = cast<ExtractElementInst>(V);
10672 VecBase = EE->getVectorOperand();
10673 UniqueBases.insert(VecBase);
10674 const TreeEntry *VE = R.getTreeEntry(V);
10675 if (!CheckedExtracts.insert(V).second ||
10676 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10677 any_of(EE->users(),
10678 [&](User *U) {
10679 return isa<GetElementPtrInst>(U) &&
10680 !R.areAllUsersVectorized(cast<Instruction>(U),
10681 &VectorizedVals);
10682 }) ||
10683 (VE && VE != E))
10684 continue;
10685 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10686 if (!EEIdx)
10687 continue;
10688 unsigned Idx = *EEIdx;
10689 // Take credit for instruction that will become dead.
10690 if (EE->hasOneUse() || !PrevNodeFound) {
10691 Instruction *Ext = EE->user_back();
10692 if (isa<SExtInst, ZExtInst>(Ext) &&
10693 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10694 // Use getExtractWithExtendCost() to calculate the cost of
10695 // extractelement/ext pair.
10696 Cost -=
10697 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10698 EE->getVectorOperandType(), Idx);
10699 // Add back the cost of s|zext which is subtracted separately.
10701 Ext->getOpcode(), Ext->getType(), EE->getType(),
10702 TTI::getCastContextHint(Ext), CostKind, Ext);
10703 continue;
10704 }
10705 }
10706 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10707 CostKind, Idx);
10708 }
10709 }
10710 // Check that gather of extractelements can be represented as just a
10711 // shuffle of a single/two vectors the scalars are extracted from.
10712 // Found the bunch of extractelement instructions that must be gathered
10713 // into a vector and can be represented as a permutation elements in a
10714 // single input vector or of 2 input vectors.
10715 // Done for reused if same extractelements were vectorized already.
10716 if (!PrevNodeFound)
10717 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10718 InVectors.assign(1, E);
10719 CommonMask.assign(Mask.begin(), Mask.end());
10720 transformMaskAfterShuffle(CommonMask, CommonMask);
10721 SameNodesEstimated = false;
10722 if (NumParts != 1 && UniqueBases.size() != 1) {
10723 UseVecBaseAsInput = true;
10724 VecBase =
10725 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10726 }
10727 return VecBase;
10728 }
10729 /// Checks if the specified entry \p E needs to be delayed because of its
10730 /// dependency nodes.
10731 std::optional<InstructionCost>
10732 needToDelay(const TreeEntry *,
10734 // No need to delay the cost estimation during analysis.
10735 return std::nullopt;
10736 }
10737 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10738 if (&E1 == &E2) {
10739 assert(all_of(Mask,
10740 [&](int Idx) {
10741 return Idx < static_cast<int>(E1.getVectorFactor());
10742 }) &&
10743 "Expected single vector shuffle mask.");
10744 add(E1, Mask);
10745 return;
10746 }
10747 if (InVectors.empty()) {
10748 CommonMask.assign(Mask.begin(), Mask.end());
10749 InVectors.assign({&E1, &E2});
10750 return;
10751 }
10752 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10753 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10754 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10755 if (NumParts == 0 || NumParts >= Mask.size() ||
10756 MaskVecTy->getNumElements() % NumParts != 0 ||
10757 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10758 MaskVecTy->getNumElements() / NumParts))
10759 NumParts = 1;
10760 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10761 const auto *It =
10762 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10763 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10764 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10765 }
10766 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10767 if (InVectors.empty()) {
10768 CommonMask.assign(Mask.begin(), Mask.end());
10769 InVectors.assign(1, &E1);
10770 return;
10771 }
10772 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10773 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10774 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10775 if (NumParts == 0 || NumParts >= Mask.size() ||
10776 MaskVecTy->getNumElements() % NumParts != 0 ||
10777 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10778 MaskVecTy->getNumElements() / NumParts))
10779 NumParts = 1;
10780 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10781 const auto *It =
10782 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10783 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10784 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10785 if (!SameNodesEstimated && InVectors.size() == 1)
10786 InVectors.emplace_back(&E1);
10787 }
10788 /// Adds 2 input vectors and the mask for their shuffling.
10789 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10790 // May come only for shuffling of 2 vectors with extractelements, already
10791 // handled in adjustExtracts.
10792 assert(InVectors.size() == 1 &&
10793 all_of(enumerate(CommonMask),
10794 [&](auto P) {
10795 if (P.value() == PoisonMaskElem)
10796 return Mask[P.index()] == PoisonMaskElem;
10797 auto *EI = cast<ExtractElementInst>(
10798 cast<const TreeEntry *>(InVectors.front())
10799 ->getOrdered(P.index()));
10800 return EI->getVectorOperand() == V1 ||
10801 EI->getVectorOperand() == V2;
10802 }) &&
10803 "Expected extractelement vectors.");
10804 }
10805 /// Adds another one input vector and the mask for the shuffling.
10806 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10807 if (InVectors.empty()) {
10808 assert(CommonMask.empty() && !ForExtracts &&
10809 "Expected empty input mask/vectors.");
10810 CommonMask.assign(Mask.begin(), Mask.end());
10811 InVectors.assign(1, V1);
10812 return;
10813 }
10814 if (ForExtracts) {
10815 // No need to add vectors here, already handled them in adjustExtracts.
10816 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10817 !CommonMask.empty() &&
10818 all_of(enumerate(CommonMask),
10819 [&](auto P) {
10820 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10821 ->getOrdered(P.index());
10822 if (P.value() == PoisonMaskElem)
10823 return P.value() == Mask[P.index()] ||
10824 isa<UndefValue>(Scalar);
10825 if (isa<Constant>(V1))
10826 return true;
10827 auto *EI = cast<ExtractElementInst>(Scalar);
10828 return EI->getVectorOperand() == V1;
10829 }) &&
10830 "Expected only tree entry for extractelement vectors.");
10831 return;
10832 }
10833 assert(!InVectors.empty() && !CommonMask.empty() &&
10834 "Expected only tree entries from extracts/reused buildvectors.");
10835 unsigned VF = getVF(V1);
10836 if (InVectors.size() == 2) {
10837 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10838 transformMaskAfterShuffle(CommonMask, CommonMask);
10839 VF = std::max<unsigned>(VF, CommonMask.size());
10840 } else if (const auto *InTE =
10841 InVectors.front().dyn_cast<const TreeEntry *>()) {
10842 VF = std::max(VF, InTE->getVectorFactor());
10843 } else {
10844 VF = std::max(
10845 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10846 ->getNumElements());
10847 }
10848 InVectors.push_back(V1);
10849 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10850 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10851 CommonMask[Idx] = Mask[Idx] + VF;
10852 }
10853 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10854 Value *Root = nullptr) {
10855 Cost += getBuildVectorCost(VL, Root);
10856 if (!Root) {
10857 // FIXME: Need to find a way to avoid use of getNullValue here.
10859 unsigned VF = VL.size();
10860 if (MaskVF != 0)
10861 VF = std::min(VF, MaskVF);
10862 for (Value *V : VL.take_front(VF)) {
10863 if (isa<UndefValue>(V)) {
10864 Vals.push_back(cast<Constant>(V));
10865 continue;
10866 }
10867 Vals.push_back(Constant::getNullValue(V->getType()));
10868 }
10869 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10870 assert(SLPReVec && "FixedVectorType is not expected.");
10871 // When REVEC is enabled, we need to expand vector types into scalar
10872 // types.
10873 unsigned VecTyNumElements = VecTy->getNumElements();
10874 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10875 for (auto [I, V] : enumerate(Vals)) {
10876 Type *ScalarTy = V->getType()->getScalarType();
10877 Constant *NewVal;
10878 if (isa<PoisonValue>(V))
10879 NewVal = PoisonValue::get(ScalarTy);
10880 else if (isa<UndefValue>(V))
10881 NewVal = UndefValue::get(ScalarTy);
10882 else
10883 NewVal = Constant::getNullValue(ScalarTy);
10884 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10885 NewVal);
10886 }
10887 Vals.swap(NewVals);
10888 }
10889 return ConstantVector::get(Vals);
10890 }
10893 cast<FixedVectorType>(Root->getType())->getNumElements()),
10894 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10895 }
10897 /// Finalize emission of the shuffles.
10900 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10901 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10902 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10903 IsFinalized = true;
10904 if (Action) {
10905 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10906 if (InVectors.size() == 2)
10907 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10908 else
10909 Cost += createShuffle(Vec, nullptr, CommonMask);
10910 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10911 if (CommonMask[Idx] != PoisonMaskElem)
10912 CommonMask[Idx] = Idx;
10913 assert(VF > 0 &&
10914 "Expected vector length for the final value before action.");
10915 Value *V = cast<Value *>(Vec);
10916 Action(V, CommonMask);
10917 InVectors.front() = V;
10918 }
10919 if (!SubVectors.empty()) {
10920 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10921 if (InVectors.size() == 2)
10922 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10923 else
10924 Cost += createShuffle(Vec, nullptr, CommonMask);
10925 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10926 if (CommonMask[Idx] != PoisonMaskElem)
10927 CommonMask[Idx] = Idx;
10928 // Add subvectors permutation cost.
10929 if (!SubVectorsMask.empty()) {
10930 assert(SubVectorsMask.size() <= CommonMask.size() &&
10931 "Expected same size of masks for subvectors and common mask.");
10932 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10933 copy(SubVectorsMask, SVMask.begin());
10934 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10935 if (I2 != PoisonMaskElem) {
10936 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10937 I1 = I2 + CommonMask.size();
10938 }
10939 }
10941 getWidenedType(ScalarTy, CommonMask.size()),
10942 SVMask, CostKind);
10943 }
10944 for (auto [E, Idx] : SubVectors) {
10945 Type *EScalarTy = E->Scalars.front()->getType();
10946 bool IsSigned = true;
10947 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10948 EScalarTy =
10949 IntegerType::get(EScalarTy->getContext(), It->second.first);
10950 IsSigned = It->second.second;
10951 }
10952 if (ScalarTy != EScalarTy) {
10953 unsigned CastOpcode = Instruction::Trunc;
10954 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10955 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10956 if (DstSz > SrcSz)
10957 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10959 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10960 getWidenedType(EScalarTy, E->getVectorFactor()),
10962 }
10965 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10966 getWidenedType(ScalarTy, E->getVectorFactor()));
10967 if (!CommonMask.empty()) {
10968 std::iota(std::next(CommonMask.begin(), Idx),
10969 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10970 Idx);
10971 }
10972 }
10973 }
10974
10975 if (!ExtMask.empty()) {
10976 if (CommonMask.empty()) {
10977 CommonMask.assign(ExtMask.begin(), ExtMask.end());
10978 } else {
10979 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
10980 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
10981 if (ExtMask[I] == PoisonMaskElem)
10982 continue;
10983 NewMask[I] = CommonMask[ExtMask[I]];
10984 }
10985 CommonMask.swap(NewMask);
10986 }
10987 }
10988 if (CommonMask.empty()) {
10989 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
10990 return Cost;
10991 }
10992 return Cost +
10993 createShuffle(InVectors.front(),
10994 InVectors.size() == 2 ? InVectors.back() : nullptr,
10995 CommonMask);
10996 }
10997
10999 assert((IsFinalized || CommonMask.empty()) &&
11000 "Shuffle construction must be finalized.");
11001 }
11002};
11003
11004const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11005 unsigned Idx) const {
11006 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
11007 return VE;
11008 const auto *It =
11009 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11010 return TE->isGather() &&
11011 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11012 return EI.EdgeIdx == Idx && EI.UserTE == E;
11013 }) != TE->UserTreeIndices.end();
11014 });
11015 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
11016 return It->get();
11017}
11018
11019TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
11020 if (TE.State == TreeEntry::ScatterVectorize ||
11021 TE.State == TreeEntry::StridedVectorize)
11023 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11024 !TE.isAltShuffle()) {
11025 if (TE.ReorderIndices.empty())
11027 SmallVector<int> Mask;
11028 inversePermutation(TE.ReorderIndices, Mask);
11029 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11031 }
11033}
11034
11035/// Builds the arguments types vector for the given call instruction with the
11036/// given \p ID for the specified vector factor.
11039 const unsigned VF, unsigned MinBW,
11040 const TargetTransformInfo *TTI) {
11041 SmallVector<Type *> ArgTys;
11042 for (auto [Idx, Arg] : enumerate(CI->args())) {
11045 ArgTys.push_back(Arg->getType());
11046 continue;
11047 }
11048 if (MinBW > 0) {
11049 ArgTys.push_back(
11050 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11051 continue;
11052 }
11053 }
11054 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11055 }
11056 return ArgTys;
11057}
11058
11060BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11061 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11062 ArrayRef<Value *> VL = E->Scalars;
11063
11064 Type *ScalarTy = getValueType(VL[0]);
11065 if (!isValidElementType(ScalarTy))
11068
11069 // If we have computed a smaller type for the expression, update VecTy so
11070 // that the costs will be accurate.
11071 auto It = MinBWs.find(E);
11072 Type *OrigScalarTy = ScalarTy;
11073 if (It != MinBWs.end()) {
11074 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11075 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11076 if (VecTy)
11077 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11078 }
11079 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11080 unsigned EntryVF = E->getVectorFactor();
11081 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11082
11083 if (E->isGather()) {
11084 if (allConstant(VL))
11085 return 0;
11086 if (isa<InsertElementInst>(VL[0]))
11088 if (isa<CmpInst>(VL.front()))
11089 ScalarTy = VL.front()->getType();
11090 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11091 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11092 }
11093 InstructionCost CommonCost = 0;
11095 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11096 !isReverseOrder(E->ReorderIndices))) {
11097 SmallVector<int> NewMask;
11098 if (E->getOpcode() == Instruction::Store) {
11099 // For stores the order is actually a mask.
11100 NewMask.resize(E->ReorderIndices.size());
11101 copy(E->ReorderIndices, NewMask.begin());
11102 } else {
11103 inversePermutation(E->ReorderIndices, NewMask);
11104 }
11105 ::addMask(Mask, NewMask);
11106 }
11107 if (!E->ReuseShuffleIndices.empty())
11108 ::addMask(Mask, E->ReuseShuffleIndices);
11109 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11110 CommonCost =
11111 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11112 assert((E->State == TreeEntry::Vectorize ||
11113 E->State == TreeEntry::ScatterVectorize ||
11114 E->State == TreeEntry::StridedVectorize) &&
11115 "Unhandled state");
11116 assert(E->getOpcode() &&
11117 ((allSameType(VL) && allSameBlock(VL)) ||
11118 (E->getOpcode() == Instruction::GetElementPtr &&
11119 E->getMainOp()->getType()->isPointerTy())) &&
11120 "Invalid VL");
11121 Instruction *VL0 = E->getMainOp();
11122 unsigned ShuffleOrOp =
11123 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11124 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11125 ShuffleOrOp = E->CombinedOp;
11126 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11127 const unsigned Sz = UniqueValues.size();
11128 SmallBitVector UsedScalars(Sz, false);
11129 for (unsigned I = 0; I < Sz; ++I) {
11130 if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11131 continue;
11132 UsedScalars.set(I);
11133 }
11134 auto GetCastContextHint = [&](Value *V) {
11135 if (const TreeEntry *OpTE = getTreeEntry(V))
11136 return getCastContextHint(*OpTE);
11137 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11138 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11139 !SrcState.isAltShuffle())
11142 };
11143 auto GetCostDiff =
11144 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11146 // Calculate the cost of this instruction.
11147 InstructionCost ScalarCost = 0;
11148 if (isa<CastInst, CallInst>(VL0)) {
11149 // For some of the instructions no need to calculate cost for each
11150 // particular instruction, we can use the cost of the single
11151 // instruction x total number of scalar instructions.
11152 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11153 } else {
11154 for (unsigned I = 0; I < Sz; ++I) {
11155 if (UsedScalars.test(I))
11156 continue;
11157 ScalarCost += ScalarEltCost(I);
11158 }
11159 }
11160
11161 InstructionCost VecCost = VectorCost(CommonCost);
11162 // Check if the current node must be resized, if the parent node is not
11163 // resized.
11164 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11165 E->Idx != 0 &&
11166 (E->getOpcode() != Instruction::Load ||
11167 !E->UserTreeIndices.empty())) {
11168 const EdgeInfo &EI =
11169 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11170 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11171 });
11172 if (EI.UserTE->getOpcode() != Instruction::Select ||
11173 EI.EdgeIdx != 0) {
11174 auto UserBWIt = MinBWs.find(EI.UserTE);
11175 Type *UserScalarTy =
11176 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11177 if (UserBWIt != MinBWs.end())
11178 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11179 UserBWIt->second.first);
11180 if (ScalarTy != UserScalarTy) {
11181 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11182 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11183 unsigned VecOpcode;
11184 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11185 if (BWSz > SrcBWSz)
11186 VecOpcode = Instruction::Trunc;
11187 else
11188 VecOpcode =
11189 It->second.second ? Instruction::SExt : Instruction::ZExt;
11190 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11191 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11192 CostKind);
11193 }
11194 }
11195 }
11196 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11197 ScalarCost, "Calculated costs for Tree"));
11198 return VecCost - ScalarCost;
11199 };
11200 // Calculate cost difference from vectorizing set of GEPs.
11201 // Negative value means vectorizing is profitable.
11202 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11203 assert((E->State == TreeEntry::Vectorize ||
11204 E->State == TreeEntry::StridedVectorize) &&
11205 "Entry state expected to be Vectorize or StridedVectorize here.");
11206 InstructionCost ScalarCost = 0;
11207 InstructionCost VecCost = 0;
11208 std::tie(ScalarCost, VecCost) = getGEPCosts(
11209 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11210 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11211 "Calculated GEPs cost for Tree"));
11212
11213 return VecCost - ScalarCost;
11214 };
11215
11216 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11217 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11218 if (MinMaxID == Intrinsic::not_intrinsic)
11220 Type *CanonicalType = Ty;
11221 if (CanonicalType->isPtrOrPtrVectorTy())
11222 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11223 CanonicalType->getContext(),
11224 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11225
11226 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11227 {CanonicalType, CanonicalType});
11228 InstructionCost IntrinsicCost =
11229 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11230 // If the selects are the only uses of the compares, they will be
11231 // dead and we can adjust the cost by removing their cost.
11232 if (VI && SelectOnly) {
11233 assert((!Ty->isVectorTy() || SLPReVec) &&
11234 "Expected only for scalar type.");
11235 auto *CI = cast<CmpInst>(VI->getOperand(0));
11236 IntrinsicCost -= TTI->getCmpSelInstrCost(
11237 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11238 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11239 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11240 }
11241 return IntrinsicCost;
11242 };
11243 switch (ShuffleOrOp) {
11244 case Instruction::PHI: {
11245 // Count reused scalars.
11246 InstructionCost ScalarCost = 0;
11248 for (Value *V : UniqueValues) {
11249 auto *PHI = dyn_cast<PHINode>(V);
11250 if (!PHI)
11251 continue;
11252
11253 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11254 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11255 Value *Op = PHI->getIncomingValue(I);
11256 Operands[I] = Op;
11257 }
11258 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11259 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11260 if (!OpTE->ReuseShuffleIndices.empty())
11261 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11262 OpTE->Scalars.size());
11263 }
11264
11265 return CommonCost - ScalarCost;
11266 }
11267 case Instruction::ExtractValue:
11268 case Instruction::ExtractElement: {
11269 auto GetScalarCost = [&](unsigned Idx) {
11270 if (isa<PoisonValue>(UniqueValues[Idx]))
11272
11273 auto *I = cast<Instruction>(UniqueValues[Idx]);
11274 VectorType *SrcVecTy;
11275 if (ShuffleOrOp == Instruction::ExtractElement) {
11276 auto *EE = cast<ExtractElementInst>(I);
11277 SrcVecTy = EE->getVectorOperandType();
11278 } else {
11279 auto *EV = cast<ExtractValueInst>(I);
11280 Type *AggregateTy = EV->getAggregateOperand()->getType();
11281 unsigned NumElts;
11282 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11283 NumElts = ATy->getNumElements();
11284 else
11285 NumElts = AggregateTy->getStructNumElements();
11286 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11287 }
11288 if (I->hasOneUse()) {
11289 Instruction *Ext = I->user_back();
11290 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11291 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11292 // Use getExtractWithExtendCost() to calculate the cost of
11293 // extractelement/ext pair.
11295 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11296 // Subtract the cost of s|zext which is subtracted separately.
11298 Ext->getOpcode(), Ext->getType(), I->getType(),
11300 return Cost;
11301 }
11302 }
11303 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11305 };
11306 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11307 return GetCostDiff(GetScalarCost, GetVectorCost);
11308 }
11309 case Instruction::InsertElement: {
11310 assert(E->ReuseShuffleIndices.empty() &&
11311 "Unique insertelements only are expected.");
11312 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11313 unsigned const NumElts = SrcVecTy->getNumElements();
11314 unsigned const NumScalars = VL.size();
11315
11316 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11317
11318 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11319 unsigned OffsetBeg = *getElementIndex(VL.front());
11320 unsigned OffsetEnd = OffsetBeg;
11321 InsertMask[OffsetBeg] = 0;
11322 for (auto [I, V] : enumerate(VL.drop_front())) {
11323 unsigned Idx = *getElementIndex(V);
11324 if (OffsetBeg > Idx)
11325 OffsetBeg = Idx;
11326 else if (OffsetEnd < Idx)
11327 OffsetEnd = Idx;
11328 InsertMask[Idx] = I + 1;
11329 }
11330 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11331 if (NumOfParts > 0 && NumOfParts < NumElts)
11332 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11333 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11334 VecScalarsSz;
11335 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11336 unsigned InsertVecSz = std::min<unsigned>(
11337 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11338 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11339 bool IsWholeSubvector =
11340 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11341 // Check if we can safely insert a subvector. If it is not possible, just
11342 // generate a whole-sized vector and shuffle the source vector and the new
11343 // subvector.
11344 if (OffsetBeg + InsertVecSz > VecSz) {
11345 // Align OffsetBeg to generate correct mask.
11346 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11347 InsertVecSz = VecSz;
11348 }
11349
11350 APInt DemandedElts = APInt::getZero(NumElts);
11351 // TODO: Add support for Instruction::InsertValue.
11353 if (!E->ReorderIndices.empty()) {
11354 inversePermutation(E->ReorderIndices, Mask);
11355 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11356 } else {
11357 Mask.assign(VecSz, PoisonMaskElem);
11358 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11359 }
11360 bool IsIdentity = true;
11361 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11362 Mask.swap(PrevMask);
11363 for (unsigned I = 0; I < NumScalars; ++I) {
11364 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11365 DemandedElts.setBit(InsertIdx);
11366 IsIdentity &= InsertIdx - OffsetBeg == I;
11367 Mask[InsertIdx - OffsetBeg] = I;
11368 }
11369 assert(Offset < NumElts && "Failed to find vector index offset");
11370
11372 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11373 /*Insert*/ true, /*Extract*/ false,
11374 CostKind);
11375
11376 // First cost - resize to actual vector size if not identity shuffle or
11377 // need to shift the vector.
11378 // Do not calculate the cost if the actual size is the register size and
11379 // we can merge this shuffle with the following SK_Select.
11380 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11381 if (!IsIdentity)
11383 InsertVecTy, Mask);
11384 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11385 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11386 }));
11387 // Second cost - permutation with subvector, if some elements are from the
11388 // initial vector or inserting a subvector.
11389 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11390 // subvector of ActualVecTy.
11391 SmallBitVector InMask =
11392 isUndefVector(FirstInsert->getOperand(0),
11393 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11394 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11395 if (InsertVecSz != VecSz) {
11396 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11397 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11398 CostKind, OffsetBeg - Offset, InsertVecTy);
11399 } else {
11400 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11401 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11402 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11403 I <= End; ++I)
11404 if (Mask[I] != PoisonMaskElem)
11405 Mask[I] = I + VecSz;
11406 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11407 Mask[I] =
11408 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11409 Cost +=
11410 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11411 }
11412 }
11413 return Cost;
11414 }
11415 case Instruction::ZExt:
11416 case Instruction::SExt:
11417 case Instruction::FPToUI:
11418 case Instruction::FPToSI:
11419 case Instruction::FPExt:
11420 case Instruction::PtrToInt:
11421 case Instruction::IntToPtr:
11422 case Instruction::SIToFP:
11423 case Instruction::UIToFP:
11424 case Instruction::Trunc:
11425 case Instruction::FPTrunc:
11426 case Instruction::BitCast: {
11427 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11428 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11429 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11430 unsigned Opcode = ShuffleOrOp;
11431 unsigned VecOpcode = Opcode;
11432 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11433 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11434 // Check if the values are candidates to demote.
11435 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11436 if (SrcIt != MinBWs.end()) {
11437 SrcBWSz = SrcIt->second.first;
11438 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11439 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11440 SrcVecTy =
11441 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11442 }
11443 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11444 if (BWSz == SrcBWSz) {
11445 VecOpcode = Instruction::BitCast;
11446 } else if (BWSz < SrcBWSz) {
11447 VecOpcode = Instruction::Trunc;
11448 } else if (It != MinBWs.end()) {
11449 assert(BWSz > SrcBWSz && "Invalid cast!");
11450 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11451 } else if (SrcIt != MinBWs.end()) {
11452 assert(BWSz > SrcBWSz && "Invalid cast!");
11453 VecOpcode =
11454 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11455 }
11456 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11457 !SrcIt->second.second) {
11458 VecOpcode = Instruction::UIToFP;
11459 }
11460 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11461 assert(Idx == 0 && "Expected 0 index only");
11462 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11463 VL0->getOperand(0)->getType(),
11465 };
11466 auto GetVectorCost = [=](InstructionCost CommonCost) {
11467 // Do not count cost here if minimum bitwidth is in effect and it is just
11468 // a bitcast (here it is just a noop).
11469 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11470 return CommonCost;
11471 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11472 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11473
11474 bool IsArithmeticExtendedReduction =
11475 E->Idx == 0 && UserIgnoreList &&
11476 all_of(*UserIgnoreList, [](Value *V) {
11477 auto *I = cast<Instruction>(V);
11478 return is_contained({Instruction::Add, Instruction::FAdd,
11479 Instruction::Mul, Instruction::FMul,
11480 Instruction::And, Instruction::Or,
11481 Instruction::Xor},
11482 I->getOpcode());
11483 });
11484 if (IsArithmeticExtendedReduction &&
11485 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11486 return CommonCost;
11487 return CommonCost +
11488 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11489 VecOpcode == Opcode ? VI : nullptr);
11490 };
11491 return GetCostDiff(GetScalarCost, GetVectorCost);
11492 }
11493 case Instruction::FCmp:
11494 case Instruction::ICmp:
11495 case Instruction::Select: {
11496 CmpPredicate VecPred, SwappedVecPred;
11497 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11498 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11499 match(VL0, MatchCmp))
11500 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11501 else
11502 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11505 auto GetScalarCost = [&](unsigned Idx) {
11506 if (isa<PoisonValue>(UniqueValues[Idx]))
11508
11509 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11510 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11513 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11514 // FIXME: Use CmpPredicate::getMatching here.
11515 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11516 !match(VI, MatchCmp)) ||
11517 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11518 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11519 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11522
11524 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11525 CostKind, getOperandInfo(VI->getOperand(0)),
11526 getOperandInfo(VI->getOperand(1)), VI);
11527 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11528 if (IntrinsicCost.isValid())
11529 ScalarCost = IntrinsicCost;
11530
11531 return ScalarCost;
11532 };
11533 auto GetVectorCost = [&](InstructionCost CommonCost) {
11534 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11535
11536 InstructionCost VecCost =
11537 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11538 CostKind, getOperandInfo(E->getOperand(0)),
11539 getOperandInfo(E->getOperand(1)), VL0);
11540 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11541 auto *CondType =
11542 getWidenedType(SI->getCondition()->getType(), VL.size());
11543 unsigned CondNumElements = CondType->getNumElements();
11544 unsigned VecTyNumElements = getNumElements(VecTy);
11545 assert(VecTyNumElements >= CondNumElements &&
11546 VecTyNumElements % CondNumElements == 0 &&
11547 "Cannot vectorize Instruction::Select");
11548 if (CondNumElements != VecTyNumElements) {
11549 // When the return type is i1 but the source is fixed vector type, we
11550 // need to duplicate the condition value.
11551 VecCost += ::getShuffleCost(
11552 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11553 createReplicatedMask(VecTyNumElements / CondNumElements,
11554 CondNumElements));
11555 }
11556 }
11557 return VecCost + CommonCost;
11558 };
11559 return GetCostDiff(GetScalarCost, GetVectorCost);
11560 }
11561 case TreeEntry::MinMax: {
11562 auto GetScalarCost = [&](unsigned Idx) {
11563 return GetMinMaxCost(OrigScalarTy);
11564 };
11565 auto GetVectorCost = [&](InstructionCost CommonCost) {
11566 InstructionCost VecCost = GetMinMaxCost(VecTy);
11567 return VecCost + CommonCost;
11568 };
11569 return GetCostDiff(GetScalarCost, GetVectorCost);
11570 }
11571 case Instruction::FNeg:
11572 case Instruction::Add:
11573 case Instruction::FAdd:
11574 case Instruction::Sub:
11575 case Instruction::FSub:
11576 case Instruction::Mul:
11577 case Instruction::FMul:
11578 case Instruction::UDiv:
11579 case Instruction::SDiv:
11580 case Instruction::FDiv:
11581 case Instruction::URem:
11582 case Instruction::SRem:
11583 case Instruction::FRem:
11584 case Instruction::Shl:
11585 case Instruction::LShr:
11586 case Instruction::AShr:
11587 case Instruction::And:
11588 case Instruction::Or:
11589 case Instruction::Xor: {
11590 auto GetScalarCost = [&](unsigned Idx) {
11591 if (isa<PoisonValue>(UniqueValues[Idx]))
11593
11594 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11595 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11596 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11597 TTI::OperandValueInfo Op2Info =
11598 TTI::getOperandInfo(VI->getOperand(OpIdx));
11599 SmallVector<const Value *> Operands(VI->operand_values());
11600 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11601 Op1Info, Op2Info, Operands, VI);
11602 };
11603 auto GetVectorCost = [=](InstructionCost CommonCost) {
11604 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11605 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11606 ArrayRef<Value *> Ops = E->getOperand(I);
11607 if (all_of(Ops, [&](Value *Op) {
11608 auto *CI = dyn_cast<ConstantInt>(Op);
11609 return CI && CI->getValue().countr_one() >= It->second.first;
11610 }))
11611 return CommonCost;
11612 }
11613 }
11614 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11615 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11616 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11617 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11618 Op2Info, {}, nullptr, TLI) +
11619 CommonCost;
11620 };
11621 return GetCostDiff(GetScalarCost, GetVectorCost);
11622 }
11623 case Instruction::GetElementPtr: {
11624 return CommonCost + GetGEPCostDiff(VL, VL0);
11625 }
11626 case Instruction::Load: {
11627 auto GetScalarCost = [&](unsigned Idx) {
11628 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11629 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11630 VI->getAlign(), VI->getPointerAddressSpace(),
11632 };
11633 auto *LI0 = cast<LoadInst>(VL0);
11634 auto GetVectorCost = [&](InstructionCost CommonCost) {
11635 InstructionCost VecLdCost;
11636 switch (E->State) {
11637 case TreeEntry::Vectorize:
11638 if (unsigned Factor = E->getInterleaveFactor()) {
11639 VecLdCost = TTI->getInterleavedMemoryOpCost(
11640 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11641 LI0->getPointerAddressSpace(), CostKind);
11642
11643 } else {
11644 VecLdCost = TTI->getMemoryOpCost(
11645 Instruction::Load, VecTy, LI0->getAlign(),
11646 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11647 }
11648 break;
11649 case TreeEntry::StridedVectorize: {
11650 Align CommonAlignment =
11651 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11652 VecLdCost = TTI->getStridedMemoryOpCost(
11653 Instruction::Load, VecTy, LI0->getPointerOperand(),
11654 /*VariableMask=*/false, CommonAlignment, CostKind);
11655 break;
11656 }
11657 case TreeEntry::ScatterVectorize: {
11658 Align CommonAlignment =
11659 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11660 VecLdCost = TTI->getGatherScatterOpCost(
11661 Instruction::Load, VecTy, LI0->getPointerOperand(),
11662 /*VariableMask=*/false, CommonAlignment, CostKind);
11663 break;
11664 }
11665 case TreeEntry::CombinedVectorize:
11666 case TreeEntry::NeedToGather:
11667 llvm_unreachable("Unexpected vectorization state.");
11668 }
11669 return VecLdCost + CommonCost;
11670 };
11671
11672 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11673 // If this node generates masked gather load then it is not a terminal node.
11674 // Hence address operand cost is estimated separately.
11675 if (E->State == TreeEntry::ScatterVectorize)
11676 return Cost;
11677
11678 // Estimate cost of GEPs since this tree node is a terminator.
11679 SmallVector<Value *> PointerOps(VL.size());
11680 for (auto [I, V] : enumerate(VL))
11681 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11682 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11683 }
11684 case Instruction::Store: {
11685 bool IsReorder = !E->ReorderIndices.empty();
11686 auto GetScalarCost = [=](unsigned Idx) {
11687 auto *VI = cast<StoreInst>(VL[Idx]);
11688 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11689 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11690 VI->getAlign(), VI->getPointerAddressSpace(),
11691 CostKind, OpInfo, VI);
11692 };
11693 auto *BaseSI =
11694 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11695 auto GetVectorCost = [=](InstructionCost CommonCost) {
11696 // We know that we can merge the stores. Calculate the cost.
11697 InstructionCost VecStCost;
11698 if (E->State == TreeEntry::StridedVectorize) {
11699 Align CommonAlignment =
11700 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11701 VecStCost = TTI->getStridedMemoryOpCost(
11702 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11703 /*VariableMask=*/false, CommonAlignment, CostKind);
11704 } else {
11705 assert(E->State == TreeEntry::Vectorize &&
11706 "Expected either strided or consecutive stores.");
11707 if (unsigned Factor = E->getInterleaveFactor()) {
11708 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11709 "No reused shuffles expected");
11710 CommonCost = 0;
11711 VecStCost = TTI->getInterleavedMemoryOpCost(
11712 Instruction::Store, VecTy, Factor, std::nullopt,
11713 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11714 } else {
11715 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11716 VecStCost = TTI->getMemoryOpCost(
11717 Instruction::Store, VecTy, BaseSI->getAlign(),
11718 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11719 }
11720 }
11721 return VecStCost + CommonCost;
11722 };
11723 SmallVector<Value *> PointerOps(VL.size());
11724 for (auto [I, V] : enumerate(VL)) {
11725 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11726 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11727 }
11728
11729 return GetCostDiff(GetScalarCost, GetVectorCost) +
11730 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11731 }
11732 case Instruction::Call: {
11733 auto GetScalarCost = [&](unsigned Idx) {
11734 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11737 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11738 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11739 }
11742 CI->getFunctionType()->params(), CostKind);
11743 };
11744 auto GetVectorCost = [=](InstructionCost CommonCost) {
11745 auto *CI = cast<CallInst>(VL0);
11748 CI, ID, VecTy->getNumElements(),
11749 It != MinBWs.end() ? It->second.first : 0, TTI);
11750 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11751 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11752 };
11753 return GetCostDiff(GetScalarCost, GetVectorCost);
11754 }
11755 case Instruction::ShuffleVector: {
11756 if (!SLPReVec || E->isAltShuffle())
11757 assert(E->isAltShuffle() &&
11758 ((Instruction::isBinaryOp(E->getOpcode()) &&
11759 Instruction::isBinaryOp(E->getAltOpcode())) ||
11760 (Instruction::isCast(E->getOpcode()) &&
11761 Instruction::isCast(E->getAltOpcode())) ||
11762 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11763 "Invalid Shuffle Vector Operand");
11764 // Try to find the previous shuffle node with the same operands and same
11765 // main/alternate ops.
11766 auto TryFindNodeWithEqualOperands = [=]() {
11767 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11768 if (TE.get() == E)
11769 break;
11770 if (TE->isAltShuffle() &&
11771 ((TE->getOpcode() == E->getOpcode() &&
11772 TE->getAltOpcode() == E->getAltOpcode()) ||
11773 (TE->getOpcode() == E->getAltOpcode() &&
11774 TE->getAltOpcode() == E->getOpcode())) &&
11775 TE->hasEqualOperands(*E))
11776 return true;
11777 }
11778 return false;
11779 };
11780 auto GetScalarCost = [&](unsigned Idx) {
11781 if (isa<PoisonValue>(UniqueValues[Idx]))
11783
11784 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11785 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11786 (void)E;
11787 return TTI->getInstructionCost(VI, CostKind);
11788 };
11789 // Need to clear CommonCost since the final shuffle cost is included into
11790 // vector cost.
11791 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11792 // VecCost is equal to sum of the cost of creating 2 vectors
11793 // and the cost of creating shuffle.
11794 InstructionCost VecCost = 0;
11795 if (TryFindNodeWithEqualOperands()) {
11796 LLVM_DEBUG({
11797 dbgs() << "SLP: diamond match for alternate node found.\n";
11798 E->dump();
11799 });
11800 // No need to add new vector costs here since we're going to reuse
11801 // same main/alternate vector ops, just do different shuffling.
11802 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11803 VecCost =
11804 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11805 VecCost +=
11806 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11807 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11808 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11809 VecCost = TTIRef.getCmpSelInstrCost(
11810 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11811 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11812 VL0);
11813 VecCost += TTIRef.getCmpSelInstrCost(
11814 E->getOpcode(), VecTy, MaskTy,
11815 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11816 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11817 E->getAltOp());
11818 } else {
11819 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11820 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11821 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11822 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11823 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11824 unsigned SrcBWSz =
11825 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11826 if (SrcIt != MinBWs.end()) {
11827 SrcBWSz = SrcIt->second.first;
11828 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11829 SrcTy = getWidenedType(SrcSclTy, VL.size());
11830 }
11831 if (BWSz <= SrcBWSz) {
11832 if (BWSz < SrcBWSz)
11833 VecCost =
11834 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11836 LLVM_DEBUG({
11837 dbgs()
11838 << "SLP: alternate extension, which should be truncated.\n";
11839 E->dump();
11840 });
11841 return VecCost;
11842 }
11843 }
11844 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11846 VecCost +=
11847 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11849 }
11851 E->buildAltOpShuffleMask(
11852 [&](Instruction *I) {
11853 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11854 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11855 *TLI);
11856 },
11857 Mask);
11859 FinalVecTy, Mask, CostKind);
11860 // Patterns like [fadd,fsub] can be combined into a single instruction
11861 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11862 // need to take into account their order when looking for the most used
11863 // order.
11864 unsigned Opcode0 = E->getOpcode();
11865 unsigned Opcode1 = E->getAltOpcode();
11866 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11867 // If this pattern is supported by the target then we consider the
11868 // order.
11869 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11870 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11871 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11872 return AltVecCost < VecCost ? AltVecCost : VecCost;
11873 }
11874 // TODO: Check the reverse order too.
11875 return VecCost;
11876 };
11877 if (SLPReVec && !E->isAltShuffle())
11878 return GetCostDiff(
11879 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11880 // If a group uses mask in order, the shufflevector can be
11881 // eliminated by instcombine. Then the cost is 0.
11882 assert(isa<ShuffleVectorInst>(VL.front()) &&
11883 "Not supported shufflevector usage.");
11884 auto *SV = cast<ShuffleVectorInst>(VL.front());
11885 unsigned SVNumElements =
11886 cast<FixedVectorType>(SV->getOperand(0)->getType())
11887 ->getNumElements();
11888 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11889 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11890 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11891 int NextIndex = 0;
11892 if (!all_of(Group, [&](Value *V) {
11893 assert(isa<ShuffleVectorInst>(V) &&
11894 "Not supported shufflevector usage.");
11895 auto *SV = cast<ShuffleVectorInst>(V);
11896 int Index;
11897 [[maybe_unused]] bool IsExtractSubvectorMask =
11898 SV->isExtractSubvectorMask(Index);
11899 assert(IsExtractSubvectorMask &&
11900 "Not supported shufflevector usage.");
11901 if (NextIndex != Index)
11902 return false;
11903 NextIndex += SV->getShuffleMask().size();
11904 return true;
11905 }))
11906 return ::getShuffleCost(
11908 calculateShufflevectorMask(E->Scalars));
11909 }
11910 return TTI::TCC_Free;
11911 });
11912 return GetCostDiff(GetScalarCost, GetVectorCost);
11913 }
11914 case Instruction::Freeze:
11915 return CommonCost;
11916 default:
11917 llvm_unreachable("Unknown instruction");
11918 }
11919}
11920
11921bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11922 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11923 << VectorizableTree.size() << " is fully vectorizable .\n");
11924
11925 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11927 return TE->isGather() &&
11928 !any_of(TE->Scalars,
11929 [this](Value *V) { return EphValues.contains(V); }) &&
11930 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11931 TE->Scalars.size() < Limit ||
11932 ((TE->getOpcode() == Instruction::ExtractElement ||
11933 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11934 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11935 (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
11936 any_of(TE->Scalars, IsaPred<LoadInst>));
11937 };
11938
11939 // We only handle trees of heights 1 and 2.
11940 if (VectorizableTree.size() == 1 &&
11941 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11942 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11943 (ForReduction &&
11944 AreVectorizableGathers(VectorizableTree[0].get(),
11945 VectorizableTree[0]->Scalars.size()) &&
11946 VectorizableTree[0]->getVectorFactor() > 2)))
11947 return true;
11948
11949 if (VectorizableTree.size() != 2)
11950 return false;
11951
11952 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11953 // with the second gather nodes if they have less scalar operands rather than
11954 // the initial tree element (may be profitable to shuffle the second gather)
11955 // or they are extractelements, which form shuffle.
11957 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11958 AreVectorizableGathers(VectorizableTree[1].get(),
11959 VectorizableTree[0]->Scalars.size()))
11960 return true;
11961
11962 // Gathering cost would be too much for tiny trees.
11963 if (VectorizableTree[0]->isGather() ||
11964 (VectorizableTree[1]->isGather() &&
11965 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11966 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11967 return false;
11968
11969 return true;
11970}
11971
11972static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11974 bool MustMatchOrInst) {
11975 // Look past the root to find a source value. Arbitrarily follow the
11976 // path through operand 0 of any 'or'. Also, peek through optional
11977 // shift-left-by-multiple-of-8-bits.
11978 Value *ZextLoad = Root;
11979 const APInt *ShAmtC;
11980 bool FoundOr = false;
11981 while (!isa<ConstantExpr>(ZextLoad) &&
11982 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
11983 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
11984 ShAmtC->urem(8) == 0))) {
11985 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11986 ZextLoad = BinOp->getOperand(0);
11987 if (BinOp->getOpcode() == Instruction::Or)
11988 FoundOr = true;
11989 }
11990 // Check if the input is an extended load of the required or/shift expression.
11991 Value *Load;
11992 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11993 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
11994 return false;
11995
11996 // Require that the total load bit width is a legal integer type.
11997 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
11998 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
11999 Type *SrcTy = Load->getType();
12000 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12001 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12002 return false;
12003
12004 // Everything matched - assume that we can fold the whole sequence using
12005 // load combining.
12006 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
12007 << *(cast<Instruction>(Root)) << "\n");
12008
12009 return true;
12010}
12011
12013 if (RdxKind != RecurKind::Or)
12014 return false;
12015
12016 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12017 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12018 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
12019 /* MatchOr */ false);
12020}
12021
12023 // Peek through a final sequence of stores and check if all operations are
12024 // likely to be load-combined.
12025 unsigned NumElts = Stores.size();
12026 for (Value *Scalar : Stores) {
12027 Value *X;
12028 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
12029 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
12030 return false;
12031 }
12032 return true;
12033}
12034
12035bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
12036 if (!DebugCounter::shouldExecute(VectorizedGraphs))
12037 return true;
12038
12039 // Graph is empty - do nothing.
12040 if (VectorizableTree.empty()) {
12041 assert(ExternalUses.empty() && "We shouldn't have any external users");
12042
12043 return true;
12044 }
12045
12046 // No need to vectorize inserts of gathered values.
12047 if (VectorizableTree.size() == 2 &&
12048 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12049 VectorizableTree[1]->isGather() &&
12050 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12051 !(isSplat(VectorizableTree[1]->Scalars) ||
12052 allConstant(VectorizableTree[1]->Scalars))))
12053 return true;
12054
12055 // If the graph includes only PHI nodes and gathers, it is defnitely not
12056 // profitable for the vectorization, we can skip it, if the cost threshold is
12057 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12058 // gathers/buildvectors.
12059 constexpr int Limit = 4;
12060 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12061 !VectorizableTree.empty() &&
12062 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12063 return (TE->isGather() &&
12064 TE->getOpcode() != Instruction::ExtractElement &&
12065 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12066 TE->getOpcode() == Instruction::PHI;
12067 }))
12068 return true;
12069
12070 // We can vectorize the tree if its size is greater than or equal to the
12071 // minimum size specified by the MinTreeSize command line option.
12072 if (VectorizableTree.size() >= MinTreeSize)
12073 return false;
12074
12075 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12076 // can vectorize it if we can prove it fully vectorizable.
12077 if (isFullyVectorizableTinyTree(ForReduction))
12078 return false;
12079
12080 // Check if any of the gather node forms an insertelement buildvector
12081 // somewhere.
12082 bool IsAllowedSingleBVNode =
12083 VectorizableTree.size() > 1 ||
12084 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12085 !VectorizableTree.front()->isAltShuffle() &&
12086 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12087 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12088 allSameBlock(VectorizableTree.front()->Scalars));
12089 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12090 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12091 return isa<ExtractElementInst, UndefValue>(V) ||
12092 (IsAllowedSingleBVNode &&
12093 !V->hasNUsesOrMore(UsesLimit) &&
12094 any_of(V->users(), IsaPred<InsertElementInst>));
12095 });
12096 }))
12097 return false;
12098
12099 if (VectorizableTree.back()->isGather() &&
12100 VectorizableTree.back()->isAltShuffle() &&
12101 VectorizableTree.back()->getVectorFactor() > 2 &&
12102 allSameBlock(VectorizableTree.back()->Scalars) &&
12103 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12105 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12106 VectorizableTree.back()->getVectorFactor()),
12107 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12108 /*Insert=*/true, /*Extract=*/false,
12110 return false;
12111
12112 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12113 // vectorizable.
12114 return true;
12115}
12116
12119 constexpr unsigned SmallTree = 3;
12120 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12121 getCanonicalGraphSize() <= SmallTree &&
12122 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12123 [](const std::unique_ptr<TreeEntry> &TE) {
12124 return TE->isGather() &&
12125 TE->getOpcode() == Instruction::Load &&
12126 !allSameBlock(TE->Scalars);
12127 }) == 1)
12128 return true;
12129 return false;
12130 }
12131 bool Res = false;
12132 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12133 TreeEntry &E = *VectorizableTree[Idx];
12134 if (!E.isGather())
12135 continue;
12136 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12137 return false;
12138 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12139 continue;
12140 Res = true;
12141 }
12142 return Res;
12143}
12144
12146 // Walk from the bottom of the tree to the top, tracking which values are
12147 // live. When we see a call instruction that is not part of our tree,
12148 // query TTI to see if there is a cost to keeping values live over it
12149 // (for example, if spills and fills are required).
12150 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12152
12154 Instruction *PrevInst = nullptr;
12155
12156 // The entries in VectorizableTree are not necessarily ordered by their
12157 // position in basic blocks. Collect them and order them by dominance so later
12158 // instructions are guaranteed to be visited first. For instructions in
12159 // different basic blocks, we only scan to the beginning of the block, so
12160 // their order does not matter, as long as all instructions in a basic block
12161 // are grouped together. Using dominance ensures a deterministic order.
12162 SmallVector<Instruction *, 16> OrderedScalars;
12163 for (const auto &TEPtr : VectorizableTree) {
12164 if (TEPtr->State != TreeEntry::Vectorize)
12165 continue;
12166 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12167 if (!Inst)
12168 continue;
12169 OrderedScalars.push_back(Inst);
12170 }
12171 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12172 auto *NodeA = DT->getNode(A->getParent());
12173 auto *NodeB = DT->getNode(B->getParent());
12174 assert(NodeA && "Should only process reachable instructions");
12175 assert(NodeB && "Should only process reachable instructions");
12176 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12177 "Different nodes should have different DFS numbers");
12178 if (NodeA != NodeB)
12179 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12180 return B->comesBefore(A);
12181 });
12182
12183 for (Instruction *Inst : OrderedScalars) {
12184 if (!PrevInst) {
12185 PrevInst = Inst;
12186 continue;
12187 }
12188
12189 // Update LiveValues.
12190 LiveValues.erase(PrevInst);
12191 for (auto &J : PrevInst->operands()) {
12192 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12193 LiveValues.insert(cast<Instruction>(&*J));
12194 }
12195
12196 LLVM_DEBUG({
12197 dbgs() << "SLP: #LV: " << LiveValues.size();
12198 for (auto *X : LiveValues)
12199 dbgs() << " " << X->getName();
12200 dbgs() << ", Looking at ";
12201 Inst->dump();
12202 });
12203
12204 // Now find the sequence of instructions between PrevInst and Inst.
12205 unsigned NumCalls = 0;
12206 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12207 PrevInstIt =
12208 PrevInst->getIterator().getReverse();
12209 while (InstIt != PrevInstIt) {
12210 if (PrevInstIt == PrevInst->getParent()->rend()) {
12211 PrevInstIt = Inst->getParent()->rbegin();
12212 continue;
12213 }
12214
12215 auto NoCallIntrinsic = [this](Instruction *I) {
12216 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12217 if (II->isAssumeLikeIntrinsic())
12218 return true;
12219 FastMathFlags FMF;
12221 for (auto &ArgOp : II->args())
12222 Tys.push_back(ArgOp->getType());
12223 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12224 FMF = FPMO->getFastMathFlags();
12225 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12226 FMF);
12227 InstructionCost IntrCost =
12230 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12231 if (IntrCost < CallCost)
12232 return true;
12233 }
12234 return false;
12235 };
12236
12237 // Debug information does not impact spill cost.
12238 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12239 &*PrevInstIt != PrevInst)
12240 NumCalls++;
12241
12242 ++PrevInstIt;
12243 }
12244
12245 if (NumCalls) {
12247 for (auto *II : LiveValues) {
12248 auto *ScalarTy = II->getType();
12249 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12250 ScalarTy = VectorTy->getElementType();
12251 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12252 }
12253 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12254 }
12255
12256 PrevInst = Inst;
12257 }
12258
12259 return Cost;
12260}
12261
12262/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12263/// buildvector sequence.
12265 const InsertElementInst *IE2) {
12266 if (IE1 == IE2)
12267 return false;
12268 const auto *I1 = IE1;
12269 const auto *I2 = IE2;
12270 const InsertElementInst *PrevI1;
12271 const InsertElementInst *PrevI2;
12272 unsigned Idx1 = *getElementIndex(IE1);
12273 unsigned Idx2 = *getElementIndex(IE2);
12274 do {
12275 if (I2 == IE1)
12276 return true;
12277 if (I1 == IE2)
12278 return false;
12279 PrevI1 = I1;
12280 PrevI2 = I2;
12281 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12282 getElementIndex(I1).value_or(Idx2) != Idx2)
12283 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12284 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12285 getElementIndex(I2).value_or(Idx1) != Idx1)
12286 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12287 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12288 llvm_unreachable("Two different buildvectors not expected.");
12289}
12290
12291namespace {
12292/// Returns incoming Value *, if the requested type is Value * too, or a default
12293/// value, otherwise.
12294struct ValueSelect {
12295 template <typename U>
12296 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12297 return V;
12298 }
12299 template <typename U>
12300 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12301 return U();
12302 }
12303};
12304} // namespace
12305
12306/// Does the analysis of the provided shuffle masks and performs the requested
12307/// actions on the vectors with the given shuffle masks. It tries to do it in
12308/// several steps.
12309/// 1. If the Base vector is not undef vector, resizing the very first mask to
12310/// have common VF and perform action for 2 input vectors (including non-undef
12311/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12312/// and processed as a shuffle of 2 elements.
12313/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12314/// action only for 1 vector with the given mask, if it is not the identity
12315/// mask.
12316/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12317/// vectors, combing the masks properly between the steps.
12318template <typename T>
12320 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12321 function_ref<unsigned(T *)> GetVF,
12322 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12324 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12325 SmallVector<int> Mask(ShuffleMask.begin()->second);
12326 auto VMIt = std::next(ShuffleMask.begin());
12327 T *Prev = nullptr;
12328 SmallBitVector UseMask =
12329 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12330 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12331 if (!IsBaseUndef.all()) {
12332 // Base is not undef, need to combine it with the next subvectors.
12333 std::pair<T *, bool> Res =
12334 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12335 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12336 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12337 if (Mask[Idx] == PoisonMaskElem)
12338 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12339 else
12340 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12341 }
12342 auto *V = ValueSelect::get<T *>(Base);
12343 (void)V;
12344 assert((!V || GetVF(V) == Mask.size()) &&
12345 "Expected base vector of VF number of elements.");
12346 Prev = Action(Mask, {nullptr, Res.first});
12347 } else if (ShuffleMask.size() == 1) {
12348 // Base is undef and only 1 vector is shuffled - perform the action only for
12349 // single vector, if the mask is not the identity mask.
12350 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12351 /*ForSingleMask=*/true);
12352 if (Res.second)
12353 // Identity mask is found.
12354 Prev = Res.first;
12355 else
12356 Prev = Action(Mask, {ShuffleMask.begin()->first});
12357 } else {
12358 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12359 // shuffles step by step, combining shuffle between the steps.
12360 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12361 unsigned Vec2VF = GetVF(VMIt->first);
12362 if (Vec1VF == Vec2VF) {
12363 // No need to resize the input vectors since they are of the same size, we
12364 // can shuffle them directly.
12365 ArrayRef<int> SecMask = VMIt->second;
12366 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12367 if (SecMask[I] != PoisonMaskElem) {
12368 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12369 Mask[I] = SecMask[I] + Vec1VF;
12370 }
12371 }
12372 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12373 } else {
12374 // Vectors of different sizes - resize and reshuffle.
12375 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12376 /*ForSingleMask=*/false);
12377 std::pair<T *, bool> Res2 =
12378 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12379 ArrayRef<int> SecMask = VMIt->second;
12380 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12381 if (Mask[I] != PoisonMaskElem) {
12382 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12383 if (Res1.second)
12384 Mask[I] = I;
12385 } else if (SecMask[I] != PoisonMaskElem) {
12386 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12387 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12388 }
12389 }
12390 Prev = Action(Mask, {Res1.first, Res2.first});
12391 }
12392 VMIt = std::next(VMIt);
12393 }
12394 bool IsBaseNotUndef = !IsBaseUndef.all();
12395 (void)IsBaseNotUndef;
12396 // Perform requested actions for the remaining masks/vectors.
12397 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12398 // Shuffle other input vectors, if any.
12399 std::pair<T *, bool> Res =
12400 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12401 ArrayRef<int> SecMask = VMIt->second;
12402 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12403 if (SecMask[I] != PoisonMaskElem) {
12404 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12405 "Multiple uses of scalars.");
12406 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12407 } else if (Mask[I] != PoisonMaskElem) {
12408 Mask[I] = I;
12409 }
12410 }
12411 Prev = Action(Mask, {Prev, Res.first});
12412 }
12413 return Prev;
12414}
12415
12416namespace {
12417/// Data type for handling buildvector sequences with the reused scalars from
12418/// other tree entries.
12419template <typename T> struct ShuffledInsertData {
12420 /// List of insertelements to be replaced by shuffles.
12421 SmallVector<InsertElementInst *> InsertElements;
12422 /// The parent vectors and shuffle mask for the given list of inserts.
12424};
12425} // namespace
12426
12429 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12430 << VectorizableTree.size() << ".\n");
12431
12432 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12433
12434 SmallPtrSet<Value *, 4> CheckedExtracts;
12435 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12436 TreeEntry &TE = *VectorizableTree[I];
12437 // No need to count the cost for combined entries, they are combined and
12438 // just skip their cost.
12439 if (TE.State == TreeEntry::CombinedVectorize) {
12440 LLVM_DEBUG(
12441 dbgs() << "SLP: Skipping cost for combined node that starts with "
12442 << *TE.Scalars[0] << ".\n";
12443 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12444 continue;
12445 }
12446 if (TE.isGather()) {
12447 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12448 E && E->getVectorFactor() == TE.getVectorFactor() &&
12449 E->isSame(TE.Scalars)) {
12450 // Some gather nodes might be absolutely the same as some vectorizable
12451 // nodes after reordering, need to handle it.
12452 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12453 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12454 << "SLP: Current total cost = " << Cost << "\n");
12455 continue;
12456 }
12457 }
12458
12459 // Exclude cost of gather loads nodes which are not used. These nodes were
12460 // built as part of the final attempt to vectorize gathered loads.
12461 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12462 "Expected gather nodes with users only.");
12463
12464 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12465 Cost += C;
12466 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12467 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12468 << "SLP: Current total cost = " << Cost << "\n");
12469 }
12470
12471 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12472 InstructionCost ExtractCost = 0;
12474 SmallVector<APInt> DemandedElts;
12475 SmallDenseSet<Value *, 4> UsedInserts;
12477 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12479 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12480 // Keep track {Scalar, Index, User} tuple.
12481 // On AArch64, this helps in fusing a mov instruction, associated with
12482 // extractelement, with fmul in the backend so that extractelement is free.
12484 for (ExternalUser &EU : ExternalUses) {
12485 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12486 }
12487 for (ExternalUser &EU : ExternalUses) {
12488 // Uses by ephemeral values are free (because the ephemeral value will be
12489 // removed prior to code generation, and so the extraction will be
12490 // removed as well).
12491 if (EphValues.count(EU.User))
12492 continue;
12493
12494 // Used in unreachable blocks or in EH pads (rarely executed) or is
12495 // terminated with unreachable instruction.
12496 if (BasicBlock *UserParent =
12497 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12498 UserParent &&
12499 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12500 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12501 continue;
12502
12503 // We only add extract cost once for the same scalar.
12504 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12505 !ExtractCostCalculated.insert(EU.Scalar).second)
12506 continue;
12507
12508 // No extract cost for vector "scalar"
12509 if (isa<FixedVectorType>(EU.Scalar->getType()))
12510 continue;
12511
12512 // If found user is an insertelement, do not calculate extract cost but try
12513 // to detect it as a final shuffled/identity match.
12514 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12515 VU && VU->getOperand(1) == EU.Scalar) {
12516 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12517 if (!UsedInserts.insert(VU).second)
12518 continue;
12519 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12520 if (InsertIdx) {
12521 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12522 auto *It = find_if(
12523 ShuffledInserts,
12524 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12525 // Checks if 2 insertelements are from the same buildvector.
12526 InsertElementInst *VecInsert = Data.InsertElements.front();
12528 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12529 Value *Op0 = II->getOperand(0);
12530 if (getTreeEntry(II) && !getTreeEntry(Op0))
12531 return nullptr;
12532 return Op0;
12533 });
12534 });
12535 int VecId = -1;
12536 if (It == ShuffledInserts.end()) {
12537 auto &Data = ShuffledInserts.emplace_back();
12538 Data.InsertElements.emplace_back(VU);
12539 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12540 VecId = ShuffledInserts.size() - 1;
12541 auto It = MinBWs.find(ScalarTE);
12542 if (It != MinBWs.end() &&
12543 VectorCasts
12544 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12545 .second) {
12546 unsigned BWSz = It->second.first;
12547 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12548 unsigned VecOpcode;
12549 if (DstBWSz < BWSz)
12550 VecOpcode = Instruction::Trunc;
12551 else
12552 VecOpcode =
12553 It->second.second ? Instruction::SExt : Instruction::ZExt;
12556 VecOpcode, FTy,
12557 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12558 FTy->getNumElements()),
12560 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12561 << " for extending externally used vector with "
12562 "non-equal minimum bitwidth.\n");
12563 Cost += C;
12564 }
12565 } else {
12566 if (isFirstInsertElement(VU, It->InsertElements.front()))
12567 It->InsertElements.front() = VU;
12568 VecId = std::distance(ShuffledInserts.begin(), It);
12569 }
12570 int InIdx = *InsertIdx;
12571 SmallVectorImpl<int> &Mask =
12572 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12573 if (Mask.empty())
12574 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12575 Mask[InIdx] = EU.Lane;
12576 DemandedElts[VecId].setBit(InIdx);
12577 continue;
12578 }
12579 }
12580 }
12581
12583 // If we plan to rewrite the tree in a smaller type, we will need to sign
12584 // extend the extracted value back to the original type. Here, we account
12585 // for the extract and the added cost of the sign extend if needed.
12586 InstructionCost ExtraCost = TTI::TCC_Free;
12587 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12588 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12589 auto It = MinBWs.find(Entry);
12590 if (It != MinBWs.end()) {
12591 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12592 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12593 ? Instruction::ZExt
12594 : Instruction::SExt;
12595 VecTy = getWidenedType(MinTy, BundleWidth);
12596 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12597 VecTy, EU.Lane);
12598 } else {
12599 ExtraCost =
12600 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12601 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12602 }
12603 // Leave the scalar instructions as is if they are cheaper than extracts.
12604 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12605 Entry->getOpcode() == Instruction::Load) {
12606 // Checks if the user of the external scalar is phi in loop body.
12607 auto IsPhiInLoop = [&](const ExternalUser &U) {
12608 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12609 auto *I = cast<Instruction>(U.Scalar);
12610 const Loop *L = LI->getLoopFor(Phi->getParent());
12611 return L && (Phi->getParent() == I->getParent() ||
12612 L == LI->getLoopFor(I->getParent()));
12613 }
12614 return false;
12615 };
12616 if (!ValueToExtUses) {
12617 ValueToExtUses.emplace();
12618 for_each(enumerate(ExternalUses), [&](const auto &P) {
12619 // Ignore phis in loops.
12620 if (IsPhiInLoop(P.value()))
12621 return;
12622
12623 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12624 });
12625 }
12626 // Can use original instruction, if no operands vectorized or they are
12627 // marked as externally used already.
12628 auto *Inst = cast<Instruction>(EU.Scalar);
12629 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12630 auto OperandIsScalar = [&](Value *V) {
12631 if (!getTreeEntry(V)) {
12632 // Some extractelements might be not vectorized, but
12633 // transformed into shuffle and removed from the function,
12634 // consider it here.
12635 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12636 return !EE->hasOneUse() || !MustGather.contains(EE);
12637 return true;
12638 }
12639 return ValueToExtUses->contains(V);
12640 };
12641 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12642 bool CanBeUsedAsScalarCast = false;
12643 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12644 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12645 Op && all_of(Op->operands(), OperandIsScalar)) {
12646 InstructionCost OpCost =
12647 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12649 : 0;
12650 if (ScalarCost + OpCost <= ExtraCost) {
12651 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12652 ScalarCost += OpCost;
12653 }
12654 }
12655 }
12656 if (CanBeUsedAsScalar) {
12657 bool KeepScalar = ScalarCost <= ExtraCost;
12658 // Try to keep original scalar if the user is the phi node from the same
12659 // block as the root phis, currently vectorized. It allows to keep
12660 // better ordering info of PHIs, being vectorized currently.
12661 bool IsProfitablePHIUser =
12662 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12663 VectorizableTree.front()->Scalars.size() > 2)) &&
12664 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12665 !Inst->hasNUsesOrMore(UsesLimit) &&
12666 none_of(Inst->users(),
12667 [&](User *U) {
12668 auto *PHIUser = dyn_cast<PHINode>(U);
12669 return (!PHIUser ||
12670 PHIUser->getParent() !=
12671 cast<Instruction>(
12672 VectorizableTree.front()->getMainOp())
12673 ->getParent()) &&
12674 !getTreeEntry(U);
12675 }) &&
12676 count_if(Entry->Scalars, [&](Value *V) {
12677 return ValueToExtUses->contains(V);
12678 }) <= 2;
12679 if (IsProfitablePHIUser) {
12680 KeepScalar = true;
12681 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12682 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12683 (!GatheredLoadsEntriesFirst.has_value() ||
12684 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12685 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12686 return ValueToExtUses->contains(V);
12687 });
12688 auto It = ExtractsCount.find(Entry);
12689 if (It != ExtractsCount.end()) {
12690 assert(ScalarUsesCount >= It->getSecond().size() &&
12691 "Expected total number of external uses not less than "
12692 "number of scalar uses.");
12693 ScalarUsesCount -= It->getSecond().size();
12694 }
12695 // Keep original scalar if number of externally used instructions in
12696 // the same entry is not power of 2. It may help to do some extra
12697 // vectorization for now.
12698 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12699 }
12700 if (KeepScalar) {
12701 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12702 for_each(Inst->operands(), [&](Value *V) {
12703 auto It = ValueToExtUses->find(V);
12704 if (It != ValueToExtUses->end()) {
12705 // Replace all uses to avoid compiler crash.
12706 ExternalUses[It->second].User = nullptr;
12707 }
12708 });
12709 ExtraCost = ScalarCost;
12710 if (!IsPhiInLoop(EU))
12711 ExtractsCount[Entry].insert(Inst);
12712 if (CanBeUsedAsScalarCast) {
12713 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12714 // Update the users of the operands of the cast operand to avoid
12715 // compiler crash.
12716 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12717 for_each(IOp->operands(), [&](Value *V) {
12718 auto It = ValueToExtUses->find(V);
12719 if (It != ValueToExtUses->end()) {
12720 // Replace all uses to avoid compiler crash.
12721 ExternalUses[It->second].User = nullptr;
12722 }
12723 });
12724 }
12725 }
12726 }
12727 }
12728 }
12729
12730 ExtractCost += ExtraCost;
12731 }
12732 // Insert externals for extract of operands of casts to be emitted as scalars
12733 // instead of extractelement.
12734 for (Value *V : ScalarOpsFromCasts) {
12735 ExternalUsesAsOriginalScalar.insert(V);
12736 if (const TreeEntry *E = getTreeEntry(V)) {
12737 ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
12738 }
12739 }
12740 // Add reduced value cost, if resized.
12741 if (!VectorizedVals.empty()) {
12742 const TreeEntry &Root = *VectorizableTree.front();
12743 auto BWIt = MinBWs.find(&Root);
12744 if (BWIt != MinBWs.end()) {
12745 Type *DstTy = Root.Scalars.front()->getType();
12746 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12747 unsigned SrcSz =
12748 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12749 if (OriginalSz != SrcSz) {
12750 unsigned Opcode = Instruction::Trunc;
12751 if (OriginalSz > SrcSz)
12752 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12753 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12754 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12755 assert(SLPReVec && "Only supported by REVEC.");
12756 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12757 }
12758 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12761 }
12762 }
12763 }
12764
12765 InstructionCost SpillCost = getSpillCost();
12766 Cost += SpillCost + ExtractCost;
12767 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12768 bool) {
12769 InstructionCost C = 0;
12770 unsigned VF = Mask.size();
12771 unsigned VecVF = TE->getVectorFactor();
12772 if (VF != VecVF &&
12773 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12775 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12776 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12777 OrigMask.begin());
12779 getWidenedType(TE->getMainOp()->getType(), VecVF),
12780 OrigMask);
12781 LLVM_DEBUG(
12782 dbgs() << "SLP: Adding cost " << C
12783 << " for final shuffle of insertelement external users.\n";
12784 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12785 Cost += C;
12786 return std::make_pair(TE, true);
12787 }
12788 return std::make_pair(TE, false);
12789 };
12790 // Calculate the cost of the reshuffled vectors, if any.
12791 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12792 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12793 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12794 unsigned VF = 0;
12795 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12797 assert((TEs.size() == 1 || TEs.size() == 2) &&
12798 "Expected exactly 1 or 2 tree entries.");
12799 if (TEs.size() == 1) {
12800 if (VF == 0)
12801 VF = TEs.front()->getVectorFactor();
12802 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12803 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12804 !all_of(enumerate(Mask), [=](const auto &Data) {
12805 return Data.value() == PoisonMaskElem ||
12806 (Data.index() < VF &&
12807 static_cast<int>(Data.index()) == Data.value());
12808 })) {
12811 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12812 << " for final shuffle of insertelement "
12813 "external users.\n";
12814 TEs.front()->dump();
12815 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12816 Cost += C;
12817 }
12818 } else {
12819 if (VF == 0) {
12820 if (TEs.front() &&
12821 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12822 VF = TEs.front()->getVectorFactor();
12823 else
12824 VF = Mask.size();
12825 }
12826 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12829 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12830 << " for final shuffle of vector node and external "
12831 "insertelement users.\n";
12832 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12833 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12834 Cost += C;
12835 }
12836 VF = Mask.size();
12837 return TEs.back();
12838 };
12839 (void)performExtractsShuffleAction<const TreeEntry>(
12840 MutableArrayRef(Vector.data(), Vector.size()), Base,
12841 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12842 EstimateShufflesCost);
12844 cast<FixedVectorType>(
12845 ShuffledInserts[I].InsertElements.front()->getType()),
12846 DemandedElts[I],
12847 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12848 Cost -= InsertCost;
12849 }
12850
12851 // Add the cost for reduced value resize (if required).
12852 if (ReductionBitWidth != 0) {
12853 assert(UserIgnoreList && "Expected reduction tree.");
12854 const TreeEntry &E = *VectorizableTree.front();
12855 auto It = MinBWs.find(&E);
12856 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12857 unsigned SrcSize = It->second.first;
12858 unsigned DstSize = ReductionBitWidth;
12859 unsigned Opcode = Instruction::Trunc;
12860 if (SrcSize < DstSize) {
12861 bool IsArithmeticExtendedReduction =
12862 all_of(*UserIgnoreList, [](Value *V) {
12863 auto *I = cast<Instruction>(V);
12864 return is_contained({Instruction::Add, Instruction::FAdd,
12865 Instruction::Mul, Instruction::FMul,
12866 Instruction::And, Instruction::Or,
12867 Instruction::Xor},
12868 I->getOpcode());
12869 });
12870 if (IsArithmeticExtendedReduction)
12871 Opcode =
12872 Instruction::BitCast; // Handle it by getExtendedReductionCost
12873 else
12874 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12875 }
12876 if (Opcode != Instruction::BitCast) {
12877 auto *SrcVecTy =
12878 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12879 auto *DstVecTy =
12880 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12881 TTI::CastContextHint CCH = getCastContextHint(E);
12882 InstructionCost CastCost;
12883 switch (E.getOpcode()) {
12884 case Instruction::SExt:
12885 case Instruction::ZExt:
12886 case Instruction::Trunc: {
12887 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12888 CCH = getCastContextHint(*OpTE);
12889 break;
12890 }
12891 default:
12892 break;
12893 }
12894 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12896 Cost += CastCost;
12897 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12898 << " for final resize for reduction from " << SrcVecTy
12899 << " to " << DstVecTy << "\n";
12900 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12901 }
12902 }
12903 }
12904
12905#ifndef NDEBUG
12906 SmallString<256> Str;
12907 {
12909 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12910 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12911 << "SLP: Total Cost = " << Cost << ".\n";
12912 }
12913 LLVM_DEBUG(dbgs() << Str);
12914 if (ViewSLPTree)
12915 ViewGraph(this, "SLP" + F->getName(), false, Str);
12916#endif
12917
12918 return Cost;
12919}
12920
12921/// Tries to find extractelement instructions with constant indices from fixed
12922/// vector type and gather such instructions into a bunch, which highly likely
12923/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12924/// successful, the matched scalars are replaced by poison values in \p VL for
12925/// future analysis.
12926std::optional<TTI::ShuffleKind>
12927BoUpSLP::tryToGatherSingleRegisterExtractElements(
12929 // Scan list of gathered scalars for extractelements that can be represented
12930 // as shuffles.
12932 SmallVector<int> UndefVectorExtracts;
12933 for (int I = 0, E = VL.size(); I < E; ++I) {
12934 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12935 if (!EI) {
12936 if (isa<UndefValue>(VL[I]))
12937 UndefVectorExtracts.push_back(I);
12938 continue;
12939 }
12940 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12941 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12942 continue;
12943 std::optional<unsigned> Idx = getExtractIndex(EI);
12944 // Undefined index.
12945 if (!Idx) {
12946 UndefVectorExtracts.push_back(I);
12947 continue;
12948 }
12949 if (Idx >= VecTy->getNumElements()) {
12950 UndefVectorExtracts.push_back(I);
12951 continue;
12952 }
12953 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12954 ExtractMask.reset(*Idx);
12955 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12956 UndefVectorExtracts.push_back(I);
12957 continue;
12958 }
12959 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12960 }
12961 // Sort the vector operands by the maximum number of uses in extractelements.
12963 VectorOpToIdx.takeVector();
12964 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
12965 return P1.second.size() > P2.second.size();
12966 });
12967 // Find the best pair of the vectors or a single vector.
12968 const int UndefSz = UndefVectorExtracts.size();
12969 unsigned SingleMax = 0;
12970 unsigned PairMax = 0;
12971 if (!Vectors.empty()) {
12972 SingleMax = Vectors.front().second.size() + UndefSz;
12973 if (Vectors.size() > 1) {
12974 auto *ItNext = std::next(Vectors.begin());
12975 PairMax = SingleMax + ItNext->second.size();
12976 }
12977 }
12978 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12979 return std::nullopt;
12980 // Check if better to perform a shuffle of 2 vectors or just of a single
12981 // vector.
12982 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
12983 SmallVector<Value *> GatheredExtracts(
12984 VL.size(), PoisonValue::get(VL.front()->getType()));
12985 if (SingleMax >= PairMax && SingleMax) {
12986 for (int Idx : Vectors.front().second)
12987 std::swap(GatheredExtracts[Idx], VL[Idx]);
12988 } else if (!Vectors.empty()) {
12989 for (unsigned Idx : {0, 1})
12990 for (int Idx : Vectors[Idx].second)
12991 std::swap(GatheredExtracts[Idx], VL[Idx]);
12992 }
12993 // Add extracts from undefs too.
12994 for (int Idx : UndefVectorExtracts)
12995 std::swap(GatheredExtracts[Idx], VL[Idx]);
12996 // Check that gather of extractelements can be represented as just a
12997 // shuffle of a single/two vectors the scalars are extracted from.
12998 std::optional<TTI::ShuffleKind> Res =
12999 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13000 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
13001 // TODO: try to check other subsets if possible.
13002 // Restore the original VL if attempt was not successful.
13003 copy(SavedVL, VL.begin());
13004 return std::nullopt;
13005 }
13006 // Restore unused scalars from mask, if some of the extractelements were not
13007 // selected for shuffle.
13008 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
13009 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13010 isa<UndefValue>(GatheredExtracts[I])) {
13011 std::swap(VL[I], GatheredExtracts[I]);
13012 continue;
13013 }
13014 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13015 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13016 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13017 is_contained(UndefVectorExtracts, I))
13018 continue;
13019 }
13020 return Res;
13021}
13022
13023/// Tries to find extractelement instructions with constant indices from fixed
13024/// vector type and gather such instructions into a bunch, which highly likely
13025/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13026/// successful, the matched scalars are replaced by poison values in \p VL for
13027/// future analysis.
13029BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13031 unsigned NumParts) const {
13032 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
13033 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13034 Mask.assign(VL.size(), PoisonMaskElem);
13035 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13036 for (unsigned Part : seq<unsigned>(NumParts)) {
13037 // Scan list of gathered scalars for extractelements that can be represented
13038 // as shuffles.
13040 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13041 SmallVector<int> SubMask;
13042 std::optional<TTI::ShuffleKind> Res =
13043 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13044 ShufflesRes[Part] = Res;
13045 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13046 }
13047 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13048 return Res.has_value();
13049 }))
13050 ShufflesRes.clear();
13051 return ShufflesRes;
13052}
13053
13054std::optional<TargetTransformInfo::ShuffleKind>
13055BoUpSLP::isGatherShuffledSingleRegisterEntry(
13056 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13057 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13058 Entries.clear();
13059 // TODO: currently checking only for Scalars in the tree entry, need to count
13060 // reused elements too for better cost estimation.
13061 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13062 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13063 : TE->UserTreeIndices.front();
13064 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13065 const BasicBlock *TEInsertBlock = nullptr;
13066 // Main node of PHI entries keeps the correct order of operands/incoming
13067 // blocks.
13068 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13069 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13070 TEInsertPt = TEInsertBlock->getTerminator();
13071 } else {
13072 TEInsertBlock = TEInsertPt->getParent();
13073 }
13074 if (!DT->isReachableFromEntry(TEInsertBlock))
13075 return std::nullopt;
13076 auto *NodeUI = DT->getNode(TEInsertBlock);
13077 assert(NodeUI && "Should only process reachable instructions");
13078 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13079 auto CheckOrdering = [&](const Instruction *InsertPt) {
13080 // Argument InsertPt is an instruction where vector code for some other
13081 // tree entry (one that shares one or more scalars with TE) is going to be
13082 // generated. This lambda returns true if insertion point of vector code
13083 // for the TE dominates that point (otherwise dependency is the other way
13084 // around). The other node is not limited to be of a gather kind. Gather
13085 // nodes are not scheduled and their vector code is inserted before their
13086 // first user. If user is PHI, that is supposed to be at the end of a
13087 // predecessor block. Otherwise it is the last instruction among scalars of
13088 // the user node. So, instead of checking dependency between instructions
13089 // themselves, we check dependency between their insertion points for vector
13090 // code (since each scalar instruction ends up as a lane of a vector
13091 // instruction).
13092 const BasicBlock *InsertBlock = InsertPt->getParent();
13093 auto *NodeEUI = DT->getNode(InsertBlock);
13094 if (!NodeEUI)
13095 return false;
13096 assert((NodeUI == NodeEUI) ==
13097 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13098 "Different nodes should have different DFS numbers");
13099 // Check the order of the gather nodes users.
13100 if (TEInsertPt->getParent() != InsertBlock &&
13101 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13102 return false;
13103 if (TEInsertPt->getParent() == InsertBlock &&
13104 TEInsertPt->comesBefore(InsertPt))
13105 return false;
13106 return true;
13107 };
13108 // Find all tree entries used by the gathered values. If no common entries
13109 // found - not a shuffle.
13110 // Here we build a set of tree nodes for each gathered value and trying to
13111 // find the intersection between these sets. If we have at least one common
13112 // tree node for each gathered value - we have just a permutation of the
13113 // single vector. If we have 2 different sets, we're in situation where we
13114 // have a permutation of 2 input vectors.
13116 DenseMap<Value *, int> UsedValuesEntry;
13117 for (Value *V : VL) {
13118 if (isConstant(V))
13119 continue;
13120 // Build a list of tree entries where V is used.
13122 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13123 if (TEPtr == TE || TEPtr->Idx == 0)
13124 continue;
13125 assert(any_of(TEPtr->Scalars,
13126 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13127 "Must contain at least single gathered value.");
13128 assert(TEPtr->UserTreeIndices.size() == 1 &&
13129 "Expected only single user of a gather node.");
13130 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13131
13132 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13133 const Instruction *InsertPt =
13134 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13135 : &getLastInstructionInBundle(UseEI.UserTE);
13136 if (TEInsertPt == InsertPt) {
13137 // If 2 gathers are operands of the same entry (regardless of whether
13138 // user is PHI or else), compare operands indices, use the earlier one
13139 // as the base.
13140 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13141 continue;
13142 // If the user instruction is used for some reason in different
13143 // vectorized nodes - make it depend on index.
13144 if (TEUseEI.UserTE != UseEI.UserTE &&
13145 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13146 continue;
13147 }
13148
13149 // Check if the user node of the TE comes after user node of TEPtr,
13150 // otherwise TEPtr depends on TE.
13151 if ((TEInsertBlock != InsertPt->getParent() ||
13152 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13153 !CheckOrdering(InsertPt))
13154 continue;
13155 VToTEs.insert(TEPtr);
13156 }
13157 if (const TreeEntry *VTE = getTreeEntry(V)) {
13158 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13159 if (VTE->State != TreeEntry::Vectorize) {
13160 auto It = MultiNodeScalars.find(V);
13161 if (It == MultiNodeScalars.end())
13162 continue;
13163 VTE = *It->getSecond().begin();
13164 // Iterate through all vectorized nodes.
13165 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
13166 return MTE->State == TreeEntry::Vectorize;
13167 });
13168 if (MIt == It->getSecond().end())
13169 continue;
13170 VTE = *MIt;
13171 }
13172 }
13173 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13174 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13175 continue;
13176 VToTEs.insert(VTE);
13177 }
13178 if (VToTEs.empty())
13179 continue;
13180 if (UsedTEs.empty()) {
13181 // The first iteration, just insert the list of nodes to vector.
13182 UsedTEs.push_back(VToTEs);
13183 UsedValuesEntry.try_emplace(V, 0);
13184 } else {
13185 // Need to check if there are any previously used tree nodes which use V.
13186 // If there are no such nodes, consider that we have another one input
13187 // vector.
13188 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13189 unsigned Idx = 0;
13190 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13191 // Do we have a non-empty intersection of previously listed tree entries
13192 // and tree entries using current V?
13193 set_intersect(VToTEs, Set);
13194 if (!VToTEs.empty()) {
13195 // Yes, write the new subset and continue analysis for the next
13196 // scalar.
13197 Set.swap(VToTEs);
13198 break;
13199 }
13200 VToTEs = SavedVToTEs;
13201 ++Idx;
13202 }
13203 // No non-empty intersection found - need to add a second set of possible
13204 // source vectors.
13205 if (Idx == UsedTEs.size()) {
13206 // If the number of input vectors is greater than 2 - not a permutation,
13207 // fallback to the regular gather.
13208 // TODO: support multiple reshuffled nodes.
13209 if (UsedTEs.size() == 2)
13210 continue;
13211 UsedTEs.push_back(SavedVToTEs);
13212 Idx = UsedTEs.size() - 1;
13213 }
13214 UsedValuesEntry.try_emplace(V, Idx);
13215 }
13216 }
13217
13218 if (UsedTEs.empty()) {
13219 Entries.clear();
13220 return std::nullopt;
13221 }
13222
13223 unsigned VF = 0;
13224 if (UsedTEs.size() == 1) {
13225 // Keep the order to avoid non-determinism.
13226 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13227 UsedTEs.front().end());
13228 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13229 return TE1->Idx < TE2->Idx;
13230 });
13231 // Try to find the perfect match in another gather node at first.
13232 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13233 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13234 });
13235 if (It != FirstEntries.end() &&
13236 ((*It)->getVectorFactor() == VL.size() ||
13237 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13238 TE->ReuseShuffleIndices.size() == VL.size() &&
13239 (*It)->isSame(TE->Scalars)))) {
13240 Entries.push_back(*It);
13241 if ((*It)->getVectorFactor() == VL.size()) {
13242 std::iota(std::next(Mask.begin(), Part * VL.size()),
13243 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13244 } else {
13245 SmallVector<int> CommonMask = TE->getCommonMask();
13246 copy(CommonMask, Mask.begin());
13247 }
13248 // Clear undef scalars.
13249 for (unsigned I : seq<unsigned>(VL.size()))
13250 if (isa<PoisonValue>(VL[I]))
13251 Mask[Part * VL.size() + I] = PoisonMaskElem;
13253 }
13254 // No perfect match, just shuffle, so choose the first tree node from the
13255 // tree.
13256 Entries.push_back(FirstEntries.front());
13257 VF = FirstEntries.front()->getVectorFactor();
13258 } else {
13259 // Try to find nodes with the same vector factor.
13260 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13261 // Keep the order of tree nodes to avoid non-determinism.
13263 for (const TreeEntry *TE : UsedTEs.front()) {
13264 unsigned VF = TE->getVectorFactor();
13265 auto It = VFToTE.find(VF);
13266 if (It != VFToTE.end()) {
13267 if (It->second->Idx > TE->Idx)
13268 It->getSecond() = TE;
13269 continue;
13270 }
13271 VFToTE.try_emplace(VF, TE);
13272 }
13273 // Same, keep the order to avoid non-determinism.
13274 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13275 UsedTEs.back().end());
13276 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13277 return TE1->Idx < TE2->Idx;
13278 });
13279 for (const TreeEntry *TE : SecondEntries) {
13280 auto It = VFToTE.find(TE->getVectorFactor());
13281 if (It != VFToTE.end()) {
13282 VF = It->first;
13283 Entries.push_back(It->second);
13284 Entries.push_back(TE);
13285 break;
13286 }
13287 }
13288 // No 2 source vectors with the same vector factor - just choose 2 with max
13289 // index.
13290 if (Entries.empty()) {
13291 Entries.push_back(*llvm::max_element(
13292 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13293 return TE1->Idx < TE2->Idx;
13294 }));
13295 Entries.push_back(SecondEntries.front());
13296 VF = std::max(Entries.front()->getVectorFactor(),
13297 Entries.back()->getVectorFactor());
13298 } else {
13299 VF = Entries.front()->getVectorFactor();
13300 }
13301 }
13302
13303 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13304 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13305 // vectorized.
13306 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13307 auto *PHI = cast<PHINode>(V);
13308 auto *PHI1 = cast<PHINode>(V1);
13309 // Check that all incoming values are compatible/from same parent (if they
13310 // are instructions).
13311 // The incoming values are compatible if they all are constants, or
13312 // instruction with the same/alternate opcodes from the same basic block.
13313 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13314 Value *In = PHI->getIncomingValue(I);
13315 Value *In1 = PHI1->getIncomingValue(I);
13316 if (isConstant(In) && isConstant(In1))
13317 continue;
13318 if (!getSameOpcode({In, In1}, *TLI))
13319 return false;
13320 if (cast<Instruction>(In)->getParent() !=
13321 cast<Instruction>(In1)->getParent())
13322 return false;
13323 }
13324 return true;
13325 };
13326 // Check if the value can be ignored during analysis for shuffled gathers.
13327 // We suppose it is better to ignore instruction, which do not form splats,
13328 // are not vectorized/not extractelements (these instructions will be handled
13329 // by extractelements processing) or may form vector node in future.
13330 auto MightBeIgnored = [=](Value *V) {
13331 auto *I = dyn_cast<Instruction>(V);
13332 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13334 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13335 };
13336 // Check that the neighbor instruction may form a full vector node with the
13337 // current instruction V. It is possible, if they have same/alternate opcode
13338 // and same parent basic block.
13339 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13340 Value *V1 = VL[Idx];
13341 bool UsedInSameVTE = false;
13342 auto It = UsedValuesEntry.find(V1);
13343 if (It != UsedValuesEntry.end())
13344 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13345 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13346 getSameOpcode({V, V1}, *TLI) &&
13347 cast<Instruction>(V)->getParent() ==
13348 cast<Instruction>(V1)->getParent() &&
13349 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13350 };
13351 // Build a shuffle mask for better cost estimation and vector emission.
13352 SmallBitVector UsedIdxs(Entries.size());
13354 for (int I = 0, E = VL.size(); I < E; ++I) {
13355 Value *V = VL[I];
13356 auto It = UsedValuesEntry.find(V);
13357 if (It == UsedValuesEntry.end())
13358 continue;
13359 // Do not try to shuffle scalars, if they are constants, or instructions
13360 // that can be vectorized as a result of the following vector build
13361 // vectorization.
13362 if (isConstant(V) || (MightBeIgnored(V) &&
13363 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13364 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13365 continue;
13366 unsigned Idx = It->second;
13367 EntryLanes.emplace_back(Idx, I);
13368 UsedIdxs.set(Idx);
13369 }
13370 // Iterate through all shuffled scalars and select entries, which can be used
13371 // for final shuffle.
13373 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13374 if (!UsedIdxs.test(I))
13375 continue;
13376 // Fix the entry number for the given scalar. If it is the first entry, set
13377 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13378 // These indices are used when calculating final shuffle mask as the vector
13379 // offset.
13380 for (std::pair<unsigned, int> &Pair : EntryLanes)
13381 if (Pair.first == I)
13382 Pair.first = TempEntries.size();
13383 TempEntries.push_back(Entries[I]);
13384 }
13385 Entries.swap(TempEntries);
13386 if (EntryLanes.size() == Entries.size() &&
13387 !VL.equals(ArrayRef(TE->Scalars)
13388 .slice(Part * VL.size(),
13389 std::min<int>(VL.size(), TE->Scalars.size())))) {
13390 // We may have here 1 or 2 entries only. If the number of scalars is equal
13391 // to the number of entries, no need to do the analysis, it is not very
13392 // profitable. Since VL is not the same as TE->Scalars, it means we already
13393 // have some shuffles before. Cut off not profitable case.
13394 Entries.clear();
13395 return std::nullopt;
13396 }
13397 // Build the final mask, check for the identity shuffle, if possible.
13398 bool IsIdentity = Entries.size() == 1;
13399 // Pair.first is the offset to the vector, while Pair.second is the index of
13400 // scalar in the list.
13401 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13402 unsigned Idx = Part * VL.size() + Pair.second;
13403 Mask[Idx] =
13404 Pair.first * VF +
13405 (ForOrder ? std::distance(
13406 Entries[Pair.first]->Scalars.begin(),
13407 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13408 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13409 IsIdentity &= Mask[Idx] == Pair.second;
13410 }
13411 if (ForOrder || IsIdentity || Entries.empty()) {
13412 switch (Entries.size()) {
13413 case 1:
13414 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13416 break;
13417 case 2:
13418 if (EntryLanes.size() > 2 || VL.size() <= 2)
13420 break;
13421 default:
13422 break;
13423 }
13424 } else if (!isa<VectorType>(VL.front()->getType()) &&
13425 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13426 // Do the cost estimation if shuffle beneficial than buildvector.
13427 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13428 std::next(Mask.begin(), (Part + 1) * VL.size()));
13429 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13430 for (int Idx : SubMask) {
13431 if (Idx == PoisonMaskElem)
13432 continue;
13433 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13434 MinElement = Idx;
13435 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13436 MaxElement = Idx;
13437 }
13438 assert(MaxElement >= 0 && MinElement >= 0 &&
13439 MaxElement % VF >= MinElement % VF &&
13440 "Expected at least single element.");
13441 unsigned NewVF = std::max<unsigned>(
13442 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13443 (MaxElement % VF) -
13444 (MinElement % VF) + 1));
13445 if (NewVF < VF) {
13446 for_each(SubMask, [&](int &Idx) {
13447 if (Idx == PoisonMaskElem)
13448 return;
13449 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13450 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13451 });
13452 } else {
13453 NewVF = VF;
13454 }
13455
13457 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
13458 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13459 auto GetShuffleCost = [&,
13462 VectorType *VecTy) -> InstructionCost {
13463 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13465 Mask, Entries.front()->getInterleaveFactor()))
13466 return TTI::TCC_Free;
13467 return ::getShuffleCost(TTI,
13468 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13470 VecTy, Mask, CostKind);
13471 };
13472 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13473 InstructionCost FirstShuffleCost = 0;
13474 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13475 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13476 FirstShuffleCost = ShuffleCost;
13477 } else {
13478 // Transform mask to include only first entry.
13479 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13480 bool IsIdentity = true;
13481 for (auto [I, Idx] : enumerate(FirstMask)) {
13482 if (Idx >= static_cast<int>(NewVF)) {
13484 } else {
13485 DemandedElts.clearBit(I);
13486 if (Idx != PoisonMaskElem)
13487 IsIdentity &= static_cast<int>(I) == Idx;
13488 }
13489 }
13490 if (!IsIdentity)
13491 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13492 FirstShuffleCost += TTI->getScalarizationOverhead(
13493 MaskVecTy, DemandedElts, /*Insert=*/true,
13494 /*Extract=*/false, CostKind);
13495 }
13496 InstructionCost SecondShuffleCost = 0;
13497 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13498 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13499 SecondShuffleCost = ShuffleCost;
13500 } else {
13501 // Transform mask to include only first entry.
13502 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13503 bool IsIdentity = true;
13504 for (auto [I, Idx] : enumerate(SecondMask)) {
13505 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
13507 } else {
13508 DemandedElts.clearBit(I);
13509 if (Idx != PoisonMaskElem) {
13510 Idx -= NewVF;
13511 IsIdentity &= static_cast<int>(I) == Idx;
13512 }
13513 }
13514 }
13515 if (!IsIdentity)
13516 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13517 SecondShuffleCost += TTI->getScalarizationOverhead(
13518 MaskVecTy, DemandedElts, /*Insert=*/true,
13519 /*Extract=*/false, CostKind);
13520 }
13521 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13522 for (auto [I, Idx] : enumerate(SubMask))
13523 if (Idx == PoisonMaskElem)
13524 DemandedElts.clearBit(I);
13525 InstructionCost BuildVectorCost =
13526 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13527 /*Extract=*/false, CostKind);
13528 const TreeEntry *BestEntry = nullptr;
13529 if (FirstShuffleCost < ShuffleCost) {
13530 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13531 std::next(Mask.begin(), (Part + 1) * VL.size()),
13532 [&](int &Idx) {
13533 if (Idx >= static_cast<int>(VF))
13534 Idx = PoisonMaskElem;
13535 });
13536 BestEntry = Entries.front();
13537 ShuffleCost = FirstShuffleCost;
13538 }
13539 if (SecondShuffleCost < ShuffleCost) {
13540 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13541 std::next(Mask.begin(), (Part + 1) * VL.size()),
13542 [&](int &Idx) {
13543 if (Idx < static_cast<int>(VF))
13544 Idx = PoisonMaskElem;
13545 else
13546 Idx -= VF;
13547 });
13548 BestEntry = Entries[1];
13549 ShuffleCost = SecondShuffleCost;
13550 }
13551 if (BuildVectorCost >= ShuffleCost) {
13552 if (BestEntry) {
13553 Entries.clear();
13554 Entries.push_back(BestEntry);
13555 }
13556 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13558 }
13559 }
13560 Entries.clear();
13561 // Clear the corresponding mask elements.
13562 std::fill(std::next(Mask.begin(), Part * VL.size()),
13563 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13564 return std::nullopt;
13565}
13566
13568BoUpSLP::isGatherShuffledEntry(
13569 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13570 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13571 bool ForOrder) {
13572 assert(NumParts > 0 && NumParts < VL.size() &&
13573 "Expected positive number of registers.");
13574 Entries.clear();
13575 // No need to check for the topmost gather node.
13576 if (TE == VectorizableTree.front().get() &&
13577 (!GatheredLoadsEntriesFirst.has_value() ||
13578 none_of(ArrayRef(VectorizableTree).drop_front(),
13579 [](const std::unique_ptr<TreeEntry> &TE) {
13580 return !TE->isGather();
13581 })))
13582 return {};
13583 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13584 if (TE->isNonPowOf2Vec())
13585 return {};
13586 Mask.assign(VL.size(), PoisonMaskElem);
13587 assert((TE->UserTreeIndices.size() == 1 ||
13588 TE == VectorizableTree.front().get()) &&
13589 "Expected only single user of the gather node.");
13590 assert(VL.size() % NumParts == 0 &&
13591 "Number of scalars must be divisible by NumParts.");
13592 if (!TE->UserTreeIndices.empty() &&
13593 TE->UserTreeIndices.front().UserTE->isGather() &&
13594 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13595 assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
13596 isSplat(TE->Scalars)) &&
13597 "Expected splat or extractelements only node.");
13598 return {};
13599 }
13600 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13602 for (unsigned Part : seq<unsigned>(NumParts)) {
13603 ArrayRef<Value *> SubVL =
13604 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13605 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13606 std::optional<TTI::ShuffleKind> SubRes =
13607 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13608 ForOrder);
13609 if (!SubRes)
13610 SubEntries.clear();
13611 Res.push_back(SubRes);
13612 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13613 SubEntries.front()->getVectorFactor() == VL.size() &&
13614 (SubEntries.front()->isSame(TE->Scalars) ||
13615 SubEntries.front()->isSame(VL))) {
13616 SmallVector<const TreeEntry *> LocalSubEntries;
13617 LocalSubEntries.swap(SubEntries);
13618 Entries.clear();
13619 Res.clear();
13620 std::iota(Mask.begin(), Mask.end(), 0);
13621 // Clear undef scalars.
13622 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13623 if (isa<PoisonValue>(VL[I]))
13625 Entries.emplace_back(1, LocalSubEntries.front());
13627 return Res;
13628 }
13629 }
13630 if (all_of(Res,
13631 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13632 Entries.clear();
13633 return {};
13634 }
13635 return Res;
13636}
13637
13638InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13639 Type *ScalarTy) const {
13640 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13641 bool DuplicateNonConst = false;
13642 // Find the cost of inserting/extracting values from the vector.
13643 // Check if the same elements are inserted several times and count them as
13644 // shuffle candidates.
13645 APInt ShuffledElements = APInt::getZero(VL.size());
13646 DenseMap<Value *, unsigned> UniqueElements;
13649 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13650 if (V->getType() != ScalarTy) {
13651 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13653 V = nullptr;
13654 }
13655 if (!ForPoisonSrc)
13656 Cost +=
13657 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13658 I, Constant::getNullValue(VecTy), V);
13659 };
13660 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13661 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13662 Value *V = VL[I];
13663 // No need to shuffle duplicates for constants.
13664 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13665 ShuffledElements.setBit(I);
13666 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13667 continue;
13668 }
13669
13670 auto Res = UniqueElements.try_emplace(V, I);
13671 if (Res.second) {
13672 EstimateInsertCost(I, V);
13673 ShuffleMask[I] = I;
13674 continue;
13675 }
13676
13677 DuplicateNonConst = true;
13678 ShuffledElements.setBit(I);
13679 ShuffleMask[I] = Res.first->second;
13680 }
13681 if (ForPoisonSrc) {
13682 if (isa<FixedVectorType>(ScalarTy)) {
13683 assert(SLPReVec && "Only supported by REVEC.");
13684 // We don't need to insert elements one by one. Instead, we can insert the
13685 // entire vector into the destination.
13686 Cost = 0;
13687 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13688 for (unsigned I : seq<unsigned>(VL.size()))
13689 if (!ShuffledElements[I])
13691 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13692 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13693 } else {
13695 /*DemandedElts*/ ~ShuffledElements,
13696 /*Insert*/ true,
13697 /*Extract*/ false, CostKind, VL);
13698 }
13699 }
13700 if (DuplicateNonConst)
13702 VecTy, ShuffleMask);
13703 return Cost;
13704}
13705
13706Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13707 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13708 if (Res)
13709 return *Res;
13710 // Get the basic block this bundle is in. All instructions in the bundle
13711 // should be in this block (except for extractelement-like instructions with
13712 // constant indices or gathered loads).
13713 auto *Front = E->getMainOp();
13714 auto *BB = Front->getParent();
13715 assert(((GatheredLoadsEntriesFirst.has_value() &&
13716 E->getOpcode() == Instruction::Load && E->isGather() &&
13717 E->Idx < *GatheredLoadsEntriesFirst) ||
13718 all_of(E->Scalars,
13719 [=](Value *V) -> bool {
13720 if (E->getOpcode() == Instruction::GetElementPtr &&
13721 !isa<GetElementPtrInst>(V))
13722 return true;
13723 auto *I = dyn_cast<Instruction>(V);
13724 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13725 isVectorLikeInstWithConstOps(I);
13726 })) &&
13727 "Expected gathered loads or GEPs or instructions from same basic "
13728 "block.");
13729
13730 auto FindLastInst = [&]() {
13731 Instruction *LastInst = Front;
13732 for (Value *V : E->Scalars) {
13733 auto *I = dyn_cast<Instruction>(V);
13734 if (!I)
13735 continue;
13736 if (LastInst->getParent() == I->getParent()) {
13737 if (LastInst->comesBefore(I))
13738 LastInst = I;
13739 continue;
13740 }
13741 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13742 !isa<GetElementPtrInst>(I)) ||
13743 (isVectorLikeInstWithConstOps(LastInst) &&
13745 (GatheredLoadsEntriesFirst.has_value() &&
13746 E->getOpcode() == Instruction::Load && E->isGather() &&
13747 E->Idx < *GatheredLoadsEntriesFirst)) &&
13748 "Expected vector-like or non-GEP in GEP node insts only.");
13749 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13750 LastInst = I;
13751 continue;
13752 }
13753 if (!DT->isReachableFromEntry(I->getParent()))
13754 continue;
13755 auto *NodeA = DT->getNode(LastInst->getParent());
13756 auto *NodeB = DT->getNode(I->getParent());
13757 assert(NodeA && "Should only process reachable instructions");
13758 assert(NodeB && "Should only process reachable instructions");
13759 assert((NodeA == NodeB) ==
13760 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13761 "Different nodes should have different DFS numbers");
13762 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13763 LastInst = I;
13764 }
13765 BB = LastInst->getParent();
13766 return LastInst;
13767 };
13768
13769 auto FindFirstInst = [&]() {
13770 Instruction *FirstInst = Front;
13771 for (Value *V : E->Scalars) {
13772 auto *I = dyn_cast<Instruction>(V);
13773 if (!I)
13774 continue;
13775 if (FirstInst->getParent() == I->getParent()) {
13776 if (I->comesBefore(FirstInst))
13777 FirstInst = I;
13778 continue;
13779 }
13780 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13781 !isa<GetElementPtrInst>(I)) ||
13782 (isVectorLikeInstWithConstOps(FirstInst) &&
13784 "Expected vector-like or non-GEP in GEP node insts only.");
13785 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13786 FirstInst = I;
13787 continue;
13788 }
13789 if (!DT->isReachableFromEntry(I->getParent()))
13790 continue;
13791 auto *NodeA = DT->getNode(FirstInst->getParent());
13792 auto *NodeB = DT->getNode(I->getParent());
13793 assert(NodeA && "Should only process reachable instructions");
13794 assert(NodeB && "Should only process reachable instructions");
13795 assert((NodeA == NodeB) ==
13796 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13797 "Different nodes should have different DFS numbers");
13798 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13799 FirstInst = I;
13800 }
13801 return FirstInst;
13802 };
13803
13804 // Set insertpoint for gathered loads to the very first load.
13805 if (GatheredLoadsEntriesFirst.has_value() &&
13806 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13807 E->getOpcode() == Instruction::Load) {
13808 Res = FindFirstInst();
13809 return *Res;
13810 }
13811
13812 // Set the insert point to the beginning of the basic block if the entry
13813 // should not be scheduled.
13814 if (doesNotNeedToSchedule(E->Scalars) ||
13815 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13816 if ((E->getOpcode() == Instruction::GetElementPtr &&
13817 any_of(E->Scalars,
13818 [](Value *V) {
13819 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13820 })) ||
13821 all_of(E->Scalars,
13822 [](Value *V) {
13823 return isa<PoisonValue>(V) ||
13824 (!isVectorLikeInstWithConstOps(V) &&
13825 isUsedOutsideBlock(V));
13826 }) ||
13827 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13828 return isa<ExtractElementInst, UndefValue>(V) ||
13829 areAllOperandsNonInsts(V);
13830 })))
13831 Res = FindLastInst();
13832 else
13833 Res = FindFirstInst();
13834 return *Res;
13835 }
13836
13837 // Find the last instruction. The common case should be that BB has been
13838 // scheduled, and the last instruction is VL.back(). So we start with
13839 // VL.back() and iterate over schedule data until we reach the end of the
13840 // bundle. The end of the bundle is marked by null ScheduleData.
13841 if (BlocksSchedules.count(BB) && !E->isGather()) {
13842 Value *V = E->isOneOf(E->Scalars.back());
13844 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13845 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13846 if (Bundle && Bundle->isPartOfBundle())
13847 for (; Bundle; Bundle = Bundle->NextInBundle)
13848 Res = Bundle->Inst;
13849 }
13850
13851 // LastInst can still be null at this point if there's either not an entry
13852 // for BB in BlocksSchedules or there's no ScheduleData available for
13853 // VL.back(). This can be the case if buildTree_rec aborts for various
13854 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13855 // size is reached, etc.). ScheduleData is initialized in the scheduling
13856 // "dry-run".
13857 //
13858 // If this happens, we can still find the last instruction by brute force. We
13859 // iterate forwards from Front (inclusive) until we either see all
13860 // instructions in the bundle or reach the end of the block. If Front is the
13861 // last instruction in program order, LastInst will be set to Front, and we
13862 // will visit all the remaining instructions in the block.
13863 //
13864 // One of the reasons we exit early from buildTree_rec is to place an upper
13865 // bound on compile-time. Thus, taking an additional compile-time hit here is
13866 // not ideal. However, this should be exceedingly rare since it requires that
13867 // we both exit early from buildTree_rec and that the bundle be out-of-order
13868 // (causing us to iterate all the way to the end of the block).
13869 if (!Res)
13870 Res = FindLastInst();
13871 assert(Res && "Failed to find last instruction in bundle");
13872 return *Res;
13873}
13874
13875void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13876 auto *Front = E->getMainOp();
13877 Instruction *LastInst = &getLastInstructionInBundle(E);
13878 assert(LastInst && "Failed to find last instruction in bundle");
13879 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13880 // If the instruction is PHI, set the insert point after all the PHIs.
13881 bool IsPHI = isa<PHINode>(LastInst);
13882 if (IsPHI)
13883 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13884 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13885 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13886 } else {
13887 // Set the insertion point after the last instruction in the bundle. Set the
13888 // debug location to Front.
13889 Builder.SetInsertPoint(
13890 LastInst->getParent(),
13892 }
13893 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13894}
13895
13896Value *BoUpSLP::gather(
13897 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13898 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13899 // List of instructions/lanes from current block and/or the blocks which are
13900 // part of the current loop. These instructions will be inserted at the end to
13901 // make it possible to optimize loops and hoist invariant instructions out of
13902 // the loops body with better chances for success.
13904 SmallSet<int, 4> PostponedIndices;
13905 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13906 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13908 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13909 InsertBB = InsertBB->getSinglePredecessor();
13910 return InsertBB && InsertBB == InstBB;
13911 };
13912 for (int I = 0, E = VL.size(); I < E; ++I) {
13913 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13914 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13915 getTreeEntry(Inst) ||
13916 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13917 PostponedIndices.insert(I).second)
13918 PostponedInsts.emplace_back(Inst, I);
13919 }
13920
13921 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13922 Type *Ty) {
13923 Value *Scalar = V;
13924 if (Scalar->getType() != Ty) {
13925 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13926 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13927 Value *V = Scalar;
13928 if (auto *CI = dyn_cast<CastInst>(Scalar);
13929 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13930 Value *Op = CI->getOperand(0);
13931 if (auto *IOp = dyn_cast<Instruction>(Op);
13932 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13933 V = Op;
13934 }
13935 Scalar = Builder.CreateIntCast(
13936 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13937 }
13938
13939 Instruction *InsElt;
13940 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13941 assert(SLPReVec && "FixedVectorType is not expected.");
13942 Vec = InsElt = cast<Instruction>(createInsertVector(
13943 Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
13944 auto *II = dyn_cast<IntrinsicInst>(InsElt);
13945 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13946 return Vec;
13947 } else {
13948 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13949 InsElt = dyn_cast<InsertElementInst>(Vec);
13950 if (!InsElt)
13951 return Vec;
13952 }
13953 GatherShuffleExtractSeq.insert(InsElt);
13954 CSEBlocks.insert(InsElt->getParent());
13955 // Add to our 'need-to-extract' list.
13956 if (isa<Instruction>(V)) {
13957 if (TreeEntry *Entry = getTreeEntry(V)) {
13958 // Find which lane we need to extract.
13959 User *UserOp = nullptr;
13960 if (Scalar != V) {
13961 if (auto *SI = dyn_cast<Instruction>(Scalar))
13962 UserOp = SI;
13963 } else {
13964 UserOp = InsElt;
13965 }
13966 if (UserOp) {
13967 unsigned FoundLane = Entry->findLaneForValue(V);
13968 ExternalUses.emplace_back(V, UserOp, FoundLane);
13969 }
13970 }
13971 }
13972 return Vec;
13973 };
13974 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13975 Value *Vec = PoisonValue::get(VecTy);
13976 SmallVector<int> NonConsts;
13978 std::iota(Mask.begin(), Mask.end(), 0);
13979 Value *OriginalRoot = Root;
13980 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13981 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13982 SV->getOperand(0)->getType() == VecTy) {
13983 Root = SV->getOperand(0);
13984 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13985 }
13986 // Insert constant values at first.
13987 for (int I = 0, E = VL.size(); I < E; ++I) {
13988 if (PostponedIndices.contains(I))
13989 continue;
13990 if (!isConstant(VL[I])) {
13991 NonConsts.push_back(I);
13992 continue;
13993 }
13994 if (isa<PoisonValue>(VL[I]))
13995 continue;
13996 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
13997 Mask[I] = I + E;
13998 }
13999 if (Root) {
14000 if (isa<PoisonValue>(Vec)) {
14001 Vec = OriginalRoot;
14002 } else {
14003 Vec = CreateShuffle(Root, Vec, Mask);
14004 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14005 OI && OI->hasNUses(0) &&
14006 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14007 return TE->VectorizedValue == OI;
14008 }))
14009 eraseInstruction(OI);
14010 }
14011 }
14012 // Insert non-constant values.
14013 for (int I : NonConsts)
14014 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14015 // Append instructions, which are/may be part of the loop, in the end to make
14016 // it possible to hoist non-loop-based instructions.
14017 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14018 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14019
14020 return Vec;
14021}
14022
14023/// Merges shuffle masks and emits final shuffle instruction, if required. It
14024/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14025/// when the actual shuffle instruction is generated only if this is actually
14026/// required. Otherwise, the shuffle instruction emission is delayed till the
14027/// end of the process, to reduce the number of emitted instructions and further
14028/// analysis/transformations.
14029/// The class also will look through the previously emitted shuffle instructions
14030/// and properly mark indices in mask as undef.
14031/// For example, given the code
14032/// \code
14033/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14034/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14035/// \endcode
14036/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14037/// look through %s1 and %s2 and emit
14038/// \code
14039/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14040/// \endcode
14041/// instead.
14042/// If 2 operands are of different size, the smallest one will be resized and
14043/// the mask recalculated properly.
14044/// For example, given the code
14045/// \code
14046/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14047/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14048/// \endcode
14049/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14050/// look through %s1 and %s2 and emit
14051/// \code
14052/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14053/// \endcode
14054/// instead.
14055class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
14056 bool IsFinalized = false;
14057 /// Combined mask for all applied operands and masks. It is built during
14058 /// analysis and actual emission of shuffle vector instructions.
14059 SmallVector<int> CommonMask;
14060 /// List of operands for the shuffle vector instruction. It hold at max 2
14061 /// operands, if the 3rd is going to be added, the first 2 are combined into
14062 /// shuffle with \p CommonMask mask, the first operand sets to be the
14063 /// resulting shuffle and the second operand sets to be the newly added
14064 /// operand. The \p CommonMask is transformed in the proper way after that.
14065 SmallVector<Value *, 2> InVectors;
14066 IRBuilderBase &Builder;
14067 BoUpSLP &R;
14068
14069 class ShuffleIRBuilder {
14070 IRBuilderBase &Builder;
14071 /// Holds all of the instructions that we gathered.
14072 SetVector<Instruction *> &GatherShuffleExtractSeq;
14073 /// A list of blocks that we are going to CSE.
14074 DenseSet<BasicBlock *> &CSEBlocks;
14075 /// Data layout.
14076 const DataLayout &DL;
14077
14078 public:
14079 ShuffleIRBuilder(IRBuilderBase &Builder,
14080 SetVector<Instruction *> &GatherShuffleExtractSeq,
14081 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14082 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14083 CSEBlocks(CSEBlocks), DL(DL) {}
14084 ~ShuffleIRBuilder() = default;
14085 /// Creates shufflevector for the 2 operands with the given mask.
14086 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14087 if (V1->getType() != V2->getType()) {
14089 V1->getType()->isIntOrIntVectorTy() &&
14090 "Expected integer vector types only.");
14091 if (V1->getType() != V2->getType()) {
14092 if (cast<VectorType>(V2->getType())
14093 ->getElementType()
14094 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14095 ->getElementType()
14096 ->getIntegerBitWidth())
14097 V2 = Builder.CreateIntCast(
14098 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14099 else
14100 V1 = Builder.CreateIntCast(
14101 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14102 }
14103 }
14104 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14105 if (auto *I = dyn_cast<Instruction>(Vec)) {
14106 GatherShuffleExtractSeq.insert(I);
14107 CSEBlocks.insert(I->getParent());
14108 }
14109 return Vec;
14110 }
14111 /// Creates permutation of the single vector operand with the given mask, if
14112 /// it is not identity mask.
14113 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14114 if (Mask.empty())
14115 return V1;
14116 unsigned VF = Mask.size();
14117 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14118 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14119 return V1;
14120 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14121 if (auto *I = dyn_cast<Instruction>(Vec)) {
14122 GatherShuffleExtractSeq.insert(I);
14123 CSEBlocks.insert(I->getParent());
14124 }
14125 return Vec;
14126 }
14127 Value *createIdentity(Value *V) { return V; }
14128 Value *createPoison(Type *Ty, unsigned VF) {
14129 return PoisonValue::get(getWidenedType(Ty, VF));
14130 }
14131 /// Resizes 2 input vector to match the sizes, if the they are not equal
14132 /// yet. The smallest vector is resized to the size of the larger vector.
14133 void resizeToMatch(Value *&V1, Value *&V2) {
14134 if (V1->getType() == V2->getType())
14135 return;
14136 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14137 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14138 int VF = std::max(V1VF, V2VF);
14139 int MinVF = std::min(V1VF, V2VF);
14140 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14141 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14142 0);
14143 Value *&Op = MinVF == V1VF ? V1 : V2;
14144 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14145 if (auto *I = dyn_cast<Instruction>(Op)) {
14146 GatherShuffleExtractSeq.insert(I);
14147 CSEBlocks.insert(I->getParent());
14148 }
14149 if (MinVF == V1VF)
14150 V1 = Op;
14151 else
14152 V2 = Op;
14153 }
14154 };
14155
14156 /// Smart shuffle instruction emission, walks through shuffles trees and
14157 /// tries to find the best matching vector for the actual shuffle
14158 /// instruction.
14159 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14160 assert(V1 && "Expected at least one vector value.");
14161 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14162 R.CSEBlocks, *R.DL);
14163 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14164 ShuffleBuilder);
14165 }
14166
14167 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
14168 /// shuffle emission.
14169 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
14170 ArrayRef<int> Mask) {
14171 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14172 if (Mask[Idx] != PoisonMaskElem)
14173 CommonMask[Idx] = Idx;
14174 }
14175
14176 /// Cast value \p V to the vector type with the same number of elements, but
14177 /// the base type \p ScalarTy.
14178 Value *castToScalarTyElem(Value *V,
14179 std::optional<bool> IsSigned = std::nullopt) {
14180 auto *VecTy = cast<VectorType>(V->getType());
14181 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14182 if (VecTy->getElementType() == ScalarTy->getScalarType())
14183 return V;
14184 return Builder.CreateIntCast(
14185 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14186 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14187 }
14188
14189public:
14191 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14192
14193 /// Adjusts extractelements after reusing them.
14194 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14195 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14196 unsigned NumParts, bool &UseVecBaseAsInput) {
14197 UseVecBaseAsInput = false;
14198 SmallPtrSet<Value *, 4> UniqueBases;
14199 Value *VecBase = nullptr;
14200 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14201 if (!E->ReorderIndices.empty()) {
14202 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14203 E->ReorderIndices.end());
14204 reorderScalars(VL, ReorderMask);
14205 }
14206 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14207 int Idx = Mask[I];
14208 if (Idx == PoisonMaskElem)
14209 continue;
14210 auto *EI = cast<ExtractElementInst>(VL[I]);
14211 VecBase = EI->getVectorOperand();
14212 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14213 VecBase = TE->VectorizedValue;
14214 assert(VecBase && "Expected vectorized value.");
14215 UniqueBases.insert(VecBase);
14216 // If the only one use is vectorized - can delete the extractelement
14217 // itself.
14218 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14219 (NumParts != 1 && count(VL, EI) > 1) ||
14220 any_of(EI->users(), [&](User *U) {
14221 const TreeEntry *UTE = R.getTreeEntry(U);
14222 return !UTE || R.MultiNodeScalars.contains(U) ||
14223 (isa<GetElementPtrInst>(U) &&
14224 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14225 count_if(R.VectorizableTree,
14226 [&](const std::unique_ptr<TreeEntry> &TE) {
14227 return any_of(TE->UserTreeIndices,
14228 [&](const EdgeInfo &Edge) {
14229 return Edge.UserTE == UTE;
14230 }) &&
14231 is_contained(VL, EI);
14232 }) != 1;
14233 }))
14234 continue;
14235 R.eraseInstruction(EI);
14236 }
14237 if (NumParts == 1 || UniqueBases.size() == 1) {
14238 assert(VecBase && "Expected vectorized value.");
14239 return castToScalarTyElem(VecBase);
14240 }
14241 UseVecBaseAsInput = true;
14242 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14243 for (auto [I, Idx] : enumerate(Mask))
14244 if (Idx != PoisonMaskElem)
14245 Idx = I;
14246 };
14247 // Perform multi-register vector shuffle, joining them into a single virtual
14248 // long vector.
14249 // Need to shuffle each part independently and then insert all this parts
14250 // into a long virtual vector register, forming the original vector.
14251 Value *Vec = nullptr;
14252 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14253 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14254 for (unsigned Part : seq<unsigned>(NumParts)) {
14255 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14256 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14257 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14258 constexpr int MaxBases = 2;
14259 SmallVector<Value *, MaxBases> Bases(MaxBases);
14260 auto VLMask = zip(SubVL, SubMask);
14261 const unsigned VF = std::accumulate(
14262 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14263 if (std::get<1>(D) == PoisonMaskElem)
14264 return S;
14265 Value *VecOp =
14266 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14267 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14268 VecOp = TE->VectorizedValue;
14269 assert(VecOp && "Expected vectorized value.");
14270 const unsigned Size =
14271 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14272 return std::max(S, Size);
14273 });
14274 for (const auto [V, I] : VLMask) {
14275 if (I == PoisonMaskElem)
14276 continue;
14277 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14278 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14279 VecOp = TE->VectorizedValue;
14280 assert(VecOp && "Expected vectorized value.");
14281 VecOp = castToScalarTyElem(VecOp);
14282 Bases[I / VF] = VecOp;
14283 }
14284 if (!Bases.front())
14285 continue;
14286 Value *SubVec;
14287 if (Bases.back()) {
14288 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14289 TransformToIdentity(SubMask);
14290 } else {
14291 SubVec = Bases.front();
14292 }
14293 if (!Vec) {
14294 Vec = SubVec;
14295 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14296 [&](unsigned P) {
14297 ArrayRef<int> SubMask =
14298 Mask.slice(P * SliceSize,
14299 getNumElems(Mask.size(),
14300 SliceSize, P));
14301 return all_of(SubMask, [](int Idx) {
14302 return Idx == PoisonMaskElem;
14303 });
14304 })) &&
14305 "Expected first part or all previous parts masked.");
14306 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14307 } else {
14308 unsigned NewVF =
14309 cast<FixedVectorType>(Vec->getType())->getNumElements();
14310 if (Vec->getType() != SubVec->getType()) {
14311 unsigned SubVecVF =
14312 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14313 NewVF = std::max(NewVF, SubVecVF);
14314 }
14315 // Adjust SubMask.
14316 for (int &Idx : SubMask)
14317 if (Idx != PoisonMaskElem)
14318 Idx += NewVF;
14319 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14320 Vec = createShuffle(Vec, SubVec, VecMask);
14321 TransformToIdentity(VecMask);
14322 }
14323 }
14324 copy(VecMask, Mask.begin());
14325 return Vec;
14326 }
14327 /// Checks if the specified entry \p E needs to be delayed because of its
14328 /// dependency nodes.
14329 std::optional<Value *>
14330 needToDelay(const TreeEntry *E,
14332 // No need to delay emission if all deps are ready.
14333 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14334 return all_of(
14335 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14336 }))
14337 return std::nullopt;
14338 // Postpone gather emission, will be emitted after the end of the
14339 // process to keep correct order.
14340 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14341 return Builder.CreateAlignedLoad(
14342 ResVecTy,
14344 MaybeAlign());
14345 }
14346 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14347 /// shuffling.
14348 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14349 Value *V1 = E1.VectorizedValue;
14350 if (V1->getType()->isIntOrIntVectorTy())
14351 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14352 if (isa<PoisonValue>(V))
14353 return false;
14354 return !isKnownNonNegative(
14355 V, SimplifyQuery(*R.DL));
14356 }));
14357 Value *V2 = E2.VectorizedValue;
14358 if (V2->getType()->isIntOrIntVectorTy())
14359 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14360 if (isa<PoisonValue>(V))
14361 return false;
14362 return !isKnownNonNegative(
14363 V, SimplifyQuery(*R.DL));
14364 }));
14365 add(V1, V2, Mask);
14366 }
14367 /// Adds single input vector (in form of tree entry) and the mask for its
14368 /// shuffling.
14369 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14370 Value *V1 = E1.VectorizedValue;
14371 if (V1->getType()->isIntOrIntVectorTy())
14372 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14373 if (isa<PoisonValue>(V))
14374 return false;
14375 return !isKnownNonNegative(
14376 V, SimplifyQuery(*R.DL));
14377 }));
14378 add(V1, Mask);
14379 }
14380 /// Adds 2 input vectors and the mask for their shuffling.
14381 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14382 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14383 assert(isa<FixedVectorType>(V1->getType()) &&
14384 isa<FixedVectorType>(V2->getType()) &&
14385 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14386 V1 = castToScalarTyElem(V1);
14387 V2 = castToScalarTyElem(V2);
14388 if (InVectors.empty()) {
14389 InVectors.push_back(V1);
14390 InVectors.push_back(V2);
14391 CommonMask.assign(Mask.begin(), Mask.end());
14392 return;
14393 }
14394 Value *Vec = InVectors.front();
14395 if (InVectors.size() == 2) {
14396 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14397 transformMaskAfterShuffle(CommonMask, CommonMask);
14398 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14399 Mask.size()) {
14400 Vec = createShuffle(Vec, nullptr, CommonMask);
14401 transformMaskAfterShuffle(CommonMask, CommonMask);
14402 }
14403 V1 = createShuffle(V1, V2, Mask);
14404 unsigned VF = std::max(getVF(V1), getVF(Vec));
14405 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14406 if (Mask[Idx] != PoisonMaskElem)
14407 CommonMask[Idx] = Idx + VF;
14408 InVectors.front() = Vec;
14409 if (InVectors.size() == 2)
14410 InVectors.back() = V1;
14411 else
14412 InVectors.push_back(V1);
14413 }
14414 /// Adds another one input vector and the mask for the shuffling.
14415 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14416 assert(isa<FixedVectorType>(V1->getType()) &&
14417 "castToScalarTyElem expects V1 to be FixedVectorType");
14418 V1 = castToScalarTyElem(V1);
14419 if (InVectors.empty()) {
14420 InVectors.push_back(V1);
14421 CommonMask.assign(Mask.begin(), Mask.end());
14422 return;
14423 }
14424 const auto *It = find(InVectors, V1);
14425 if (It == InVectors.end()) {
14426 if (InVectors.size() == 2 ||
14427 InVectors.front()->getType() != V1->getType()) {
14428 Value *V = InVectors.front();
14429 if (InVectors.size() == 2) {
14430 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14431 transformMaskAfterShuffle(CommonMask, CommonMask);
14432 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14433 CommonMask.size()) {
14434 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14435 transformMaskAfterShuffle(CommonMask, CommonMask);
14436 }
14437 unsigned VF = std::max(CommonMask.size(), Mask.size());
14438 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14439 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14440 CommonMask[Idx] =
14441 V->getType() != V1->getType()
14442 ? Idx + VF
14443 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14444 ->getNumElements();
14445 if (V->getType() != V1->getType())
14446 V1 = createShuffle(V1, nullptr, Mask);
14447 InVectors.front() = V;
14448 if (InVectors.size() == 2)
14449 InVectors.back() = V1;
14450 else
14451 InVectors.push_back(V1);
14452 return;
14453 }
14454 // Check if second vector is required if the used elements are already
14455 // used from the first one.
14456 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14457 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14458 InVectors.push_back(V1);
14459 break;
14460 }
14461 }
14462 int VF = getVF(V1);
14463 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14464 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14465 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14466 }
14467 /// Adds another one input vector and the mask for the shuffling.
14469 SmallVector<int> NewMask;
14470 inversePermutation(Order, NewMask);
14471 add(V1, NewMask);
14472 }
14473 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14474 Value *Root = nullptr) {
14475 return R.gather(VL, Root, ScalarTy,
14476 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14477 return createShuffle(V1, V2, Mask);
14478 });
14479 }
14480 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14481 /// Finalize emission of the shuffles.
14482 /// \param Action the action (if any) to be performed before final applying of
14483 /// the \p ExtMask mask.
14484 Value *
14486 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14487 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14488 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14489 IsFinalized = true;
14490 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14491 SmallVector<int> NewExtMask(ExtMask);
14492 if (ScalarTyNumElements != 1) {
14493 assert(SLPReVec && "FixedVectorType is not expected.");
14494 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
14495 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
14496 ExtMask = NewExtMask;
14497 }
14498 if (Action) {
14499 Value *Vec = InVectors.front();
14500 if (InVectors.size() == 2) {
14501 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14502 InVectors.pop_back();
14503 } else {
14504 Vec = createShuffle(Vec, nullptr, CommonMask);
14505 }
14506 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14507 if (CommonMask[Idx] != PoisonMaskElem)
14508 CommonMask[Idx] = Idx;
14509 assert(VF > 0 &&
14510 "Expected vector length for the final value before action.");
14511 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14512 if (VecVF < VF) {
14513 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14514 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14515 Vec = createShuffle(Vec, nullptr, ResizeMask);
14516 }
14517 Action(Vec, CommonMask);
14518 InVectors.front() = Vec;
14519 }
14520 if (!SubVectors.empty()) {
14521 Value *Vec = InVectors.front();
14522 if (InVectors.size() == 2) {
14523 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14524 InVectors.pop_back();
14525 } else {
14526 Vec = createShuffle(Vec, nullptr, CommonMask);
14527 }
14528 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14529 if (CommonMask[Idx] != PoisonMaskElem)
14530 CommonMask[Idx] = Idx;
14531 auto CreateSubVectors = [&](Value *Vec,
14532 SmallVectorImpl<int> &CommonMask) {
14533 for (auto [E, Idx] : SubVectors) {
14534 Value *V = E->VectorizedValue;
14535 if (V->getType()->isIntOrIntVectorTy())
14536 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14537 if (isa<PoisonValue>(V))
14538 return false;
14539 return !isKnownNonNegative(
14540 V, SimplifyQuery(*R.DL));
14541 }));
14542 unsigned InsertionIndex = Idx * ScalarTyNumElements;
14543 Vec = createInsertVector(
14544 Builder, Vec, V, InsertionIndex,
14545 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
14546 _3));
14547 if (!CommonMask.empty()) {
14548 std::iota(
14549 std::next(CommonMask.begin(), InsertionIndex),
14550 std::next(CommonMask.begin(),
14551 (Idx + E->getVectorFactor()) * ScalarTyNumElements),
14552 InsertionIndex);
14553 }
14554 }
14555 return Vec;
14556 };
14557 if (SubVectorsMask.empty()) {
14558 Vec = CreateSubVectors(Vec, CommonMask);
14559 } else {
14560 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14561 copy(SubVectorsMask, SVMask.begin());
14562 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14563 if (I2 != PoisonMaskElem) {
14564 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14565 I1 = I2 + CommonMask.size();
14566 }
14567 }
14568 Value *InsertVec =
14569 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14570 Vec = createShuffle(InsertVec, Vec, SVMask);
14571 for (unsigned I : seq<unsigned>(CommonMask.size())) {
14572 if (SVMask[I] != PoisonMaskElem)
14573 CommonMask[I] = I;
14574 }
14575 }
14576 InVectors.front() = Vec;
14577 }
14578
14579 if (!ExtMask.empty()) {
14580 if (CommonMask.empty()) {
14581 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14582 } else {
14583 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14584 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14585 if (ExtMask[I] == PoisonMaskElem)
14586 continue;
14587 NewMask[I] = CommonMask[ExtMask[I]];
14588 }
14589 CommonMask.swap(NewMask);
14590 }
14591 }
14592 if (CommonMask.empty()) {
14593 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14594 return InVectors.front();
14595 }
14596 if (InVectors.size() == 2)
14597 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14598 return createShuffle(InVectors.front(), nullptr, CommonMask);
14599 }
14600
14602 assert((IsFinalized || CommonMask.empty()) &&
14603 "Shuffle construction must be finalized.");
14604 }
14605};
14606
14607BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14608 unsigned NodeIdx) {
14609 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14610 InstructionsState S = getSameOpcode(VL, *TLI);
14611 // Special processing for GEPs bundle, which may include non-gep values.
14612 if (!S && VL.front()->getType()->isPointerTy()) {
14613 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14614 if (It != VL.end())
14615 S = getSameOpcode(*It, *TLI);
14616 }
14617 if (!S)
14618 return nullptr;
14619 auto CheckSameVE = [&](const TreeEntry *VE) {
14620 return VE->isSame(VL) &&
14621 (any_of(VE->UserTreeIndices,
14622 [E, NodeIdx](const EdgeInfo &EI) {
14623 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14624 }) ||
14625 any_of(VectorizableTree,
14626 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14627 return TE->isOperandGatherNode(
14628 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14629 VE->isSame(TE->Scalars);
14630 }));
14631 };
14632 TreeEntry *VE = getTreeEntry(S.getMainOp());
14633 if (VE && CheckSameVE(VE))
14634 return VE;
14635 auto It = MultiNodeScalars.find(S.getMainOp());
14636 if (It != MultiNodeScalars.end()) {
14637 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
14638 return TE != VE && CheckSameVE(TE);
14639 });
14640 if (I != It->getSecond().end())
14641 return *I;
14642 }
14643 return nullptr;
14644}
14645
14646Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14647 bool PostponedPHIs) {
14648 ValueList &VL = E->getOperand(NodeIdx);
14649 const unsigned VF = VL.size();
14650 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14651 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14652 // V may be affected by MinBWs.
14653 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14654 // factor is the number of elements, not their type.
14655 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14656 unsigned NumElements = getNumElements(VL.front()->getType());
14657 ShuffleInstructionBuilder ShuffleBuilder(
14658 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14659 : ScalarTy,
14660 Builder, *this);
14661 ShuffleBuilder.add(V, Mask);
14663 E->CombinedEntriesWithIndices.size());
14664 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14665 [&](const auto &P) {
14666 return std::make_pair(VectorizableTree[P.first].get(),
14667 P.second);
14668 });
14669 assert((E->CombinedEntriesWithIndices.empty() ||
14670 E->ReorderIndices.empty()) &&
14671 "Expected either combined subnodes or reordering");
14672 return ShuffleBuilder.finalize({}, SubVectors, {});
14673 };
14674 Value *V = vectorizeTree(VE, PostponedPHIs);
14675 if (VF * getNumElements(VL[0]->getType()) !=
14676 cast<FixedVectorType>(V->getType())->getNumElements()) {
14677 if (!VE->ReuseShuffleIndices.empty()) {
14678 // Reshuffle to get only unique values.
14679 // If some of the scalars are duplicated in the vectorization
14680 // tree entry, we do not vectorize them but instead generate a
14681 // mask for the reuses. But if there are several users of the
14682 // same entry, they may have different vectorization factors.
14683 // This is especially important for PHI nodes. In this case, we
14684 // need to adapt the resulting instruction for the user
14685 // vectorization factor and have to reshuffle it again to take
14686 // only unique elements of the vector. Without this code the
14687 // function incorrectly returns reduced vector instruction with
14688 // the same elements, not with the unique ones.
14689
14690 // block:
14691 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14692 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14693 // ... (use %2)
14694 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14695 // br %block
14697 for (auto [I, V] : enumerate(VL)) {
14698 if (isa<PoisonValue>(V))
14699 continue;
14700 Mask[I] = VE->findLaneForValue(V);
14701 }
14702 V = FinalShuffle(V, Mask);
14703 } else {
14704 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14705 "Expected vectorization factor less "
14706 "than original vector size.");
14707 SmallVector<int> UniformMask(VF, 0);
14708 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14709 V = FinalShuffle(V, UniformMask);
14710 }
14711 }
14712 // Need to update the operand gather node, if actually the operand is not a
14713 // vectorized node, but the buildvector/gather node, which matches one of
14714 // the vectorized nodes.
14715 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14716 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14717 }) == VE->UserTreeIndices.end()) {
14718 auto *It =
14719 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14720 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14721 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14722 });
14723 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14724 (*It)->VectorizedValue = V;
14725 }
14726 return V;
14727 }
14728
14729 // Find the corresponding gather entry and vectorize it.
14730 // Allows to be more accurate with tree/graph transformations, checks for the
14731 // correctness of the transformations in many cases.
14732 auto *I = find_if(VectorizableTree,
14733 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14734 return TE->isOperandGatherNode({E, NodeIdx});
14735 });
14736 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14737 assert(I->get()->UserTreeIndices.size() == 1 &&
14738 "Expected only single user for the gather node.");
14739 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14740 return vectorizeTree(I->get(), PostponedPHIs);
14741}
14742
14743template <typename BVTy, typename ResTy, typename... Args>
14744ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14745 Args &...Params) {
14746 assert(E->isGather() && "Expected gather node.");
14747 unsigned VF = E->getVectorFactor();
14748
14749 bool NeedFreeze = false;
14750 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14751 E->ReuseShuffleIndices.end());
14752 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14753 // Clear values, to be replaced by insertvector instructions.
14754 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14755 for_each(MutableArrayRef(GatheredScalars)
14756 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14757 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14759 E->CombinedEntriesWithIndices.size());
14760 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14761 [&](const auto &P) {
14762 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14763 });
14764 // Build a mask out of the reorder indices and reorder scalars per this
14765 // mask.
14766 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14767 E->ReorderIndices.end());
14768 if (!ReorderMask.empty())
14769 reorderScalars(GatheredScalars, ReorderMask);
14770 SmallVector<int> SubVectorsMask;
14771 inversePermutation(E->ReorderIndices, SubVectorsMask);
14772 // Transform non-clustered elements in the mask to poison (-1).
14773 // "Clustered" operations will be reordered using this mask later.
14774 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14775 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14776 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14777 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14778 } else {
14779 SubVectorsMask.clear();
14780 }
14781 SmallVector<Value *> StoredGS(GatheredScalars);
14782 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14783 unsigned I, unsigned SliceSize,
14784 bool IsNotPoisonous) {
14785 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14786 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14787 }))
14788 return false;
14789 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14790 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14791 if (UserTE->getNumOperands() != 2)
14792 return false;
14793 if (!IsNotPoisonous) {
14794 auto *It =
14795 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14796 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14797 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14798 }) != TE->UserTreeIndices.end();
14799 });
14800 if (It == VectorizableTree.end())
14801 return false;
14802 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14803 if (!(*It)->ReorderIndices.empty()) {
14804 inversePermutation((*It)->ReorderIndices, ReorderMask);
14805 reorderScalars(GS, ReorderMask);
14806 }
14807 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14808 Value *V0 = std::get<0>(P);
14809 Value *V1 = std::get<1>(P);
14810 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14811 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14812 is_contained(E->Scalars, V1));
14813 }))
14814 return false;
14815 }
14816 int Idx;
14817 if ((Mask.size() < InputVF &&
14819 Idx == 0) ||
14820 (Mask.size() == InputVF &&
14821 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14822 std::iota(
14823 std::next(Mask.begin(), I * SliceSize),
14824 std::next(Mask.begin(),
14825 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14826 0);
14827 } else {
14828 unsigned IVal =
14829 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14830 std::fill(
14831 std::next(Mask.begin(), I * SliceSize),
14832 std::next(Mask.begin(),
14833 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14834 IVal);
14835 }
14836 return true;
14837 };
14838 BVTy ShuffleBuilder(ScalarTy, Params...);
14839 ResTy Res = ResTy();
14841 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14843 Value *ExtractVecBase = nullptr;
14844 bool UseVecBaseAsInput = false;
14847 Type *OrigScalarTy = GatheredScalars.front()->getType();
14848 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14849 unsigned NumParts = TTI->getNumberOfParts(VecTy);
14850 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14851 VecTy->getNumElements() % NumParts != 0 ||
14853 VecTy->getNumElements() / NumParts))
14854 NumParts = 1;
14855 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14856 // Check for gathered extracts.
14857 bool Resized = false;
14858 ExtractShuffles =
14859 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14860 if (!ExtractShuffles.empty()) {
14861 SmallVector<const TreeEntry *> ExtractEntries;
14862 for (auto [Idx, I] : enumerate(ExtractMask)) {
14863 if (I == PoisonMaskElem)
14864 continue;
14865 if (const auto *TE = getTreeEntry(
14866 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14867 ExtractEntries.push_back(TE);
14868 }
14869 if (std::optional<ResTy> Delayed =
14870 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14871 // Delay emission of gathers which are not ready yet.
14872 PostponedGathers.insert(E);
14873 // Postpone gather emission, will be emitted after the end of the
14874 // process to keep correct order.
14875 return *Delayed;
14876 }
14877 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14878 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14879 ExtractVecBase = VecBase;
14880 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14881 if (VF == VecBaseTy->getNumElements() &&
14882 GatheredScalars.size() != VF) {
14883 Resized = true;
14884 GatheredScalars.append(VF - GatheredScalars.size(),
14885 PoisonValue::get(OrigScalarTy));
14886 }
14887 }
14888 }
14889 // Gather extracts after we check for full matched gathers only.
14890 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
14891 ((E->getOpcode() == Instruction::Load ||
14892 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14893 any_of(E->Scalars,
14894 [this](Value *V) {
14895 return isa<LoadInst>(V) && getTreeEntry(V);
14896 })) ||
14897 E->isAltShuffle() ||
14898 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14899 isSplat(E->Scalars) ||
14900 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14901 GatherShuffles =
14902 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14903 }
14904 if (!GatherShuffles.empty()) {
14905 if (std::optional<ResTy> Delayed =
14906 ShuffleBuilder.needToDelay(E, Entries)) {
14907 // Delay emission of gathers which are not ready yet.
14908 PostponedGathers.insert(E);
14909 // Postpone gather emission, will be emitted after the end of the
14910 // process to keep correct order.
14911 return *Delayed;
14912 }
14913 if (GatherShuffles.size() == 1 &&
14914 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14915 Entries.front().front()->isSame(E->Scalars)) {
14916 // Perfect match in the graph, will reuse the previously vectorized
14917 // node. Cost is 0.
14918 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14919 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14920 // Restore the mask for previous partially matched values.
14921 Mask.resize(E->Scalars.size());
14922 const TreeEntry *FrontTE = Entries.front().front();
14923 if (FrontTE->ReorderIndices.empty() &&
14924 ((FrontTE->ReuseShuffleIndices.empty() &&
14925 E->Scalars.size() == FrontTE->Scalars.size()) ||
14926 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14927 std::iota(Mask.begin(), Mask.end(), 0);
14928 } else {
14929 for (auto [I, V] : enumerate(E->Scalars)) {
14930 if (isa<PoisonValue>(V)) {
14932 continue;
14933 }
14934 Mask[I] = FrontTE->findLaneForValue(V);
14935 }
14936 }
14937 ShuffleBuilder.add(*FrontTE, Mask);
14938 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14939 SubVectorsMask);
14940 return Res;
14941 }
14942 if (!Resized) {
14943 if (GatheredScalars.size() != VF &&
14944 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14945 return any_of(TEs, [&](const TreeEntry *TE) {
14946 return TE->getVectorFactor() == VF;
14947 });
14948 }))
14949 GatheredScalars.append(VF - GatheredScalars.size(),
14950 PoisonValue::get(OrigScalarTy));
14951 }
14952 // Remove shuffled elements from list of gathers.
14953 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14954 if (Mask[I] != PoisonMaskElem)
14955 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
14956 }
14957 }
14958 }
14959 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14960 SmallVectorImpl<int> &ReuseMask,
14961 bool IsRootPoison) {
14962 // For splats with can emit broadcasts instead of gathers, so try to find
14963 // such sequences.
14964 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
14965 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14966 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
14967 SmallVector<int> UndefPos;
14968 DenseMap<Value *, unsigned> UniquePositions;
14969 // Gather unique non-const values and all constant values.
14970 // For repeated values, just shuffle them.
14971 int NumNonConsts = 0;
14972 int SinglePos = 0;
14973 for (auto [I, V] : enumerate(Scalars)) {
14974 if (isa<UndefValue>(V)) {
14975 if (!isa<PoisonValue>(V)) {
14976 ReuseMask[I] = I;
14977 UndefPos.push_back(I);
14978 }
14979 continue;
14980 }
14981 if (isConstant(V)) {
14982 ReuseMask[I] = I;
14983 continue;
14984 }
14985 ++NumNonConsts;
14986 SinglePos = I;
14987 Value *OrigV = V;
14988 Scalars[I] = PoisonValue::get(OrigScalarTy);
14989 if (IsSplat) {
14990 Scalars.front() = OrigV;
14991 ReuseMask[I] = 0;
14992 } else {
14993 const auto Res = UniquePositions.try_emplace(OrigV, I);
14994 Scalars[Res.first->second] = OrigV;
14995 ReuseMask[I] = Res.first->second;
14996 }
14997 }
14998 if (NumNonConsts == 1) {
14999 // Restore single insert element.
15000 if (IsSplat) {
15001 ReuseMask.assign(VF, PoisonMaskElem);
15002 std::swap(Scalars.front(), Scalars[SinglePos]);
15003 if (!UndefPos.empty() && UndefPos.front() == 0)
15004 Scalars.front() = UndefValue::get(OrigScalarTy);
15005 }
15006 ReuseMask[SinglePos] = SinglePos;
15007 } else if (!UndefPos.empty() && IsSplat) {
15008 // For undef values, try to replace them with the simple broadcast.
15009 // We can do it if the broadcasted value is guaranteed to be
15010 // non-poisonous, or by freezing the incoming scalar value first.
15011 auto *It = find_if(Scalars, [this, E](Value *V) {
15012 return !isa<UndefValue>(V) &&
15013 (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
15014 (E->UserTreeIndices.size() == 1 &&
15015 any_of(V->uses(), [E](const Use &U) {
15016 // Check if the value already used in the same operation in
15017 // one of the nodes already.
15018 return E->UserTreeIndices.front().EdgeIdx !=
15019 U.getOperandNo() &&
15020 is_contained(
15021 E->UserTreeIndices.front().UserTE->Scalars,
15022 U.getUser());
15023 })));
15024 });
15025 if (It != Scalars.end()) {
15026 // Replace undefs by the non-poisoned scalars and emit broadcast.
15027 int Pos = std::distance(Scalars.begin(), It);
15028 for (int I : UndefPos) {
15029 // Set the undef position to the non-poisoned scalar.
15030 ReuseMask[I] = Pos;
15031 // Replace the undef by the poison, in the mask it is replaced by
15032 // non-poisoned scalar already.
15033 if (I != Pos)
15034 Scalars[I] = PoisonValue::get(OrigScalarTy);
15035 }
15036 } else {
15037 // Replace undefs by the poisons, emit broadcast and then emit
15038 // freeze.
15039 for (int I : UndefPos) {
15040 ReuseMask[I] = PoisonMaskElem;
15041 if (isa<UndefValue>(Scalars[I]))
15042 Scalars[I] = PoisonValue::get(OrigScalarTy);
15043 }
15044 NeedFreeze = true;
15045 }
15046 }
15047 };
15048 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15049 bool IsNonPoisoned = true;
15050 bool IsUsedInExpr = true;
15051 Value *Vec1 = nullptr;
15052 if (!ExtractShuffles.empty()) {
15053 // Gather of extractelements can be represented as just a shuffle of
15054 // a single/two vectors the scalars are extracted from.
15055 // Find input vectors.
15056 Value *Vec2 = nullptr;
15057 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15058 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15059 ExtractMask[I] = PoisonMaskElem;
15060 }
15061 if (UseVecBaseAsInput) {
15062 Vec1 = ExtractVecBase;
15063 } else {
15064 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15065 if (ExtractMask[I] == PoisonMaskElem)
15066 continue;
15067 if (isa<UndefValue>(E->Scalars[I]))
15068 continue;
15069 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15070 Value *VecOp = EI->getVectorOperand();
15071 if (const auto *TE = getTreeEntry(VecOp))
15072 if (TE->VectorizedValue)
15073 VecOp = TE->VectorizedValue;
15074 if (!Vec1) {
15075 Vec1 = VecOp;
15076 } else if (Vec1 != VecOp) {
15077 assert((!Vec2 || Vec2 == VecOp) &&
15078 "Expected only 1 or 2 vectors shuffle.");
15079 Vec2 = VecOp;
15080 }
15081 }
15082 }
15083 if (Vec2) {
15084 IsUsedInExpr = false;
15085 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15086 isGuaranteedNotToBePoison(Vec2, AC);
15087 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15088 } else if (Vec1) {
15089 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15090 IsUsedInExpr &= FindReusedSplat(
15091 ExtractMask,
15092 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15093 ExtractMask.size(), IsNotPoisonedVec);
15094 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15095 IsNonPoisoned &= IsNotPoisonedVec;
15096 } else {
15097 IsUsedInExpr = false;
15098 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15099 /*ForExtracts=*/true);
15100 }
15101 }
15102 if (!GatherShuffles.empty()) {
15103 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15104 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15105 for (const auto [I, TEs] : enumerate(Entries)) {
15106 if (TEs.empty()) {
15107 assert(!GatherShuffles[I] &&
15108 "No shuffles with empty entries list expected.");
15109 continue;
15110 }
15111 assert((TEs.size() == 1 || TEs.size() == 2) &&
15112 "Expected shuffle of 1 or 2 entries.");
15113 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15114 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15115 VecMask.assign(VecMask.size(), PoisonMaskElem);
15116 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15117 if (TEs.size() == 1) {
15118 bool IsNotPoisonedVec =
15119 TEs.front()->VectorizedValue
15120 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15121 : true;
15122 IsUsedInExpr &=
15123 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15124 SliceSize, IsNotPoisonedVec);
15125 ShuffleBuilder.add(*TEs.front(), VecMask);
15126 IsNonPoisoned &= IsNotPoisonedVec;
15127 } else {
15128 IsUsedInExpr = false;
15129 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15130 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15131 IsNonPoisoned &=
15132 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15133 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15134 }
15135 }
15136 }
15137 // Try to figure out best way to combine values: build a shuffle and insert
15138 // elements or just build several shuffles.
15139 // Insert non-constant scalars.
15140 SmallVector<Value *> NonConstants(GatheredScalars);
15141 int EMSz = ExtractMask.size();
15142 int MSz = Mask.size();
15143 // Try to build constant vector and shuffle with it only if currently we
15144 // have a single permutation and more than 1 scalar constants.
15145 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15146 bool IsIdentityShuffle =
15147 ((UseVecBaseAsInput ||
15148 all_of(ExtractShuffles,
15149 [](const std::optional<TTI::ShuffleKind> &SK) {
15150 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15152 })) &&
15153 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15154 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15155 (!GatherShuffles.empty() &&
15156 all_of(GatherShuffles,
15157 [](const std::optional<TTI::ShuffleKind> &SK) {
15158 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15160 }) &&
15161 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15163 bool EnoughConstsForShuffle =
15164 IsSingleShuffle &&
15165 (none_of(GatheredScalars,
15166 [](Value *V) {
15167 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15168 }) ||
15169 any_of(GatheredScalars,
15170 [](Value *V) {
15171 return isa<Constant>(V) && !isa<UndefValue>(V);
15172 })) &&
15173 (!IsIdentityShuffle ||
15174 (GatheredScalars.size() == 2 &&
15175 any_of(GatheredScalars,
15176 [](Value *V) { return !isa<UndefValue>(V); })) ||
15177 count_if(GatheredScalars, [](Value *V) {
15178 return isa<Constant>(V) && !isa<PoisonValue>(V);
15179 }) > 1);
15180 // NonConstants array contains just non-constant values, GatheredScalars
15181 // contains only constant to build final vector and then shuffle.
15182 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15183 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15184 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15185 else
15186 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15187 }
15188 // Generate constants for final shuffle and build a mask for them.
15189 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15190 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15191 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15192 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15193 ShuffleBuilder.add(BV, BVMask);
15194 }
15195 if (all_of(NonConstants, [=](Value *V) {
15196 return isa<PoisonValue>(V) ||
15197 (IsSingleShuffle && ((IsIdentityShuffle &&
15198 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15199 }))
15200 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15201 SubVectorsMask);
15202 else
15203 Res = ShuffleBuilder.finalize(
15204 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15205 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15206 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15207 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15208 });
15209 } else if (!allConstant(GatheredScalars)) {
15210 // Gather unique scalars and all constants.
15211 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15212 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15213 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15214 ShuffleBuilder.add(BV, ReuseMask);
15215 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15216 SubVectorsMask);
15217 } else {
15218 // Gather all constants.
15219 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15220 for (auto [I, V] : enumerate(GatheredScalars)) {
15221 if (!isa<PoisonValue>(V))
15222 Mask[I] = I;
15223 }
15224 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15225 ShuffleBuilder.add(BV, Mask);
15226 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15227 SubVectorsMask);
15228 }
15229
15230 if (NeedFreeze)
15231 Res = ShuffleBuilder.createFreeze(Res);
15232 return Res;
15233}
15234
15235Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15236 bool PostponedPHIs) {
15237 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15238 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15239 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15240 Builder, *this);
15241}
15242
15243/// \returns \p I after propagating metadata from \p VL only for instructions in
15244/// \p VL.
15247 for (Value *V : VL)
15248 if (isa<Instruction>(V))
15249 Insts.push_back(V);
15250 return llvm::propagateMetadata(Inst, Insts);
15251}
15252
15253Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15254 IRBuilderBase::InsertPointGuard Guard(Builder);
15255
15256 if (E->VectorizedValue &&
15257 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15258 E->isAltShuffle())) {
15259 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15260 return E->VectorizedValue;
15261 }
15262
15263 Value *V = E->Scalars.front();
15264 Type *ScalarTy = V->getType();
15265 if (!isa<CmpInst>(V))
15266 ScalarTy = getValueType(V);
15267 auto It = MinBWs.find(E);
15268 if (It != MinBWs.end()) {
15269 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15270 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15271 if (VecTy)
15272 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15273 }
15274 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15275 if (E->isGather()) {
15276 // Set insert point for non-reduction initial nodes.
15277 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15278 setInsertPointAfterBundle(E);
15279 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15280 E->VectorizedValue = Vec;
15281 return Vec;
15282 }
15283
15284 bool IsReverseOrder =
15285 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15286 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15287 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15288 if (E->getOpcode() == Instruction::Store &&
15289 E->State == TreeEntry::Vectorize) {
15291 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15292 E->ReorderIndices.size());
15293 ShuffleBuilder.add(V, Mask);
15294 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15295 ShuffleBuilder.addOrdered(V, {});
15296 } else {
15297 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15298 }
15300 E->CombinedEntriesWithIndices.size());
15301 transform(
15302 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15303 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15304 });
15305 assert(
15306 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15307 "Expected either combined subnodes or reordering");
15308 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15309 };
15310
15311 assert(!E->isGather() && "Unhandled state");
15312 unsigned ShuffleOrOp =
15313 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15314 Instruction *VL0 = E->getMainOp();
15315 auto GetOperandSignedness = [&](unsigned Idx) {
15316 const TreeEntry *OpE = getOperandEntry(E, Idx);
15317 bool IsSigned = false;
15318 auto It = MinBWs.find(OpE);
15319 if (It != MinBWs.end())
15320 IsSigned = It->second.second;
15321 else
15322 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15323 if (isa<PoisonValue>(V))
15324 return false;
15325 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15326 });
15327 return IsSigned;
15328 };
15329 switch (ShuffleOrOp) {
15330 case Instruction::PHI: {
15331 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15332 E != VectorizableTree.front().get() ||
15333 !E->UserTreeIndices.empty()) &&
15334 "PHI reordering is free.");
15335 if (PostponedPHIs && E->VectorizedValue)
15336 return E->VectorizedValue;
15337 auto *PH = cast<PHINode>(VL0);
15338 Builder.SetInsertPoint(PH->getParent(),
15339 PH->getParent()->getFirstNonPHIIt());
15340 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15341 if (PostponedPHIs || !E->VectorizedValue) {
15342 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15343 E->PHI = NewPhi;
15344 Value *V = NewPhi;
15345
15346 // Adjust insertion point once all PHI's have been generated.
15347 Builder.SetInsertPoint(PH->getParent(),
15348 PH->getParent()->getFirstInsertionPt());
15349 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15350
15351 V = FinalShuffle(V, E);
15352
15353 E->VectorizedValue = V;
15354 if (PostponedPHIs)
15355 return V;
15356 }
15357 PHINode *NewPhi = cast<PHINode>(E->PHI);
15358 // If phi node is fully emitted - exit.
15359 if (NewPhi->getNumIncomingValues() != 0)
15360 return NewPhi;
15361
15362 // PHINodes may have multiple entries from the same block. We want to
15363 // visit every block once.
15365
15366 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15368 BasicBlock *IBB = PH->getIncomingBlock(I);
15369
15370 // Stop emission if all incoming values are generated.
15371 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15372 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15373 return NewPhi;
15374 }
15375
15376 if (!VisitedBBs.insert(IBB).second) {
15377 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15378 continue;
15379 }
15380
15381 Builder.SetInsertPoint(IBB->getTerminator());
15382 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15383 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15384 if (VecTy != Vec->getType()) {
15385 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15386 MinBWs.contains(getOperandEntry(E, I))) &&
15387 "Expected item in MinBWs.");
15388 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15389 }
15390 NewPhi->addIncoming(Vec, IBB);
15391 }
15392
15393 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15394 "Invalid number of incoming values");
15395 assert(E->VectorizedValue && "Expected vectorized value.");
15396 return E->VectorizedValue;
15397 }
15398
15399 case Instruction::ExtractElement: {
15400 Value *V = E->getSingleOperand(0);
15401 if (const TreeEntry *TE = getTreeEntry(V))
15402 V = TE->VectorizedValue;
15403 setInsertPointAfterBundle(E);
15404 V = FinalShuffle(V, E);
15405 E->VectorizedValue = V;
15406 return V;
15407 }
15408 case Instruction::ExtractValue: {
15409 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15410 Builder.SetInsertPoint(LI);
15411 Value *Ptr = LI->getPointerOperand();
15412 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15413 Value *NewV = ::propagateMetadata(V, E->Scalars);
15414 NewV = FinalShuffle(NewV, E);
15415 E->VectorizedValue = NewV;
15416 return NewV;
15417 }
15418 case Instruction::InsertElement: {
15419 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15420 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15421 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15422 ArrayRef<Value *> Op = E->getOperand(1);
15423 Type *ScalarTy = Op.front()->getType();
15424 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15425 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15426 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15427 assert(Res.first > 0 && "Expected item in MinBWs.");
15428 V = Builder.CreateIntCast(
15429 V,
15431 ScalarTy,
15432 cast<FixedVectorType>(V->getType())->getNumElements()),
15433 Res.second);
15434 }
15435
15436 // Create InsertVector shuffle if necessary
15437 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15438 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15439 }));
15440 const unsigned NumElts =
15441 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15442 const unsigned NumScalars = E->Scalars.size();
15443
15444 unsigned Offset = *getElementIndex(VL0);
15445 assert(Offset < NumElts && "Failed to find vector index offset");
15446
15447 // Create shuffle to resize vector
15449 if (!E->ReorderIndices.empty()) {
15450 inversePermutation(E->ReorderIndices, Mask);
15451 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15452 } else {
15453 Mask.assign(NumElts, PoisonMaskElem);
15454 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15455 }
15456 // Create InsertVector shuffle if necessary
15457 bool IsIdentity = true;
15458 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15459 Mask.swap(PrevMask);
15460 for (unsigned I = 0; I < NumScalars; ++I) {
15461 Value *Scalar = E->Scalars[PrevMask[I]];
15462 unsigned InsertIdx = *getElementIndex(Scalar);
15463 IsIdentity &= InsertIdx - Offset == I;
15464 Mask[InsertIdx - Offset] = I;
15465 }
15466 if (!IsIdentity || NumElts != NumScalars) {
15467 Value *V2 = nullptr;
15468 bool IsVNonPoisonous =
15470 SmallVector<int> InsertMask(Mask);
15471 if (NumElts != NumScalars && Offset == 0) {
15472 // Follow all insert element instructions from the current buildvector
15473 // sequence.
15474 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15475 do {
15476 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15477 if (!InsertIdx)
15478 break;
15479 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15480 InsertMask[*InsertIdx] = *InsertIdx;
15481 if (!Ins->hasOneUse())
15482 break;
15483 Ins = dyn_cast_or_null<InsertElementInst>(
15484 Ins->getUniqueUndroppableUser());
15485 } while (Ins);
15486 SmallBitVector UseMask =
15487 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15488 SmallBitVector IsFirstPoison =
15489 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15490 SmallBitVector IsFirstUndef =
15491 isUndefVector(FirstInsert->getOperand(0), UseMask);
15492 if (!IsFirstPoison.all()) {
15493 unsigned Idx = 0;
15494 for (unsigned I = 0; I < NumElts; I++) {
15495 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15496 IsFirstUndef.test(I)) {
15497 if (IsVNonPoisonous) {
15498 InsertMask[I] = I < NumScalars ? I : 0;
15499 continue;
15500 }
15501 if (!V2)
15502 V2 = UndefValue::get(V->getType());
15503 if (Idx >= NumScalars)
15504 Idx = NumScalars - 1;
15505 InsertMask[I] = NumScalars + Idx;
15506 ++Idx;
15507 } else if (InsertMask[I] != PoisonMaskElem &&
15508 Mask[I] == PoisonMaskElem) {
15509 InsertMask[I] = PoisonMaskElem;
15510 }
15511 }
15512 } else {
15513 InsertMask = Mask;
15514 }
15515 }
15516 if (!V2)
15517 V2 = PoisonValue::get(V->getType());
15518 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15519 if (auto *I = dyn_cast<Instruction>(V)) {
15520 GatherShuffleExtractSeq.insert(I);
15521 CSEBlocks.insert(I->getParent());
15522 }
15523 }
15524
15525 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15526 for (unsigned I = 0; I < NumElts; I++) {
15527 if (Mask[I] != PoisonMaskElem)
15528 InsertMask[Offset + I] = I;
15529 }
15530 SmallBitVector UseMask =
15531 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15532 SmallBitVector IsFirstUndef =
15533 isUndefVector(FirstInsert->getOperand(0), UseMask);
15534 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15535 NumElts != NumScalars) {
15536 if (IsFirstUndef.all()) {
15537 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15538 SmallBitVector IsFirstPoison =
15539 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15540 if (!IsFirstPoison.all()) {
15541 for (unsigned I = 0; I < NumElts; I++) {
15542 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15543 InsertMask[I] = I + NumElts;
15544 }
15545 }
15546 V = Builder.CreateShuffleVector(
15547 V,
15548 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15549 : FirstInsert->getOperand(0),
15550 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15551 if (auto *I = dyn_cast<Instruction>(V)) {
15552 GatherShuffleExtractSeq.insert(I);
15553 CSEBlocks.insert(I->getParent());
15554 }
15555 }
15556 } else {
15557 SmallBitVector IsFirstPoison =
15558 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15559 for (unsigned I = 0; I < NumElts; I++) {
15560 if (InsertMask[I] == PoisonMaskElem)
15561 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15562 else
15563 InsertMask[I] += NumElts;
15564 }
15565 V = Builder.CreateShuffleVector(
15566 FirstInsert->getOperand(0), V, InsertMask,
15567 cast<Instruction>(E->Scalars.back())->getName());
15568 if (auto *I = dyn_cast<Instruction>(V)) {
15569 GatherShuffleExtractSeq.insert(I);
15570 CSEBlocks.insert(I->getParent());
15571 }
15572 }
15573 }
15574
15575 ++NumVectorInstructions;
15576 E->VectorizedValue = V;
15577 return V;
15578 }
15579 case Instruction::ZExt:
15580 case Instruction::SExt:
15581 case Instruction::FPToUI:
15582 case Instruction::FPToSI:
15583 case Instruction::FPExt:
15584 case Instruction::PtrToInt:
15585 case Instruction::IntToPtr:
15586 case Instruction::SIToFP:
15587 case Instruction::UIToFP:
15588 case Instruction::Trunc:
15589 case Instruction::FPTrunc:
15590 case Instruction::BitCast: {
15591 setInsertPointAfterBundle(E);
15592
15593 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15594 if (E->VectorizedValue) {
15595 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15596 return E->VectorizedValue;
15597 }
15598
15599 auto *CI = cast<CastInst>(VL0);
15600 Instruction::CastOps VecOpcode = CI->getOpcode();
15601 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15602 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15603 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15604 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15605 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15606 // Check if the values are candidates to demote.
15607 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15608 if (SrcIt != MinBWs.end())
15609 SrcBWSz = SrcIt->second.first;
15610 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15611 if (BWSz == SrcBWSz) {
15612 VecOpcode = Instruction::BitCast;
15613 } else if (BWSz < SrcBWSz) {
15614 VecOpcode = Instruction::Trunc;
15615 } else if (It != MinBWs.end()) {
15616 assert(BWSz > SrcBWSz && "Invalid cast!");
15617 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15618 } else if (SrcIt != MinBWs.end()) {
15619 assert(BWSz > SrcBWSz && "Invalid cast!");
15620 VecOpcode =
15621 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15622 }
15623 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15624 !SrcIt->second.second) {
15625 VecOpcode = Instruction::UIToFP;
15626 }
15627 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15628 ? InVec
15629 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15630 V = FinalShuffle(V, E);
15631
15632 E->VectorizedValue = V;
15633 ++NumVectorInstructions;
15634 return V;
15635 }
15636 case Instruction::FCmp:
15637 case Instruction::ICmp: {
15638 setInsertPointAfterBundle(E);
15639
15640 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15641 if (E->VectorizedValue) {
15642 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15643 return E->VectorizedValue;
15644 }
15645 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15646 if (E->VectorizedValue) {
15647 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15648 return E->VectorizedValue;
15649 }
15650 if (L->getType() != R->getType()) {
15651 assert((getOperandEntry(E, 0)->isGather() ||
15652 getOperandEntry(E, 1)->isGather() ||
15653 MinBWs.contains(getOperandEntry(E, 0)) ||
15654 MinBWs.contains(getOperandEntry(E, 1))) &&
15655 "Expected item in MinBWs.");
15656 if (cast<VectorType>(L->getType())
15657 ->getElementType()
15658 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15659 ->getElementType()
15660 ->getIntegerBitWidth()) {
15661 Type *CastTy = R->getType();
15662 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15663 } else {
15664 Type *CastTy = L->getType();
15665 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15666 }
15667 }
15668
15669 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15670 Value *V = Builder.CreateCmp(P0, L, R);
15671 propagateIRFlags(V, E->Scalars, VL0);
15672 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15673 ICmp->setSameSign(/*B=*/false);
15674 // Do not cast for cmps.
15675 VecTy = cast<FixedVectorType>(V->getType());
15676 V = FinalShuffle(V, E);
15677
15678 E->VectorizedValue = V;
15679 ++NumVectorInstructions;
15680 return V;
15681 }
15682 case Instruction::Select: {
15683 setInsertPointAfterBundle(E);
15684
15685 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15686 if (E->VectorizedValue) {
15687 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15688 return E->VectorizedValue;
15689 }
15690 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15691 if (E->VectorizedValue) {
15692 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15693 return E->VectorizedValue;
15694 }
15695 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15696 if (E->VectorizedValue) {
15697 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15698 return E->VectorizedValue;
15699 }
15700 if (True->getType() != VecTy || False->getType() != VecTy) {
15701 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15702 getOperandEntry(E, 2)->isGather() ||
15703 MinBWs.contains(getOperandEntry(E, 1)) ||
15704 MinBWs.contains(getOperandEntry(E, 2))) &&
15705 "Expected item in MinBWs.");
15706 if (True->getType() != VecTy)
15707 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15708 if (False->getType() != VecTy)
15709 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15710 }
15711
15712 unsigned CondNumElements = getNumElements(Cond->getType());
15713 unsigned TrueNumElements = getNumElements(True->getType());
15714 assert(TrueNumElements >= CondNumElements &&
15715 TrueNumElements % CondNumElements == 0 &&
15716 "Cannot vectorize Instruction::Select");
15717 assert(TrueNumElements == getNumElements(False->getType()) &&
15718 "Cannot vectorize Instruction::Select");
15719 if (CondNumElements != TrueNumElements) {
15720 // When the return type is i1 but the source is fixed vector type, we
15721 // need to duplicate the condition value.
15722 Cond = Builder.CreateShuffleVector(
15723 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15724 CondNumElements));
15725 }
15726 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15727 "Cannot vectorize Instruction::Select");
15728 Value *V = Builder.CreateSelect(Cond, True, False);
15729 V = FinalShuffle(V, E);
15730
15731 E->VectorizedValue = V;
15732 ++NumVectorInstructions;
15733 return V;
15734 }
15735 case Instruction::FNeg: {
15736 setInsertPointAfterBundle(E);
15737
15738 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15739
15740 if (E->VectorizedValue) {
15741 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15742 return E->VectorizedValue;
15743 }
15744
15745 Value *V = Builder.CreateUnOp(
15746 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15747 propagateIRFlags(V, E->Scalars, VL0);
15748 if (auto *I = dyn_cast<Instruction>(V))
15749 V = ::propagateMetadata(I, E->Scalars);
15750
15751 V = FinalShuffle(V, E);
15752
15753 E->VectorizedValue = V;
15754 ++NumVectorInstructions;
15755
15756 return V;
15757 }
15758 case Instruction::Freeze: {
15759 setInsertPointAfterBundle(E);
15760
15761 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15762
15763 if (E->VectorizedValue) {
15764 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15765 return E->VectorizedValue;
15766 }
15767
15768 if (Op->getType() != VecTy) {
15769 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15770 MinBWs.contains(getOperandEntry(E, 0))) &&
15771 "Expected item in MinBWs.");
15772 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15773 }
15774 Value *V = Builder.CreateFreeze(Op);
15775 V = FinalShuffle(V, E);
15776
15777 E->VectorizedValue = V;
15778 ++NumVectorInstructions;
15779
15780 return V;
15781 }
15782 case Instruction::Add:
15783 case Instruction::FAdd:
15784 case Instruction::Sub:
15785 case Instruction::FSub:
15786 case Instruction::Mul:
15787 case Instruction::FMul:
15788 case Instruction::UDiv:
15789 case Instruction::SDiv:
15790 case Instruction::FDiv:
15791 case Instruction::URem:
15792 case Instruction::SRem:
15793 case Instruction::FRem:
15794 case Instruction::Shl:
15795 case Instruction::LShr:
15796 case Instruction::AShr:
15797 case Instruction::And:
15798 case Instruction::Or:
15799 case Instruction::Xor: {
15800 setInsertPointAfterBundle(E);
15801
15802 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15803 if (E->VectorizedValue) {
15804 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15805 return E->VectorizedValue;
15806 }
15807 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15808 if (E->VectorizedValue) {
15809 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15810 return E->VectorizedValue;
15811 }
15812 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15813 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15814 ArrayRef<Value *> Ops = E->getOperand(I);
15815 if (all_of(Ops, [&](Value *Op) {
15816 auto *CI = dyn_cast<ConstantInt>(Op);
15817 return CI && CI->getValue().countr_one() >= It->second.first;
15818 })) {
15819 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15820 E->VectorizedValue = V;
15821 ++NumVectorInstructions;
15822 return V;
15823 }
15824 }
15825 }
15826 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15827 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15828 getOperandEntry(E, 1)->isGather() ||
15829 MinBWs.contains(getOperandEntry(E, 0)) ||
15830 MinBWs.contains(getOperandEntry(E, 1))) &&
15831 "Expected item in MinBWs.");
15832 if (LHS->getType() != VecTy)
15833 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15834 if (RHS->getType() != VecTy)
15835 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15836 }
15837
15838 Value *V = Builder.CreateBinOp(
15839 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15840 RHS);
15841 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15842 if (auto *I = dyn_cast<Instruction>(V)) {
15843 V = ::propagateMetadata(I, E->Scalars);
15844 // Drop nuw flags for abs(sub(commutative), true).
15845 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15846 any_of(E->Scalars, [](Value *V) {
15847 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15848 }))
15849 I->setHasNoUnsignedWrap(/*b=*/false);
15850 }
15851
15852 V = FinalShuffle(V, E);
15853
15854 E->VectorizedValue = V;
15855 ++NumVectorInstructions;
15856
15857 return V;
15858 }
15859 case Instruction::Load: {
15860 // Loads are inserted at the head of the tree because we don't want to
15861 // sink them all the way down past store instructions.
15862 setInsertPointAfterBundle(E);
15863
15864 LoadInst *LI = cast<LoadInst>(VL0);
15865 Instruction *NewLI;
15866 Value *PO = LI->getPointerOperand();
15867 if (E->State == TreeEntry::Vectorize) {
15868 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15869 } else if (E->State == TreeEntry::StridedVectorize) {
15870 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15871 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15872 PO = IsReverseOrder ? PtrN : Ptr0;
15873 std::optional<int> Diff = getPointersDiff(
15874 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15875 Type *StrideTy = DL->getIndexType(PO->getType());
15876 Value *StrideVal;
15877 if (Diff) {
15878 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15879 StrideVal =
15880 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15881 DL->getTypeAllocSize(ScalarTy));
15882 } else {
15883 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15884 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15885 return cast<LoadInst>(V)->getPointerOperand();
15886 });
15887 OrdersType Order;
15888 std::optional<Value *> Stride =
15889 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15890 &*Builder.GetInsertPoint());
15891 Value *NewStride =
15892 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15893 StrideVal = Builder.CreateMul(
15894 NewStride,
15895 ConstantInt::get(
15896 StrideTy,
15897 (IsReverseOrder ? -1 : 1) *
15898 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15899 }
15900 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15901 auto *Inst = Builder.CreateIntrinsic(
15902 Intrinsic::experimental_vp_strided_load,
15903 {VecTy, PO->getType(), StrideTy},
15904 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15905 Builder.getInt32(E->Scalars.size())});
15906 Inst->addParamAttr(
15907 /*ArgNo=*/0,
15908 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15909 NewLI = Inst;
15910 } else {
15911 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15912 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15913 if (E->VectorizedValue) {
15914 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15915 return E->VectorizedValue;
15916 }
15917 if (isa<FixedVectorType>(ScalarTy)) {
15918 assert(SLPReVec && "FixedVectorType is not expected.");
15919 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15920 // to expand VecPtr if ScalarTy is a vector type.
15921 unsigned ScalarTyNumElements =
15922 cast<FixedVectorType>(ScalarTy)->getNumElements();
15923 unsigned VecTyNumElements =
15924 cast<FixedVectorType>(VecTy)->getNumElements();
15925 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15926 "Cannot expand getelementptr.");
15927 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15928 SmallVector<Constant *> Indices(VecTyNumElements);
15929 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15930 return Builder.getInt64(I % ScalarTyNumElements);
15931 });
15932 VecPtr = Builder.CreateGEP(
15933 VecTy->getElementType(),
15934 Builder.CreateShuffleVector(
15935 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15936 ConstantVector::get(Indices));
15937 }
15938 // Use the minimum alignment of the gathered loads.
15939 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15940 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15941 }
15942 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15943
15944 V = FinalShuffle(V, E);
15945 E->VectorizedValue = V;
15946 ++NumVectorInstructions;
15947 return V;
15948 }
15949 case Instruction::Store: {
15950 auto *SI = cast<StoreInst>(VL0);
15951
15952 setInsertPointAfterBundle(E);
15953
15954 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15955 if (VecValue->getType() != VecTy)
15956 VecValue =
15957 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15958 VecValue = FinalShuffle(VecValue, E);
15959
15960 Value *Ptr = SI->getPointerOperand();
15961 Instruction *ST;
15962 if (E->State == TreeEntry::Vectorize) {
15963 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
15964 } else {
15965 assert(E->State == TreeEntry::StridedVectorize &&
15966 "Expected either strided or consecutive stores.");
15967 if (!E->ReorderIndices.empty()) {
15968 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15969 Ptr = SI->getPointerOperand();
15970 }
15971 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15972 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
15973 auto *Inst = Builder.CreateIntrinsic(
15974 Intrinsic::experimental_vp_strided_store,
15975 {VecTy, Ptr->getType(), StrideTy},
15976 {VecValue, Ptr,
15977 ConstantInt::get(
15978 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15979 Builder.getAllOnesMask(VecTy->getElementCount()),
15980 Builder.getInt32(E->Scalars.size())});
15981 Inst->addParamAttr(
15982 /*ArgNo=*/1,
15983 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15984 ST = Inst;
15985 }
15986
15987 Value *V = ::propagateMetadata(ST, E->Scalars);
15988
15989 E->VectorizedValue = V;
15990 ++NumVectorInstructions;
15991 return V;
15992 }
15993 case Instruction::GetElementPtr: {
15994 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15995 setInsertPointAfterBundle(E);
15996
15997 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15998 if (E->VectorizedValue) {
15999 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16000 return E->VectorizedValue;
16001 }
16002
16003 SmallVector<Value *> OpVecs;
16004 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
16005 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16006 if (E->VectorizedValue) {
16007 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16008 return E->VectorizedValue;
16009 }
16010 OpVecs.push_back(OpVec);
16011 }
16012
16013 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16014 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16016 for (Value *V : E->Scalars) {
16017 if (isa<GetElementPtrInst>(V))
16018 GEPs.push_back(V);
16019 }
16020 V = ::propagateMetadata(I, GEPs);
16021 }
16022
16023 V = FinalShuffle(V, E);
16024
16025 E->VectorizedValue = V;
16026 ++NumVectorInstructions;
16027
16028 return V;
16029 }
16030 case Instruction::Call: {
16031 CallInst *CI = cast<CallInst>(VL0);
16032 setInsertPointAfterBundle(E);
16033
16035
16037 CI, ID, VecTy->getNumElements(),
16038 It != MinBWs.end() ? It->second.first : 0, TTI);
16039 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16040 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
16041 VecCallCosts.first <= VecCallCosts.second;
16042
16043 Value *ScalarArg = nullptr;
16044 SmallVector<Value *> OpVecs;
16045 SmallVector<Type *, 2> TysForDecl;
16046 // Add return type if intrinsic is overloaded on it.
16047 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
16048 TysForDecl.push_back(VecTy);
16049 auto *CEI = cast<CallInst>(VL0);
16050 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16051 ValueList OpVL;
16052 // Some intrinsics have scalar arguments. This argument should not be
16053 // vectorized.
16054 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16055 ScalarArg = CEI->getArgOperand(I);
16056 // if decided to reduce bitwidth of abs intrinsic, it second argument
16057 // must be set false (do not return poison, if value issigned min).
16058 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16059 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16060 ScalarArg = Builder.getFalse();
16061 OpVecs.push_back(ScalarArg);
16063 TysForDecl.push_back(ScalarArg->getType());
16064 continue;
16065 }
16066
16067 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16068 if (E->VectorizedValue) {
16069 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16070 return E->VectorizedValue;
16071 }
16072 ScalarArg = CEI->getArgOperand(I);
16073 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16074 ScalarArg->getType()->getScalarType() &&
16075 It == MinBWs.end()) {
16076 auto *CastTy =
16077 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16078 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16079 } else if (It != MinBWs.end()) {
16080 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16081 }
16082 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16083 OpVecs.push_back(OpVec);
16084 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16085 TysForDecl.push_back(OpVec->getType());
16086 }
16087
16088 Function *CF;
16089 if (!UseIntrinsic) {
16090 VFShape Shape =
16093 static_cast<unsigned>(VecTy->getNumElements())),
16094 false /*HasGlobalPred*/);
16095 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16096 } else {
16097 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16098 }
16099
16101 CI->getOperandBundlesAsDefs(OpBundles);
16102 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16103
16104 propagateIRFlags(V, E->Scalars, VL0);
16105 V = FinalShuffle(V, E);
16106
16107 E->VectorizedValue = V;
16108 ++NumVectorInstructions;
16109 return V;
16110 }
16111 case Instruction::ShuffleVector: {
16112 Value *V;
16113 if (SLPReVec && !E->isAltShuffle()) {
16114 setInsertPointAfterBundle(E);
16115 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16116 if (E->VectorizedValue) {
16117 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16118 return E->VectorizedValue;
16119 }
16120 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16121 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16122 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16123 "Not supported shufflevector usage.");
16124 SmallVector<int> NewMask(ThisMask.size());
16125 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16126 return SVSrc->getShuffleMask()[Mask];
16127 });
16128 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16129 } else {
16130 V = Builder.CreateShuffleVector(Src, ThisMask);
16131 }
16132 propagateIRFlags(V, E->Scalars, VL0);
16133 if (auto *I = dyn_cast<Instruction>(V))
16134 V = ::propagateMetadata(I, E->Scalars);
16135 V = FinalShuffle(V, E);
16136 } else {
16137 assert(E->isAltShuffle() &&
16138 ((Instruction::isBinaryOp(E->getOpcode()) &&
16139 Instruction::isBinaryOp(E->getAltOpcode())) ||
16140 (Instruction::isCast(E->getOpcode()) &&
16141 Instruction::isCast(E->getAltOpcode())) ||
16142 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16143 "Invalid Shuffle Vector Operand");
16144
16145 Value *LHS = nullptr, *RHS = nullptr;
16146 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16147 setInsertPointAfterBundle(E);
16148 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16149 if (E->VectorizedValue) {
16150 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16151 return E->VectorizedValue;
16152 }
16153 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16154 } else {
16155 setInsertPointAfterBundle(E);
16156 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16157 }
16158 if (E->VectorizedValue) {
16159 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16160 return E->VectorizedValue;
16161 }
16162 if (LHS && RHS &&
16163 ((Instruction::isBinaryOp(E->getOpcode()) &&
16164 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16165 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16166 assert((It != MinBWs.end() ||
16167 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16168 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16169 MinBWs.contains(getOperandEntry(E, 0)) ||
16170 MinBWs.contains(getOperandEntry(E, 1))) &&
16171 "Expected item in MinBWs.");
16172 Type *CastTy = VecTy;
16173 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16174 if (cast<VectorType>(LHS->getType())
16175 ->getElementType()
16176 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16177 ->getElementType()
16178 ->getIntegerBitWidth())
16179 CastTy = RHS->getType();
16180 else
16181 CastTy = LHS->getType();
16182 }
16183 if (LHS->getType() != CastTy)
16184 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16185 if (RHS->getType() != CastTy)
16186 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16187 }
16188
16189 Value *V0, *V1;
16190 if (Instruction::isBinaryOp(E->getOpcode())) {
16191 V0 = Builder.CreateBinOp(
16192 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16193 V1 = Builder.CreateBinOp(
16194 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16195 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16196 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16197 auto *AltCI = cast<CmpInst>(E->getAltOp());
16198 CmpInst::Predicate AltPred = AltCI->getPredicate();
16199 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16200 } else {
16201 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16202 unsigned SrcBWSz = DL->getTypeSizeInBits(
16203 cast<VectorType>(LHS->getType())->getElementType());
16204 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16205 if (BWSz <= SrcBWSz) {
16206 if (BWSz < SrcBWSz)
16207 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16208 assert(LHS->getType() == VecTy &&
16209 "Expected same type as operand.");
16210 if (auto *I = dyn_cast<Instruction>(LHS))
16211 LHS = ::propagateMetadata(I, E->Scalars);
16212 LHS = FinalShuffle(LHS, E);
16213 E->VectorizedValue = LHS;
16214 ++NumVectorInstructions;
16215 return LHS;
16216 }
16217 }
16218 V0 = Builder.CreateCast(
16219 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16220 V1 = Builder.CreateCast(
16221 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16222 }
16223 // Add V0 and V1 to later analysis to try to find and remove matching
16224 // instruction, if any.
16225 for (Value *V : {V0, V1}) {
16226 if (auto *I = dyn_cast<Instruction>(V)) {
16227 GatherShuffleExtractSeq.insert(I);
16228 CSEBlocks.insert(I->getParent());
16229 }
16230 }
16231
16232 // Create shuffle to take alternate operations from the vector.
16233 // Also, gather up main and alt scalar ops to propagate IR flags to
16234 // each vector operation.
16235 ValueList OpScalars, AltScalars;
16237 E->buildAltOpShuffleMask(
16238 [E, this](Instruction *I) {
16239 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16240 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16241 *TLI);
16242 },
16243 Mask, &OpScalars, &AltScalars);
16244
16245 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16246 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16247 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16248 // Drop nuw flags for abs(sub(commutative), true).
16249 if (auto *I = dyn_cast<Instruction>(Vec);
16250 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16251 any_of(E->Scalars, [](Value *V) {
16252 if (isa<PoisonValue>(V))
16253 return false;
16254 auto *IV = cast<Instruction>(V);
16255 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16256 }))
16257 I->setHasNoUnsignedWrap(/*b=*/false);
16258 };
16259 DropNuwFlag(V0, E->getOpcode());
16260 DropNuwFlag(V1, E->getAltOpcode());
16261
16262 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16263 assert(SLPReVec && "FixedVectorType is not expected.");
16265 }
16266 V = Builder.CreateShuffleVector(V0, V1, Mask);
16267 if (auto *I = dyn_cast<Instruction>(V)) {
16268 V = ::propagateMetadata(I, E->Scalars);
16269 GatherShuffleExtractSeq.insert(I);
16270 CSEBlocks.insert(I->getParent());
16271 }
16272 }
16273
16274 E->VectorizedValue = V;
16275 ++NumVectorInstructions;
16276
16277 return V;
16278 }
16279 default:
16280 llvm_unreachable("unknown inst");
16281 }
16282 return nullptr;
16283}
16284
16286 ExtraValueToDebugLocsMap ExternallyUsedValues;
16287 return vectorizeTree(ExternallyUsedValues);
16288}
16289
16290Value *
16292 Instruction *ReductionRoot) {
16293 // All blocks must be scheduled before any instructions are inserted.
16294 for (auto &BSIter : BlocksSchedules) {
16295 scheduleBlock(BSIter.second.get());
16296 }
16297 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16298 // need to rebuild it.
16299 EntryToLastInstruction.clear();
16300
16301 if (ReductionRoot)
16302 Builder.SetInsertPoint(ReductionRoot->getParent(),
16303 ReductionRoot->getIterator());
16304 else
16305 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16306
16307 // Emit gathered loads first to emit better code for the users of those
16308 // gathered loads.
16309 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16310 if (GatheredLoadsEntriesFirst.has_value() &&
16311 TE->Idx >= *GatheredLoadsEntriesFirst &&
16312 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16313 assert((!TE->UserTreeIndices.empty() ||
16314 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16315 "Expected gathered load node.");
16316 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16317 }
16318 }
16319 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16320 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16321 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16322 if (TE->State == TreeEntry::Vectorize &&
16323 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16324 TE->VectorizedValue)
16325 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16326 // Run through the list of postponed gathers and emit them, replacing the temp
16327 // emitted allocas with actual vector instructions.
16328 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16330 for (const TreeEntry *E : PostponedNodes) {
16331 auto *TE = const_cast<TreeEntry *>(E);
16332 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16333 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16334 TE->UserTreeIndices.front().EdgeIdx)) &&
16335 VecTE->isSame(TE->Scalars))
16336 // Found gather node which is absolutely the same as one of the
16337 // vectorized nodes. It may happen after reordering.
16338 continue;
16339 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16340 TE->VectorizedValue = nullptr;
16341 auto *UserI =
16342 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16343 // If user is a PHI node, its vector code have to be inserted right before
16344 // block terminator. Since the node was delayed, there were some unresolved
16345 // dependencies at the moment when stab instruction was emitted. In a case
16346 // when any of these dependencies turn out an operand of another PHI, coming
16347 // from this same block, position of a stab instruction will become invalid.
16348 // The is because source vector that supposed to feed this gather node was
16349 // inserted at the end of the block [after stab instruction]. So we need
16350 // to adjust insertion point again to the end of block.
16351 if (isa<PHINode>(UserI)) {
16352 // Insert before all users.
16353 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16354 for (User *U : PrevVec->users()) {
16355 if (U == UserI)
16356 continue;
16357 auto *UI = dyn_cast<Instruction>(U);
16358 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16359 continue;
16360 if (UI->comesBefore(InsertPt))
16361 InsertPt = UI;
16362 }
16363 Builder.SetInsertPoint(InsertPt);
16364 } else {
16365 Builder.SetInsertPoint(PrevVec);
16366 }
16367 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16368 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16369 if (auto *VecI = dyn_cast<Instruction>(Vec);
16370 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16371 Builder.GetInsertPoint()->comesBefore(VecI))
16372 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16373 Builder.GetInsertPoint());
16374 if (Vec->getType() != PrevVec->getType()) {
16375 assert(Vec->getType()->isIntOrIntVectorTy() &&
16376 PrevVec->getType()->isIntOrIntVectorTy() &&
16377 "Expected integer vector types only.");
16378 std::optional<bool> IsSigned;
16379 for (Value *V : TE->Scalars) {
16380 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16381 auto It = MinBWs.find(BaseTE);
16382 if (It != MinBWs.end()) {
16383 IsSigned = IsSigned.value_or(false) || It->second.second;
16384 if (*IsSigned)
16385 break;
16386 }
16387 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16388 auto It = MinBWs.find(MNTE);
16389 if (It != MinBWs.end()) {
16390 IsSigned = IsSigned.value_or(false) || It->second.second;
16391 if (*IsSigned)
16392 break;
16393 }
16394 }
16395 if (IsSigned.value_or(false))
16396 break;
16397 // Scan through gather nodes.
16398 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16399 auto It = MinBWs.find(BVE);
16400 if (It != MinBWs.end()) {
16401 IsSigned = IsSigned.value_or(false) || It->second.second;
16402 if (*IsSigned)
16403 break;
16404 }
16405 }
16406 if (IsSigned.value_or(false))
16407 break;
16408 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16409 IsSigned =
16410 IsSigned.value_or(false) ||
16411 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16412 continue;
16413 }
16414 if (IsSigned.value_or(false))
16415 break;
16416 }
16417 }
16418 if (IsSigned.value_or(false)) {
16419 // Final attempt - check user node.
16420 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16421 if (It != MinBWs.end())
16422 IsSigned = It->second.second;
16423 }
16424 assert(IsSigned &&
16425 "Expected user node or perfect diamond match in MinBWs.");
16426 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16427 }
16428 PrevVec->replaceAllUsesWith(Vec);
16429 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16430 // Replace the stub vector node, if it was used before for one of the
16431 // buildvector nodes already.
16432 auto It = PostponedValues.find(PrevVec);
16433 if (It != PostponedValues.end()) {
16434 for (TreeEntry *VTE : It->getSecond())
16435 VTE->VectorizedValue = Vec;
16436 }
16437 eraseInstruction(PrevVec);
16438 }
16439
16440 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16441 << " values .\n");
16442
16444 // Maps vector instruction to original insertelement instruction
16445 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16446 // Maps extract Scalar to the corresponding extractelement instruction in the
16447 // basic block. Only one extractelement per block should be emitted.
16449 ScalarToEEs;
16450 SmallDenseSet<Value *, 4> UsedInserts;
16452 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16454 // Extract all of the elements with the external uses.
16455 for (const auto &ExternalUse : ExternalUses) {
16456 Value *Scalar = ExternalUse.Scalar;
16457 llvm::User *User = ExternalUse.User;
16458
16459 // Skip users that we already RAUW. This happens when one instruction
16460 // has multiple uses of the same value.
16461 if (User && !is_contained(Scalar->users(), User))
16462 continue;
16463 TreeEntry *E = getTreeEntry(Scalar);
16464 assert(E && "Invalid scalar");
16465 assert(!E->isGather() && "Extracting from a gather list");
16466 // Non-instruction pointers are not deleted, just skip them.
16467 if (E->getOpcode() == Instruction::GetElementPtr &&
16468 !isa<GetElementPtrInst>(Scalar))
16469 continue;
16470
16471 Value *Vec = E->VectorizedValue;
16472 assert(Vec && "Can't find vectorizable value");
16473
16474 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16475 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16476 if (Scalar->getType() != Vec->getType()) {
16477 Value *Ex = nullptr;
16478 Value *ExV = nullptr;
16479 auto *Inst = dyn_cast<Instruction>(Scalar);
16480 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16481 auto It = ScalarToEEs.find(Scalar);
16482 if (It != ScalarToEEs.end()) {
16483 // No need to emit many extracts, just move the only one in the
16484 // current block.
16485 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16486 : Builder.GetInsertBlock());
16487 if (EEIt != It->second.end()) {
16488 Value *PrevV = EEIt->second.first;
16489 if (auto *I = dyn_cast<Instruction>(PrevV);
16490 I && !ReplaceInst &&
16491 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16492 Builder.GetInsertPoint()->comesBefore(I)) {
16493 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16494 Builder.GetInsertPoint());
16495 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16496 CI->moveAfter(I);
16497 }
16498 Ex = PrevV;
16499 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16500 }
16501 }
16502 if (!Ex) {
16503 // "Reuse" the existing extract to improve final codegen.
16504 if (ReplaceInst) {
16505 // Leave the instruction as is, if it cheaper extracts and all
16506 // operands are scalar.
16507 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16508 IgnoredExtracts.insert(EE);
16509 Ex = EE;
16510 } else {
16511 auto *CloneInst = Inst->clone();
16512 CloneInst->insertBefore(Inst);
16513 if (Inst->hasName())
16514 CloneInst->takeName(Inst);
16515 Ex = CloneInst;
16516 }
16517 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16518 ES && isa<Instruction>(Vec)) {
16519 Value *V = ES->getVectorOperand();
16520 auto *IVec = cast<Instruction>(Vec);
16521 if (const TreeEntry *ETE = getTreeEntry(V))
16522 V = ETE->VectorizedValue;
16523 if (auto *IV = dyn_cast<Instruction>(V);
16524 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16525 IV->comesBefore(IVec))
16526 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16527 else
16528 Ex = Builder.CreateExtractElement(Vec, Lane);
16529 } else if (auto *VecTy =
16530 dyn_cast<FixedVectorType>(Scalar->getType())) {
16531 assert(SLPReVec && "FixedVectorType is not expected.");
16532 unsigned VecTyNumElements = VecTy->getNumElements();
16533 // When REVEC is enabled, we need to extract a vector.
16534 // Note: The element size of Scalar may be different from the
16535 // element size of Vec.
16536 Ex = Builder.CreateExtractVector(
16538 VecTyNumElements),
16539 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
16540 } else {
16541 Ex = Builder.CreateExtractElement(Vec, Lane);
16542 }
16543 // If necessary, sign-extend or zero-extend ScalarRoot
16544 // to the larger type.
16545 ExV = Ex;
16546 if (Scalar->getType() != Ex->getType())
16547 ExV = Builder.CreateIntCast(
16548 Ex, Scalar->getType(),
16549 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16550 auto *I = dyn_cast<Instruction>(Ex);
16551 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16552 : &F->getEntryBlock(),
16553 std::make_pair(Ex, ExV));
16554 }
16555 // The then branch of the previous if may produce constants, since 0
16556 // operand might be a constant.
16557 if (auto *ExI = dyn_cast<Instruction>(Ex);
16558 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16559 GatherShuffleExtractSeq.insert(ExI);
16560 CSEBlocks.insert(ExI->getParent());
16561 }
16562 return ExV;
16563 }
16564 assert(isa<FixedVectorType>(Scalar->getType()) &&
16565 isa<InsertElementInst>(Scalar) &&
16566 "In-tree scalar of vector type is not insertelement?");
16567 auto *IE = cast<InsertElementInst>(Scalar);
16568 VectorToInsertElement.try_emplace(Vec, IE);
16569 return Vec;
16570 };
16571 // If User == nullptr, the Scalar remains as scalar in vectorized
16572 // instructions or is used as extra arg. Generate ExtractElement instruction
16573 // and update the record for this scalar in ExternallyUsedValues.
16574 if (!User) {
16575 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16576 continue;
16577 assert((ExternallyUsedValues.count(Scalar) ||
16578 Scalar->hasNUsesOrMore(UsesLimit) ||
16579 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16580 any_of(Scalar->users(),
16581 [&](llvm::User *U) {
16582 if (ExternalUsesAsOriginalScalar.contains(U))
16583 return true;
16584 TreeEntry *UseEntry = getTreeEntry(U);
16585 return UseEntry &&
16586 (UseEntry->State == TreeEntry::Vectorize ||
16587 UseEntry->State ==
16588 TreeEntry::StridedVectorize) &&
16589 (E->State == TreeEntry::Vectorize ||
16590 E->State == TreeEntry::StridedVectorize) &&
16591 doesInTreeUserNeedToExtract(
16592 Scalar, getRootEntryInstruction(*UseEntry),
16593 TLI, TTI);
16594 })) &&
16595 "Scalar with nullptr User must be registered in "
16596 "ExternallyUsedValues map or remain as scalar in vectorized "
16597 "instructions");
16598 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16599 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16600 if (PHI->getParent()->isLandingPad())
16601 Builder.SetInsertPoint(
16602 PHI->getParent(),
16603 std::next(
16604 PHI->getParent()->getLandingPadInst()->getIterator()));
16605 else
16606 Builder.SetInsertPoint(PHI->getParent(),
16607 PHI->getParent()->getFirstNonPHIIt());
16608 } else {
16609 Builder.SetInsertPoint(VecI->getParent(),
16610 std::next(VecI->getIterator()));
16611 }
16612 } else {
16613 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16614 }
16615 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16616 // Required to update internally referenced instructions.
16617 if (Scalar != NewInst) {
16618 assert((!isa<ExtractElementInst>(Scalar) ||
16619 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16620 "Extractelements should not be replaced.");
16621 Scalar->replaceAllUsesWith(NewInst);
16622 }
16623 continue;
16624 }
16625
16626 if (auto *VU = dyn_cast<InsertElementInst>(User);
16627 VU && VU->getOperand(1) == Scalar) {
16628 // Skip if the scalar is another vector op or Vec is not an instruction.
16629 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16630 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16631 if (!UsedInserts.insert(VU).second)
16632 continue;
16633 // Need to use original vector, if the root is truncated.
16634 auto BWIt = MinBWs.find(E);
16635 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16636 auto *ScalarTy = FTy->getElementType();
16637 auto Key = std::make_pair(Vec, ScalarTy);
16638 auto VecIt = VectorCasts.find(Key);
16639 if (VecIt == VectorCasts.end()) {
16640 IRBuilderBase::InsertPointGuard Guard(Builder);
16641 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16642 if (IVec->getParent()->isLandingPad())
16643 Builder.SetInsertPoint(IVec->getParent(),
16644 std::next(IVec->getParent()
16645 ->getLandingPadInst()
16646 ->getIterator()));
16647 else
16648 Builder.SetInsertPoint(
16649 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16650 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16651 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16652 }
16653 Vec = Builder.CreateIntCast(
16654 Vec,
16656 ScalarTy,
16657 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16658 BWIt->second.second);
16659 VectorCasts.try_emplace(Key, Vec);
16660 } else {
16661 Vec = VecIt->second;
16662 }
16663 }
16664
16665 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16666 if (InsertIdx) {
16667 auto *It = find_if(
16668 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16669 // Checks if 2 insertelements are from the same buildvector.
16670 InsertElementInst *VecInsert = Data.InsertElements.front();
16672 VU, VecInsert,
16673 [](InsertElementInst *II) { return II->getOperand(0); });
16674 });
16675 unsigned Idx = *InsertIdx;
16676 if (It == ShuffledInserts.end()) {
16677 (void)ShuffledInserts.emplace_back();
16678 It = std::next(ShuffledInserts.begin(),
16679 ShuffledInserts.size() - 1);
16680 }
16681 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16682 if (Mask.empty())
16683 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16684 Mask[Idx] = ExternalUse.Lane;
16685 It->InsertElements.push_back(cast<InsertElementInst>(User));
16686 continue;
16687 }
16688 }
16689 }
16690 }
16691
16692 // Generate extracts for out-of-tree users.
16693 // Find the insertion point for the extractelement lane.
16694 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16695 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16696 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16697 if (PH->getIncomingValue(I) == Scalar) {
16698 Instruction *IncomingTerminator =
16699 PH->getIncomingBlock(I)->getTerminator();
16700 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16701 Builder.SetInsertPoint(VecI->getParent(),
16702 std::next(VecI->getIterator()));
16703 } else {
16704 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16705 }
16706 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16707 PH->setOperand(I, NewInst);
16708 }
16709 }
16710 } else {
16711 Builder.SetInsertPoint(cast<Instruction>(User));
16712 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16713 User->replaceUsesOfWith(Scalar, NewInst);
16714 }
16715 } else {
16716 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16717 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16718 User->replaceUsesOfWith(Scalar, NewInst);
16719 }
16720
16721 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16722 }
16723
16724 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16725 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16726 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16727 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16728 for (int I = 0, E = Mask.size(); I < E; ++I) {
16729 if (Mask[I] < VF)
16730 CombinedMask1[I] = Mask[I];
16731 else
16732 CombinedMask2[I] = Mask[I] - VF;
16733 }
16734 ShuffleInstructionBuilder ShuffleBuilder(
16735 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16736 ShuffleBuilder.add(V1, CombinedMask1);
16737 if (V2)
16738 ShuffleBuilder.add(V2, CombinedMask2);
16739 return ShuffleBuilder.finalize({}, {}, {});
16740 };
16741
16742 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16743 bool ForSingleMask) {
16744 unsigned VF = Mask.size();
16745 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16746 if (VF != VecVF) {
16747 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16748 Vec = CreateShuffle(Vec, nullptr, Mask);
16749 return std::make_pair(Vec, true);
16750 }
16751 if (!ForSingleMask) {
16752 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16753 for (unsigned I = 0; I < VF; ++I) {
16754 if (Mask[I] != PoisonMaskElem)
16755 ResizeMask[Mask[I]] = Mask[I];
16756 }
16757 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16758 }
16759 }
16760
16761 return std::make_pair(Vec, false);
16762 };
16763 // Perform shuffling of the vectorize tree entries for better handling of
16764 // external extracts.
16765 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16766 // Find the first and the last instruction in the list of insertelements.
16767 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16768 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16769 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16770 Builder.SetInsertPoint(LastInsert);
16771 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16772 Value *NewInst = performExtractsShuffleAction<Value>(
16773 MutableArrayRef(Vector.data(), Vector.size()),
16774 FirstInsert->getOperand(0),
16775 [](Value *Vec) {
16776 return cast<VectorType>(Vec->getType())
16777 ->getElementCount()
16778 .getKnownMinValue();
16779 },
16780 ResizeToVF,
16781 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16782 ArrayRef<Value *> Vals) {
16783 assert((Vals.size() == 1 || Vals.size() == 2) &&
16784 "Expected exactly 1 or 2 input values.");
16785 if (Vals.size() == 1) {
16786 // Do not create shuffle if the mask is a simple identity
16787 // non-resizing mask.
16788 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16789 ->getNumElements() ||
16790 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16791 return CreateShuffle(Vals.front(), nullptr, Mask);
16792 return Vals.front();
16793 }
16794 return CreateShuffle(Vals.front() ? Vals.front()
16795 : FirstInsert->getOperand(0),
16796 Vals.back(), Mask);
16797 });
16798 auto It = ShuffledInserts[I].InsertElements.rbegin();
16799 // Rebuild buildvector chain.
16800 InsertElementInst *II = nullptr;
16801 if (It != ShuffledInserts[I].InsertElements.rend())
16802 II = *It;
16804 while (It != ShuffledInserts[I].InsertElements.rend()) {
16805 assert(II && "Must be an insertelement instruction.");
16806 if (*It == II)
16807 ++It;
16808 else
16809 Inserts.push_back(cast<Instruction>(II));
16810 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16811 }
16812 for (Instruction *II : reverse(Inserts)) {
16813 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16814 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16815 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16816 II->moveAfter(NewI);
16817 NewInst = II;
16818 }
16819 LastInsert->replaceAllUsesWith(NewInst);
16820 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16821 IE->replaceUsesOfWith(IE->getOperand(0),
16822 PoisonValue::get(IE->getOperand(0)->getType()));
16823 IE->replaceUsesOfWith(IE->getOperand(1),
16824 PoisonValue::get(IE->getOperand(1)->getType()));
16825 eraseInstruction(IE);
16826 }
16827 CSEBlocks.insert(LastInsert->getParent());
16828 }
16829
16830 SmallVector<Instruction *> RemovedInsts;
16831 // For each vectorized value:
16832 for (auto &TEPtr : VectorizableTree) {
16833 TreeEntry *Entry = TEPtr.get();
16834
16835 // No need to handle users of gathered values.
16836 if (Entry->isGather())
16837 continue;
16838
16839 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16840
16841 // For each lane:
16842 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16843 Value *Scalar = Entry->Scalars[Lane];
16844
16845 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16846 !isa<GetElementPtrInst>(Scalar))
16847 continue;
16848 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16849 EE && IgnoredExtracts.contains(EE))
16850 continue;
16851 if (isa<PoisonValue>(Scalar))
16852 continue;
16853#ifndef NDEBUG
16854 Type *Ty = Scalar->getType();
16855 if (!Ty->isVoidTy()) {
16856 for (User *U : Scalar->users()) {
16857 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16858
16859 // It is legal to delete users in the ignorelist.
16860 assert((getTreeEntry(U) ||
16861 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16862 (isa_and_nonnull<Instruction>(U) &&
16863 isDeleted(cast<Instruction>(U)))) &&
16864 "Deleting out-of-tree value");
16865 }
16866 }
16867#endif
16868 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16869 auto *I = cast<Instruction>(Scalar);
16870 RemovedInsts.push_back(I);
16871 }
16872 }
16873
16874 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16875 // new vector instruction.
16876 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16877 V->mergeDIAssignID(RemovedInsts);
16878
16879 // Clear up reduction references, if any.
16880 if (UserIgnoreList) {
16881 for (Instruction *I : RemovedInsts) {
16882 const TreeEntry *IE = getTreeEntry(I);
16883 if (IE->Idx != 0 &&
16884 !(VectorizableTree.front()->isGather() &&
16885 !IE->UserTreeIndices.empty() &&
16886 (ValueToGatherNodes.lookup(I).contains(
16887 VectorizableTree.front().get()) ||
16888 any_of(IE->UserTreeIndices,
16889 [&](const EdgeInfo &EI) {
16890 return EI.UserTE == VectorizableTree.front().get() &&
16891 EI.EdgeIdx == UINT_MAX;
16892 }))) &&
16893 !(GatheredLoadsEntriesFirst.has_value() &&
16894 IE->Idx >= *GatheredLoadsEntriesFirst &&
16895 VectorizableTree.front()->isGather() &&
16896 is_contained(VectorizableTree.front()->Scalars, I)))
16897 continue;
16898 SmallVector<SelectInst *> LogicalOpSelects;
16899 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16900 // Do not replace condition of the logical op in form select <cond>.
16901 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16902 (match(U.getUser(), m_LogicalAnd()) ||
16903 match(U.getUser(), m_LogicalOr())) &&
16904 U.getOperandNo() == 0;
16905 if (IsPoisoningLogicalOp) {
16906 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16907 return false;
16908 }
16909 return UserIgnoreList->contains(U.getUser());
16910 });
16911 // Replace conditions of the poisoning logical ops with the non-poison
16912 // constant value.
16913 for (SelectInst *SI : LogicalOpSelects)
16914 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16915 }
16916 }
16917 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16918 // cache correctness.
16919 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16920 // - instructions are not deleted until later.
16921 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16922
16923 Builder.ClearInsertionPoint();
16924 InstrElementSize.clear();
16925
16926 const TreeEntry &RootTE = *VectorizableTree.front();
16927 Value *Vec = RootTE.VectorizedValue;
16928 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16929 It != MinBWs.end() &&
16930 ReductionBitWidth != It->second.first) {
16931 IRBuilder<>::InsertPointGuard Guard(Builder);
16932 Builder.SetInsertPoint(ReductionRoot->getParent(),
16933 ReductionRoot->getIterator());
16934 Vec = Builder.CreateIntCast(
16935 Vec,
16936 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16937 cast<VectorType>(Vec->getType())->getElementCount()),
16938 It->second.second);
16939 }
16940 return Vec;
16941}
16942
16944 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16945 << " gather sequences instructions.\n");
16946 // LICM InsertElementInst sequences.
16947 for (Instruction *I : GatherShuffleExtractSeq) {
16948 if (isDeleted(I))
16949 continue;
16950
16951 // Check if this block is inside a loop.
16952 Loop *L = LI->getLoopFor(I->getParent());
16953 if (!L)
16954 continue;
16955
16956 // Check if it has a preheader.
16957 BasicBlock *PreHeader = L->getLoopPreheader();
16958 if (!PreHeader)
16959 continue;
16960
16961 // If the vector or the element that we insert into it are
16962 // instructions that are defined in this basic block then we can't
16963 // hoist this instruction.
16964 if (any_of(I->operands(), [L](Value *V) {
16965 auto *OpI = dyn_cast<Instruction>(V);
16966 return OpI && L->contains(OpI);
16967 }))
16968 continue;
16969
16970 // We can hoist this instruction. Move it to the pre-header.
16971 I->moveBefore(PreHeader->getTerminator());
16972 CSEBlocks.insert(PreHeader);
16973 }
16974
16975 // Make a list of all reachable blocks in our CSE queue.
16977 CSEWorkList.reserve(CSEBlocks.size());
16978 for (BasicBlock *BB : CSEBlocks)
16979 if (DomTreeNode *N = DT->getNode(BB)) {
16981 CSEWorkList.push_back(N);
16982 }
16983
16984 // Sort blocks by domination. This ensures we visit a block after all blocks
16985 // dominating it are visited.
16986 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
16987 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
16988 "Different nodes should have different DFS numbers");
16989 return A->getDFSNumIn() < B->getDFSNumIn();
16990 });
16991
16992 // Less defined shuffles can be replaced by the more defined copies.
16993 // Between two shuffles one is less defined if it has the same vector operands
16994 // and its mask indeces are the same as in the first one or undefs. E.g.
16995 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
16996 // poison, <0, 0, 0, 0>.
16997 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
16998 Instruction *I2,
16999 SmallVectorImpl<int> &NewMask) {
17000 if (I1->getType() != I2->getType())
17001 return false;
17002 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17003 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17004 if (!SI1 || !SI2)
17005 return I1->isIdenticalTo(I2);
17006 if (SI1->isIdenticalTo(SI2))
17007 return true;
17008 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
17009 if (SI1->getOperand(I) != SI2->getOperand(I))
17010 return false;
17011 // Check if the second instruction is more defined than the first one.
17012 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17013 ArrayRef<int> SM1 = SI1->getShuffleMask();
17014 // Count trailing undefs in the mask to check the final number of used
17015 // registers.
17016 unsigned LastUndefsCnt = 0;
17017 for (int I = 0, E = NewMask.size(); I < E; ++I) {
17018 if (SM1[I] == PoisonMaskElem)
17019 ++LastUndefsCnt;
17020 else
17021 LastUndefsCnt = 0;
17022 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
17023 NewMask[I] != SM1[I])
17024 return false;
17025 if (NewMask[I] == PoisonMaskElem)
17026 NewMask[I] = SM1[I];
17027 }
17028 // Check if the last undefs actually change the final number of used vector
17029 // registers.
17030 return SM1.size() - LastUndefsCnt > 1 &&
17031 TTI->getNumberOfParts(SI1->getType()) ==
17033 getWidenedType(SI1->getType()->getElementType(),
17034 SM1.size() - LastUndefsCnt));
17035 };
17036 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
17037 // instructions. TODO: We can further optimize this scan if we split the
17038 // instructions into different buckets based on the insert lane.
17040 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
17041 assert(*I &&
17042 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17043 "Worklist not sorted properly!");
17044 BasicBlock *BB = (*I)->getBlock();
17045 // For all instructions in blocks containing gather sequences:
17046 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
17047 if (isDeleted(&In))
17048 continue;
17049 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17050 !GatherShuffleExtractSeq.contains(&In))
17051 continue;
17052
17053 // Check if we can replace this instruction with any of the
17054 // visited instructions.
17055 bool Replaced = false;
17056 for (Instruction *&V : Visited) {
17057 SmallVector<int> NewMask;
17058 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17059 DT->dominates(V->getParent(), In.getParent())) {
17060 In.replaceAllUsesWith(V);
17061 eraseInstruction(&In);
17062 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17063 if (!NewMask.empty())
17064 SI->setShuffleMask(NewMask);
17065 Replaced = true;
17066 break;
17067 }
17068 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17069 GatherShuffleExtractSeq.contains(V) &&
17070 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17071 DT->dominates(In.getParent(), V->getParent())) {
17072 In.moveAfter(V);
17073 V->replaceAllUsesWith(&In);
17075 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17076 if (!NewMask.empty())
17077 SI->setShuffleMask(NewMask);
17078 V = &In;
17079 Replaced = true;
17080 break;
17081 }
17082 }
17083 if (!Replaced) {
17084 assert(!is_contained(Visited, &In));
17085 Visited.push_back(&In);
17086 }
17087 }
17088 }
17089 CSEBlocks.clear();
17090 GatherShuffleExtractSeq.clear();
17091}
17092
17093BoUpSLP::ScheduleData *
17094BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17095 ScheduleData *Bundle = nullptr;
17096 ScheduleData *PrevInBundle = nullptr;
17097 for (Value *V : VL) {
17099 continue;
17100 ScheduleData *BundleMember = getScheduleData(V);
17101 assert(BundleMember &&
17102 "no ScheduleData for bundle member "
17103 "(maybe not in same basic block)");
17104 assert(BundleMember->isSchedulingEntity() &&
17105 "bundle member already part of other bundle");
17106 if (PrevInBundle) {
17107 PrevInBundle->NextInBundle = BundleMember;
17108 } else {
17109 Bundle = BundleMember;
17110 }
17111
17112 // Group the instructions to a bundle.
17113 BundleMember->FirstInBundle = Bundle;
17114 PrevInBundle = BundleMember;
17115 }
17116 assert(Bundle && "Failed to find schedule bundle");
17117 return Bundle;
17118}
17119
17120// Groups the instructions to a bundle (which is then a single scheduling entity)
17121// and schedules instructions until the bundle gets ready.
17122std::optional<BoUpSLP::ScheduleData *>
17123BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17124 const InstructionsState &S) {
17125 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17126 // instructions.
17127 if (isa<PHINode>(S.getMainOp()) ||
17129 return nullptr;
17130
17131 // Initialize the instruction bundle.
17132 Instruction *OldScheduleEnd = ScheduleEnd;
17133 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17134
17135 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17136 ScheduleData *Bundle) {
17137 // The scheduling region got new instructions at the lower end (or it is a
17138 // new region for the first bundle). This makes it necessary to
17139 // recalculate all dependencies.
17140 // It is seldom that this needs to be done a second time after adding the
17141 // initial bundle to the region.
17142 if (ScheduleEnd != OldScheduleEnd) {
17143 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17144 if (ScheduleData *SD = getScheduleData(I))
17145 SD->clearDependencies();
17146 ReSchedule = true;
17147 }
17148 if (Bundle) {
17149 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17150 << " in block " << BB->getName() << "\n");
17151 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17152 }
17153
17154 if (ReSchedule) {
17155 resetSchedule();
17156 initialFillReadyList(ReadyInsts);
17157 }
17158
17159 // Now try to schedule the new bundle or (if no bundle) just calculate
17160 // dependencies. As soon as the bundle is "ready" it means that there are no
17161 // cyclic dependencies and we can schedule it. Note that's important that we
17162 // don't "schedule" the bundle yet (see cancelScheduling).
17163 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17164 !ReadyInsts.empty()) {
17165 ScheduleData *Picked = ReadyInsts.pop_back_val();
17166 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17167 "must be ready to schedule");
17168 schedule(Picked, ReadyInsts);
17169 }
17170 };
17171
17172 // Make sure that the scheduling region contains all
17173 // instructions of the bundle.
17174 for (Value *V : VL) {
17176 continue;
17177 if (!extendSchedulingRegion(V, S)) {
17178 // If the scheduling region got new instructions at the lower end (or it
17179 // is a new region for the first bundle). This makes it necessary to
17180 // recalculate all dependencies.
17181 // Otherwise the compiler may crash trying to incorrectly calculate
17182 // dependencies and emit instruction in the wrong order at the actual
17183 // scheduling.
17184 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17185 return std::nullopt;
17186 }
17187 }
17188
17189 bool ReSchedule = false;
17190 for (Value *V : VL) {
17192 continue;
17193 ScheduleData *BundleMember = getScheduleData(V);
17194 assert(BundleMember &&
17195 "no ScheduleData for bundle member (maybe not in same basic block)");
17196
17197 // Make sure we don't leave the pieces of the bundle in the ready list when
17198 // whole bundle might not be ready.
17199 ReadyInsts.remove(BundleMember);
17200
17201 if (!BundleMember->IsScheduled)
17202 continue;
17203 // A bundle member was scheduled as single instruction before and now
17204 // needs to be scheduled as part of the bundle. We just get rid of the
17205 // existing schedule.
17206 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17207 << " was already scheduled\n");
17208 ReSchedule = true;
17209 }
17210
17211 auto *Bundle = buildBundle(VL);
17212 TryScheduleBundleImpl(ReSchedule, Bundle);
17213 if (!Bundle->isReady()) {
17214 cancelScheduling(VL, S.getMainOp());
17215 return std::nullopt;
17216 }
17217 return Bundle;
17218}
17219
17220void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17221 Value *OpValue) {
17222 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17224 return;
17225
17226 if (doesNotNeedToBeScheduled(OpValue))
17227 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17228 ScheduleData *Bundle = getScheduleData(OpValue);
17229 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17230 assert(!Bundle->IsScheduled &&
17231 "Can't cancel bundle which is already scheduled");
17232 assert(Bundle->isSchedulingEntity() &&
17233 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17234 "tried to unbundle something which is not a bundle");
17235
17236 // Remove the bundle from the ready list.
17237 if (Bundle->isReady())
17238 ReadyInsts.remove(Bundle);
17239
17240 // Un-bundle: make single instructions out of the bundle.
17241 ScheduleData *BundleMember = Bundle;
17242 while (BundleMember) {
17243 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17244 BundleMember->FirstInBundle = BundleMember;
17245 ScheduleData *Next = BundleMember->NextInBundle;
17246 BundleMember->NextInBundle = nullptr;
17247 BundleMember->TE = nullptr;
17248 if (BundleMember->unscheduledDepsInBundle() == 0) {
17249 ReadyInsts.insert(BundleMember);
17250 }
17251 BundleMember = Next;
17252 }
17253}
17254
17255BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17256 // Allocate a new ScheduleData for the instruction.
17257 if (ChunkPos >= ChunkSize) {
17258 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17259 ChunkPos = 0;
17260 }
17261 return &(ScheduleDataChunks.back()[ChunkPos++]);
17262}
17263
17264bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17265 Value *V, const InstructionsState &S) {
17266 Instruction *I = dyn_cast<Instruction>(V);
17267 assert(I && "bundle member must be an instruction");
17268 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17270 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17271 "be scheduled");
17272 if (getScheduleData(I))
17273 return true;
17274 if (!ScheduleStart) {
17275 // It's the first instruction in the new region.
17276 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17277 ScheduleStart = I;
17278 ScheduleEnd = I->getNextNode();
17279 assert(ScheduleEnd && "tried to vectorize a terminator?");
17280 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17281 return true;
17282 }
17283 // Search up and down at the same time, because we don't know if the new
17284 // instruction is above or below the existing scheduling region.
17285 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17286 // against the budget. Otherwise debug info could affect codegen.
17288 ++ScheduleStart->getIterator().getReverse();
17289 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17290 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17291 BasicBlock::iterator LowerEnd = BB->end();
17292 auto IsAssumeLikeIntr = [](const Instruction &I) {
17293 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17294 return II->isAssumeLikeIntrinsic();
17295 return false;
17296 };
17297 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17298 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17299 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17300 &*DownIter != I) {
17301 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17302 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17303 return false;
17304 }
17305
17306 ++UpIter;
17307 ++DownIter;
17308
17309 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17310 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17311 }
17312 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17313 assert(I->getParent() == ScheduleStart->getParent() &&
17314 "Instruction is in wrong basic block.");
17315 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17316 ScheduleStart = I;
17317 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17318 << "\n");
17319 return true;
17320 }
17321 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17322 "Expected to reach top of the basic block or instruction down the "
17323 "lower end.");
17324 assert(I->getParent() == ScheduleEnd->getParent() &&
17325 "Instruction is in wrong basic block.");
17326 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17327 nullptr);
17328 ScheduleEnd = I->getNextNode();
17329 assert(ScheduleEnd && "tried to vectorize a terminator?");
17330 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17331 return true;
17332}
17333
17334void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17335 Instruction *ToI,
17336 ScheduleData *PrevLoadStore,
17337 ScheduleData *NextLoadStore) {
17338 ScheduleData *CurrentLoadStore = PrevLoadStore;
17339 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17340 // No need to allocate data for non-schedulable instructions.
17342 continue;
17343 ScheduleData *SD = ScheduleDataMap.lookup(I);
17344 if (!SD) {
17345 SD = allocateScheduleDataChunks();
17346 ScheduleDataMap[I] = SD;
17347 }
17348 assert(!isInSchedulingRegion(SD) &&
17349 "new ScheduleData already in scheduling region");
17350 SD->init(SchedulingRegionID, I);
17351
17352 if (I->mayReadOrWriteMemory() &&
17353 (!isa<IntrinsicInst>(I) ||
17354 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17355 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17356 Intrinsic::pseudoprobe))) {
17357 // Update the linked list of memory accessing instructions.
17358 if (CurrentLoadStore) {
17359 CurrentLoadStore->NextLoadStore = SD;
17360 } else {
17361 FirstLoadStoreInRegion = SD;
17362 }
17363 CurrentLoadStore = SD;
17364 }
17365
17366 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17367 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17368 RegionHasStackSave = true;
17369 }
17370 if (NextLoadStore) {
17371 if (CurrentLoadStore)
17372 CurrentLoadStore->NextLoadStore = NextLoadStore;
17373 } else {
17374 LastLoadStoreInRegion = CurrentLoadStore;
17375 }
17376}
17377
17378void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17379 bool InsertInReadyList,
17380 BoUpSLP *SLP) {
17381 assert(SD->isSchedulingEntity());
17382
17384 WorkList.push_back(SD);
17385
17386 while (!WorkList.empty()) {
17387 ScheduleData *SD = WorkList.pop_back_val();
17388 for (ScheduleData *BundleMember = SD; BundleMember;
17389 BundleMember = BundleMember->NextInBundle) {
17390 assert(isInSchedulingRegion(BundleMember));
17391 if (BundleMember->hasValidDependencies())
17392 continue;
17393
17394 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17395 << "\n");
17396 BundleMember->Dependencies = 0;
17397 BundleMember->resetUnscheduledDeps();
17398
17399 // Handle def-use chain dependencies.
17400 for (User *U : BundleMember->Inst->users()) {
17401 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17402 BundleMember->Dependencies++;
17403 ScheduleData *DestBundle = UseSD->FirstInBundle;
17404 if (!DestBundle->IsScheduled)
17405 BundleMember->incrementUnscheduledDeps(1);
17406 if (!DestBundle->hasValidDependencies())
17407 WorkList.push_back(DestBundle);
17408 }
17409 }
17410
17411 auto MakeControlDependent = [&](Instruction *I) {
17412 auto *DepDest = getScheduleData(I);
17413 assert(DepDest && "must be in schedule window");
17414 DepDest->ControlDependencies.push_back(BundleMember);
17415 BundleMember->Dependencies++;
17416 ScheduleData *DestBundle = DepDest->FirstInBundle;
17417 if (!DestBundle->IsScheduled)
17418 BundleMember->incrementUnscheduledDeps(1);
17419 if (!DestBundle->hasValidDependencies())
17420 WorkList.push_back(DestBundle);
17421 };
17422
17423 // Any instruction which isn't safe to speculate at the beginning of the
17424 // block is control dependend on any early exit or non-willreturn call
17425 // which proceeds it.
17426 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17427 for (Instruction *I = BundleMember->Inst->getNextNode();
17428 I != ScheduleEnd; I = I->getNextNode()) {
17429 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17430 continue;
17431
17432 // Add the dependency
17433 MakeControlDependent(I);
17434
17436 // Everything past here must be control dependent on I.
17437 break;
17438 }
17439 }
17440
17441 if (RegionHasStackSave) {
17442 // If we have an inalloc alloca instruction, it needs to be scheduled
17443 // after any preceeding stacksave. We also need to prevent any alloca
17444 // from reordering above a preceeding stackrestore.
17445 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17446 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17447 for (Instruction *I = BundleMember->Inst->getNextNode();
17448 I != ScheduleEnd; I = I->getNextNode()) {
17449 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17450 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17451 // Any allocas past here must be control dependent on I, and I
17452 // must be memory dependend on BundleMember->Inst.
17453 break;
17454
17455 if (!isa<AllocaInst>(I))
17456 continue;
17457
17458 // Add the dependency
17459 MakeControlDependent(I);
17460 }
17461 }
17462
17463 // In addition to the cases handle just above, we need to prevent
17464 // allocas and loads/stores from moving below a stacksave or a
17465 // stackrestore. Avoiding moving allocas below stackrestore is currently
17466 // thought to be conservatism. Moving loads/stores below a stackrestore
17467 // can lead to incorrect code.
17468 if (isa<AllocaInst>(BundleMember->Inst) ||
17469 BundleMember->Inst->mayReadOrWriteMemory()) {
17470 for (Instruction *I = BundleMember->Inst->getNextNode();
17471 I != ScheduleEnd; I = I->getNextNode()) {
17472 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17473 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17474 continue;
17475
17476 // Add the dependency
17477 MakeControlDependent(I);
17478 break;
17479 }
17480 }
17481 }
17482
17483 // Handle the memory dependencies (if any).
17484 ScheduleData *DepDest = BundleMember->NextLoadStore;
17485 if (!DepDest)
17486 continue;
17487 Instruction *SrcInst = BundleMember->Inst;
17488 assert(SrcInst->mayReadOrWriteMemory() &&
17489 "NextLoadStore list for non memory effecting bundle?");
17490 MemoryLocation SrcLoc = getLocation(SrcInst);
17491 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17492 unsigned NumAliased = 0;
17493 unsigned DistToSrc = 1;
17494
17495 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17496 assert(isInSchedulingRegion(DepDest));
17497
17498 // We have two limits to reduce the complexity:
17499 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17500 // SLP->isAliased (which is the expensive part in this loop).
17501 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17502 // the whole loop (even if the loop is fast, it's quadratic).
17503 // It's important for the loop break condition (see below) to
17504 // check this limit even between two read-only instructions.
17505 if (DistToSrc >= MaxMemDepDistance ||
17506 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17507 (NumAliased >= AliasedCheckLimit ||
17508 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17509
17510 // We increment the counter only if the locations are aliased
17511 // (instead of counting all alias checks). This gives a better
17512 // balance between reduced runtime and accurate dependencies.
17513 NumAliased++;
17514
17515 DepDest->MemoryDependencies.push_back(BundleMember);
17516 BundleMember->Dependencies++;
17517 ScheduleData *DestBundle = DepDest->FirstInBundle;
17518 if (!DestBundle->IsScheduled) {
17519 BundleMember->incrementUnscheduledDeps(1);
17520 }
17521 if (!DestBundle->hasValidDependencies()) {
17522 WorkList.push_back(DestBundle);
17523 }
17524 }
17525
17526 // Example, explaining the loop break condition: Let's assume our
17527 // starting instruction is i0 and MaxMemDepDistance = 3.
17528 //
17529 // +--------v--v--v
17530 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17531 // +--------^--^--^
17532 //
17533 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17534 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17535 // Previously we already added dependencies from i3 to i6,i7,i8
17536 // (because of MaxMemDepDistance). As we added a dependency from
17537 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17538 // and we can abort this loop at i6.
17539 if (DistToSrc >= 2 * MaxMemDepDistance)
17540 break;
17541 DistToSrc++;
17542 }
17543 }
17544 if (InsertInReadyList && SD->isReady()) {
17545 ReadyInsts.insert(SD);
17546 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17547 << "\n");
17548 }
17549 }
17550}
17551
17552void BoUpSLP::BlockScheduling::resetSchedule() {
17553 assert(ScheduleStart &&
17554 "tried to reset schedule on block which has not been scheduled");
17555 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17556 if (ScheduleData *SD = getScheduleData(I)) {
17557 assert(isInSchedulingRegion(SD) &&
17558 "ScheduleData not in scheduling region");
17559 SD->IsScheduled = false;
17560 SD->resetUnscheduledDeps();
17561 }
17562 }
17563 ReadyInsts.clear();
17564}
17565
17566void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17567 if (!BS->ScheduleStart)
17568 return;
17569
17570 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17571
17572 // A key point - if we got here, pre-scheduling was able to find a valid
17573 // scheduling of the sub-graph of the scheduling window which consists
17574 // of all vector bundles and their transitive users. As such, we do not
17575 // need to reschedule anything *outside of* that subgraph.
17576
17577 BS->resetSchedule();
17578
17579 // For the real scheduling we use a more sophisticated ready-list: it is
17580 // sorted by the original instruction location. This lets the final schedule
17581 // be as close as possible to the original instruction order.
17582 // WARNING: If changing this order causes a correctness issue, that means
17583 // there is some missing dependence edge in the schedule data graph.
17584 struct ScheduleDataCompare {
17585 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17586 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17587 }
17588 };
17589 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17590
17591 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17592 // and fill the ready-list with initial instructions.
17593 int Idx = 0;
17594 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17595 I = I->getNextNode()) {
17596 if (ScheduleData *SD = BS->getScheduleData(I)) {
17597 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17598 (void)SDTE;
17600 SD->isPartOfBundle() ==
17601 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17602 "scheduler and vectorizer bundle mismatch");
17603 SD->FirstInBundle->SchedulingPriority = Idx++;
17604
17605 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17606 BS->calculateDependencies(SD, false, this);
17607 }
17608 }
17609 BS->initialFillReadyList(ReadyInsts);
17610
17611 Instruction *LastScheduledInst = BS->ScheduleEnd;
17612
17613 // Do the "real" scheduling.
17614 while (!ReadyInsts.empty()) {
17615 ScheduleData *Picked = *ReadyInsts.begin();
17616 ReadyInsts.erase(ReadyInsts.begin());
17617
17618 // Move the scheduled instruction(s) to their dedicated places, if not
17619 // there yet.
17620 for (ScheduleData *BundleMember = Picked; BundleMember;
17621 BundleMember = BundleMember->NextInBundle) {
17622 Instruction *PickedInst = BundleMember->Inst;
17623 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17624 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17625 LastScheduledInst = PickedInst;
17626 }
17627
17628 BS->schedule(Picked, ReadyInsts);
17629 }
17630
17631 // Check that we didn't break any of our invariants.
17632#ifdef EXPENSIVE_CHECKS
17633 BS->verify();
17634#endif
17635
17636#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17637 // Check that all schedulable entities got scheduled
17638 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17639 ScheduleData *SD = BS->getScheduleData(I);
17640 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17641 assert(SD->IsScheduled && "must be scheduled at this point");
17642 }
17643#endif
17644
17645 // Avoid duplicate scheduling of the block.
17646 BS->ScheduleStart = nullptr;
17647}
17648
17650 // If V is a store, just return the width of the stored value (or value
17651 // truncated just before storing) without traversing the expression tree.
17652 // This is the common case.
17653 if (auto *Store = dyn_cast<StoreInst>(V))
17654 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17655
17656 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17657 return getVectorElementSize(IEI->getOperand(1));
17658
17659 auto E = InstrElementSize.find(V);
17660 if (E != InstrElementSize.end())
17661 return E->second;
17662
17663 // If V is not a store, we can traverse the expression tree to find loads
17664 // that feed it. The type of the loaded value may indicate a more suitable
17665 // width than V's type. We want to base the vector element size on the width
17666 // of memory operations where possible.
17669 if (auto *I = dyn_cast<Instruction>(V)) {
17670 Worklist.emplace_back(I, I->getParent(), 0);
17671 Visited.insert(I);
17672 }
17673
17674 // Traverse the expression tree in bottom-up order looking for loads. If we
17675 // encounter an instruction we don't yet handle, we give up.
17676 auto Width = 0u;
17677 Value *FirstNonBool = nullptr;
17678 while (!Worklist.empty()) {
17679 auto [I, Parent, Level] = Worklist.pop_back_val();
17680
17681 // We should only be looking at scalar instructions here. If the current
17682 // instruction has a vector type, skip.
17683 auto *Ty = I->getType();
17684 if (isa<VectorType>(Ty))
17685 continue;
17686 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17687 FirstNonBool = I;
17688 if (Level > RecursionMaxDepth)
17689 continue;
17690
17691 // If the current instruction is a load, update MaxWidth to reflect the
17692 // width of the loaded value.
17693 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17694 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17695
17696 // Otherwise, we need to visit the operands of the instruction. We only
17697 // handle the interesting cases from buildTree here. If an operand is an
17698 // instruction we haven't yet visited and from the same basic block as the
17699 // user or the use is a PHI node, we add it to the worklist.
17702 for (Use &U : I->operands()) {
17703 if (auto *J = dyn_cast<Instruction>(U.get()))
17704 if (Visited.insert(J).second &&
17705 (isa<PHINode>(I) || J->getParent() == Parent)) {
17706 Worklist.emplace_back(J, J->getParent(), Level + 1);
17707 continue;
17708 }
17709 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17710 FirstNonBool = U.get();
17711 }
17712 } else {
17713 break;
17714 }
17715 }
17716
17717 // If we didn't encounter a memory access in the expression tree, or if we
17718 // gave up for some reason, just return the width of V. Otherwise, return the
17719 // maximum width we found.
17720 if (!Width) {
17721 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17722 V = FirstNonBool;
17723 Width = DL->getTypeSizeInBits(V->getType());
17724 }
17725
17726 for (Instruction *I : Visited)
17727 InstrElementSize[I] = Width;
17728
17729 return Width;
17730}
17731
17732bool BoUpSLP::collectValuesToDemote(
17733 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17735 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17736 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17737 // We can always demote constants.
17738 if (all_of(E.Scalars, IsaPred<Constant>))
17739 return true;
17740
17741 unsigned OrigBitWidth =
17742 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17743 if (OrigBitWidth == BitWidth) {
17744 MaxDepthLevel = 1;
17745 return true;
17746 }
17747
17748 // Check if the node was analyzed already and must keep its original bitwidth.
17749 if (NodesToKeepBWs.contains(E.Idx))
17750 return false;
17751
17752 // If the value is not a vectorized instruction in the expression and not used
17753 // by the insertelement instruction and not used in multiple vector nodes, it
17754 // cannot be demoted.
17755 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17756 if (isa<PoisonValue>(R))
17757 return false;
17758 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17759 });
17760 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17761 if (isa<PoisonValue>(V))
17762 return true;
17763 if (MultiNodeScalars.contains(V))
17764 return false;
17765 // For lat shuffle of sext/zext with many uses need to check the extra bit
17766 // for unsigned values, otherwise may have incorrect casting for reused
17767 // scalars.
17768 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17769 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17770 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17771 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17772 return true;
17773 }
17774 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17775 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17776 if (IsSignedNode)
17777 ++BitWidth1;
17778 if (auto *I = dyn_cast<Instruction>(V)) {
17779 APInt Mask = DB->getDemandedBits(I);
17780 unsigned BitWidth2 =
17781 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17782 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17783 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17784 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17785 break;
17786 BitWidth2 *= 2;
17787 }
17788 BitWidth1 = std::min(BitWidth1, BitWidth2);
17789 }
17790 BitWidth = std::max(BitWidth, BitWidth1);
17791 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17792 };
17793 auto FinalAnalysis = [&, TTI = TTI]() {
17794 if (!IsProfitableToDemote)
17795 return false;
17796 bool Res = all_of(
17797 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17798 // Demote gathers.
17799 if (Res && E.isGather()) {
17800 // Check possible extractelement instructions bases and final vector
17801 // length.
17802 SmallPtrSet<Value *, 4> UniqueBases;
17803 for (Value *V : E.Scalars) {
17804 auto *EE = dyn_cast<ExtractElementInst>(V);
17805 if (!EE)
17806 continue;
17807 UniqueBases.insert(EE->getVectorOperand());
17808 }
17809 const unsigned VF = E.Scalars.size();
17810 Type *OrigScalarTy = E.Scalars.front()->getType();
17811 if (UniqueBases.size() <= 2 ||
17812 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17814 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17815 ToDemote.push_back(E.Idx);
17816 }
17817 return Res;
17818 };
17819 if (E.isGather() || !Visited.insert(&E).second ||
17820 any_of(E.Scalars, [&](Value *V) {
17821 return all_of(V->users(), [&](User *U) {
17822 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17823 });
17824 }))
17825 return FinalAnalysis();
17826
17827 if (any_of(E.Scalars, [&](Value *V) {
17828 return !all_of(V->users(), [=](User *U) {
17829 return getTreeEntry(U) ||
17830 (E.Idx == 0 && UserIgnoreList &&
17831 UserIgnoreList->contains(U)) ||
17832 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17833 !U->getType()->isScalableTy() &&
17834 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17835 }) && !IsPotentiallyTruncated(V, BitWidth);
17836 }))
17837 return false;
17838
17839 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17840 bool &NeedToExit) {
17841 NeedToExit = false;
17842 unsigned InitLevel = MaxDepthLevel;
17843 for (const TreeEntry *Op : Operands) {
17844 unsigned Level = InitLevel;
17845 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17846 ToDemote, Visited, NodesToKeepBWs, Level,
17847 IsProfitableToDemote, IsTruncRoot)) {
17848 if (!IsProfitableToDemote)
17849 return false;
17850 NeedToExit = true;
17851 if (!FinalAnalysis())
17852 return false;
17853 continue;
17854 }
17855 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17856 }
17857 return true;
17858 };
17859 auto AttemptCheckBitwidth =
17860 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17861 // Try all bitwidth < OrigBitWidth.
17862 NeedToExit = false;
17863 unsigned BestFailBitwidth = 0;
17864 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17865 if (Checker(BitWidth, OrigBitWidth))
17866 return true;
17867 if (BestFailBitwidth == 0 && FinalAnalysis())
17868 BestFailBitwidth = BitWidth;
17869 }
17870 if (BitWidth >= OrigBitWidth) {
17871 if (BestFailBitwidth == 0) {
17872 BitWidth = OrigBitWidth;
17873 return false;
17874 }
17875 MaxDepthLevel = 1;
17876 BitWidth = BestFailBitwidth;
17877 NeedToExit = true;
17878 return true;
17879 }
17880 return false;
17881 };
17882 auto TryProcessInstruction =
17883 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17884 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17885 if (Operands.empty()) {
17886 if (!IsTruncRoot)
17887 MaxDepthLevel = 1;
17888 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17889 std::ref(BitWidth)));
17890 } else {
17891 // Several vectorized uses? Check if we can truncate it, otherwise -
17892 // exit.
17893 if (E.UserTreeIndices.size() > 1 &&
17894 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17895 std::ref(BitWidth))))
17896 return false;
17897 bool NeedToExit = false;
17898 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17899 return false;
17900 if (NeedToExit)
17901 return true;
17902 if (!ProcessOperands(Operands, NeedToExit))
17903 return false;
17904 if (NeedToExit)
17905 return true;
17906 }
17907
17908 ++MaxDepthLevel;
17909 // Record the entry that we can demote.
17910 ToDemote.push_back(E.Idx);
17911 return IsProfitableToDemote;
17912 };
17913 switch (E.getOpcode()) {
17914
17915 // We can always demote truncations and extensions. Since truncations can
17916 // seed additional demotion, we save the truncated value.
17917 case Instruction::Trunc:
17918 if (IsProfitableToDemoteRoot)
17919 IsProfitableToDemote = true;
17920 return TryProcessInstruction(BitWidth);
17921 case Instruction::ZExt:
17922 case Instruction::SExt:
17923 IsProfitableToDemote = true;
17924 return TryProcessInstruction(BitWidth);
17925
17926 // We can demote certain binary operations if we can demote both of their
17927 // operands.
17928 case Instruction::Add:
17929 case Instruction::Sub:
17930 case Instruction::Mul:
17931 case Instruction::And:
17932 case Instruction::Or:
17933 case Instruction::Xor: {
17934 return TryProcessInstruction(
17935 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17936 }
17937 case Instruction::Freeze:
17938 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17939 case Instruction::Shl: {
17940 // If we are truncating the result of this SHL, and if it's a shift of an
17941 // inrange amount, we can always perform a SHL in a smaller type.
17942 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17943 return all_of(E.Scalars, [&](Value *V) {
17944 if (isa<PoisonValue>(V))
17945 return true;
17946 auto *I = cast<Instruction>(V);
17947 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17948 return AmtKnownBits.getMaxValue().ult(BitWidth);
17949 });
17950 };
17951 return TryProcessInstruction(
17952 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17953 }
17954 case Instruction::LShr: {
17955 // If this is a truncate of a logical shr, we can truncate it to a smaller
17956 // lshr iff we know that the bits we would otherwise be shifting in are
17957 // already zeros.
17958 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17959 return all_of(E.Scalars, [&](Value *V) {
17960 if (isa<PoisonValue>(V))
17961 return true;
17962 auto *I = cast<Instruction>(V);
17963 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17964 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17965 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17966 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17967 SimplifyQuery(*DL));
17968 });
17969 };
17970 return TryProcessInstruction(
17971 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17972 LShrChecker);
17973 }
17974 case Instruction::AShr: {
17975 // If this is a truncate of an arithmetic shr, we can truncate it to a
17976 // smaller ashr iff we know that all the bits from the sign bit of the
17977 // original type and the sign bit of the truncate type are similar.
17978 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17979 return all_of(E.Scalars, [&](Value *V) {
17980 if (isa<PoisonValue>(V))
17981 return true;
17982 auto *I = cast<Instruction>(V);
17983 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17984 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17985 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17986 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17987 nullptr, DT);
17988 });
17989 };
17990 return TryProcessInstruction(
17991 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17992 AShrChecker);
17993 }
17994 case Instruction::UDiv:
17995 case Instruction::URem: {
17996 // UDiv and URem can be truncated if all the truncated bits are zero.
17997 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
17998 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
17999 return all_of(E.Scalars, [&](Value *V) {
18000 auto *I = cast<Instruction>(V);
18001 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18002 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18003 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18004 });
18005 };
18006 return TryProcessInstruction(
18007 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18008 }
18009
18010 // We can demote selects if we can demote their true and false values.
18011 case Instruction::Select: {
18012 return TryProcessInstruction(
18013 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18014 }
18015
18016 // We can demote phis if we can demote all their incoming operands. Note that
18017 // we don't need to worry about cycles since we ensure single use above.
18018 case Instruction::PHI: {
18019 const unsigned NumOps = E.getNumOperands();
18021 transform(seq<unsigned>(0, NumOps), Ops.begin(),
18022 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
18023
18024 return TryProcessInstruction(BitWidth, Ops);
18025 }
18026
18027 case Instruction::Call: {
18028 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18029 if (!IC)
18030 break;
18032 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
18033 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
18034 break;
18035 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
18036 function_ref<bool(unsigned, unsigned)> CallChecker;
18037 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18038 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18039 return all_of(E.Scalars, [&](Value *V) {
18040 auto *I = cast<Instruction>(V);
18041 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18042 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18043 return MaskedValueIsZero(I->getOperand(0), Mask,
18044 SimplifyQuery(*DL)) &&
18045 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18046 }
18047 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
18048 "Expected min/max intrinsics only.");
18049 unsigned SignBits = OrigBitWidth - BitWidth;
18050 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18051 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18052 nullptr, DT);
18053 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18054 nullptr, DT);
18055 return SignBits <= Op0SignBits &&
18056 ((SignBits != Op0SignBits &&
18057 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18058 MaskedValueIsZero(I->getOperand(0), Mask,
18059 SimplifyQuery(*DL))) &&
18060 SignBits <= Op1SignBits &&
18061 ((SignBits != Op1SignBits &&
18062 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18063 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18064 });
18065 };
18066 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18067 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18068 return all_of(E.Scalars, [&](Value *V) {
18069 auto *I = cast<Instruction>(V);
18070 unsigned SignBits = OrigBitWidth - BitWidth;
18071 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18072 unsigned Op0SignBits =
18073 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18074 return SignBits <= Op0SignBits &&
18075 ((SignBits != Op0SignBits &&
18076 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18077 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18078 });
18079 };
18080 if (ID != Intrinsic::abs) {
18081 Operands.push_back(getOperandEntry(&E, 1));
18082 CallChecker = CompChecker;
18083 } else {
18084 CallChecker = AbsChecker;
18085 }
18086 InstructionCost BestCost =
18087 std::numeric_limits<InstructionCost::CostType>::max();
18088 unsigned BestBitWidth = BitWidth;
18089 unsigned VF = E.Scalars.size();
18090 // Choose the best bitwidth based on cost estimations.
18091 auto Checker = [&](unsigned BitWidth, unsigned) {
18092 unsigned MinBW = PowerOf2Ceil(BitWidth);
18093 SmallVector<Type *> ArgTys =
18094 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18095 auto VecCallCosts = getVectorCallCosts(
18096 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18097 TTI, TLI, ArgTys);
18098 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18099 if (Cost < BestCost) {
18100 BestCost = Cost;
18101 BestBitWidth = BitWidth;
18102 }
18103 return false;
18104 };
18105 [[maybe_unused]] bool NeedToExit;
18106 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18107 BitWidth = BestBitWidth;
18108 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18109 }
18110
18111 // Otherwise, conservatively give up.
18112 default:
18113 break;
18114 }
18115 MaxDepthLevel = 1;
18116 return FinalAnalysis();
18117}
18118
18119static RecurKind getRdxKind(Value *V);
18120
18122 // We only attempt to truncate integer expressions.
18123 bool IsStoreOrInsertElt =
18124 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18125 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18126 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18127 ExtraBitWidthNodes.size() <= 1 &&
18128 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18129 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18130 return;
18131
18132 unsigned NodeIdx = 0;
18133 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18134 NodeIdx = 1;
18135
18136 // Ensure the roots of the vectorizable tree don't form a cycle.
18137 if (VectorizableTree[NodeIdx]->isGather() ||
18138 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18139 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18140 [NodeIdx](const EdgeInfo &EI) {
18141 return EI.UserTE->Idx > NodeIdx;
18142 })))
18143 return;
18144
18145 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18146 // resize to the final type.
18147 bool IsTruncRoot = false;
18148 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18149 SmallVector<unsigned> RootDemotes;
18150 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18151 if (NodeIdx != 0 &&
18152 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18153 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18154 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18155 IsTruncRoot = true;
18156 RootDemotes.push_back(NodeIdx);
18157 IsProfitableToDemoteRoot = true;
18158 ++NodeIdx;
18159 }
18160
18161 // Analyzed the reduction already and not profitable - exit.
18162 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18163 return;
18164
18165 SmallVector<unsigned> ToDemote;
18166 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
18167 bool IsProfitableToDemoteRoot, unsigned Opcode,
18168 unsigned Limit, bool IsTruncRoot,
18169 bool IsSignedCmp) -> unsigned {
18170 ToDemote.clear();
18171 // Check if the root is trunc and the next node is gather/buildvector, then
18172 // keep trunc in scalars, which is free in most cases.
18173 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18174 !NodesToKeepBWs.contains(E.Idx) &&
18175 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18176 all_of(E.Scalars, [&](Value *V) {
18177 return V->hasOneUse() || isa<Constant>(V) ||
18178 (!V->hasNUsesOrMore(UsesLimit) &&
18179 none_of(V->users(), [&](User *U) {
18180 const TreeEntry *TE = getTreeEntry(U);
18181 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18182 if (TE == UserTE || !TE)
18183 return false;
18184 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18185 SelectInst>(U) ||
18186 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18187 SelectInst>(UserTE->getMainOp()))
18188 return true;
18189 unsigned UserTESz = DL->getTypeSizeInBits(
18190 UserTE->Scalars.front()->getType());
18191 auto It = MinBWs.find(TE);
18192 if (It != MinBWs.end() && It->second.first > UserTESz)
18193 return true;
18194 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18195 }));
18196 })) {
18197 ToDemote.push_back(E.Idx);
18198 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18199 auto It = MinBWs.find(UserTE);
18200 if (It != MinBWs.end())
18201 return It->second.first;
18202 unsigned MaxBitWidth =
18203 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18204 MaxBitWidth = bit_ceil(MaxBitWidth);
18205 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18206 MaxBitWidth = 8;
18207 return MaxBitWidth;
18208 }
18209
18210 unsigned VF = E.getVectorFactor();
18211 Type *ScalarTy = E.Scalars.front()->getType();
18212 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18213 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18214 if (!TreeRootIT || !Opcode)
18215 return 0u;
18216
18217 if (any_of(E.Scalars,
18218 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18219 return 0u;
18220
18221 unsigned NumParts = TTI->getNumberOfParts(
18222 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18223
18224 // The maximum bit width required to represent all the values that can be
18225 // demoted without loss of precision. It would be safe to truncate the roots
18226 // of the expression to this width.
18227 unsigned MaxBitWidth = 1u;
18228
18229 // True if the roots can be zero-extended back to their original type,
18230 // rather than sign-extended. We know that if the leading bits are not
18231 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18232 // True.
18233 // Determine if the sign bit of all the roots is known to be zero. If not,
18234 // IsKnownPositive is set to False.
18235 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18236 if (isa<PoisonValue>(R))
18237 return true;
18238 KnownBits Known = computeKnownBits(R, *DL);
18239 return Known.isNonNegative();
18240 });
18241
18242 // We first check if all the bits of the roots are demanded. If they're not,
18243 // we can truncate the roots to this narrower type.
18244 for (Value *Root : E.Scalars) {
18245 if (isa<PoisonValue>(Root))
18246 continue;
18247 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18248 TypeSize NumTypeBits =
18249 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18250 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18251 // If we can't prove that the sign bit is zero, we must add one to the
18252 // maximum bit width to account for the unknown sign bit. This preserves
18253 // the existing sign bit so we can safely sign-extend the root back to the
18254 // original type. Otherwise, if we know the sign bit is zero, we will
18255 // zero-extend the root instead.
18256 //
18257 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18258 // one to the maximum bit width will yield a larger-than-necessary
18259 // type. In general, we need to add an extra bit only if we can't
18260 // prove that the upper bit of the original type is equal to the
18261 // upper bit of the proposed smaller type. If these two bits are
18262 // the same (either zero or one) we know that sign-extending from
18263 // the smaller type will result in the same value. Here, since we
18264 // can't yet prove this, we are just making the proposed smaller
18265 // type larger to ensure correctness.
18266 if (!IsKnownPositive)
18267 ++BitWidth1;
18268
18269 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18270 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18271 MaxBitWidth =
18272 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18273 }
18274
18275 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18276 MaxBitWidth = 8;
18277
18278 // If the original type is large, but reduced type does not improve the reg
18279 // use - ignore it.
18280 if (NumParts > 1 &&
18281 NumParts ==
18283 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18284 return 0u;
18285
18286 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18287 Opcode == Instruction::SExt ||
18288 Opcode == Instruction::ZExt || NumParts > 1;
18289 // Conservatively determine if we can actually truncate the roots of the
18290 // expression. Collect the values that can be demoted in ToDemote and
18291 // additional roots that require investigating in Roots.
18293 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18294 bool NeedToDemote = IsProfitableToDemote;
18295
18296 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18297 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18298 NeedToDemote, IsTruncRoot) ||
18299 (MaxDepthLevel <= Limit &&
18300 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18301 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18302 DL->getTypeSizeInBits(TreeRootIT) /
18303 DL->getTypeSizeInBits(
18304 E.getMainOp()->getOperand(0)->getType()) >
18305 2)))))
18306 return 0u;
18307 // Round MaxBitWidth up to the next power-of-two.
18308 MaxBitWidth = bit_ceil(MaxBitWidth);
18309
18310 return MaxBitWidth;
18311 };
18312
18313 // If we can truncate the root, we must collect additional values that might
18314 // be demoted as a result. That is, those seeded by truncations we will
18315 // modify.
18316 // Add reduction ops sizes, if any.
18317 if (UserIgnoreList &&
18318 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18319 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18320 // x i1> to in)).
18321 if (all_of(*UserIgnoreList,
18322 [](Value *V) {
18323 return isa<PoisonValue>(V) ||
18324 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18325 }) &&
18326 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18327 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18328 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18329 Builder.getInt1Ty()) {
18330 ReductionBitWidth = 1;
18331 } else {
18332 for (Value *V : *UserIgnoreList) {
18333 if (isa<PoisonValue>(V))
18334 continue;
18335 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18336 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18337 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18339 ++BitWidth1;
18340 unsigned BitWidth2 = BitWidth1;
18342 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18343 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18344 }
18345 ReductionBitWidth =
18346 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18347 }
18348 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18349 ReductionBitWidth = 8;
18350
18351 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18352 }
18353 }
18354 bool IsTopRoot = NodeIdx == 0;
18355 while (NodeIdx < VectorizableTree.size() &&
18356 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18357 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18358 RootDemotes.push_back(NodeIdx);
18359 ++NodeIdx;
18360 IsTruncRoot = true;
18361 }
18362 bool IsSignedCmp = false;
18363 while (NodeIdx < VectorizableTree.size()) {
18364 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18365 unsigned Limit = 2;
18366 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18367 if (IsTopRoot &&
18368 ReductionBitWidth ==
18369 DL->getTypeSizeInBits(
18370 VectorizableTree.front()->Scalars.front()->getType()))
18371 Limit = 3;
18372 unsigned MaxBitWidth = ComputeMaxBitWidth(
18373 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18374 Limit, IsTruncRoot, IsSignedCmp);
18375 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18376 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18377 ReductionBitWidth = bit_ceil(MaxBitWidth);
18378 else if (MaxBitWidth == 0)
18379 ReductionBitWidth = 0;
18380 }
18381
18382 for (unsigned Idx : RootDemotes) {
18383 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18384 uint32_t OrigBitWidth =
18385 DL->getTypeSizeInBits(V->getType()->getScalarType());
18386 if (OrigBitWidth > MaxBitWidth) {
18387 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18388 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18389 }
18390 return false;
18391 }))
18392 ToDemote.push_back(Idx);
18393 }
18394 RootDemotes.clear();
18395 IsTopRoot = false;
18396 IsProfitableToDemoteRoot = true;
18397
18398 if (ExtraBitWidthNodes.empty()) {
18399 NodeIdx = VectorizableTree.size();
18400 } else {
18401 unsigned NewIdx = 0;
18402 do {
18403 NewIdx = *ExtraBitWidthNodes.begin();
18404 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18405 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18406 NodeIdx = NewIdx;
18407 IsTruncRoot =
18408 NodeIdx < VectorizableTree.size() &&
18409 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18410 [](const EdgeInfo &EI) {
18411 return EI.EdgeIdx == 0 &&
18412 EI.UserTE->getOpcode() == Instruction::Trunc &&
18413 !EI.UserTE->isAltShuffle();
18414 });
18415 IsSignedCmp =
18416 NodeIdx < VectorizableTree.size() &&
18417 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18418 [&](const EdgeInfo &EI) {
18419 return EI.UserTE->getOpcode() == Instruction::ICmp &&
18420 any_of(EI.UserTE->Scalars, [&](Value *V) {
18421 auto *IC = dyn_cast<ICmpInst>(V);
18422 return IC &&
18423 (IC->isSigned() ||
18424 !isKnownNonNegative(IC->getOperand(0),
18425 SimplifyQuery(*DL)) ||
18426 !isKnownNonNegative(IC->getOperand(1),
18427 SimplifyQuery(*DL)));
18428 });
18429 });
18430 }
18431
18432 // If the maximum bit width we compute is less than the width of the roots'
18433 // type, we can proceed with the narrowing. Otherwise, do nothing.
18434 if (MaxBitWidth == 0 ||
18435 MaxBitWidth >=
18436 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18437 ->getBitWidth()) {
18438 if (UserIgnoreList)
18439 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18440 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18441 continue;
18442 }
18443
18444 // Finally, map the values we can demote to the maximum bit with we
18445 // computed.
18446 for (unsigned Idx : ToDemote) {
18447 TreeEntry *TE = VectorizableTree[Idx].get();
18448 if (MinBWs.contains(TE))
18449 continue;
18450 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18451 if (isa<PoisonValue>(R))
18452 return false;
18453 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18454 });
18455 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18456 }
18457 }
18458}
18459
18461 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18462 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18463 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18464 auto *AA = &AM.getResult<AAManager>(F);
18465 auto *LI = &AM.getResult<LoopAnalysis>(F);
18466 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18467 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18468 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18470
18471 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18472 if (!Changed)
18473 return PreservedAnalyses::all();
18474
18477 return PA;
18478}
18479
18481 TargetTransformInfo *TTI_,
18482 TargetLibraryInfo *TLI_, AAResults *AA_,
18483 LoopInfo *LI_, DominatorTree *DT_,
18484 AssumptionCache *AC_, DemandedBits *DB_,
18487 return false;
18488 SE = SE_;
18489 TTI = TTI_;
18490 TLI = TLI_;
18491 AA = AA_;
18492 LI = LI_;
18493 DT = DT_;
18494 AC = AC_;
18495 DB = DB_;
18496 DL = &F.getDataLayout();
18497
18498 Stores.clear();
18499 GEPs.clear();
18500 bool Changed = false;
18501
18502 // If the target claims to have no vector registers don't attempt
18503 // vectorization.
18505 LLVM_DEBUG(
18506 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18507 return false;
18508 }
18509
18510 // Don't vectorize when the attribute NoImplicitFloat is used.
18511 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18512 return false;
18513
18514 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18515
18516 // Use the bottom up slp vectorizer to construct chains that start with
18517 // store instructions.
18518 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18519
18520 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18521 // delete instructions.
18522
18523 // Update DFS numbers now so that we can use them for ordering.
18524 DT->updateDFSNumbers();
18525
18526 // Scan the blocks in the function in post order.
18527 for (auto *BB : post_order(&F.getEntryBlock())) {
18528 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18529 continue;
18530
18531 // Start new block - clear the list of reduction roots.
18532 R.clearReductionData();
18533 collectSeedInstructions(BB);
18534
18535 // Vectorize trees that end at stores.
18536 if (!Stores.empty()) {
18537 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18538 << " underlying objects.\n");
18539 Changed |= vectorizeStoreChains(R);
18540 }
18541
18542 // Vectorize trees that end at reductions.
18543 Changed |= vectorizeChainsInBlock(BB, R);
18544
18545 // Vectorize the index computations of getelementptr instructions. This
18546 // is primarily intended to catch gather-like idioms ending at
18547 // non-consecutive loads.
18548 if (!GEPs.empty()) {
18549 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18550 << " underlying objects.\n");
18551 Changed |= vectorizeGEPIndices(BB, R);
18552 }
18553 }
18554
18555 if (Changed) {
18556 R.optimizeGatherSequence();
18557 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18558 }
18559 return Changed;
18560}
18561
18562std::optional<bool>
18563SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18564 unsigned Idx, unsigned MinVF,
18565 unsigned &Size) {
18566 Size = 0;
18567 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18568 << "\n");
18569 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18570 unsigned VF = Chain.size();
18571
18572 if (!has_single_bit(Sz) ||
18574 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18575 VF) ||
18576 VF < 2 || VF < MinVF) {
18577 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18578 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18579 // all vector lanes are used.
18580 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18581 return false;
18582 }
18583
18584 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18585 << "\n");
18586
18587 SetVector<Value *> ValOps;
18588 for (Value *V : Chain)
18589 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18590 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18591 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18592 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18593 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18594 bool IsAllowedSize =
18595 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18596 ValOps.size()) ||
18597 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18598 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18599 (!S.getMainOp()->isSafeToRemove() ||
18600 any_of(ValOps.getArrayRef(),
18601 [&](Value *V) {
18602 return !isa<ExtractElementInst>(V) &&
18603 (V->getNumUses() > Chain.size() ||
18604 any_of(V->users(), [&](User *U) {
18605 return !Stores.contains(U);
18606 }));
18607 }))) ||
18608 (ValOps.size() > Chain.size() / 2 && !S)) {
18609 Size = (!IsAllowedSize && S) ? 1 : 2;
18610 return false;
18611 }
18612 }
18613 if (R.isLoadCombineCandidate(Chain))
18614 return true;
18615 R.buildTree(Chain);
18616 // Check if tree tiny and store itself or its value is not vectorized.
18617 if (R.isTreeTinyAndNotFullyVectorizable()) {
18618 if (R.isGathered(Chain.front()) ||
18619 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18620 return std::nullopt;
18621 Size = R.getCanonicalGraphSize();
18622 return false;
18623 }
18624 R.reorderTopToBottom();
18625 R.reorderBottomToTop();
18626 R.transformNodes();
18627 R.buildExternalUses();
18628
18629 R.computeMinimumValueSizes();
18630
18631 Size = R.getCanonicalGraphSize();
18632 if (S && S.getOpcode() == Instruction::Load)
18633 Size = 2; // cut off masked gather small trees
18634 InstructionCost Cost = R.getTreeCost();
18635
18636 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18637 if (Cost < -SLPCostThreshold) {
18638 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18639
18640 using namespace ore;
18641
18642 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18643 cast<StoreInst>(Chain[0]))
18644 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18645 << " and with tree size "
18646 << NV("TreeSize", R.getTreeSize()));
18647
18648 R.vectorizeTree();
18649 return true;
18650 }
18651
18652 return false;
18653}
18654
18655/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18656static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18657 bool First) {
18658 unsigned Num = 0;
18659 uint64_t Sum = std::accumulate(
18660 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18661 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18662 unsigned Size = First ? Val.first : Val.second;
18663 if (Size == 1)
18664 return V;
18665 ++Num;
18666 return V + Size;
18667 });
18668 if (Num == 0)
18669 return true;
18670 uint64_t Mean = Sum / Num;
18671 if (Mean == 0)
18672 return true;
18673 uint64_t Dev = std::accumulate(
18674 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18675 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18676 unsigned P = First ? Val.first : Val.second;
18677 if (P == 1)
18678 return V;
18679 return V + (P - Mean) * (P - Mean);
18680 }) /
18681 Num;
18682 return Dev * 81 / (Mean * Mean) == 0;
18683}
18684
18685bool SLPVectorizerPass::vectorizeStores(
18686 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18687 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18688 &Visited) {
18689 // We may run into multiple chains that merge into a single chain. We mark the
18690 // stores that we vectorized so that we don't visit the same store twice.
18691 BoUpSLP::ValueSet VectorizedStores;
18692 bool Changed = false;
18693
18694 struct StoreDistCompare {
18695 bool operator()(const std::pair<unsigned, int> &Op1,
18696 const std::pair<unsigned, int> &Op2) const {
18697 return Op1.second < Op2.second;
18698 }
18699 };
18700 // A set of pairs (index of store in Stores array ref, Distance of the store
18701 // address relative to base store address in units).
18702 using StoreIndexToDistSet =
18703 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18704 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18705 int PrevDist = -1;
18707 // Collect the chain into a list.
18708 for (auto [Idx, Data] : enumerate(Set)) {
18709 if (Operands.empty() || Data.second - PrevDist == 1) {
18710 Operands.push_back(Stores[Data.first]);
18711 PrevDist = Data.second;
18712 if (Idx != Set.size() - 1)
18713 continue;
18714 }
18715 auto E = make_scope_exit([&, &DataVar = Data]() {
18716 Operands.clear();
18717 Operands.push_back(Stores[DataVar.first]);
18718 PrevDist = DataVar.second;
18719 });
18720
18721 if (Operands.size() <= 1 ||
18722 !Visited
18723 .insert({Operands.front(),
18724 cast<StoreInst>(Operands.front())->getValueOperand(),
18725 Operands.back(),
18726 cast<StoreInst>(Operands.back())->getValueOperand(),
18727 Operands.size()})
18728 .second)
18729 continue;
18730
18731 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18732 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18733 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18734
18735 unsigned MaxVF =
18736 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18737 auto *Store = cast<StoreInst>(Operands[0]);
18738 Type *StoreTy = Store->getValueOperand()->getType();
18739 Type *ValueTy = StoreTy;
18740 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18741 ValueTy = Trunc->getSrcTy();
18742 unsigned MinVF = std::max<unsigned>(
18744 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18745 ValueTy)));
18746
18747 if (MaxVF < MinVF) {
18748 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18749 << ") < "
18750 << "MinVF (" << MinVF << ")\n");
18751 continue;
18752 }
18753
18754 unsigned NonPowerOf2VF = 0;
18756 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18757 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18758 // lanes are used.
18759 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18760 if (has_single_bit(CandVF + 1)) {
18761 NonPowerOf2VF = CandVF;
18762 assert(NonPowerOf2VF != MaxVF &&
18763 "Non-power-of-2 VF should not be equal to MaxVF");
18764 }
18765 }
18766
18767 unsigned MaxRegVF = MaxVF;
18768 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18769 if (MaxVF < MinVF) {
18770 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18771 << ") < "
18772 << "MinVF (" << MinVF << ")\n");
18773 continue;
18774 }
18775
18776 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18777 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18778 unsigned Size = MinVF;
18779 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18780 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18781 Size *= 2;
18782 });
18783 unsigned End = Operands.size();
18784 unsigned Repeat = 0;
18785 constexpr unsigned MaxAttempts = 4;
18787 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18788 P.first = P.second = 1;
18789 });
18791 auto IsNotVectorized = [](bool First,
18792 const std::pair<unsigned, unsigned> &P) {
18793 return First ? P.first > 0 : P.second > 0;
18794 };
18795 auto IsVectorized = [](bool First,
18796 const std::pair<unsigned, unsigned> &P) {
18797 return First ? P.first == 0 : P.second == 0;
18798 };
18799 auto VFIsProfitable = [](bool First, unsigned Size,
18800 const std::pair<unsigned, unsigned> &P) {
18801 return First ? Size >= P.first : Size >= P.second;
18802 };
18803 auto FirstSizeSame = [](unsigned Size,
18804 const std::pair<unsigned, unsigned> &P) {
18805 return Size == P.first;
18806 };
18807 while (true) {
18808 ++Repeat;
18809 bool RepeatChanged = false;
18810 bool AnyProfitableGraph = false;
18811 for (unsigned Size : CandidateVFs) {
18812 AnyProfitableGraph = false;
18813 unsigned StartIdx = std::distance(
18814 RangeSizes.begin(),
18815 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18816 std::placeholders::_1)));
18817 while (StartIdx < End) {
18818 unsigned EndIdx =
18819 std::distance(RangeSizes.begin(),
18820 find_if(RangeSizes.drop_front(StartIdx),
18821 std::bind(IsVectorized, Size >= MaxRegVF,
18822 std::placeholders::_1)));
18823 unsigned Sz = EndIdx >= End ? End : EndIdx;
18824 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18825 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18826 Size >= MaxRegVF)) {
18827 ++Cnt;
18828 continue;
18829 }
18831 assert(all_of(Slice,
18832 [&](Value *V) {
18833 return cast<StoreInst>(V)
18834 ->getValueOperand()
18835 ->getType() ==
18836 cast<StoreInst>(Slice.front())
18837 ->getValueOperand()
18838 ->getType();
18839 }) &&
18840 "Expected all operands of same type.");
18841 if (!NonSchedulable.empty()) {
18842 auto [NonSchedSizeMax, NonSchedSizeMin] =
18843 NonSchedulable.lookup(Slice.front());
18844 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18845 Cnt += NonSchedSizeMax;
18846 continue;
18847 }
18848 }
18849 unsigned TreeSize;
18850 std::optional<bool> Res =
18851 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18852 if (!Res) {
18853 NonSchedulable
18854 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18855 .first->getSecond()
18856 .second = Size;
18857 } else if (*Res) {
18858 // Mark the vectorized stores so that we don't vectorize them
18859 // again.
18860 VectorizedStores.insert(Slice.begin(), Slice.end());
18861 // Mark the vectorized stores so that we don't vectorize them
18862 // again.
18863 AnyProfitableGraph = RepeatChanged = Changed = true;
18864 // If we vectorized initial block, no need to try to vectorize
18865 // it again.
18866 for_each(RangeSizes.slice(Cnt, Size),
18867 [](std::pair<unsigned, unsigned> &P) {
18868 P.first = P.second = 0;
18869 });
18870 if (Cnt < StartIdx + MinVF) {
18871 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18872 [](std::pair<unsigned, unsigned> &P) {
18873 P.first = P.second = 0;
18874 });
18875 StartIdx = Cnt + Size;
18876 }
18877 if (Cnt > Sz - Size - MinVF) {
18878 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18879 [](std::pair<unsigned, unsigned> &P) {
18880 P.first = P.second = 0;
18881 });
18882 if (Sz == End)
18883 End = Cnt;
18884 Sz = Cnt;
18885 }
18886 Cnt += Size;
18887 continue;
18888 }
18889 if (Size > 2 && Res &&
18890 !all_of(RangeSizes.slice(Cnt, Size),
18891 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18892 std::placeholders::_1))) {
18893 Cnt += Size;
18894 continue;
18895 }
18896 // Check for the very big VFs that we're not rebuilding same
18897 // trees, just with larger number of elements.
18898 if (Size > MaxRegVF && TreeSize > 1 &&
18899 all_of(RangeSizes.slice(Cnt, Size),
18900 std::bind(FirstSizeSame, TreeSize,
18901 std::placeholders::_1))) {
18902 Cnt += Size;
18903 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18904 ++Cnt;
18905 continue;
18906 }
18907 if (TreeSize > 1)
18908 for_each(RangeSizes.slice(Cnt, Size),
18909 [&](std::pair<unsigned, unsigned> &P) {
18910 if (Size >= MaxRegVF)
18911 P.second = std::max(P.second, TreeSize);
18912 else
18913 P.first = std::max(P.first, TreeSize);
18914 });
18915 ++Cnt;
18916 AnyProfitableGraph = true;
18917 }
18918 if (StartIdx >= End)
18919 break;
18920 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18921 AnyProfitableGraph = true;
18922 StartIdx = std::distance(
18923 RangeSizes.begin(),
18924 find_if(RangeSizes.drop_front(Sz),
18925 std::bind(IsNotVectorized, Size >= MaxRegVF,
18926 std::placeholders::_1)));
18927 }
18928 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18929 break;
18930 }
18931 // All values vectorized - exit.
18932 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18933 return P.first == 0 && P.second == 0;
18934 }))
18935 break;
18936 // Check if tried all attempts or no need for the last attempts at all.
18937 if (Repeat >= MaxAttempts ||
18938 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18939 break;
18940 constexpr unsigned StoresLimit = 64;
18941 const unsigned MaxTotalNum = std::min<unsigned>(
18942 Operands.size(),
18943 static_cast<unsigned>(
18944 End -
18945 std::distance(
18946 RangeSizes.begin(),
18947 find_if(RangeSizes, std::bind(IsNotVectorized, true,
18948 std::placeholders::_1))) +
18949 1));
18950 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
18951 unsigned Limit =
18952 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18953 CandidateVFs.clear();
18954 if (bit_floor(Limit) == VF)
18955 CandidateVFs.push_back(Limit);
18956 if (VF > MaxTotalNum || VF >= StoresLimit)
18957 break;
18958 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18959 if (P.first != 0)
18960 P.first = std::max(P.second, P.first);
18961 });
18962 // Last attempt to vectorize max number of elements, if all previous
18963 // attempts were unsuccessful because of the cost issues.
18964 CandidateVFs.push_back(VF);
18965 }
18966 }
18967 };
18968
18969 // Stores pair (first: index of the store into Stores array ref, address of
18970 // which taken as base, second: sorted set of pairs {index, dist}, which are
18971 // indices of stores in the set and their store location distances relative to
18972 // the base address).
18973
18974 // Need to store the index of the very first store separately, since the set
18975 // may be reordered after the insertion and the first store may be moved. This
18976 // container allows to reduce number of calls of getPointersDiff() function.
18978 // Inserts the specified store SI with the given index Idx to the set of the
18979 // stores. If the store with the same distance is found already - stop
18980 // insertion, try to vectorize already found stores. If some stores from this
18981 // sequence were not vectorized - try to vectorize them with the new store
18982 // later. But this logic is applied only to the stores, that come before the
18983 // previous store with the same distance.
18984 // Example:
18985 // 1. store x, %p
18986 // 2. store y, %p+1
18987 // 3. store z, %p+2
18988 // 4. store a, %p
18989 // 5. store b, %p+3
18990 // - Scan this from the last to first store. The very first bunch of stores is
18991 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
18992 // vector).
18993 // - The next store in the list - #1 - has the same distance from store #5 as
18994 // the store #4.
18995 // - Try to vectorize sequence of stores 4,2,3,5.
18996 // - If all these stores are vectorized - just drop them.
18997 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
18998 // - Start new stores sequence.
18999 // The new bunch of stores is {1, {1, 0}}.
19000 // - Add the stores from previous sequence, that were not vectorized.
19001 // Here we consider the stores in the reversed order, rather they are used in
19002 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
19003 // Store #3 can be added -> comes after store #4 with the same distance as
19004 // store #1.
19005 // Store #5 cannot be added - comes before store #4.
19006 // This logic allows to improve the compile time, we assume that the stores
19007 // after previous store with the same distance most likely have memory
19008 // dependencies and no need to waste compile time to try to vectorize them.
19009 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19010 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
19011 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19012 std::optional<int> Diff = getPointersDiff(
19013 Stores[Set.first]->getValueOperand()->getType(),
19014 Stores[Set.first]->getPointerOperand(),
19015 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
19016 /*StrictCheck=*/true);
19017 if (!Diff)
19018 continue;
19019 auto It = Set.second.find(std::make_pair(Idx, *Diff));
19020 if (It == Set.second.end()) {
19021 Set.second.emplace(Idx, *Diff);
19022 return;
19023 }
19024 // Try to vectorize the first found set to avoid duplicate analysis.
19025 TryToVectorize(Set.second);
19026 unsigned ItIdx = It->first;
19027 int ItDist = It->second;
19028 StoreIndexToDistSet PrevSet;
19029 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19030 [&](const std::pair<unsigned, int> &Pair) {
19031 return Pair.first > ItIdx;
19032 });
19033 Set.second.clear();
19034 Set.first = Idx;
19035 Set.second.emplace(Idx, 0);
19036 // Insert stores that followed previous match to try to vectorize them
19037 // with this store.
19038 unsigned StartIdx = ItIdx + 1;
19039 SmallBitVector UsedStores(Idx - StartIdx);
19040 // Distances to previously found dup store (or this store, since they
19041 // store to the same addresses).
19042 SmallVector<int> Dists(Idx - StartIdx, 0);
19043 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
19044 // Do not try to vectorize sequences, we already tried.
19045 if (VectorizedStores.contains(Stores[Pair.first]))
19046 break;
19047 unsigned BI = Pair.first - StartIdx;
19048 UsedStores.set(BI);
19049 Dists[BI] = Pair.second - ItDist;
19050 }
19051 for (unsigned I = StartIdx; I < Idx; ++I) {
19052 unsigned BI = I - StartIdx;
19053 if (UsedStores.test(BI))
19054 Set.second.emplace(I, Dists[BI]);
19055 }
19056 return;
19057 }
19058 auto &Res = SortedStores.emplace_back();
19059 Res.first = Idx;
19060 Res.second.emplace(Idx, 0);
19061 };
19062 Type *PrevValTy = nullptr;
19063 for (auto [I, SI] : enumerate(Stores)) {
19064 if (R.isDeleted(SI))
19065 continue;
19066 if (!PrevValTy)
19067 PrevValTy = SI->getValueOperand()->getType();
19068 // Check that we do not try to vectorize stores of different types.
19069 if (PrevValTy != SI->getValueOperand()->getType()) {
19070 for (auto &Set : SortedStores)
19071 TryToVectorize(Set.second);
19072 SortedStores.clear();
19073 PrevValTy = SI->getValueOperand()->getType();
19074 }
19075 FillStoresSet(I, SI);
19076 }
19077
19078 // Final vectorization attempt.
19079 for (auto &Set : SortedStores)
19080 TryToVectorize(Set.second);
19081
19082 return Changed;
19083}
19084
19085void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19086 // Initialize the collections. We will make a single pass over the block.
19087 Stores.clear();
19088 GEPs.clear();
19089
19090 // Visit the store and getelementptr instructions in BB and organize them in
19091 // Stores and GEPs according to the underlying objects of their pointer
19092 // operands.
19093 for (Instruction &I : *BB) {
19094 // Ignore store instructions that are volatile or have a pointer operand
19095 // that doesn't point to a scalar type.
19096 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19097 if (!SI->isSimple())
19098 continue;
19099 if (!isValidElementType(SI->getValueOperand()->getType()))
19100 continue;
19101 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19102 }
19103
19104 // Ignore getelementptr instructions that have more than one index, a
19105 // constant index, or a pointer operand that doesn't point to a scalar
19106 // type.
19107 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19108 if (GEP->getNumIndices() != 1)
19109 continue;
19110 Value *Idx = GEP->idx_begin()->get();
19111 if (isa<Constant>(Idx))
19112 continue;
19113 if (!isValidElementType(Idx->getType()))
19114 continue;
19115 if (GEP->getType()->isVectorTy())
19116 continue;
19117 GEPs[GEP->getPointerOperand()].push_back(GEP);
19118 }
19119 }
19120}
19121
19122bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19123 bool MaxVFOnly) {
19124 if (VL.size() < 2)
19125 return false;
19126
19127 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19128 << VL.size() << ".\n");
19129
19130 // Check that all of the parts are instructions of the same type,
19131 // we permit an alternate opcode via InstructionsState.
19132 InstructionsState S = getSameOpcode(VL, *TLI);
19133 if (!S)
19134 return false;
19135
19136 Instruction *I0 = S.getMainOp();
19137 // Make sure invalid types (including vector type) are rejected before
19138 // determining vectorization factor for scalar instructions.
19139 for (Value *V : VL) {
19140 Type *Ty = V->getType();
19141 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19142 // NOTE: the following will give user internal llvm type name, which may
19143 // not be useful.
19144 R.getORE()->emit([&]() {
19145 std::string TypeStr;
19146 llvm::raw_string_ostream rso(TypeStr);
19147 Ty->print(rso);
19148 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19149 << "Cannot SLP vectorize list: type "
19150 << TypeStr + " is unsupported by vectorizer";
19151 });
19152 return false;
19153 }
19154 }
19155
19156 unsigned Sz = R.getVectorElementSize(I0);
19157 unsigned MinVF = R.getMinVF(Sz);
19158 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
19159 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19160 if (MaxVF < 2) {
19161 R.getORE()->emit([&]() {
19162 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19163 << "Cannot SLP vectorize list: vectorization factor "
19164 << "less than 2 is not supported";
19165 });
19166 return false;
19167 }
19168
19169 bool Changed = false;
19170 bool CandidateFound = false;
19171 InstructionCost MinCost = SLPCostThreshold.getValue();
19172 Type *ScalarTy = getValueType(VL[0]);
19173
19174 unsigned NextInst = 0, MaxInst = VL.size();
19175 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19176 // No actual vectorization should happen, if number of parts is the same as
19177 // provided vectorization factor (i.e. the scalar type is used for vector
19178 // code during codegen).
19179 auto *VecTy = getWidenedType(ScalarTy, VF);
19180 if (TTI->getNumberOfParts(VecTy) == VF)
19181 continue;
19182 for (unsigned I = NextInst; I < MaxInst; ++I) {
19183 unsigned ActualVF = std::min(MaxInst - I, VF);
19184
19185 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19186 continue;
19187
19188 if (MaxVFOnly && ActualVF < MaxVF)
19189 break;
19190 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19191 break;
19192
19193 SmallVector<Value *> Ops(ActualVF, nullptr);
19194 unsigned Idx = 0;
19195 for (Value *V : VL.drop_front(I)) {
19196 // Check that a previous iteration of this loop did not delete the
19197 // Value.
19198 if (auto *Inst = dyn_cast<Instruction>(V);
19199 !Inst || !R.isDeleted(Inst)) {
19200 Ops[Idx] = V;
19201 ++Idx;
19202 if (Idx == ActualVF)
19203 break;
19204 }
19205 }
19206 // Not enough vectorizable instructions - exit.
19207 if (Idx != ActualVF)
19208 break;
19209
19210 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19211 << "\n");
19212
19213 R.buildTree(Ops);
19214 if (R.isTreeTinyAndNotFullyVectorizable())
19215 continue;
19216 R.reorderTopToBottom();
19217 R.reorderBottomToTop(
19218 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19219 !R.doesRootHaveInTreeUses());
19220 R.transformNodes();
19221 R.buildExternalUses();
19222
19223 R.computeMinimumValueSizes();
19224 InstructionCost Cost = R.getTreeCost();
19225 CandidateFound = true;
19226 MinCost = std::min(MinCost, Cost);
19227
19228 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19229 << " for VF=" << ActualVF << "\n");
19230 if (Cost < -SLPCostThreshold) {
19231 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19232 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19233 cast<Instruction>(Ops[0]))
19234 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19235 << " and with tree size "
19236 << ore::NV("TreeSize", R.getTreeSize()));
19237
19238 R.vectorizeTree();
19239 // Move to the next bundle.
19240 I += VF - 1;
19241 NextInst = I + 1;
19242 Changed = true;
19243 }
19244 }
19245 }
19246
19247 if (!Changed && CandidateFound) {
19248 R.getORE()->emit([&]() {
19249 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19250 << "List vectorization was possible but not beneficial with cost "
19251 << ore::NV("Cost", MinCost) << " >= "
19252 << ore::NV("Treshold", -SLPCostThreshold);
19253 });
19254 } else if (!Changed) {
19255 R.getORE()->emit([&]() {
19256 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19257 << "Cannot SLP vectorize list: vectorization was impossible"
19258 << " with available vectorization factors";
19259 });
19260 }
19261 return Changed;
19262}
19263
19264bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19265 if (!I)
19266 return false;
19267
19268 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19269 return false;
19270
19271 Value *P = I->getParent();
19272
19273 // Vectorize in current basic block only.
19274 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19275 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19276 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19277 R.isDeleted(Op0) || R.isDeleted(Op1))
19278 return false;
19279
19280 // First collect all possible candidates
19282 Candidates.emplace_back(Op0, Op1);
19283
19284 auto *A = dyn_cast<BinaryOperator>(Op0);
19285 auto *B = dyn_cast<BinaryOperator>(Op1);
19286 // Try to skip B.
19287 if (A && B && B->hasOneUse()) {
19288 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19289 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19290 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19291 Candidates.emplace_back(A, B0);
19292 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19293 Candidates.emplace_back(A, B1);
19294 }
19295 // Try to skip A.
19296 if (B && A && A->hasOneUse()) {
19297 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19298 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19299 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19300 Candidates.emplace_back(A0, B);
19301 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19302 Candidates.emplace_back(A1, B);
19303 }
19304
19305 if (Candidates.size() == 1)
19306 return tryToVectorizeList({Op0, Op1}, R);
19307
19308 // We have multiple options. Try to pick the single best.
19309 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19310 if (!BestCandidate)
19311 return false;
19312 return tryToVectorizeList(
19313 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19314}
19315
19316namespace {
19317
19318/// Model horizontal reductions.
19319///
19320/// A horizontal reduction is a tree of reduction instructions that has values
19321/// that can be put into a vector as its leaves. For example:
19322///
19323/// mul mul mul mul
19324/// \ / \ /
19325/// + +
19326/// \ /
19327/// +
19328/// This tree has "mul" as its leaf values and "+" as its reduction
19329/// instructions. A reduction can feed into a store or a binary operation
19330/// feeding a phi.
19331/// ...
19332/// \ /
19333/// +
19334/// |
19335/// phi +=
19336///
19337/// Or:
19338/// ...
19339/// \ /
19340/// +
19341/// |
19342/// *p =
19343///
19344class HorizontalReduction {
19345 using ReductionOpsType = SmallVector<Value *, 16>;
19346 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19347 ReductionOpsListType ReductionOps;
19348 /// List of possibly reduced values.
19350 /// Maps reduced value to the corresponding reduction operation.
19352 WeakTrackingVH ReductionRoot;
19353 /// The type of reduction operation.
19354 RecurKind RdxKind;
19355 /// Checks if the optimization of original scalar identity operations on
19356 /// matched horizontal reductions is enabled and allowed.
19357 bool IsSupportedHorRdxIdentityOp = false;
19358
19359 static bool isCmpSelMinMax(Instruction *I) {
19360 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19362 }
19363
19364 // And/or are potentially poison-safe logical patterns like:
19365 // select x, y, false
19366 // select x, true, y
19367 static bool isBoolLogicOp(Instruction *I) {
19368 return isa<SelectInst>(I) &&
19369 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19370 }
19371
19372 /// Checks if instruction is associative and can be vectorized.
19373 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19374 if (Kind == RecurKind::None)
19375 return false;
19376
19377 // Integer ops that map to select instructions or intrinsics are fine.
19379 isBoolLogicOp(I))
19380 return true;
19381
19382 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19383 // FP min/max are associative except for NaN and -0.0. We do not
19384 // have to rule out -0.0 here because the intrinsic semantics do not
19385 // specify a fixed result for it.
19386 return I->getFastMathFlags().noNaNs();
19387 }
19388
19389 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19390 return true;
19391
19392 return I->isAssociative();
19393 }
19394
19395 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19396 // Poison-safe 'or' takes the form: select X, true, Y
19397 // To make that work with the normal operand processing, we skip the
19398 // true value operand.
19399 // TODO: Change the code and data structures to handle this without a hack.
19400 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19401 return I->getOperand(2);
19402 return I->getOperand(Index);
19403 }
19404
19405 /// Creates reduction operation with the current opcode.
19406 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19407 Value *RHS, const Twine &Name, bool UseSelect) {
19408 switch (Kind) {
19409 case RecurKind::Or: {
19410 if (UseSelect &&
19412 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19413 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19414 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19415 Name);
19416 }
19417 case RecurKind::And: {
19418 if (UseSelect &&
19420 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19421 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19422 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19423 Name);
19424 }
19425 case RecurKind::Add:
19426 case RecurKind::Mul:
19427 case RecurKind::Xor:
19428 case RecurKind::FAdd:
19429 case RecurKind::FMul: {
19430 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19431 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19432 Name);
19433 }
19434 case RecurKind::FMax:
19435 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
19436 case RecurKind::FMin:
19437 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
19438 case RecurKind::FMaximum:
19439 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
19440 case RecurKind::FMinimum:
19441 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
19442 case RecurKind::SMax:
19443 if (UseSelect) {
19444 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
19445 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19446 }
19447 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
19448 case RecurKind::SMin:
19449 if (UseSelect) {
19450 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
19451 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19452 }
19453 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
19454 case RecurKind::UMax:
19455 if (UseSelect) {
19456 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
19457 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19458 }
19459 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
19460 case RecurKind::UMin:
19461 if (UseSelect) {
19462 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
19463 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19464 }
19465 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
19466 default:
19467 llvm_unreachable("Unknown reduction operation.");
19468 }
19469 }
19470
19471 /// Creates reduction operation with the current opcode with the IR flags
19472 /// from \p ReductionOps, dropping nuw/nsw flags.
19473 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19474 Value *RHS, const Twine &Name,
19475 const ReductionOpsListType &ReductionOps) {
19476 bool UseSelect = ReductionOps.size() == 2 ||
19477 // Logical or/and.
19478 (ReductionOps.size() == 1 &&
19479 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19480 assert((!UseSelect || ReductionOps.size() != 2 ||
19481 isa<SelectInst>(ReductionOps[1][0])) &&
19482 "Expected cmp + select pairs for reduction");
19483 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19485 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19486 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19487 /*IncludeWrapFlags=*/false);
19488 propagateIRFlags(Op, ReductionOps[1], nullptr,
19489 /*IncludeWrapFlags=*/false);
19490 return Op;
19491 }
19492 }
19493 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19494 return Op;
19495 }
19496
19497public:
19498 static RecurKind getRdxKind(Value *V) {
19499 auto *I = dyn_cast<Instruction>(V);
19500 if (!I)
19501 return RecurKind::None;
19502 if (match(I, m_Add(m_Value(), m_Value())))
19503 return RecurKind::Add;
19504 if (match(I, m_Mul(m_Value(), m_Value())))
19505 return RecurKind::Mul;
19506 if (match(I, m_And(m_Value(), m_Value())) ||
19508 return RecurKind::And;
19509 if (match(I, m_Or(m_Value(), m_Value())) ||
19511 return RecurKind::Or;
19512 if (match(I, m_Xor(m_Value(), m_Value())))
19513 return RecurKind::Xor;
19514 if (match(I, m_FAdd(m_Value(), m_Value())))
19515 return RecurKind::FAdd;
19516 if (match(I, m_FMul(m_Value(), m_Value())))
19517 return RecurKind::FMul;
19518
19519 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19520 return RecurKind::FMax;
19521 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19522 return RecurKind::FMin;
19523
19524 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19525 return RecurKind::FMaximum;
19526 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19527 return RecurKind::FMinimum;
19528 // This matches either cmp+select or intrinsics. SLP is expected to handle
19529 // either form.
19530 // TODO: If we are canonicalizing to intrinsics, we can remove several
19531 // special-case paths that deal with selects.
19532 if (match(I, m_SMax(m_Value(), m_Value())))
19533 return RecurKind::SMax;
19534 if (match(I, m_SMin(m_Value(), m_Value())))
19535 return RecurKind::SMin;
19536 if (match(I, m_UMax(m_Value(), m_Value())))
19537 return RecurKind::UMax;
19538 if (match(I, m_UMin(m_Value(), m_Value())))
19539 return RecurKind::UMin;
19540
19541 if (auto *Select = dyn_cast<SelectInst>(I)) {
19542 // Try harder: look for min/max pattern based on instructions producing
19543 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19544 // During the intermediate stages of SLP, it's very common to have
19545 // pattern like this (since optimizeGatherSequence is run only once
19546 // at the end):
19547 // %1 = extractelement <2 x i32> %a, i32 0
19548 // %2 = extractelement <2 x i32> %a, i32 1
19549 // %cond = icmp sgt i32 %1, %2
19550 // %3 = extractelement <2 x i32> %a, i32 0
19551 // %4 = extractelement <2 x i32> %a, i32 1
19552 // %select = select i1 %cond, i32 %3, i32 %4
19553 CmpPredicate Pred;
19554 Instruction *L1;
19555 Instruction *L2;
19556
19557 Value *LHS = Select->getTrueValue();
19558 Value *RHS = Select->getFalseValue();
19559 Value *Cond = Select->getCondition();
19560
19561 // TODO: Support inverse predicates.
19562 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19563 if (!isa<ExtractElementInst>(RHS) ||
19564 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19565 return RecurKind::None;
19566 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19567 if (!isa<ExtractElementInst>(LHS) ||
19568 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19569 return RecurKind::None;
19570 } else {
19571 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19572 return RecurKind::None;
19573 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19574 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19575 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19576 return RecurKind::None;
19577 }
19578
19579 switch (Pred) {
19580 default:
19581 return RecurKind::None;
19582 case CmpInst::ICMP_SGT:
19583 case CmpInst::ICMP_SGE:
19584 return RecurKind::SMax;
19585 case CmpInst::ICMP_SLT:
19586 case CmpInst::ICMP_SLE:
19587 return RecurKind::SMin;
19588 case CmpInst::ICMP_UGT:
19589 case CmpInst::ICMP_UGE:
19590 return RecurKind::UMax;
19591 case CmpInst::ICMP_ULT:
19592 case CmpInst::ICMP_ULE:
19593 return RecurKind::UMin;
19594 }
19595 }
19596 return RecurKind::None;
19597 }
19598
19599 /// Get the index of the first operand.
19600 static unsigned getFirstOperandIndex(Instruction *I) {
19601 return isCmpSelMinMax(I) ? 1 : 0;
19602 }
19603
19604private:
19605 /// Total number of operands in the reduction operation.
19606 static unsigned getNumberOfOperands(Instruction *I) {
19607 return isCmpSelMinMax(I) ? 3 : 2;
19608 }
19609
19610 /// Checks if the instruction is in basic block \p BB.
19611 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19612 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19613 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19614 auto *Sel = cast<SelectInst>(I);
19615 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19616 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19617 }
19618 return I->getParent() == BB;
19619 }
19620
19621 /// Expected number of uses for reduction operations/reduced values.
19622 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19623 if (IsCmpSelMinMax) {
19624 // SelectInst must be used twice while the condition op must have single
19625 // use only.
19626 if (auto *Sel = dyn_cast<SelectInst>(I))
19627 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19628 return I->hasNUses(2);
19629 }
19630
19631 // Arithmetic reduction operation must be used once only.
19632 return I->hasOneUse();
19633 }
19634
19635 /// Initializes the list of reduction operations.
19636 void initReductionOps(Instruction *I) {
19637 if (isCmpSelMinMax(I))
19638 ReductionOps.assign(2, ReductionOpsType());
19639 else
19640 ReductionOps.assign(1, ReductionOpsType());
19641 }
19642
19643 /// Add all reduction operations for the reduction instruction \p I.
19644 void addReductionOps(Instruction *I) {
19645 if (isCmpSelMinMax(I)) {
19646 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19647 ReductionOps[1].emplace_back(I);
19648 } else {
19649 ReductionOps[0].emplace_back(I);
19650 }
19651 }
19652
19653 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19654 int Sz = Data.size();
19655 auto *I = dyn_cast<Instruction>(Data.front());
19656 return Sz > 1 || isConstant(Data.front()) ||
19657 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19658 }
19659
19660public:
19661 HorizontalReduction() = default;
19662
19663 /// Try to find a reduction tree.
19664 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19665 ScalarEvolution &SE, const DataLayout &DL,
19666 const TargetLibraryInfo &TLI) {
19667 RdxKind = HorizontalReduction::getRdxKind(Root);
19668 if (!isVectorizable(RdxKind, Root))
19669 return false;
19670
19671 // Analyze "regular" integer/FP types for reductions - no target-specific
19672 // types or pointers.
19673 Type *Ty = Root->getType();
19674 if (!isValidElementType(Ty) || Ty->isPointerTy())
19675 return false;
19676
19677 // Though the ultimate reduction may have multiple uses, its condition must
19678 // have only single use.
19679 if (auto *Sel = dyn_cast<SelectInst>(Root))
19680 if (!Sel->getCondition()->hasOneUse())
19681 return false;
19682
19683 ReductionRoot = Root;
19684
19685 // Iterate through all the operands of the possible reduction tree and
19686 // gather all the reduced values, sorting them by their value id.
19687 BasicBlock *BB = Root->getParent();
19688 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19690 1, std::make_pair(Root, 0));
19691 // Checks if the operands of the \p TreeN instruction are also reduction
19692 // operations or should be treated as reduced values or an extra argument,
19693 // which is not part of the reduction.
19694 auto CheckOperands = [&](Instruction *TreeN,
19695 SmallVectorImpl<Value *> &PossibleReducedVals,
19696 SmallVectorImpl<Instruction *> &ReductionOps,
19697 unsigned Level) {
19698 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19699 getNumberOfOperands(TreeN)))) {
19700 Value *EdgeVal = getRdxOperand(TreeN, I);
19701 ReducedValsToOps[EdgeVal].push_back(TreeN);
19702 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19703 // If the edge is not an instruction, or it is different from the main
19704 // reduction opcode or has too many uses - possible reduced value.
19705 // Also, do not try to reduce const values, if the operation is not
19706 // foldable.
19707 if (!EdgeInst || Level > RecursionMaxDepth ||
19708 getRdxKind(EdgeInst) != RdxKind ||
19709 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19710 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19711 !isVectorizable(RdxKind, EdgeInst) ||
19712 (R.isAnalyzedReductionRoot(EdgeInst) &&
19713 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19714 PossibleReducedVals.push_back(EdgeVal);
19715 continue;
19716 }
19717 ReductionOps.push_back(EdgeInst);
19718 }
19719 };
19720 // Try to regroup reduced values so that it gets more profitable to try to
19721 // reduce them. Values are grouped by their value ids, instructions - by
19722 // instruction op id and/or alternate op id, plus do extra analysis for
19723 // loads (grouping them by the distabce between pointers) and cmp
19724 // instructions (grouping them by the predicate).
19727 8>
19728 PossibleReducedVals;
19729 initReductionOps(Root);
19731 SmallSet<size_t, 2> LoadKeyUsed;
19732
19733 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19734 Key = hash_combine(hash_value(LI->getParent()), Key);
19735 Value *Ptr =
19737 if (!LoadKeyUsed.insert(Key).second) {
19738 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19739 if (LIt != LoadsMap.end()) {
19740 for (LoadInst *RLI : LIt->second) {
19741 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19742 LI->getType(), LI->getPointerOperand(), DL, SE,
19743 /*StrictCheck=*/true))
19744 return hash_value(RLI->getPointerOperand());
19745 }
19746 for (LoadInst *RLI : LIt->second) {
19748 LI->getPointerOperand(), TLI)) {
19749 hash_code SubKey = hash_value(RLI->getPointerOperand());
19750 return SubKey;
19751 }
19752 }
19753 if (LIt->second.size() > 2) {
19754 hash_code SubKey =
19755 hash_value(LIt->second.back()->getPointerOperand());
19756 return SubKey;
19757 }
19758 }
19759 }
19760 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19761 .first->second.push_back(LI);
19762 return hash_value(LI->getPointerOperand());
19763 };
19764
19765 while (!Worklist.empty()) {
19766 auto [TreeN, Level] = Worklist.pop_back_val();
19767 SmallVector<Value *> PossibleRedVals;
19768 SmallVector<Instruction *> PossibleReductionOps;
19769 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19770 addReductionOps(TreeN);
19771 // Add reduction values. The values are sorted for better vectorization
19772 // results.
19773 for (Value *V : PossibleRedVals) {
19774 size_t Key, Idx;
19775 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19776 /*AllowAlternate=*/false);
19777 ++PossibleReducedVals[Key][Idx]
19778 .insert(std::make_pair(V, 0))
19779 .first->second;
19780 }
19781 for (Instruction *I : reverse(PossibleReductionOps))
19782 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19783 }
19784 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19785 // Sort values by the total number of values kinds to start the reduction
19786 // from the longest possible reduced values sequences.
19787 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19788 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19789 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19790 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19791 It != E; ++It) {
19792 PossibleRedValsVect.emplace_back();
19793 auto RedValsVect = It->second.takeVector();
19794 stable_sort(RedValsVect, llvm::less_second());
19795 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19796 PossibleRedValsVect.back().append(Data.second, Data.first);
19797 }
19798 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19799 return P1.size() > P2.size();
19800 });
19801 int NewIdx = -1;
19802 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19803 if (NewIdx < 0 ||
19804 (!isGoodForReduction(Data) &&
19805 (!isa<LoadInst>(Data.front()) ||
19806 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19808 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19810 cast<LoadInst>(ReducedVals[NewIdx].front())
19811 ->getPointerOperand())))) {
19812 NewIdx = ReducedVals.size();
19813 ReducedVals.emplace_back();
19814 }
19815 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19816 }
19817 }
19818 // Sort the reduced values by number of same/alternate opcode and/or pointer
19819 // operand.
19820 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19821 return P1.size() > P2.size();
19822 });
19823 return true;
19824 }
19825
19826 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19827 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19828 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19829 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19830 constexpr unsigned RegMaxNumber = 4;
19831 constexpr unsigned RedValsMaxNumber = 128;
19832 // If there are a sufficient number of reduction values, reduce
19833 // to a nearby power-of-2. We can safely generate oversized
19834 // vectors and rely on the backend to split them to legal sizes.
19835 if (unsigned NumReducedVals = std::accumulate(
19836 ReducedVals.begin(), ReducedVals.end(), 0,
19837 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19838 if (!isGoodForReduction(Vals))
19839 return Num;
19840 return Num + Vals.size();
19841 });
19842 NumReducedVals < ReductionLimit &&
19843 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19844 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19845 })) {
19846 for (ReductionOpsType &RdxOps : ReductionOps)
19847 for (Value *RdxOp : RdxOps)
19848 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19849 return nullptr;
19850 }
19851
19852 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19853 TargetFolder(DL));
19854 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19855
19856 // Track the reduced values in case if they are replaced by extractelement
19857 // because of the vectorization.
19858 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19859 ReducedVals.front().size());
19860
19861 // The compare instruction of a min/max is the insertion point for new
19862 // instructions and may be replaced with a new compare instruction.
19863 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19864 assert(isa<SelectInst>(RdxRootInst) &&
19865 "Expected min/max reduction to have select root instruction");
19866 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19867 assert(isa<Instruction>(ScalarCond) &&
19868 "Expected min/max reduction to have compare condition");
19869 return cast<Instruction>(ScalarCond);
19870 };
19871
19872 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19873 return isBoolLogicOp(cast<Instruction>(V));
19874 });
19875 // Return new VectorizedTree, based on previous value.
19876 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19877 if (VectorizedTree) {
19878 // Update the final value in the reduction.
19880 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19881 if (AnyBoolLogicOp) {
19882 auto It = ReducedValsToOps.find(VectorizedTree);
19883 auto It1 = ReducedValsToOps.find(Res);
19884 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19885 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19886 (It != ReducedValsToOps.end() &&
19887 any_of(It->getSecond(), [&](Instruction *I) {
19888 return isBoolLogicOp(I) &&
19889 getRdxOperand(I, 0) == VectorizedTree;
19890 }))) {
19891 ;
19892 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19893 (It1 != ReducedValsToOps.end() &&
19894 any_of(It1->getSecond(), [&](Instruction *I) {
19895 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19896 }))) {
19897 std::swap(VectorizedTree, Res);
19898 } else {
19899 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19900 }
19901 }
19902
19903 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19904 ReductionOps);
19905 }
19906 // Initialize the final value in the reduction.
19907 return Res;
19908 };
19909 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19910 ReductionOps.front().size());
19911 for (ReductionOpsType &RdxOps : ReductionOps)
19912 for (Value *RdxOp : RdxOps) {
19913 if (!RdxOp)
19914 continue;
19915 IgnoreList.insert(RdxOp);
19916 }
19917 // Intersect the fast-math-flags from all reduction operations.
19918 FastMathFlags RdxFMF;
19919 RdxFMF.set();
19920 for (Value *U : IgnoreList)
19921 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19922 RdxFMF &= FPMO->getFastMathFlags();
19923 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19924
19925 // Need to track reduced vals, they may be changed during vectorization of
19926 // subvectors.
19927 for (ArrayRef<Value *> Candidates : ReducedVals)
19928 for (Value *V : Candidates)
19929 TrackedVals.try_emplace(V, V);
19930
19932 Value *V) -> unsigned & {
19933 auto *It = MV.find(V);
19934 assert(It != MV.end() && "Unable to find given key.");
19935 return It->second;
19936 };
19937
19938 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19939 // List of the values that were reduced in other trees as part of gather
19940 // nodes and thus requiring extract if fully vectorized in other trees.
19941 SmallPtrSet<Value *, 4> RequiredExtract;
19942 WeakTrackingVH VectorizedTree = nullptr;
19943 bool CheckForReusedReductionOps = false;
19944 // Try to vectorize elements based on their type.
19946 for (ArrayRef<Value *> RV : ReducedVals)
19947 States.push_back(getSameOpcode(RV, TLI));
19948 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19949 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19950 InstructionsState S = States[I];
19951 SmallVector<Value *> Candidates;
19952 Candidates.reserve(2 * OrigReducedVals.size());
19953 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19954 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19955 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19956 // Check if the reduction value was not overriden by the extractelement
19957 // instruction because of the vectorization and exclude it, if it is not
19958 // compatible with other values.
19959 // Also check if the instruction was folded to constant/other value.
19960 auto *Inst = dyn_cast<Instruction>(RdxVal);
19961 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
19962 (!S || !S.isOpcodeOrAlt(Inst))) ||
19963 (S && !Inst))
19964 continue;
19965 Candidates.push_back(RdxVal);
19966 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19967 }
19968 bool ShuffledExtracts = false;
19969 // Try to handle shuffled extractelements.
19970 if (S && S.getOpcode() == Instruction::ExtractElement &&
19971 !S.isAltShuffle() && I + 1 < E) {
19972 SmallVector<Value *> CommonCandidates(Candidates);
19973 for (Value *RV : ReducedVals[I + 1]) {
19974 Value *RdxVal = TrackedVals.at(RV);
19975 // Check if the reduction value was not overriden by the
19976 // extractelement instruction because of the vectorization and
19977 // exclude it, if it is not compatible with other values.
19978 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19979 if (!Inst)
19980 continue;
19981 CommonCandidates.push_back(RdxVal);
19982 TrackedToOrig.try_emplace(RdxVal, RV);
19983 }
19985 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19986 ++I;
19987 Candidates.swap(CommonCandidates);
19988 ShuffledExtracts = true;
19989 }
19990 }
19991
19992 // Emit code for constant values.
19993 if (Candidates.size() > 1 && allConstant(Candidates)) {
19994 Value *Res = Candidates.front();
19995 Value *OrigV = TrackedToOrig.at(Candidates.front());
19996 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19997 for (Value *VC : ArrayRef(Candidates).drop_front()) {
19998 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
19999 Value *OrigV = TrackedToOrig.at(VC);
20000 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20001 if (auto *ResI = dyn_cast<Instruction>(Res))
20002 V.analyzedReductionRoot(ResI);
20003 }
20004 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20005 continue;
20006 }
20007
20008 unsigned NumReducedVals = Candidates.size();
20009 if (NumReducedVals < ReductionLimit &&
20010 (NumReducedVals < 2 || !isSplat(Candidates)))
20011 continue;
20012
20013 // Check if we support repeated scalar values processing (optimization of
20014 // original scalar identity operations on matched horizontal reductions).
20015 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20016 RdxKind != RecurKind::FMul &&
20017 RdxKind != RecurKind::FMulAdd;
20018 // Gather same values.
20019 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20020 if (IsSupportedHorRdxIdentityOp)
20021 for (Value *V : Candidates) {
20022 Value *OrigV = TrackedToOrig.at(V);
20023 ++SameValuesCounter.try_emplace(OrigV).first->second;
20024 }
20025 // Used to check if the reduced values used same number of times. In this
20026 // case the compiler may produce better code. E.g. if reduced values are
20027 // aabbccdd (8 x values), then the first node of the tree will have a node
20028 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20029 // Plus, the final reduction will be performed on <8 x aabbccdd>.
20030 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20031 // x abcd) * 2.
20032 // Currently it only handles add/fadd/xor. and/or/min/max do not require
20033 // this analysis, other operations may require an extra estimation of
20034 // the profitability.
20035 bool SameScaleFactor = false;
20036 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20037 SameValuesCounter.size() != Candidates.size();
20038 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20039 if (OptReusedScalars) {
20040 SameScaleFactor =
20041 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20042 RdxKind == RecurKind::Xor) &&
20043 all_of(drop_begin(SameValuesCounter),
20044 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20045 return P.second == SameValuesCounter.front().second;
20046 });
20047 Candidates.resize(SameValuesCounter.size());
20048 transform(SameValuesCounter, Candidates.begin(),
20049 [&](const auto &P) { return TrackedVals.at(P.first); });
20050 NumReducedVals = Candidates.size();
20051 // Have a reduction of the same element.
20052 if (NumReducedVals == 1) {
20053 Value *OrigV = TrackedToOrig.at(Candidates.front());
20054 unsigned Cnt = At(SameValuesCounter, OrigV);
20055 Value *RedVal =
20056 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20057 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20058 VectorizedVals.try_emplace(OrigV, Cnt);
20059 ExternallyUsedValues.insert(OrigV);
20060 continue;
20061 }
20062 }
20063
20064 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20065 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20066 const unsigned MaxElts = std::clamp<unsigned>(
20067 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20068 RegMaxNumber * RedValsMaxNumber);
20069
20070 unsigned ReduxWidth = NumReducedVals;
20071 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20072 unsigned NumParts, NumRegs;
20073 Type *ScalarTy = Candidates.front()->getType();
20074 ReduxWidth =
20075 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20076 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20077 NumParts = TTI.getNumberOfParts(Tp);
20078 NumRegs =
20080 while (NumParts > NumRegs) {
20081 ReduxWidth = bit_floor(ReduxWidth - 1);
20082 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20083 NumParts = TTI.getNumberOfParts(Tp);
20084 NumRegs =
20086 }
20087 if (NumParts > NumRegs / 2)
20088 ReduxWidth = bit_floor(ReduxWidth);
20089 return ReduxWidth;
20090 };
20091 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20092 ReduxWidth = GetVectorFactor(ReduxWidth);
20093 ReduxWidth = std::min(ReduxWidth, MaxElts);
20094
20095 unsigned Start = 0;
20096 unsigned Pos = Start;
20097 // Restarts vectorization attempt with lower vector factor.
20098 unsigned PrevReduxWidth = ReduxWidth;
20099 bool CheckForReusedReductionOpsLocal = false;
20100 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20101 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20102 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20103 // Check if any of the reduction ops are gathered. If so, worth
20104 // trying again with less number of reduction ops.
20105 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20106 }
20107 ++Pos;
20108 if (Pos < NumReducedVals - ReduxWidth + 1)
20109 return IsAnyRedOpGathered;
20110 Pos = Start;
20111 --ReduxWidth;
20112 if (ReduxWidth > 1)
20113 ReduxWidth = GetVectorFactor(ReduxWidth);
20114 return IsAnyRedOpGathered;
20115 };
20116 bool AnyVectorized = false;
20117 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20118 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20119 ReduxWidth >= ReductionLimit) {
20120 // Dependency in tree of the reduction ops - drop this attempt, try
20121 // later.
20122 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20123 Start == 0) {
20124 CheckForReusedReductionOps = true;
20125 break;
20126 }
20127 PrevReduxWidth = ReduxWidth;
20128 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20129 // Been analyzed already - skip.
20130 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20131 (!has_single_bit(ReduxWidth) &&
20132 (IgnoredCandidates.contains(
20133 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20134 IgnoredCandidates.contains(
20135 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20136 bit_floor(ReduxWidth))))) ||
20137 V.areAnalyzedReductionVals(VL)) {
20138 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20139 continue;
20140 }
20141 // Early exit if any of the reduction values were deleted during
20142 // previous vectorization attempts.
20143 if (any_of(VL, [&V](Value *RedVal) {
20144 auto *RedValI = dyn_cast<Instruction>(RedVal);
20145 if (!RedValI)
20146 return false;
20147 return V.isDeleted(RedValI);
20148 }))
20149 break;
20150 V.buildTree(VL, IgnoreList);
20151 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20152 if (!AdjustReducedVals())
20153 V.analyzedReductionVals(VL);
20154 continue;
20155 }
20156 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20157 if (!AdjustReducedVals())
20158 V.analyzedReductionVals(VL);
20159 continue;
20160 }
20161 V.reorderTopToBottom();
20162 // No need to reorder the root node at all.
20163 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20164 // Keep extracted other reduction values, if they are used in the
20165 // vectorization trees.
20166 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20167 ExternallyUsedValues);
20168 // The reduction root is used as the insertion point for new
20169 // instructions, so set it as externally used to prevent it from being
20170 // deleted.
20171 LocalExternallyUsedValues.insert(ReductionRoot);
20172 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20173 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20174 continue;
20175 for (Value *V : ReducedVals[Cnt])
20176 if (isa<Instruction>(V))
20177 LocalExternallyUsedValues.insert(TrackedVals[V]);
20178 }
20179 if (!IsSupportedHorRdxIdentityOp) {
20180 // Number of uses of the candidates in the vector of values.
20181 assert(SameValuesCounter.empty() &&
20182 "Reused values counter map is not empty");
20183 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20184 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20185 continue;
20186 Value *V = Candidates[Cnt];
20187 Value *OrigV = TrackedToOrig.at(V);
20188 ++SameValuesCounter.try_emplace(OrigV).first->second;
20189 }
20190 }
20191 V.transformNodes();
20192 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20193 // Gather externally used values.
20195 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20196 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20197 continue;
20198 Value *RdxVal = Candidates[Cnt];
20199 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20200 RdxVal = It->second;
20201 if (!Visited.insert(RdxVal).second)
20202 continue;
20203 // Check if the scalar was vectorized as part of the vectorization
20204 // tree but not the top node.
20205 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20206 LocalExternallyUsedValues.insert(RdxVal);
20207 continue;
20208 }
20209 Value *OrigV = TrackedToOrig.at(RdxVal);
20210 unsigned NumOps =
20211 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20212 if (NumOps != ReducedValsToOps.at(OrigV).size())
20213 LocalExternallyUsedValues.insert(RdxVal);
20214 }
20215 // Do not need the list of reused scalars in regular mode anymore.
20216 if (!IsSupportedHorRdxIdentityOp)
20217 SameValuesCounter.clear();
20218 for (Value *RdxVal : VL)
20219 if (RequiredExtract.contains(RdxVal))
20220 LocalExternallyUsedValues.insert(RdxVal);
20221 V.buildExternalUses(LocalExternallyUsedValues);
20222
20223 V.computeMinimumValueSizes();
20224
20225 // Estimate cost.
20226 InstructionCost TreeCost = V.getTreeCost(VL);
20227 InstructionCost ReductionCost =
20228 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20229 InstructionCost Cost = TreeCost + ReductionCost;
20230 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20231 << " for reduction\n");
20232 if (!Cost.isValid())
20233 break;
20234 if (Cost >= -SLPCostThreshold) {
20235 V.getORE()->emit([&]() {
20236 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20237 ReducedValsToOps.at(VL[0]).front())
20238 << "Vectorizing horizontal reduction is possible "
20239 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20240 << " and threshold "
20241 << ore::NV("Threshold", -SLPCostThreshold);
20242 });
20243 if (!AdjustReducedVals()) {
20244 V.analyzedReductionVals(VL);
20245 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20246 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20247 // Add subvectors of VL to the list of the analyzed values.
20248 for (unsigned VF = getFloorFullVectorNumberOfElements(
20249 *TTI, VL.front()->getType(), ReduxWidth - 1);
20250 VF >= ReductionLimit;
20252 *TTI, VL.front()->getType(), VF - 1)) {
20253 if (has_single_bit(VF) &&
20254 V.getCanonicalGraphSize() != V.getTreeSize())
20255 continue;
20256 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20257 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20258 }
20259 }
20260 }
20261 continue;
20262 }
20263
20264 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20265 << Cost << ". (HorRdx)\n");
20266 V.getORE()->emit([&]() {
20267 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20268 ReducedValsToOps.at(VL[0]).front())
20269 << "Vectorized horizontal reduction with cost "
20270 << ore::NV("Cost", Cost) << " and with tree size "
20271 << ore::NV("TreeSize", V.getTreeSize());
20272 });
20273
20274 Builder.setFastMathFlags(RdxFMF);
20275
20276 // Emit a reduction. If the root is a select (min/max idiom), the insert
20277 // point is the compare condition of that select.
20278 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20279 Instruction *InsertPt = RdxRootInst;
20280 if (IsCmpSelMinMax)
20281 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20282
20283 // Vectorize a tree.
20284 Value *VectorizedRoot =
20285 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20286 // Update TrackedToOrig mapping, since the tracked values might be
20287 // updated.
20288 for (Value *RdxVal : Candidates) {
20289 Value *OrigVal = TrackedToOrig.at(RdxVal);
20290 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20291 if (TransformedRdxVal != RdxVal)
20292 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20293 }
20294
20295 Builder.SetInsertPoint(InsertPt);
20296
20297 // To prevent poison from leaking across what used to be sequential,
20298 // safe, scalar boolean logic operations, the reduction operand must be
20299 // frozen.
20300 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20301 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20302
20303 // Emit code to correctly handle reused reduced values, if required.
20304 if (OptReusedScalars && !SameScaleFactor) {
20305 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20306 SameValuesCounter, TrackedToOrig);
20307 }
20308
20309 Value *ReducedSubTree;
20310 Type *ScalarTy = VL.front()->getType();
20311 if (isa<FixedVectorType>(ScalarTy)) {
20312 assert(SLPReVec && "FixedVectorType is not expected.");
20313 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20314 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20315 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20316 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20317 // Do reduction for each lane.
20318 // e.g., do reduce add for
20319 // VL[0] = <4 x Ty> <a, b, c, d>
20320 // VL[1] = <4 x Ty> <e, f, g, h>
20321 // Lane[0] = <2 x Ty> <a, e>
20322 // Lane[1] = <2 x Ty> <b, f>
20323 // Lane[2] = <2 x Ty> <c, g>
20324 // Lane[3] = <2 x Ty> <d, h>
20325 // result[0] = reduce add Lane[0]
20326 // result[1] = reduce add Lane[1]
20327 // result[2] = reduce add Lane[2]
20328 // result[3] = reduce add Lane[3]
20330 createStrideMask(I, ScalarTyNumElements, VL.size());
20331 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20332 ReducedSubTree = Builder.CreateInsertElement(
20333 ReducedSubTree,
20334 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20335 }
20336 } else {
20337 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20338 RdxRootInst->getType());
20339 }
20340 if (ReducedSubTree->getType() != VL.front()->getType()) {
20341 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20342 "Expected different reduction type.");
20343 ReducedSubTree =
20344 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20345 V.isSignedMinBitwidthRootNode());
20346 }
20347
20348 // Improved analysis for add/fadd/xor reductions with same scale factor
20349 // for all operands of reductions. We can emit scalar ops for them
20350 // instead.
20351 if (OptReusedScalars && SameScaleFactor)
20352 ReducedSubTree = emitScaleForReusedOps(
20353 ReducedSubTree, Builder, SameValuesCounter.front().second);
20354
20355 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20356 // Count vectorized reduced values to exclude them from final reduction.
20357 for (Value *RdxVal : VL) {
20358 Value *OrigV = TrackedToOrig.at(RdxVal);
20359 if (IsSupportedHorRdxIdentityOp) {
20360 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20361 continue;
20362 }
20363 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20364 if (!V.isVectorized(RdxVal))
20365 RequiredExtract.insert(RdxVal);
20366 }
20367 Pos += ReduxWidth;
20368 Start = Pos;
20369 ReduxWidth = NumReducedVals - Pos;
20370 if (ReduxWidth > 1)
20371 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20372 AnyVectorized = true;
20373 }
20374 if (OptReusedScalars && !AnyVectorized) {
20375 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20376 Value *RdxVal = TrackedVals.at(P.first);
20377 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20378 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20379 VectorizedVals.try_emplace(P.first, P.second);
20380 }
20381 continue;
20382 }
20383 }
20384 if (VectorizedTree) {
20385 // Reorder operands of bool logical op in the natural order to avoid
20386 // possible problem with poison propagation. If not possible to reorder
20387 // (both operands are originally RHS), emit an extra freeze instruction
20388 // for the LHS operand.
20389 // I.e., if we have original code like this:
20390 // RedOp1 = select i1 ?, i1 LHS, i1 false
20391 // RedOp2 = select i1 RHS, i1 ?, i1 false
20392
20393 // Then, we swap LHS/RHS to create a new op that matches the poison
20394 // semantics of the original code.
20395
20396 // If we have original code like this and both values could be poison:
20397 // RedOp1 = select i1 ?, i1 LHS, i1 false
20398 // RedOp2 = select i1 ?, i1 RHS, i1 false
20399
20400 // Then, we must freeze LHS in the new op.
20401 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20402 Instruction *RedOp1,
20403 Instruction *RedOp2,
20404 bool InitStep) {
20405 if (!AnyBoolLogicOp)
20406 return;
20407 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20408 getRdxOperand(RedOp1, 0) == LHS ||
20410 return;
20411 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20412 getRdxOperand(RedOp2, 0) == RHS ||
20414 std::swap(LHS, RHS);
20415 return;
20416 }
20417 if (LHS != VectorizedTree)
20418 LHS = Builder.CreateFreeze(LHS);
20419 };
20420 // Finish the reduction.
20421 // Need to add extra arguments and not vectorized possible reduction
20422 // values.
20423 // Try to avoid dependencies between the scalar remainders after
20424 // reductions.
20425 auto FinalGen =
20427 bool InitStep) {
20428 unsigned Sz = InstVals.size();
20430 Sz % 2);
20431 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20432 Instruction *RedOp = InstVals[I + 1].first;
20433 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20434 Value *RdxVal1 = InstVals[I].second;
20435 Value *StableRdxVal1 = RdxVal1;
20436 auto It1 = TrackedVals.find(RdxVal1);
20437 if (It1 != TrackedVals.end())
20438 StableRdxVal1 = It1->second;
20439 Value *RdxVal2 = InstVals[I + 1].second;
20440 Value *StableRdxVal2 = RdxVal2;
20441 auto It2 = TrackedVals.find(RdxVal2);
20442 if (It2 != TrackedVals.end())
20443 StableRdxVal2 = It2->second;
20444 // To prevent poison from leaking across what used to be
20445 // sequential, safe, scalar boolean logic operations, the
20446 // reduction operand must be frozen.
20447 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20448 RedOp, InitStep);
20449 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20450 StableRdxVal2, "op.rdx", ReductionOps);
20451 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20452 }
20453 if (Sz % 2 == 1)
20454 ExtraReds[Sz / 2] = InstVals.back();
20455 return ExtraReds;
20456 };
20458 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20459 VectorizedTree);
20461 for (ArrayRef<Value *> Candidates : ReducedVals) {
20462 for (Value *RdxVal : Candidates) {
20463 if (!Visited.insert(RdxVal).second)
20464 continue;
20465 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20466 for (Instruction *RedOp :
20467 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20468 ExtraReductions.emplace_back(RedOp, RdxVal);
20469 }
20470 }
20471 // Iterate through all not-vectorized reduction values/extra arguments.
20472 bool InitStep = true;
20473 while (ExtraReductions.size() > 1) {
20475 FinalGen(ExtraReductions, InitStep);
20476 ExtraReductions.swap(NewReds);
20477 InitStep = false;
20478 }
20479 VectorizedTree = ExtraReductions.front().second;
20480
20481 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20482
20483 // The original scalar reduction is expected to have no remaining
20484 // uses outside the reduction tree itself. Assert that we got this
20485 // correct, replace internal uses with undef, and mark for eventual
20486 // deletion.
20487#ifndef NDEBUG
20488 SmallSet<Value *, 4> IgnoreSet;
20489 for (ArrayRef<Value *> RdxOps : ReductionOps)
20490 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20491#endif
20492 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20493 for (Value *Ignore : RdxOps) {
20494 if (!Ignore)
20495 continue;
20496#ifndef NDEBUG
20497 for (auto *U : Ignore->users()) {
20498 assert(IgnoreSet.count(U) &&
20499 "All users must be either in the reduction ops list.");
20500 }
20501#endif
20502 if (!Ignore->use_empty()) {
20503 Value *P = PoisonValue::get(Ignore->getType());
20504 Ignore->replaceAllUsesWith(P);
20505 }
20506 }
20507 V.removeInstructionsAndOperands(RdxOps);
20508 }
20509 } else if (!CheckForReusedReductionOps) {
20510 for (ReductionOpsType &RdxOps : ReductionOps)
20511 for (Value *RdxOp : RdxOps)
20512 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20513 }
20514 return VectorizedTree;
20515 }
20516
20517private:
20518 /// Calculate the cost of a reduction.
20519 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20520 ArrayRef<Value *> ReducedVals,
20521 bool IsCmpSelMinMax, FastMathFlags FMF,
20522 const BoUpSLP &R) {
20524 Type *ScalarTy = ReducedVals.front()->getType();
20525 unsigned ReduxWidth = ReducedVals.size();
20526 FixedVectorType *VectorTy = R.getReductionType();
20527 InstructionCost VectorCost = 0, ScalarCost;
20528 // If all of the reduced values are constant, the vector cost is 0, since
20529 // the reduction value can be calculated at the compile time.
20530 bool AllConsts = allConstant(ReducedVals);
20531 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20533 // Scalar cost is repeated for N-1 elements.
20534 int Cnt = ReducedVals.size();
20535 for (Value *RdxVal : ReducedVals) {
20536 if (Cnt == 1)
20537 break;
20538 --Cnt;
20539 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20540 Cost += GenCostFn();
20541 continue;
20542 }
20543 InstructionCost ScalarCost = 0;
20544 for (User *U : RdxVal->users()) {
20545 auto *RdxOp = cast<Instruction>(U);
20546 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20547 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20548 continue;
20549 }
20550 ScalarCost = InstructionCost::getInvalid();
20551 break;
20552 }
20553 if (ScalarCost.isValid())
20554 Cost += ScalarCost;
20555 else
20556 Cost += GenCostFn();
20557 }
20558 return Cost;
20559 };
20560 switch (RdxKind) {
20561 case RecurKind::Add:
20562 case RecurKind::Mul:
20563 case RecurKind::Or:
20564 case RecurKind::And:
20565 case RecurKind::Xor:
20566 case RecurKind::FAdd:
20567 case RecurKind::FMul: {
20568 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20569 if (!AllConsts) {
20570 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20571 assert(SLPReVec && "FixedVectorType is not expected.");
20572 unsigned ScalarTyNumElements = VecTy->getNumElements();
20573 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20574 VectorCost += TTI->getShuffleCost(
20575 TTI::SK_PermuteSingleSrc, VectorTy,
20576 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20577 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20578 CostKind);
20579 }
20580 VectorCost += TTI->getScalarizationOverhead(
20581 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20582 /*Extract*/ false, TTI::TCK_RecipThroughput);
20583 } else {
20584 Type *RedTy = VectorTy->getElementType();
20585 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20586 std::make_pair(RedTy, true));
20587 if (RType == RedTy) {
20588 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20589 FMF, CostKind);
20590 } else {
20591 VectorCost = TTI->getExtendedReductionCost(
20592 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20593 FMF, CostKind);
20594 }
20595 }
20596 }
20597 ScalarCost = EvaluateScalarCost([&]() {
20598 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20599 });
20600 break;
20601 }
20602 case RecurKind::FMax:
20603 case RecurKind::FMin:
20604 case RecurKind::FMaximum:
20605 case RecurKind::FMinimum:
20606 case RecurKind::SMax:
20607 case RecurKind::SMin:
20608 case RecurKind::UMax:
20609 case RecurKind::UMin: {
20611 if (!AllConsts)
20612 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20613 ScalarCost = EvaluateScalarCost([&]() {
20614 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20615 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20616 });
20617 break;
20618 }
20619 default:
20620 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20621 }
20622
20623 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20624 << " for reduction of " << shortBundleName(ReducedVals)
20625 << " (It is a splitting reduction)\n");
20626 return VectorCost - ScalarCost;
20627 }
20628
20629 /// Emit a horizontal reduction of the vectorized value.
20630 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20631 const TargetTransformInfo *TTI, Type *DestTy) {
20632 assert(VectorizedValue && "Need to have a vectorized tree node");
20633 assert(RdxKind != RecurKind::FMulAdd &&
20634 "A call to the llvm.fmuladd intrinsic is not handled yet");
20635
20636 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20637 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20638 RdxKind == RecurKind::Add &&
20639 DestTy->getScalarType() != FTy->getScalarType()) {
20640 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20641 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20642 Value *V = Builder.CreateBitCast(
20643 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20644 ++NumVectorInstructions;
20645 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20646 }
20647 ++NumVectorInstructions;
20648 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20649 }
20650
20651 /// Emits optimized code for unique scalar value reused \p Cnt times.
20652 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20653 unsigned Cnt) {
20654 assert(IsSupportedHorRdxIdentityOp &&
20655 "The optimization of matched scalar identity horizontal reductions "
20656 "must be supported.");
20657 if (Cnt == 1)
20658 return VectorizedValue;
20659 switch (RdxKind) {
20660 case RecurKind::Add: {
20661 // res = mul vv, n
20662 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20663 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20664 << VectorizedValue << ". (HorRdx)\n");
20665 return Builder.CreateMul(VectorizedValue, Scale);
20666 }
20667 case RecurKind::Xor: {
20668 // res = n % 2 ? 0 : vv
20669 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20670 << ". (HorRdx)\n");
20671 if (Cnt % 2 == 0)
20672 return Constant::getNullValue(VectorizedValue->getType());
20673 return VectorizedValue;
20674 }
20675 case RecurKind::FAdd: {
20676 // res = fmul v, n
20677 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20678 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20679 << VectorizedValue << ". (HorRdx)\n");
20680 return Builder.CreateFMul(VectorizedValue, Scale);
20681 }
20682 case RecurKind::And:
20683 case RecurKind::Or:
20684 case RecurKind::SMax:
20685 case RecurKind::SMin:
20686 case RecurKind::UMax:
20687 case RecurKind::UMin:
20688 case RecurKind::FMax:
20689 case RecurKind::FMin:
20690 case RecurKind::FMaximum:
20691 case RecurKind::FMinimum:
20692 // res = vv
20693 return VectorizedValue;
20694 case RecurKind::Mul:
20695 case RecurKind::FMul:
20696 case RecurKind::FMulAdd:
20697 case RecurKind::IAnyOf:
20698 case RecurKind::FAnyOf:
20699 case RecurKind::IFindLastIV:
20700 case RecurKind::FFindLastIV:
20701 case RecurKind::None:
20702 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20703 }
20704 return nullptr;
20705 }
20706
20707 /// Emits actual operation for the scalar identity values, found during
20708 /// horizontal reduction analysis.
20709 Value *
20710 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20711 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20712 const DenseMap<Value *, Value *> &TrackedToOrig) {
20713 assert(IsSupportedHorRdxIdentityOp &&
20714 "The optimization of matched scalar identity horizontal reductions "
20715 "must be supported.");
20716 ArrayRef<Value *> VL = R.getRootNodeScalars();
20717 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20718 if (VTy->getElementType() != VL.front()->getType()) {
20719 VectorizedValue = Builder.CreateIntCast(
20720 VectorizedValue,
20721 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20722 R.isSignedMinBitwidthRootNode());
20723 }
20724 switch (RdxKind) {
20725 case RecurKind::Add: {
20726 // root = mul prev_root, <1, 1, n, 1>
20728 for (Value *V : VL) {
20729 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20730 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20731 }
20732 auto *Scale = ConstantVector::get(Vals);
20733 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20734 << VectorizedValue << ". (HorRdx)\n");
20735 return Builder.CreateMul(VectorizedValue, Scale);
20736 }
20737 case RecurKind::And:
20738 case RecurKind::Or:
20739 // No need for multiple or/and(s).
20740 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20741 << ". (HorRdx)\n");
20742 return VectorizedValue;
20743 case RecurKind::SMax:
20744 case RecurKind::SMin:
20745 case RecurKind::UMax:
20746 case RecurKind::UMin:
20747 case RecurKind::FMax:
20748 case RecurKind::FMin:
20749 case RecurKind::FMaximum:
20750 case RecurKind::FMinimum:
20751 // No need for multiple min/max(s) of the same value.
20752 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20753 << ". (HorRdx)\n");
20754 return VectorizedValue;
20755 case RecurKind::Xor: {
20756 // Replace values with even number of repeats with 0, since
20757 // x xor x = 0.
20758 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20759 // 7>, if elements 4th and 6th elements have even number of repeats.
20761 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20763 std::iota(Mask.begin(), Mask.end(), 0);
20764 bool NeedShuffle = false;
20765 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20766 Value *V = VL[I];
20767 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20768 if (Cnt % 2 == 0) {
20769 Mask[I] = VF;
20770 NeedShuffle = true;
20771 }
20772 }
20773 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20774 : Mask) dbgs()
20775 << I << " ";
20776 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20777 if (NeedShuffle)
20778 VectorizedValue = Builder.CreateShuffleVector(
20779 VectorizedValue,
20780 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20781 return VectorizedValue;
20782 }
20783 case RecurKind::FAdd: {
20784 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20786 for (Value *V : VL) {
20787 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20788 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20789 }
20790 auto *Scale = ConstantVector::get(Vals);
20791 return Builder.CreateFMul(VectorizedValue, Scale);
20792 }
20793 case RecurKind::Mul:
20794 case RecurKind::FMul:
20795 case RecurKind::FMulAdd:
20796 case RecurKind::IAnyOf:
20797 case RecurKind::FAnyOf:
20798 case RecurKind::IFindLastIV:
20799 case RecurKind::FFindLastIV:
20800 case RecurKind::None:
20801 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20802 }
20803 return nullptr;
20804 }
20805};
20806} // end anonymous namespace
20807
20808/// Gets recurrence kind from the specified value.
20810 return HorizontalReduction::getRdxKind(V);
20811}
20812static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20813 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20814 return cast<FixedVectorType>(IE->getType())->getNumElements();
20815
20816 unsigned AggregateSize = 1;
20817 auto *IV = cast<InsertValueInst>(InsertInst);
20818 Type *CurrentType = IV->getType();
20819 do {
20820 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20821 for (auto *Elt : ST->elements())
20822 if (Elt != ST->getElementType(0)) // check homogeneity
20823 return std::nullopt;
20824 AggregateSize *= ST->getNumElements();
20825 CurrentType = ST->getElementType(0);
20826 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20827 AggregateSize *= AT->getNumElements();
20828 CurrentType = AT->getElementType();
20829 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20830 AggregateSize *= VT->getNumElements();
20831 return AggregateSize;
20832 } else if (CurrentType->isSingleValueType()) {
20833 return AggregateSize;
20834 } else {
20835 return std::nullopt;
20836 }
20837 } while (true);
20838}
20839
20840static void findBuildAggregate_rec(Instruction *LastInsertInst,
20842 SmallVectorImpl<Value *> &BuildVectorOpds,
20843 SmallVectorImpl<Value *> &InsertElts,
20844 unsigned OperandOffset, const BoUpSLP &R) {
20845 do {
20846 Value *InsertedOperand = LastInsertInst->getOperand(1);
20847 std::optional<unsigned> OperandIndex =
20848 getElementIndex(LastInsertInst, OperandOffset);
20849 if (!OperandIndex || R.isDeleted(LastInsertInst))
20850 return;
20851 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20852 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20853 BuildVectorOpds, InsertElts, *OperandIndex, R);
20854
20855 } else {
20856 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20857 InsertElts[*OperandIndex] = LastInsertInst;
20858 }
20859 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20860 } while (LastInsertInst != nullptr &&
20861 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20862 LastInsertInst->hasOneUse());
20863}
20864
20865/// Recognize construction of vectors like
20866/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20867/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20868/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20869/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20870/// starting from the last insertelement or insertvalue instruction.
20871///
20872/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20873/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20874/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20875///
20876/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20877///
20878/// \return true if it matches.
20879static bool findBuildAggregate(Instruction *LastInsertInst,
20881 SmallVectorImpl<Value *> &BuildVectorOpds,
20882 SmallVectorImpl<Value *> &InsertElts,
20883 const BoUpSLP &R) {
20884
20885 assert((isa<InsertElementInst>(LastInsertInst) ||
20886 isa<InsertValueInst>(LastInsertInst)) &&
20887 "Expected insertelement or insertvalue instruction!");
20888
20889 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20890 "Expected empty result vectors!");
20891
20892 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20893 if (!AggregateSize)
20894 return false;
20895 BuildVectorOpds.resize(*AggregateSize);
20896 InsertElts.resize(*AggregateSize);
20897
20898 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20899 R);
20900 llvm::erase(BuildVectorOpds, nullptr);
20901 llvm::erase(InsertElts, nullptr);
20902 if (BuildVectorOpds.size() >= 2)
20903 return true;
20904
20905 return false;
20906}
20907
20908/// Try and get a reduction instruction from a phi node.
20909///
20910/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20911/// if they come from either \p ParentBB or a containing loop latch.
20912///
20913/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20914/// if not possible.
20916 BasicBlock *ParentBB, LoopInfo *LI) {
20917 // There are situations where the reduction value is not dominated by the
20918 // reduction phi. Vectorizing such cases has been reported to cause
20919 // miscompiles. See PR25787.
20920 auto DominatedReduxValue = [&](Value *R) {
20921 return isa<Instruction>(R) &&
20922 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20923 };
20924
20925 Instruction *Rdx = nullptr;
20926
20927 // Return the incoming value if it comes from the same BB as the phi node.
20928 if (P->getIncomingBlock(0) == ParentBB) {
20929 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20930 } else if (P->getIncomingBlock(1) == ParentBB) {
20931 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20932 }
20933
20934 if (Rdx && DominatedReduxValue(Rdx))
20935 return Rdx;
20936
20937 // Otherwise, check whether we have a loop latch to look at.
20938 Loop *BBL = LI->getLoopFor(ParentBB);
20939 if (!BBL)
20940 return nullptr;
20941 BasicBlock *BBLatch = BBL->getLoopLatch();
20942 if (!BBLatch)
20943 return nullptr;
20944
20945 // There is a loop latch, return the incoming value if it comes from
20946 // that. This reduction pattern occasionally turns up.
20947 if (P->getIncomingBlock(0) == BBLatch) {
20948 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20949 } else if (P->getIncomingBlock(1) == BBLatch) {
20950 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20951 }
20952
20953 if (Rdx && DominatedReduxValue(Rdx))
20954 return Rdx;
20955
20956 return nullptr;
20957}
20958
20959static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
20960 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
20961 return true;
20962 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
20963 return true;
20964 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
20965 return true;
20966 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
20967 return true;
20968 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
20969 return true;
20970 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
20971 return true;
20972 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
20973 return true;
20974 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
20975 return true;
20976 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
20977 return true;
20978 return false;
20979}
20980
20981/// We could have an initial reduction that is not an add.
20982/// r *= v1 + v2 + v3 + v4
20983/// In such a case start looking for a tree rooted in the first '+'.
20984/// \Returns the new root if found, which may be nullptr if not an instruction.
20986 Instruction *Root) {
20987 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20988 isa<IntrinsicInst>(Root)) &&
20989 "Expected binop, select, or intrinsic for reduction matching");
20990 Value *LHS =
20991 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20992 Value *RHS =
20993 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20994 if (LHS == Phi)
20995 return dyn_cast<Instruction>(RHS);
20996 if (RHS == Phi)
20997 return dyn_cast<Instruction>(LHS);
20998 return nullptr;
20999}
21000
21001/// \p Returns the first operand of \p I that does not match \p Phi. If
21002/// operand is not an instruction it returns nullptr.
21004 Value *Op0 = nullptr;
21005 Value *Op1 = nullptr;
21006 if (!matchRdxBop(I, Op0, Op1))
21007 return nullptr;
21008 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21009}
21010
21011/// \Returns true if \p I is a candidate instruction for reduction vectorization.
21013 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
21014 Value *B0 = nullptr, *B1 = nullptr;
21015 bool IsBinop = matchRdxBop(I, B0, B1);
21016 return IsBinop || IsSelect;
21017}
21018
21019bool SLPVectorizerPass::vectorizeHorReduction(
21020 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
21021 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21022 if (!ShouldVectorizeHor)
21023 return false;
21024 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
21025
21026 if (Root->getParent() != BB || isa<PHINode>(Root))
21027 return false;
21028
21029 // If we can find a secondary reduction root, use that instead.
21030 auto SelectRoot = [&]() {
21031 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
21032 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
21033 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
21034 return NewRoot;
21035 return Root;
21036 };
21037
21038 // Start analysis starting from Root instruction. If horizontal reduction is
21039 // found, try to vectorize it. If it is not a horizontal reduction or
21040 // vectorization is not possible or not effective, and currently analyzed
21041 // instruction is a binary operation, try to vectorize the operands, using
21042 // pre-order DFS traversal order. If the operands were not vectorized, repeat
21043 // the same procedure considering each operand as a possible root of the
21044 // horizontal reduction.
21045 // Interrupt the process if the Root instruction itself was vectorized or all
21046 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21047 // If a horizintal reduction was not matched or vectorized we collect
21048 // instructions for possible later attempts for vectorization.
21049 std::queue<std::pair<Instruction *, unsigned>> Stack;
21050 Stack.emplace(SelectRoot(), 0);
21051 SmallPtrSet<Value *, 8> VisitedInstrs;
21052 bool Res = false;
21053 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21054 if (R.isAnalyzedReductionRoot(Inst))
21055 return nullptr;
21056 if (!isReductionCandidate(Inst))
21057 return nullptr;
21058 HorizontalReduction HorRdx;
21059 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21060 return nullptr;
21061 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21062 };
21063 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21064 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21065 FutureSeed = getNonPhiOperand(Root, P);
21066 if (!FutureSeed)
21067 return false;
21068 }
21069 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21070 // analysis is done separately.
21071 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21072 PostponedInsts.push_back(FutureSeed);
21073 return true;
21074 };
21075
21076 while (!Stack.empty()) {
21077 Instruction *Inst;
21078 unsigned Level;
21079 std::tie(Inst, Level) = Stack.front();
21080 Stack.pop();
21081 // Do not try to analyze instruction that has already been vectorized.
21082 // This may happen when we vectorize instruction operands on a previous
21083 // iteration while stack was populated before that happened.
21084 if (R.isDeleted(Inst))
21085 continue;
21086 if (Value *VectorizedV = TryToReduce(Inst)) {
21087 Res = true;
21088 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21089 // Try to find another reduction.
21090 Stack.emplace(I, Level);
21091 continue;
21092 }
21093 if (R.isDeleted(Inst))
21094 continue;
21095 } else {
21096 // We could not vectorize `Inst` so try to use it as a future seed.
21097 if (!TryAppendToPostponedInsts(Inst)) {
21098 assert(Stack.empty() && "Expected empty stack");
21099 break;
21100 }
21101 }
21102
21103 // Try to vectorize operands.
21104 // Continue analysis for the instruction from the same basic block only to
21105 // save compile time.
21106 if (++Level < RecursionMaxDepth)
21107 for (auto *Op : Inst->operand_values())
21108 if (VisitedInstrs.insert(Op).second)
21109 if (auto *I = dyn_cast<Instruction>(Op))
21110 // Do not try to vectorize CmpInst operands, this is done
21111 // separately.
21112 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21113 !R.isDeleted(I) && I->getParent() == BB)
21114 Stack.emplace(I, Level);
21115 }
21116 return Res;
21117}
21118
21119bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21120 BasicBlock *BB, BoUpSLP &R) {
21121 SmallVector<WeakTrackingVH> PostponedInsts;
21122 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21123 Res |= tryToVectorize(PostponedInsts, R);
21124 return Res;
21125}
21126
21127bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21128 BoUpSLP &R) {
21129 bool Res = false;
21130 for (Value *V : Insts)
21131 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21132 Res |= tryToVectorize(Inst, R);
21133 return Res;
21134}
21135
21136bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21137 BasicBlock *BB, BoUpSLP &R,
21138 bool MaxVFOnly) {
21139 if (!R.canMapToVector(IVI->getType()))
21140 return false;
21141
21142 SmallVector<Value *, 16> BuildVectorOpds;
21143 SmallVector<Value *, 16> BuildVectorInsts;
21144 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21145 return false;
21146
21147 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21148 R.getORE()->emit([&]() {
21149 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21150 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21151 "trying reduction first.";
21152 });
21153 return false;
21154 }
21155 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21156 // Aggregate value is unlikely to be processed in vector register.
21157 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21158}
21159
21160bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21161 BasicBlock *BB, BoUpSLP &R,
21162 bool MaxVFOnly) {
21163 SmallVector<Value *, 16> BuildVectorInsts;
21164 SmallVector<Value *, 16> BuildVectorOpds;
21166 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21167 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21168 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21169 return false;
21170
21171 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21172 R.getORE()->emit([&]() {
21173 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21174 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21175 "trying reduction first.";
21176 });
21177 return false;
21178 }
21179 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21180 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21181}
21182
21183template <typename T>
21185 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21186 function_ref<bool(T *, T *)> AreCompatible,
21187 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21188 bool MaxVFOnly, BoUpSLP &R) {
21189 bool Changed = false;
21190 // Sort by type, parent, operands.
21191 stable_sort(Incoming, Comparator);
21192
21193 // Try to vectorize elements base on their type.
21194 SmallVector<T *> Candidates;
21196 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21197 VL.clear()) {
21198 // Look for the next elements with the same type, parent and operand
21199 // kinds.
21200 auto *I = dyn_cast<Instruction>(*IncIt);
21201 if (!I || R.isDeleted(I)) {
21202 ++IncIt;
21203 continue;
21204 }
21205 auto *SameTypeIt = IncIt;
21206 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21207 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21208 AreCompatible(*SameTypeIt, *IncIt))) {
21209 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21210 ++SameTypeIt;
21211 if (I && !R.isDeleted(I))
21212 VL.push_back(cast<T>(I));
21213 }
21214
21215 // Try to vectorize them.
21216 unsigned NumElts = VL.size();
21217 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21218 << NumElts << ")\n");
21219 // The vectorization is a 3-state attempt:
21220 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21221 // size of maximal register at first.
21222 // 2. Try to vectorize remaining instructions with the same type, if
21223 // possible. This may result in the better vectorization results rather than
21224 // if we try just to vectorize instructions with the same/alternate opcodes.
21225 // 3. Final attempt to try to vectorize all instructions with the
21226 // same/alternate ops only, this may result in some extra final
21227 // vectorization.
21228 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21229 // Success start over because instructions might have been changed.
21230 Changed = true;
21231 VL.swap(Candidates);
21232 Candidates.clear();
21233 for (T *V : VL) {
21234 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21235 Candidates.push_back(V);
21236 }
21237 } else {
21238 /// \Returns the minimum number of elements that we will attempt to
21239 /// vectorize.
21240 auto GetMinNumElements = [&R](Value *V) {
21241 unsigned EltSize = R.getVectorElementSize(V);
21242 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21243 };
21244 if (NumElts < GetMinNumElements(*IncIt) &&
21245 (Candidates.empty() ||
21246 Candidates.front()->getType() == (*IncIt)->getType())) {
21247 for (T *V : VL) {
21248 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21249 Candidates.push_back(V);
21250 }
21251 }
21252 }
21253 // Final attempt to vectorize instructions with the same types.
21254 if (Candidates.size() > 1 &&
21255 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21256 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21257 // Success start over because instructions might have been changed.
21258 Changed = true;
21259 } else if (MaxVFOnly) {
21260 // Try to vectorize using small vectors.
21262 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21263 VL.clear()) {
21264 auto *I = dyn_cast<Instruction>(*It);
21265 if (!I || R.isDeleted(I)) {
21266 ++It;
21267 continue;
21268 }
21269 auto *SameTypeIt = It;
21270 while (SameTypeIt != End &&
21271 (!isa<Instruction>(*SameTypeIt) ||
21272 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21273 AreCompatible(*SameTypeIt, *It))) {
21274 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21275 ++SameTypeIt;
21276 if (I && !R.isDeleted(I))
21277 VL.push_back(cast<T>(I));
21278 }
21279 unsigned NumElts = VL.size();
21280 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21281 /*MaxVFOnly=*/false))
21282 Changed = true;
21283 It = SameTypeIt;
21284 }
21285 }
21286 Candidates.clear();
21287 }
21288
21289 // Start over at the next instruction of a different type (or the end).
21290 IncIt = SameTypeIt;
21291 }
21292 return Changed;
21293}
21294
21295/// Compare two cmp instructions. If IsCompatibility is true, function returns
21296/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21297/// operands. If IsCompatibility is false, function implements strict weak
21298/// ordering relation between two cmp instructions, returning true if the first
21299/// instruction is "less" than the second, i.e. its predicate is less than the
21300/// predicate of the second or the operands IDs are less than the operands IDs
21301/// of the second cmp instruction.
21302template <bool IsCompatibility>
21303static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21304 const DominatorTree &DT) {
21305 assert(isValidElementType(V->getType()) &&
21306 isValidElementType(V2->getType()) &&
21307 "Expected valid element types only.");
21308 if (V == V2)
21309 return IsCompatibility;
21310 auto *CI1 = cast<CmpInst>(V);
21311 auto *CI2 = cast<CmpInst>(V2);
21312 if (CI1->getOperand(0)->getType()->getTypeID() <
21313 CI2->getOperand(0)->getType()->getTypeID())
21314 return !IsCompatibility;
21315 if (CI1->getOperand(0)->getType()->getTypeID() >
21316 CI2->getOperand(0)->getType()->getTypeID())
21317 return false;
21318 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21320 return !IsCompatibility;
21321 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21323 return false;
21324 CmpInst::Predicate Pred1 = CI1->getPredicate();
21325 CmpInst::Predicate Pred2 = CI2->getPredicate();
21328 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21329 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21330 if (BasePred1 < BasePred2)
21331 return !IsCompatibility;
21332 if (BasePred1 > BasePred2)
21333 return false;
21334 // Compare operands.
21335 bool CI1Preds = Pred1 == BasePred1;
21336 bool CI2Preds = Pred2 == BasePred1;
21337 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21338 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21339 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21340 if (Op1 == Op2)
21341 continue;
21342 if (Op1->getValueID() < Op2->getValueID())
21343 return !IsCompatibility;
21344 if (Op1->getValueID() > Op2->getValueID())
21345 return false;
21346 if (auto *I1 = dyn_cast<Instruction>(Op1))
21347 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21348 if (IsCompatibility) {
21349 if (I1->getParent() != I2->getParent())
21350 return false;
21351 } else {
21352 // Try to compare nodes with same parent.
21353 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21354 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21355 if (!NodeI1)
21356 return NodeI2 != nullptr;
21357 if (!NodeI2)
21358 return false;
21359 assert((NodeI1 == NodeI2) ==
21360 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21361 "Different nodes should have different DFS numbers");
21362 if (NodeI1 != NodeI2)
21363 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21364 }
21365 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21366 if (S && (IsCompatibility || !S.isAltShuffle()))
21367 continue;
21368 if (IsCompatibility)
21369 return false;
21370 if (I1->getOpcode() != I2->getOpcode())
21371 return I1->getOpcode() < I2->getOpcode();
21372 }
21373 }
21374 return IsCompatibility;
21375}
21376
21377template <typename ItT>
21378bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21379 BasicBlock *BB, BoUpSLP &R) {
21380 bool Changed = false;
21381 // Try to find reductions first.
21382 for (CmpInst *I : CmpInsts) {
21383 if (R.isDeleted(I))
21384 continue;
21385 for (Value *Op : I->operands())
21386 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21387 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21388 if (R.isDeleted(I))
21389 break;
21390 }
21391 }
21392 // Try to vectorize operands as vector bundles.
21393 for (CmpInst *I : CmpInsts) {
21394 if (R.isDeleted(I))
21395 continue;
21396 Changed |= tryToVectorize(I, R);
21397 }
21398 // Try to vectorize list of compares.
21399 // Sort by type, compare predicate, etc.
21400 auto CompareSorter = [&](Value *V, Value *V2) {
21401 if (V == V2)
21402 return false;
21403 return compareCmp<false>(V, V2, *TLI, *DT);
21404 };
21405
21406 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21407 if (V1 == V2)
21408 return true;
21409 return compareCmp<true>(V1, V2, *TLI, *DT);
21410 };
21411
21413 for (Instruction *V : CmpInsts)
21414 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21415 Vals.push_back(V);
21416 if (Vals.size() <= 1)
21417 return Changed;
21418 Changed |= tryToVectorizeSequence<Value>(
21419 Vals, CompareSorter, AreCompatibleCompares,
21420 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21421 // Exclude possible reductions from other blocks.
21422 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21423 return any_of(V->users(), [V](User *U) {
21424 auto *Select = dyn_cast<SelectInst>(U);
21425 return Select &&
21426 Select->getParent() != cast<Instruction>(V)->getParent();
21427 });
21428 });
21429 if (ArePossiblyReducedInOtherBlock)
21430 return false;
21431 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21432 },
21433 /*MaxVFOnly=*/true, R);
21434 return Changed;
21435}
21436
21437bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21438 BasicBlock *BB, BoUpSLP &R) {
21439 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21440 "This function only accepts Insert instructions");
21441 bool OpsChanged = false;
21442 SmallVector<WeakTrackingVH> PostponedInsts;
21443 for (auto *I : reverse(Instructions)) {
21444 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21445 if (R.isDeleted(I) || isa<CmpInst>(I))
21446 continue;
21447 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21448 OpsChanged |=
21449 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21450 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21451 OpsChanged |=
21452 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21453 }
21454 // pass2 - try to vectorize reductions only
21455 if (R.isDeleted(I))
21456 continue;
21457 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21458 if (R.isDeleted(I) || isa<CmpInst>(I))
21459 continue;
21460 // pass3 - try to match and vectorize a buildvector sequence.
21461 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21462 OpsChanged |=
21463 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21464 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21465 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21466 /*MaxVFOnly=*/false);
21467 }
21468 }
21469 // Now try to vectorize postponed instructions.
21470 OpsChanged |= tryToVectorize(PostponedInsts, R);
21471
21472 Instructions.clear();
21473 return OpsChanged;
21474}
21475
21476bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21477 bool Changed = false;
21479 SmallPtrSet<Value *, 16> VisitedInstrs;
21480 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21481 // node. Allows better to identify the chains that can be vectorized in the
21482 // better way.
21484 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21486 isValidElementType(V2->getType()) &&
21487 "Expected vectorizable types only.");
21488 // It is fine to compare type IDs here, since we expect only vectorizable
21489 // types, like ints, floats and pointers, we don't care about other type.
21490 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21491 return true;
21492 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21493 return false;
21494 if (V1->getType()->getScalarSizeInBits() <
21495 V2->getType()->getScalarSizeInBits())
21496 return true;
21497 if (V1->getType()->getScalarSizeInBits() >
21498 V2->getType()->getScalarSizeInBits())
21499 return false;
21500 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21501 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21502 if (Opcodes1.size() < Opcodes2.size())
21503 return true;
21504 if (Opcodes1.size() > Opcodes2.size())
21505 return false;
21506 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21507 {
21508 // Instructions come first.
21509 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21510 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21511 if (I1 && I2) {
21512 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21513 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21514 if (!NodeI1)
21515 return NodeI2 != nullptr;
21516 if (!NodeI2)
21517 return false;
21518 assert((NodeI1 == NodeI2) ==
21519 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21520 "Different nodes should have different DFS numbers");
21521 if (NodeI1 != NodeI2)
21522 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21523 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21524 if (S && !S.isAltShuffle())
21525 continue;
21526 return I1->getOpcode() < I2->getOpcode();
21527 }
21528 if (I1)
21529 return true;
21530 if (I2)
21531 return false;
21532 }
21533 {
21534 // Non-undef constants come next.
21535 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21536 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21537 if (C1 && C2)
21538 continue;
21539 if (C1)
21540 return true;
21541 if (C2)
21542 return false;
21543 }
21544 bool U1 = isa<UndefValue>(Opcodes1[I]);
21545 bool U2 = isa<UndefValue>(Opcodes2[I]);
21546 {
21547 // Non-constant non-instructions come next.
21548 if (!U1 && !U2) {
21549 auto ValID1 = Opcodes1[I]->getValueID();
21550 auto ValID2 = Opcodes2[I]->getValueID();
21551 if (ValID1 == ValID2)
21552 continue;
21553 if (ValID1 < ValID2)
21554 return true;
21555 if (ValID1 > ValID2)
21556 return false;
21557 }
21558 if (!U1)
21559 return true;
21560 if (!U2)
21561 return false;
21562 }
21563 // Undefs come last.
21564 assert(U1 && U2 && "The only thing left should be undef & undef.");
21565 }
21566 return false;
21567 };
21568 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21569 if (V1 == V2)
21570 return true;
21571 if (V1->getType() != V2->getType())
21572 return false;
21573 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21574 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21575 if (Opcodes1.size() != Opcodes2.size())
21576 return false;
21577 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21578 // Undefs are compatible with any other value.
21579 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21580 continue;
21581 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21582 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21583 if (R.isDeleted(I1) || R.isDeleted(I2))
21584 return false;
21585 if (I1->getParent() != I2->getParent())
21586 return false;
21587 if (getSameOpcode({I1, I2}, *TLI))
21588 continue;
21589 return false;
21590 }
21591 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21592 continue;
21593 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21594 return false;
21595 }
21596 return true;
21597 };
21598
21599 bool HaveVectorizedPhiNodes = false;
21600 do {
21601 // Collect the incoming values from the PHIs.
21602 Incoming.clear();
21603 for (Instruction &I : *BB) {
21604 auto *P = dyn_cast<PHINode>(&I);
21605 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21606 break;
21607
21608 // No need to analyze deleted, vectorized and non-vectorizable
21609 // instructions.
21610 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21611 isValidElementType(P->getType()))
21612 Incoming.push_back(P);
21613 }
21614
21615 if (Incoming.size() <= 1)
21616 break;
21617
21618 // Find the corresponding non-phi nodes for better matching when trying to
21619 // build the tree.
21620 for (Value *V : Incoming) {
21621 SmallVectorImpl<Value *> &Opcodes =
21622 PHIToOpcodes.try_emplace(V).first->getSecond();
21623 if (!Opcodes.empty())
21624 continue;
21625 SmallVector<Value *, 4> Nodes(1, V);
21627 while (!Nodes.empty()) {
21628 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21629 if (!Visited.insert(PHI).second)
21630 continue;
21631 for (Value *V : PHI->incoming_values()) {
21632 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21633 Nodes.push_back(PHI1);
21634 continue;
21635 }
21636 Opcodes.emplace_back(V);
21637 }
21638 }
21639 }
21640
21641 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21642 Incoming, PHICompare, AreCompatiblePHIs,
21643 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21644 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21645 },
21646 /*MaxVFOnly=*/true, R);
21647 Changed |= HaveVectorizedPhiNodes;
21648 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21649 auto *PHI = dyn_cast<PHINode>(P.first);
21650 return !PHI || R.isDeleted(PHI);
21651 }))
21652 PHIToOpcodes.clear();
21653 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21654 } while (HaveVectorizedPhiNodes);
21655
21656 VisitedInstrs.clear();
21657
21658 InstSetVector PostProcessInserts;
21659 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21660 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21661 // also vectorizes `PostProcessCmps`.
21662 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21663 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21664 if (VectorizeCmps) {
21665 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21666 PostProcessCmps.clear();
21667 }
21668 PostProcessInserts.clear();
21669 return Changed;
21670 };
21671 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21672 auto IsInPostProcessInstrs = [&](Instruction *I) {
21673 if (auto *Cmp = dyn_cast<CmpInst>(I))
21674 return PostProcessCmps.contains(Cmp);
21675 return isa<InsertElementInst, InsertValueInst>(I) &&
21676 PostProcessInserts.contains(I);
21677 };
21678 // Returns true if `I` is an instruction without users, like terminator, or
21679 // function call with ignored return value, store. Ignore unused instructions
21680 // (basing on instruction type, except for CallInst and InvokeInst).
21681 auto HasNoUsers = [](Instruction *I) {
21682 return I->use_empty() &&
21683 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21684 };
21685 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21686 // Skip instructions with scalable type. The num of elements is unknown at
21687 // compile-time for scalable type.
21688 if (isa<ScalableVectorType>(It->getType()))
21689 continue;
21690
21691 // Skip instructions marked for the deletion.
21692 if (R.isDeleted(&*It))
21693 continue;
21694 // We may go through BB multiple times so skip the one we have checked.
21695 if (!VisitedInstrs.insert(&*It).second) {
21696 if (HasNoUsers(&*It) &&
21697 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21698 // We would like to start over since some instructions are deleted
21699 // and the iterator may become invalid value.
21700 Changed = true;
21701 It = BB->begin();
21702 E = BB->end();
21703 }
21704 continue;
21705 }
21706
21707 if (isa<DbgInfoIntrinsic>(It))
21708 continue;
21709
21710 // Try to vectorize reductions that use PHINodes.
21711 if (PHINode *P = dyn_cast<PHINode>(It)) {
21712 // Check that the PHI is a reduction PHI.
21713 if (P->getNumIncomingValues() == 2) {
21714 // Try to match and vectorize a horizontal reduction.
21715 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21716 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21717 Changed = true;
21718 It = BB->begin();
21719 E = BB->end();
21720 continue;
21721 }
21722 }
21723 // Try to vectorize the incoming values of the PHI, to catch reductions
21724 // that feed into PHIs.
21725 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21726 // Skip if the incoming block is the current BB for now. Also, bypass
21727 // unreachable IR for efficiency and to avoid crashing.
21728 // TODO: Collect the skipped incoming values and try to vectorize them
21729 // after processing BB.
21730 if (BB == P->getIncomingBlock(I) ||
21731 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21732 continue;
21733
21734 // Postponed instructions should not be vectorized here, delay their
21735 // vectorization.
21736 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21737 PI && !IsInPostProcessInstrs(PI)) {
21738 bool Res =
21739 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21740 Changed |= Res;
21741 if (Res && R.isDeleted(P)) {
21742 It = BB->begin();
21743 E = BB->end();
21744 break;
21745 }
21746 }
21747 }
21748 continue;
21749 }
21750
21751 if (HasNoUsers(&*It)) {
21752 bool OpsChanged = false;
21753 auto *SI = dyn_cast<StoreInst>(It);
21754 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21755 if (SI) {
21756 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21757 // Try to vectorize chain in store, if this is the only store to the
21758 // address in the block.
21759 // TODO: This is just a temporarily solution to save compile time. Need
21760 // to investigate if we can safely turn on slp-vectorize-hor-store
21761 // instead to allow lookup for reduction chains in all non-vectorized
21762 // stores (need to check side effects and compile time).
21763 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21764 SI->getValueOperand()->hasOneUse();
21765 }
21766 if (TryToVectorizeRoot) {
21767 for (auto *V : It->operand_values()) {
21768 // Postponed instructions should not be vectorized here, delay their
21769 // vectorization.
21770 if (auto *VI = dyn_cast<Instruction>(V);
21771 VI && !IsInPostProcessInstrs(VI))
21772 // Try to match and vectorize a horizontal reduction.
21773 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21774 }
21775 }
21776 // Start vectorization of post-process list of instructions from the
21777 // top-tree instructions to try to vectorize as many instructions as
21778 // possible.
21779 OpsChanged |=
21780 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21781 if (OpsChanged) {
21782 // We would like to start over since some instructions are deleted
21783 // and the iterator may become invalid value.
21784 Changed = true;
21785 It = BB->begin();
21786 E = BB->end();
21787 continue;
21788 }
21789 }
21790
21791 if (isa<InsertElementInst, InsertValueInst>(It))
21792 PostProcessInserts.insert(&*It);
21793 else if (isa<CmpInst>(It))
21794 PostProcessCmps.insert(cast<CmpInst>(&*It));
21795 }
21796
21797 return Changed;
21798}
21799
21800bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21801 auto Changed = false;
21802 for (auto &Entry : GEPs) {
21803 // If the getelementptr list has fewer than two elements, there's nothing
21804 // to do.
21805 if (Entry.second.size() < 2)
21806 continue;
21807
21808 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21809 << Entry.second.size() << ".\n");
21810
21811 // Process the GEP list in chunks suitable for the target's supported
21812 // vector size. If a vector register can't hold 1 element, we are done. We
21813 // are trying to vectorize the index computations, so the maximum number of
21814 // elements is based on the size of the index expression, rather than the
21815 // size of the GEP itself (the target's pointer size).
21816 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21817 return !R.isDeleted(GEP);
21818 });
21819 if (It == Entry.second.end())
21820 continue;
21821 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21822 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21823 if (MaxVecRegSize < EltSize)
21824 continue;
21825
21826 unsigned MaxElts = MaxVecRegSize / EltSize;
21827 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21828 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21829 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21830
21831 // Initialize a set a candidate getelementptrs. Note that we use a
21832 // SetVector here to preserve program order. If the index computations
21833 // are vectorizable and begin with loads, we want to minimize the chance
21834 // of having to reorder them later.
21835 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21836
21837 // Some of the candidates may have already been vectorized after we
21838 // initially collected them or their index is optimized to constant value.
21839 // If so, they are marked as deleted, so remove them from the set of
21840 // candidates.
21841 Candidates.remove_if([&R](Value *I) {
21842 return R.isDeleted(cast<Instruction>(I)) ||
21843 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21844 });
21845
21846 // Remove from the set of candidates all pairs of getelementptrs with
21847 // constant differences. Such getelementptrs are likely not good
21848 // candidates for vectorization in a bottom-up phase since one can be
21849 // computed from the other. We also ensure all candidate getelementptr
21850 // indices are unique.
21851 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21852 auto *GEPI = GEPList[I];
21853 if (!Candidates.count(GEPI))
21854 continue;
21855 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21856 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21857 auto *GEPJ = GEPList[J];
21858 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21859 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21860 Candidates.remove(GEPI);
21861 Candidates.remove(GEPJ);
21862 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21863 Candidates.remove(GEPJ);
21864 }
21865 }
21866 }
21867
21868 // We break out of the above computation as soon as we know there are
21869 // fewer than two candidates remaining.
21870 if (Candidates.size() < 2)
21871 continue;
21872
21873 // Add the single, non-constant index of each candidate to the bundle. We
21874 // ensured the indices met these constraints when we originally collected
21875 // the getelementptrs.
21876 SmallVector<Value *, 16> Bundle(Candidates.size());
21877 auto BundleIndex = 0u;
21878 for (auto *V : Candidates) {
21879 auto *GEP = cast<GetElementPtrInst>(V);
21880 auto *GEPIdx = GEP->idx_begin()->get();
21881 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21882 Bundle[BundleIndex++] = GEPIdx;
21883 }
21884
21885 // Try and vectorize the indices. We are currently only interested in
21886 // gather-like cases of the form:
21887 //
21888 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21889 //
21890 // where the loads of "a", the loads of "b", and the subtractions can be
21891 // performed in parallel. It's likely that detecting this pattern in a
21892 // bottom-up phase will be simpler and less costly than building a
21893 // full-blown top-down phase beginning at the consecutive loads.
21894 Changed |= tryToVectorizeList(Bundle, R);
21895 }
21896 }
21897 return Changed;
21898}
21899
21900bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21901 bool Changed = false;
21902 // Sort by type, base pointers and values operand. Value operands must be
21903 // compatible (have the same opcode, same parent), otherwise it is
21904 // definitely not profitable to try to vectorize them.
21905 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21906 if (V->getValueOperand()->getType()->getTypeID() <
21907 V2->getValueOperand()->getType()->getTypeID())
21908 return true;
21909 if (V->getValueOperand()->getType()->getTypeID() >
21910 V2->getValueOperand()->getType()->getTypeID())
21911 return false;
21912 if (V->getPointerOperandType()->getTypeID() <
21913 V2->getPointerOperandType()->getTypeID())
21914 return true;
21915 if (V->getPointerOperandType()->getTypeID() >
21916 V2->getPointerOperandType()->getTypeID())
21917 return false;
21918 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21919 V2->getValueOperand()->getType()->getScalarSizeInBits())
21920 return true;
21921 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21922 V2->getValueOperand()->getType()->getScalarSizeInBits())
21923 return false;
21924 // UndefValues are compatible with all other values.
21925 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21926 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21928 DT->getNode(I1->getParent());
21930 DT->getNode(I2->getParent());
21931 assert(NodeI1 && "Should only process reachable instructions");
21932 assert(NodeI2 && "Should only process reachable instructions");
21933 assert((NodeI1 == NodeI2) ==
21934 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21935 "Different nodes should have different DFS numbers");
21936 if (NodeI1 != NodeI2)
21937 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21938 return I1->getOpcode() < I2->getOpcode();
21939 }
21940 return V->getValueOperand()->getValueID() <
21941 V2->getValueOperand()->getValueID();
21942 };
21943
21944 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21945 if (V1 == V2)
21946 return true;
21947 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21948 return false;
21949 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21950 return false;
21951 // Undefs are compatible with any other value.
21952 if (isa<UndefValue>(V1->getValueOperand()) ||
21953 isa<UndefValue>(V2->getValueOperand()))
21954 return true;
21955 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21956 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21957 if (I1->getParent() != I2->getParent())
21958 return false;
21959 return getSameOpcode({I1, I2}, *TLI).valid();
21960 }
21961 if (isa<Constant>(V1->getValueOperand()) &&
21962 isa<Constant>(V2->getValueOperand()))
21963 return true;
21964 return V1->getValueOperand()->getValueID() ==
21965 V2->getValueOperand()->getValueID();
21966 };
21967
21968 // Attempt to sort and vectorize each of the store-groups.
21970 for (auto &Pair : Stores) {
21971 if (Pair.second.size() < 2)
21972 continue;
21973
21974 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
21975 << Pair.second.size() << ".\n");
21976
21977 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21978 continue;
21979
21980 // Reverse stores to do bottom-to-top analysis. This is important if the
21981 // values are stores to the same addresses several times, in this case need
21982 // to follow the stores order (reversed to meet the memory dependecies).
21983 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21984 Pair.second.rend());
21985 Changed |= tryToVectorizeSequence<StoreInst>(
21986 ReversedStores, StoreSorter, AreCompatibleStores,
21987 [&](ArrayRef<StoreInst *> Candidates, bool) {
21988 return vectorizeStores(Candidates, R, Attempted);
21989 },
21990 /*MaxVFOnly=*/false, R);
21991 }
21992 return Changed;
21993}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:675
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1970
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1865
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2107
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1964
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1961
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:859
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1071
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2510
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:530
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1079
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2498
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2293
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1814
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2573
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2185
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1873
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:866
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1760
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:879
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2403
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2434
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2277
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:871
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2532
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2448
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1670
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2301
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2224
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1833
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1613
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:283
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:763
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1683
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:255
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2128
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.