LLVM 23.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115STATISTIC(NumStridedStoreChains, "Number of vectorized stride stores");
116STATISTIC(NumStoreChains, "Number of vector stores created");
117STATISTIC(NumVectorizedStores, "Number of vectorized stores");
118
119DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
120 "Controls which SLP graphs should be vectorized.");
121
122static cl::opt<bool>
123 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
124 cl::desc("Run the SLP vectorization passes"));
125
126static cl::opt<bool>
127 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
128 cl::desc("Enable vectorization for wider vector utilization"));
129
130static cl::opt<int>
132 cl::desc("Only vectorize if you gain more than this "
133 "number "));
134
135static cl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
137 cl::desc("Attempt to vectorize horizontal reductions"));
138
140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
141 cl::desc(
142 "Attempt to vectorize horizontal reductions feeding into a store"));
143
145 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
146 cl::desc("Improve the code quality by splitting alternate instructions"));
147
149 "slp-inst-count-check", cl::init(true), cl::Hidden,
150 cl::desc("Reject vectorization if vector instruction count exceeds "
151 "scalar instruction count"));
152
153static cl::opt<int>
155 cl::desc("Attempt to vectorize for this register size in bits"));
156
159 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
160
161/// Limits the size of scheduling regions in a block.
162/// It avoid long compile times for _very_ large blocks where vector
163/// instructions are spread over a wide range.
164/// This limit is way higher than needed by real-world functions.
165static cl::opt<int>
166ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
167 cl::desc("Limit the size of the SLP scheduling region per block"));
168
170 "slp-min-reg-size", cl::init(128), cl::Hidden,
171 cl::desc("Attempt to vectorize for this register size in bits"));
172
174 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
175 cl::desc("Limit the recursion depth when building a vectorizable tree"));
176
178 "slp-min-tree-size", cl::init(3), cl::Hidden,
179 cl::desc("Only vectorize small trees if they are fully vectorizable"));
180
181// The maximum depth that the look-ahead score heuristic will explore.
182// The higher this value, the higher the compilation time overhead.
184 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
185 cl::desc("The maximum look-ahead depth for operand reordering scores"));
186
187// The maximum depth that the look-ahead score heuristic will explore
188// when it probing among candidates for vectorization tree roots.
189// The higher this value, the higher the compilation time overhead but unlike
190// similar limit for operands ordering this is less frequently used, hence
191// impact of higher value is less noticeable.
193 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
194 cl::desc("The maximum look-ahead depth for searching best rooting option"));
195
197 "slp-min-strided-loads", cl::init(2), cl::Hidden,
198 cl::desc("The minimum number of loads, which should be considered strided, "
199 "if the stride is > 1 or is runtime value"));
200
202 "slp-min-strided-stores", cl::init(2), cl::Hidden,
203 cl::desc(
204 "The minimum number of stores, which should be considered strided, "
205 "if the stride is > 1 or is runtime value"));
206
208 "slp-max-stride", cl::init(8), cl::Hidden,
209 cl::desc("The maximum stride, considered to be profitable."));
210
211static cl::opt<bool>
212 EnableStridedStores("slp-enable-strided-stores", cl::init(false),
214 cl::desc("Enable SLP trees to be built from strided "
215 "store chains."));
216
217static cl::opt<bool>
218 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
219 cl::desc("Disable tree reordering even if it is "
220 "profitable. Used for testing only."));
221
222static cl::opt<bool>
223 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
224 cl::desc("Generate strided loads even if they are not "
225 "profitable. Used for testing only."));
226
227static cl::opt<bool>
228 ViewSLPTree("view-slp-tree", cl::Hidden,
229 cl::desc("Display the SLP trees with Graphviz"));
230
232 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
233 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
234
236 "slp-postprocess-stores-operands", cl::init(false), cl::Hidden,
237 cl::desc("Force vectorization of non-vectorizable stores operands."));
238
240 "slp-non-vectorizables-as-reductions", cl::init(false), cl::Hidden,
241 cl::desc(
242 "Use non-vectorizable instructions as potential reduction roots."));
243
244/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
245/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
246/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
247static bool isAllowedNonPowerOf2VF(unsigned NumElts) {
248 return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1);
249}
250
251/// Enables vectorization of copyable elements.
253 "slp-copyable-elements", cl::init(true), cl::Hidden,
254 cl::desc("Try to replace values with the idempotent instructions for "
255 "better vectorization."));
256
258 "slp-cost-loop-trip-count", cl::init(2), cl::Hidden,
259 cl::desc("Loop trip count, considered by the cost model during "
260 "modeling (0=loops are ignored and considered flat code)"));
261
262/// Refine the loop-aware cost scaling of gather/buildvector tree entries by
263/// using the per-lane execution scale of the operand that feeds each lane,
264/// instead of a single whole-entry scale. This matches the LICM hoisting
265/// performed by optimizeGatherSequence() at codegen time: lanes whose
266/// operands are loop-invariant in an inner loop contribute the outer loop's
267/// execution scale rather than the inner loop's, which avoids over-costing
268/// buildvectors that bridge values from outer loop nests into an inner loop.
270 "slp-per-lane-gather-scale", cl::init(true), cl::Hidden,
271 cl::desc("Use per-lane execution scale for gather/buildvector tree "
272 "entries to model LICM-hoistable buildvector sequences."));
273
274// Limit the number of alias checks. The limit is chosen so that
275// it has no negative effect on the llvm benchmarks.
276static const unsigned AliasedCheckLimit = 10;
277
278// Limit of the number of uses for potentially transformed instructions/values,
279// used in checks to avoid compile-time explode.
280static constexpr int UsesLimit = 64;
281
282// Another limit for the alias checks: The maximum distance between load/store
283// instructions where alias checks are done.
284// This limit is useful for very large basic blocks.
285static const unsigned MaxMemDepDistance = 160;
286
287/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
288/// regions to be handled.
289static const int MinScheduleRegionSize = 16;
290
291/// Maximum allowed number of operands in the PHI nodes.
292static const unsigned MaxPHINumOperands = 128;
293
294/// Predicate for the element types that the SLP vectorizer supports.
295///
296/// The most important thing to filter here are types which are invalid in LLVM
297/// vectors. We also filter target specific types which have absolutely no
298/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
299/// avoids spending time checking the cost model and realizing that they will
300/// be inevitably scalarized.
301static bool isValidElementType(Type *Ty) {
302 // TODO: Support ScalableVectorType.
304 Ty = Ty->getScalarType();
305 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
306 !Ty->isPPC_FP128Ty();
307}
308
309/// Returns the "element type" of the given value/instruction \p V.
310/// For stores, returns the stored value type; for insertelement (when ReVec is
311/// off), the inserted operand type. For compares, the default is to return the
312/// result type (i1); when \p LookThroughCmp is true, returns the type of the
313/// compared operands instead, which is needed for vector width calculations
314/// (the width is determined by the operand type, not the i1 result).
315static Type *getValueType(Value *V, bool LookThroughCmp = false) {
316 if (auto *SI = dyn_cast<StoreInst>(V))
317 return SI->getValueOperand()->getType();
318 if (LookThroughCmp)
319 if (auto *CI = dyn_cast<CmpInst>(V))
320 return CI->getOperand(0)->getType();
321 if (!SLPReVec)
322 if (auto *IE = dyn_cast<InsertElementInst>(V))
323 return IE->getOperand(1)->getType();
324 return V->getType();
325}
326
327/// \returns the number of elements for Ty.
328static unsigned getNumElements(Type *Ty) {
330 "ScalableVectorType is not supported.");
331 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
332 return VecTy->getNumElements();
333 return 1;
334}
335
336/// \returns the vector type of ScalarTy based on vectorization factor.
337static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
338 return FixedVectorType::get(ScalarTy->getScalarType(),
339 VF * getNumElements(ScalarTy));
340}
341
342/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
343/// which forms type, which splits by \p TTI into whole vector types during
344/// legalization.
346 Type *Ty, unsigned Sz) {
347 if (!isValidElementType(Ty))
348 return bit_ceil(Sz);
349 // Find the number of elements, which forms full vectors.
350 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
351 if (NumParts == 0 || NumParts >= Sz)
352 return bit_ceil(Sz);
353 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
354}
355
356/// Returns the number of elements of the given type \p Ty, not greater than \p
357/// Sz, which forms type, which splits by \p TTI into whole vector types during
358/// legalization.
359static unsigned
361 unsigned Sz) {
362 if (!isValidElementType(Ty))
363 return bit_floor(Sz);
364 // Find the number of elements, which forms full vectors.
365 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
366 if (NumParts == 0 || NumParts >= Sz)
367 return bit_floor(Sz);
368 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
369 if (RegVF > Sz)
370 return bit_floor(Sz);
371 return (Sz / RegVF) * RegVF;
372}
373
374static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
375 SmallVectorImpl<int> &Mask) {
376 // The ShuffleBuilder implementation use shufflevector to splat an "element".
377 // But the element have different meaning for SLP (scalar) and REVEC
378 // (vector). We need to expand Mask into masks which shufflevector can use
379 // directly.
380 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
381 for (unsigned I : seq<unsigned>(Mask.size()))
382 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
383 I * VecTyNumElements, VecTyNumElements)))
384 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
385 : Mask[I] * VecTyNumElements + J;
386 Mask.swap(NewMask);
387}
388
389/// \returns the number of groups of shufflevector
390/// A group has the following features
391/// 1. All of value in a group are shufflevector.
392/// 2. The mask of all shufflevector is isExtractSubvectorMask.
393/// 3. The mask of all shufflevector uses all of the elements of the source.
394/// e.g., it is 1 group (%0)
395/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
396/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
397/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
398/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
399/// it is 2 groups (%3 and %4)
400/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
401/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
402/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
404/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
405/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
406/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
408/// it is 0 group
409/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
410/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
411/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
412/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
414 if (VL.empty())
415 return 0;
417 return 0;
418 auto *SV = cast<ShuffleVectorInst>(VL.front());
419 unsigned SVNumElements =
420 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
421 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
422 if (SVNumElements % ShuffleMaskSize != 0)
423 return 0;
424 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
425 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
426 return 0;
427 unsigned NumGroup = 0;
428 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
429 auto *SV = cast<ShuffleVectorInst>(VL[I]);
430 Value *Src = SV->getOperand(0);
431 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
432 SmallBitVector ExpectedIndex(GroupSize);
433 if (!all_of(Group, [&](Value *V) {
434 auto *SV = cast<ShuffleVectorInst>(V);
435 // From the same source.
436 if (SV->getOperand(0) != Src)
437 return false;
438 int Index;
439 if (!SV->isExtractSubvectorMask(Index))
440 return false;
441 ExpectedIndex.set(Index / ShuffleMaskSize);
442 return true;
443 }))
444 return 0;
445 if (!ExpectedIndex.all())
446 return 0;
447 ++NumGroup;
448 }
449 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
450 return NumGroup;
451}
452
453/// \returns a shufflevector mask which is used to vectorize shufflevectors
454/// e.g.,
455/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
456/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
457/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
458/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
459/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
460/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
461/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
462/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
463/// the result is
464/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
466 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
467 auto *SV = cast<ShuffleVectorInst>(VL.front());
468 unsigned SVNumElements =
469 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
470 SmallVector<int> Mask;
471 unsigned AccumulateLength = 0;
472 for (Value *V : VL) {
473 auto *SV = cast<ShuffleVectorInst>(V);
474 for (int M : SV->getShuffleMask())
475 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
476 : AccumulateLength + M);
477 AccumulateLength += SVNumElements;
478 }
479 return Mask;
480}
481
482/// \returns True if the value is a constant (but not globals/constant
483/// expressions).
484static bool isConstant(Value *V) {
486}
487
488/// Checks if \p V is one of vector-like instructions, i.e. undef,
489/// insertelement/extractelement with constant indices for fixed vector type or
490/// extractvalue instruction.
494 return false;
495 auto *I = dyn_cast<Instruction>(V);
496 if (!I || isa<ExtractValueInst>(I))
497 return true;
498 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
499 return false;
501 return isConstant(I->getOperand(1));
502 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
503 return isConstant(I->getOperand(2));
504}
505
506/// Returns power-of-2 number of elements in a single register (part), given the
507/// total number of elements \p Size and number of registers (parts) \p
508/// NumParts.
509static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
510 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
511}
512
513/// Returns correct remaining number of elements, considering total amount \p
514/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
515/// and current register (part) \p Part.
516static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
517 unsigned Part) {
518 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
519}
520
521#if !defined(NDEBUG)
522/// Print a short descriptor of the instruction bundle suitable for debug output.
523static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
524 std::string Result;
525 raw_string_ostream OS(Result);
526 if (Idx >= 0)
527 OS << "Idx: " << Idx << ", ";
528 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
529 return Result;
530}
531#endif
532
533/// \returns true if all of the instructions in \p VL are in the same block or
534/// false otherwise.
536 auto *It = find_if(VL, IsaPred<Instruction>);
537 if (It == VL.end())
538 return false;
541 return true;
542
543 BasicBlock *BB = I0->getParent();
544 for (Value *V : iterator_range(It, VL.end())) {
545 if (isa<PoisonValue>(V))
546 continue;
547 auto *II = dyn_cast<Instruction>(V);
548 if (!II)
549 return false;
550
551 if (BB != II->getParent())
552 return false;
553 }
554 return true;
555}
556
557/// \returns True if all of the values in \p VL are constants (but not
558/// globals/constant expressions).
560 // Constant expressions and globals can't be vectorized like normal integer/FP
561 // constants.
562 return all_of(VL, isConstant);
563}
564
565/// \returns True if all of the values in \p VL are identical or some of them
566/// are UndefValue.
567static bool isSplat(ArrayRef<Value *> VL) {
568 Value *FirstNonUndef = nullptr;
569 for (Value *V : VL) {
570 if (isa<UndefValue>(V))
571 continue;
572 if (!FirstNonUndef) {
573 FirstNonUndef = V;
574 continue;
575 }
576 if (V != FirstNonUndef)
577 return false;
578 }
579 return FirstNonUndef != nullptr;
580}
581
582/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
583/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
584/// patterns that make it effectively commutative (like equality comparisons
585/// with zero).
586/// In most cases, users should not call this function directly (since \p I and
587/// \p InstWithUses are the same). However, when analyzing interchangeable
588/// instructions, we need to use the converted opcode along with the original
589/// uses.
590/// \param I The instruction to check for commutativity
591/// \param ValWithUses The value whose uses are analyzed for special
592/// patterns
593static bool isCommutative(Instruction *I, Value *ValWithUses,
594 bool IsCopyable = false) {
595 if (auto *Cmp = dyn_cast<CmpInst>(I))
596 return Cmp->isCommutative();
597 if (auto *BO = dyn_cast<BinaryOperator>(I))
598 return BO->isCommutative() ||
599 (BO->getOpcode() == Instruction::Sub &&
600 ValWithUses->hasUseList() &&
601 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
602 all_of(
603 ValWithUses->uses(),
604 [&](const Use &U) {
605 // Commutative, if icmp eq/ne sub, 0
606 CmpPredicate Pred;
607 if (match(U.getUser(),
608 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
609 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
610 return true;
611 // Commutative, if abs(sub nsw, true) or abs(sub, false).
612 ConstantInt *Flag;
613 auto *I = dyn_cast<BinaryOperator>(U.get());
614 return match(U.getUser(),
615 m_Intrinsic<Intrinsic::abs>(
616 m_Specific(U.get()), m_ConstantInt(Flag))) &&
617 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
618 Flag->isOne());
619 })) ||
620 (BO->getOpcode() == Instruction::FSub &&
621 ValWithUses->hasUseList() &&
622 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
623 all_of(ValWithUses->uses(), [](const Use &U) {
624 return match(U.getUser(),
625 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
626 }));
627 return I->isCommutative();
628}
629
630/// Checks if the operand is commutative. In commutative operations, not all
631/// operands might commutable, e.g. for fmuladd only 2 first operands are
632/// commutable.
633static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
634 bool IsCopyable = false) {
635 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
636 "The instruction is not commutative.");
637 if (isa<CmpInst>(I))
638 return true;
639 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
640 switch (BO->getOpcode()) {
641 case Instruction::Sub:
642 case Instruction::FSub:
643 return true;
644 default:
645 break;
646 }
647 }
648 return I->isCommutableOperand(Op);
649}
650
651/// This is a helper function to check whether \p I is commutative.
652/// This is a convenience wrapper that calls the two-parameter version of
653/// isCommutative with the same instruction for both parameters. This is
654/// the common case where the instruction being checked for commutativity
655/// is the same as the instruction whose uses are analyzed for special
656/// patterns (see the two-parameter version above for details).
657/// \param I The instruction to check for commutativity
658/// \returns true if the instruction is commutative, false otherwise
659static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
660
661/// \returns number of operands of \p I, considering commutativity. Returns 2
662/// for commutative intrinsics.
663/// \param I The instruction to check for commutativity
666 // IntrinsicInst::isCommutative returns true if swapping the first "two"
667 // arguments to the intrinsic produces the same result.
668 constexpr unsigned IntrinsicNumOperands = 2;
669 return IntrinsicNumOperands;
670 }
671 return I->getNumOperands();
672}
673
674template <typename T>
675static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
676 unsigned Offset) {
677 static_assert(std::is_same_v<T, InsertElementInst> ||
678 std::is_same_v<T, ExtractElementInst>,
679 "unsupported T");
680 int Index = Offset;
681 if (const auto *IE = dyn_cast<T>(Inst)) {
682 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
683 if (!VT)
684 return std::nullopt;
685 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
686 if (!CI)
687 return std::nullopt;
688 if (CI->getValue().uge(VT->getNumElements()))
689 return std::nullopt;
690 Index *= VT->getNumElements();
691 Index += CI->getZExtValue();
692 return Index;
693 }
694 return std::nullopt;
695}
696
697/// \returns inserting or extracting index of InsertElement, ExtractElement or
698/// InsertValue instruction, using Offset as base offset for index.
699/// \returns std::nullopt if the index is not an immediate.
700static std::optional<unsigned> getElementIndex(const Value *Inst,
701 unsigned Offset = 0) {
702 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
703 return Index;
705 return Index;
706
707 int Index = Offset;
708
709 const auto *IV = dyn_cast<InsertValueInst>(Inst);
710 if (!IV)
711 return std::nullopt;
712
713 Type *CurrentType = IV->getType();
714 for (unsigned I : IV->indices()) {
715 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
716 Index *= ST->getNumElements();
717 CurrentType = ST->getElementType(I);
718 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
719 Index *= AT->getNumElements();
720 CurrentType = AT->getElementType();
721 } else {
722 return std::nullopt;
723 }
724 Index += I;
725 }
726 return Index;
727}
728
729/// \returns true if all of the values in \p VL use the same opcode.
730/// For comparison instructions, also checks if predicates match.
731/// PoisonValues are considered matching.
732/// Interchangeable instructions are not considered.
734 auto *It = find_if(VL, IsaPred<Instruction>);
735 if (It == VL.end())
736 return true;
737 Instruction *MainOp = cast<Instruction>(*It);
738 unsigned Opcode = MainOp->getOpcode();
739 bool IsCmpOp = isa<CmpInst>(MainOp);
740 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
742 return std::all_of(It, VL.end(), [&](Value *V) {
743 if (auto *CI = dyn_cast<CmpInst>(V))
744 return BasePred == CI->getPredicate();
745 if (auto *I = dyn_cast<Instruction>(V))
746 return I->getOpcode() == Opcode;
747 return isa<PoisonValue>(V);
748 });
749}
750
751namespace {
752/// Specifies the way the mask should be analyzed for undefs/poisonous elements
753/// in the shuffle mask.
754enum class UseMask {
755 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
756 ///< check for the mask elements for the first argument (mask
757 ///< indices are in range [0:VF)).
758 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
759 ///< for the mask elements for the second argument (mask indices
760 ///< are in range [VF:2*VF))
761 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
762 ///< future shuffle elements and mark them as ones as being used
763 ///< in future. Non-undef elements are considered as unused since
764 ///< they're already marked as used in the mask.
765};
766} // namespace
767
768/// Prepares a use bitset for the given mask either for the first argument or
769/// for the second.
771 UseMask MaskArg) {
772 SmallBitVector UseMask(VF, true);
773 for (auto [Idx, Value] : enumerate(Mask)) {
774 if (Value == PoisonMaskElem) {
775 if (MaskArg == UseMask::UndefsAsMask)
776 UseMask.reset(Idx);
777 continue;
778 }
779 if (MaskArg == UseMask::FirstArg && Value < VF)
780 UseMask.reset(Value);
781 else if (MaskArg == UseMask::SecondArg && Value >= VF)
782 UseMask.reset(Value - VF);
783 }
784 return UseMask;
785}
786
787/// Checks if the given value is actually an undefined constant vector.
788/// Also, if the \p UseMask is not empty, tries to check if the non-masked
789/// elements actually mask the insertelement buildvector, if any.
790template <bool IsPoisonOnly = false>
792 const SmallBitVector &UseMask = {}) {
793 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
794 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
795 if (isa<T>(V))
796 return Res;
797 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
798 if (!VecTy)
799 return Res.reset();
800 auto *C = dyn_cast<Constant>(V);
801 if (!C) {
802 if (!UseMask.empty()) {
803 const Value *Base = V;
804 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
805 Base = II->getOperand(0);
806 if (isa<T>(II->getOperand(1)))
807 continue;
808 std::optional<unsigned> Idx = getElementIndex(II);
809 if (!Idx) {
810 Res.reset();
811 return Res;
812 }
813 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
814 Res.reset(*Idx);
815 }
816 // TODO: Add analysis for shuffles here too.
817 if (V == Base) {
818 Res.reset();
819 } else {
820 SmallBitVector SubMask(UseMask.size(), false);
821 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
822 }
823 } else {
824 Res.reset();
825 }
826 return Res;
827 }
828 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
829 if (Constant *Elem = C->getAggregateElement(I))
830 if (!isa<T>(Elem) &&
831 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
832 Res.reset(I);
833 }
834 return Res;
835}
836
837/// Checks if the vector of instructions can be represented as a shuffle, like:
838/// %x0 = extractelement <4 x i8> %x, i32 0
839/// %x3 = extractelement <4 x i8> %x, i32 3
840/// %y1 = extractelement <4 x i8> %y, i32 1
841/// %y2 = extractelement <4 x i8> %y, i32 2
842/// %x0x0 = mul i8 %x0, %x0
843/// %x3x3 = mul i8 %x3, %x3
844/// %y1y1 = mul i8 %y1, %y1
845/// %y2y2 = mul i8 %y2, %y2
846/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
847/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
848/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
849/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
850/// ret <4 x i8> %ins4
851/// can be transformed into:
852/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
853/// i32 6>
854/// %2 = mul <4 x i8> %1, %1
855/// ret <4 x i8> %2
856/// Mask will return the Shuffle Mask equivalent to the extracted elements.
857/// TODO: Can we split off and reuse the shuffle mask detection from
858/// ShuffleVectorInst/getShuffleCost?
859static std::optional<TargetTransformInfo::ShuffleKind>
861 AssumptionCache *AC) {
862 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
863 if (It == VL.end())
864 return std::nullopt;
865 unsigned Size = accumulate(VL, 0u, [](unsigned S, Value *V) {
866 auto *EI = dyn_cast<ExtractElementInst>(V);
867 if (!EI)
868 return S;
869 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
870 if (!VTy)
871 return S;
872 return std::max(S, VTy->getNumElements());
873 });
874
875 Value *Vec1 = nullptr;
876 Value *Vec2 = nullptr;
877 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
878 auto *EE = dyn_cast<ExtractElementInst>(V);
879 if (!EE)
880 return false;
881 Value *Vec = EE->getVectorOperand();
882 if (isa<UndefValue>(Vec))
883 return false;
884 return isGuaranteedNotToBePoison(Vec, AC);
885 });
886 enum ShuffleMode { Unknown, Select, Permute };
887 ShuffleMode CommonShuffleMode = Unknown;
888 Mask.assign(VL.size(), PoisonMaskElem);
889 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
890 // Undef can be represented as an undef element in a vector.
891 if (isa<UndefValue>(VL[I]))
892 continue;
893 auto *EI = cast<ExtractElementInst>(VL[I]);
894 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
895 return std::nullopt;
896 auto *Vec = EI->getVectorOperand();
897 // We can extractelement from undef or poison vector.
899 continue;
900 // All vector operands must have the same number of vector elements.
901 if (isa<UndefValue>(Vec)) {
902 Mask[I] = I;
903 } else {
904 if (isa<UndefValue>(EI->getIndexOperand()))
905 continue;
906 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
907 if (!Idx)
908 return std::nullopt;
909 // Undefined behavior if Idx is negative or >= Size.
910 if (Idx->getValue().uge(Size))
911 continue;
912 unsigned IntIdx = Idx->getValue().getZExtValue();
913 Mask[I] = IntIdx;
914 }
915 if (isUndefVector(Vec).all() && HasNonUndefVec)
916 continue;
917 // For correct shuffling we have to have at most 2 different vector operands
918 // in all extractelement instructions.
919 if (!Vec1 || Vec1 == Vec) {
920 Vec1 = Vec;
921 } else if (!Vec2 || Vec2 == Vec) {
922 Vec2 = Vec;
923 Mask[I] += Size;
924 } else {
925 return std::nullopt;
926 }
927 if (CommonShuffleMode == Permute)
928 continue;
929 // If the extract index is not the same as the operation number, it is a
930 // permutation.
931 if (Mask[I] % Size != I) {
932 CommonShuffleMode = Permute;
933 continue;
934 }
935 CommonShuffleMode = Select;
936 }
937 // If we're not crossing lanes in different vectors, consider it as blending.
938 if (CommonShuffleMode == Select && Vec2)
940 // If Vec2 was never used, we have a permutation of a single vector, otherwise
941 // we have permutation of 2 vectors.
944}
945
946/// \returns True if Extract{Value,Element} instruction extracts element Idx.
947static std::optional<unsigned> getExtractIndex(const Instruction *E) {
948 unsigned Opcode = E->getOpcode();
949 assert((Opcode == Instruction::ExtractElement ||
950 Opcode == Instruction::ExtractValue) &&
951 "Expected extractelement or extractvalue instruction.");
952 if (Opcode == Instruction::ExtractElement) {
953 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
954 if (!CI)
955 return std::nullopt;
956 // Check if the index is out of bound - we can get the source vector from
957 // operand 0
958 unsigned Idx = CI->getZExtValue();
959 auto *EE = cast<ExtractElementInst>(E);
960 const unsigned VF = ::getNumElements(EE->getVectorOperandType());
961 if (Idx >= VF)
962 return std::nullopt;
963 return Idx;
964 }
965 auto *EI = cast<ExtractValueInst>(E);
966 if (EI->getNumIndices() != 1)
967 return std::nullopt;
968 return *EI->idx_begin();
969}
970
971/// Checks if the provided value does not require scheduling. It does not
972/// require scheduling if this is not an instruction or it is an instruction
973/// that does not read/write memory and all operands are either not instructions
974/// or phi nodes or instructions from different blocks.
975static bool areAllOperandsNonInsts(Value *V);
976/// Checks if the provided value does not require scheduling. It does not
977/// require scheduling if this is not an instruction or it is an instruction
978/// that does not read/write memory and all users are phi nodes or instructions
979/// from the different blocks.
980static bool isUsedOutsideBlock(Value *V);
981/// Checks if the specified value does not require scheduling. It does not
982/// require scheduling if all operands and all users do not need to be scheduled
983/// in the current basic block.
984static bool doesNotNeedToBeScheduled(Value *V);
985
986/// \returns true if \p Opcode is allowed as part of the main/alternate
987/// instruction for SLP vectorization.
988///
989/// Example of unsupported opcode is SDIV that can potentially cause UB if the
990/// "shuffled out" lane would result in division by zero.
991static bool isValidForAlternation(unsigned Opcode) {
992 return !Instruction::isIntDivRem(Opcode);
993}
994
995namespace {
996
997/// Helper class that determines VL can use the same opcode.
998/// Alternate instruction is supported. In addition, it supports interchangeable
999/// instruction. An interchangeable instruction is an instruction that can be
1000/// converted to another instruction with same semantics. For example, x << 1 is
1001/// equal to x * 2. x * 1 is equal to x | 0.
1002class BinOpSameOpcodeHelper {
1003 using MaskType = std::uint_fast32_t;
1004 /// Sort SupportedOp because it is used by binary_search.
1005 constexpr static std::initializer_list<unsigned> SupportedOp = {
1006 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
1007 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
1008 static_assert(llvm::is_sorted_constexpr(SupportedOp) &&
1009 "SupportedOp is not sorted.");
1010 enum : MaskType {
1011 ShlBIT = 1,
1012 AShrBIT = 1 << 1,
1013 MulBIT = 1 << 2,
1014 AddBIT = 1 << 3,
1015 SubBIT = 1 << 4,
1016 AndBIT = 1 << 5,
1017 OrBIT = 1 << 6,
1018 XorBIT = 1 << 7,
1019 MainOpBIT = 1 << 8,
1020 LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
1021 };
1022 /// Return a non-nullptr if either operand of I is a ConstantInt.
1023 /// The second return value represents the operand position. We check the
1024 /// right-hand side first (1). If the right hand side is not a ConstantInt and
1025 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
1026 /// side (0).
1027 static std::pair<ConstantInt *, unsigned>
1028 isBinOpWithConstantInt(const Instruction *I) {
1029 unsigned Opcode = I->getOpcode();
1030 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
1031 (void)SupportedOp;
1032 auto *BinOp = cast<BinaryOperator>(I);
1033 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
1034 return {CI, 1};
1035 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
1036 Opcode == Instruction::AShr)
1037 return {nullptr, 0};
1038 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
1039 return {CI, 0};
1040 return {nullptr, 0};
1041 }
1042 struct InterchangeableInfo {
1043 const Instruction *I = nullptr;
1044 /// The bit it sets represents whether MainOp can be converted to.
1045 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
1046 MulBIT | AShrBIT | ShlBIT;
1047 /// We cannot create an interchangeable instruction that does not exist in
1048 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
1049 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
1050 /// 1]. SeenBefore is used to know what operations have been seen before.
1051 MaskType SeenBefore = 0;
1052 InterchangeableInfo(const Instruction *I) : I(I) {}
1053 /// Return false allows BinOpSameOpcodeHelper to find an alternate
1054 /// instruction. Directly setting the mask will destroy the mask state,
1055 /// preventing us from determining which instruction it should convert to.
1056 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1057 if (Mask & InterchangeableMask) {
1058 SeenBefore |= OpcodeInMaskForm;
1059 Mask &= InterchangeableMask;
1060 return true;
1061 }
1062 return false;
1063 }
1064 bool equal(unsigned Opcode) {
1065 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1066 }
1067 unsigned getOpcode() const {
1068 MaskType Candidate = Mask & SeenBefore;
1069 if (Candidate & MainOpBIT)
1070 return I->getOpcode();
1071 if (Candidate & ShlBIT)
1072 return Instruction::Shl;
1073 if (Candidate & AShrBIT)
1074 return Instruction::AShr;
1075 if (Candidate & MulBIT)
1076 return Instruction::Mul;
1077 if (Candidate & AddBIT)
1078 return Instruction::Add;
1079 if (Candidate & SubBIT)
1080 return Instruction::Sub;
1081 if (Candidate & AndBIT)
1082 return Instruction::And;
1083 if (Candidate & OrBIT)
1084 return Instruction::Or;
1085 if (Candidate & XorBIT)
1086 return Instruction::Xor;
1087 llvm_unreachable("Cannot find interchangeable instruction.");
1088 }
1089
1090 bool hasDefinedOpcode() const { return (Mask & SeenBefore) > 0; }
1091
1092 /// Return true if the instruction can be converted to \p Opcode.
1093 bool hasCandidateOpcode(unsigned Opcode) const {
1094 MaskType Candidate = Mask & SeenBefore;
1095 switch (Opcode) {
1096 case Instruction::Shl:
1097 return Candidate & ShlBIT;
1098 case Instruction::AShr:
1099 return Candidate & AShrBIT;
1100 case Instruction::Mul:
1101 return Candidate & MulBIT;
1102 case Instruction::Add:
1103 return Candidate & AddBIT;
1104 case Instruction::Sub:
1105 return Candidate & SubBIT;
1106 case Instruction::And:
1107 return Candidate & AndBIT;
1108 case Instruction::Or:
1109 return Candidate & OrBIT;
1110 case Instruction::Xor:
1111 return Candidate & XorBIT;
1112 case Instruction::LShr:
1113 case Instruction::FAdd:
1114 case Instruction::FSub:
1115 case Instruction::FMul:
1116 case Instruction::SDiv:
1117 case Instruction::UDiv:
1118 case Instruction::FDiv:
1119 case Instruction::SRem:
1120 case Instruction::URem:
1121 case Instruction::FRem:
1122 return false;
1123 default:
1124 break;
1125 }
1126 llvm_unreachable("Cannot find interchangeable instruction.");
1127 }
1128
1129 SmallVector<Value *> getOperand(const Instruction *To) const {
1130 unsigned ToOpcode = To->getOpcode();
1131 unsigned FromOpcode = I->getOpcode();
1132 if (FromOpcode == ToOpcode)
1133 return SmallVector<Value *>(I->operands());
1134 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1135 auto [CI, Pos] = isBinOpWithConstantInt(I);
1136 const APInt &FromCIValue = CI->getValue();
1137 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1138 Type *RHSType = I->getOperand(Pos)->getType();
1139 Constant *RHS;
1140 switch (FromOpcode) {
1141 case Instruction::Shl:
1142 if (ToOpcode == Instruction::Add && FromCIValue.isOne())
1143 return {I->getOperand(0), I->getOperand(0)};
1144 if (ToOpcode == Instruction::Mul) {
1145 RHS = ConstantInt::get(
1146 RHSType, APInt::getOneBitSet(FromCIValueBitWidth,
1147 FromCIValue.getZExtValue()));
1148 } else {
1149 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1150 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1151 /*AllowRHSConstant=*/true);
1152 }
1153 break;
1154 case Instruction::Mul:
1155 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1156 if (ToOpcode == Instruction::Shl) {
1157 RHS = ConstantInt::get(
1158 RHSType, APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
1159 } else {
1160 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1161 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1162 /*AllowRHSConstant=*/true);
1163 }
1164 break;
1165 case Instruction::Add:
1166 case Instruction::Sub:
1167 if (FromCIValue.isZero()) {
1168 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1169 /*AllowRHSConstant=*/true);
1170 } else {
1171 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1172 "Cannot convert the instruction.");
1173 APInt NegatedVal = APInt(FromCIValue);
1174 NegatedVal.negate();
1175 RHS = ConstantInt::get(RHSType, NegatedVal);
1176 }
1177 break;
1178 case Instruction::And:
1179 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1180 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1181 /*AllowRHSConstant=*/true);
1182 break;
1183 default:
1184 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1185 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1186 /*AllowRHSConstant=*/true);
1187 break;
1188 }
1189 Value *LHS = I->getOperand(1 - Pos);
1190 // If the target opcode is non-commutative (e.g., shl, sub),
1191 // force the variable to the left and the constant to the right.
1192 if (Pos == 1 || !Instruction::isCommutative(ToOpcode))
1193 return SmallVector<Value *>({LHS, RHS});
1194
1195 return SmallVector<Value *>({RHS, LHS});
1196 }
1197 };
1198 InterchangeableInfo MainOp;
1199 InterchangeableInfo AltOp;
1200 bool isValidForAlternation(const Instruction *I) const {
1201 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1202 ::isValidForAlternation(I->getOpcode());
1203 }
1204 bool initializeAltOp(const Instruction *I) {
1205 if (AltOp.I)
1206 return true;
1208 return false;
1209 AltOp.I = I;
1210 return true;
1211 }
1212
1213public:
1214 BinOpSameOpcodeHelper(const Instruction *MainOp,
1215 const Instruction *AltOp = nullptr)
1216 : MainOp(MainOp), AltOp(AltOp) {}
1217 bool add(const Instruction *I) {
1219 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1220 unsigned Opcode = I->getOpcode();
1221 MaskType OpcodeInMaskForm;
1222 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1223 switch (Opcode) {
1224 case Instruction::Shl:
1225 OpcodeInMaskForm = ShlBIT;
1226 break;
1227 case Instruction::AShr:
1228 OpcodeInMaskForm = AShrBIT;
1229 break;
1230 case Instruction::Mul:
1231 OpcodeInMaskForm = MulBIT;
1232 break;
1233 case Instruction::Add:
1234 OpcodeInMaskForm = AddBIT;
1235 break;
1236 case Instruction::Sub:
1237 OpcodeInMaskForm = SubBIT;
1238 break;
1239 case Instruction::And:
1240 OpcodeInMaskForm = AndBIT;
1241 break;
1242 case Instruction::Or:
1243 OpcodeInMaskForm = OrBIT;
1244 break;
1245 case Instruction::Xor:
1246 OpcodeInMaskForm = XorBIT;
1247 break;
1248 default:
1249 return MainOp.equal(Opcode) ||
1250 (initializeAltOp(I) && AltOp.equal(Opcode));
1251 }
1252 MaskType InterchangeableMask = OpcodeInMaskForm;
1253 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1254 if (CI) {
1255 constexpr MaskType CanBeAll =
1256 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1257 const APInt &CIValue = CI->getValue();
1258 switch (Opcode) {
1259 case Instruction::Shl:
1260 if (CIValue.ult(CIValue.getBitWidth()))
1261 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1262 if (CIValue.isOne())
1263 InterchangeableMask |= AddBIT;
1264 break;
1265 case Instruction::Mul:
1266 if (CIValue.isOne()) {
1267 InterchangeableMask = CanBeAll;
1268 break;
1269 }
1270 if (CIValue.isPowerOf2())
1271 InterchangeableMask = MulBIT | ShlBIT;
1272 break;
1273 case Instruction::Add:
1274 case Instruction::Sub:
1275 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1276 break;
1277 case Instruction::And:
1278 if (CIValue.isAllOnes())
1279 InterchangeableMask = CanBeAll;
1280 break;
1281 case Instruction::Xor:
1282 if (CIValue.isZero())
1283 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1284 break;
1285 default:
1286 if (CIValue.isZero())
1287 InterchangeableMask = CanBeAll;
1288 break;
1289 }
1290 }
1291 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1292 (initializeAltOp(I) &&
1293 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1294 }
1295 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1296 bool hasDefinedMainOpcode() const { return MainOp.hasDefinedOpcode(); }
1297 /// Checks if the list of potential opcodes includes \p Opcode.
1298 bool hasCandidateOpcode(unsigned Opcode) const {
1299 return MainOp.hasCandidateOpcode(Opcode);
1300 }
1301 bool hasAltOp() const { return AltOp.I; }
1302 unsigned getAltOpcode() const {
1303 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1304 }
1305 bool hasDefinedAltOpcode() const {
1306 return !hasAltOp() || AltOp.hasDefinedOpcode();
1307 }
1308 SmallVector<Value *> getOperand(const Instruction *I) const {
1309 return MainOp.getOperand(I);
1310 }
1311};
1312
1313/// Main data required for vectorization of instructions.
1314class InstructionsState {
1315 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1316 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1317 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1318 /// isAltShuffle).
1319 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1320 /// from getMainAltOpsNoStateVL.
1321 /// For those InstructionsState that use alternate instructions, the resulting
1322 /// vectorized output ultimately comes from a shufflevector. For example,
1323 /// given a vector list (VL):
1324 /// VL[0] = add i32 a, e
1325 /// VL[1] = sub i32 b, f
1326 /// VL[2] = add i32 c, g
1327 /// VL[3] = sub i32 d, h
1328 /// The vectorized result would be:
1329 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1330 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1331 /// result = shufflevector <4 x i32> intermediated_0,
1332 /// <4 x i32> intermediated_1,
1333 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1334 /// Since shufflevector is used in the final result, when calculating the cost
1335 /// (getEntryCost), we must account for the usage of shufflevector in
1336 /// GetVectorCost.
1337 Instruction *MainOp = nullptr;
1338 Instruction *AltOp = nullptr;
1339 /// Wether the instruction state represents copyable instructions.
1340 bool HasCopyables = false;
1341
1342public:
1343 Instruction *getMainOp() const {
1344 assert(valid() && "InstructionsState is invalid.");
1345 return MainOp;
1346 }
1347
1348 Instruction *getAltOp() const {
1349 assert(valid() && "InstructionsState is invalid.");
1350 return AltOp;
1351 }
1352
1353 /// The main/alternate opcodes for the list of instructions.
1354 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1355
1356 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1357
1358 /// Some of the instructions in the list have alternate opcodes.
1359 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1360
1361 /// Checks if the instruction matches either the main or alternate opcode.
1362 /// \returns
1363 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1364 /// to it
1365 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1366 /// it
1367 /// - nullptr if \param I cannot be matched or converted to either opcode
1368 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1369 assert(MainOp && "MainOp cannot be nullptr.");
1370 if (I->getOpcode() == MainOp->getOpcode())
1371 return MainOp;
1372 if (MainOp->getOpcode() == Instruction::Select &&
1373 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1374 return MainOp;
1375 // Prefer AltOp instead of interchangeable instruction of MainOp.
1376 assert(AltOp && "AltOp cannot be nullptr.");
1377 if (I->getOpcode() == AltOp->getOpcode())
1378 return AltOp;
1379 if (!I->isBinaryOp())
1380 return nullptr;
1381 BinOpSameOpcodeHelper Converter(MainOp);
1382 if (!Converter.add(I) || !Converter.add(MainOp))
1383 return nullptr;
1384 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1385 BinOpSameOpcodeHelper AltConverter(AltOp);
1386 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1387 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1388 return AltOp;
1389 }
1390 if (Converter.hasAltOp() && !isAltShuffle())
1391 return nullptr;
1392 return Converter.hasAltOp() ? AltOp : MainOp;
1393 }
1394
1395 /// Checks if main/alt instructions are shift operations.
1396 bool isShiftOp() const {
1397 return getMainOp()->isShift() && getAltOp()->isShift();
1398 }
1399
1400 /// Checks if main/alt instructions are bitwise logic operations.
1401 bool isBitwiseLogicOp() const {
1402 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1403 }
1404
1405 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1406 bool isMulDivLikeOp() const {
1407 constexpr std::array<unsigned, 8> MulDiv = {
1408 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1409 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1410 Instruction::URem, Instruction::FRem};
1411 return is_contained(MulDiv, getOpcode()) &&
1412 is_contained(MulDiv, getAltOpcode());
1413 }
1414
1415 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1416 bool isAddSubLikeOp() const {
1417 constexpr std::array<unsigned, 4> AddSub = {
1418 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1419 Instruction::FSub};
1420 return is_contained(AddSub, getOpcode()) &&
1421 is_contained(AddSub, getAltOpcode());
1422 }
1423
1424 /// Checks if main/alt instructions are cmp operations.
1425 bool isCmpOp() const {
1426 return (getOpcode() == Instruction::ICmp ||
1427 getOpcode() == Instruction::FCmp) &&
1428 getAltOpcode() == getOpcode();
1429 }
1430
1431 /// Checks if the current state is valid, i.e. has non-null MainOp
1432 bool valid() const { return MainOp && AltOp; }
1433
1434 explicit operator bool() const { return valid(); }
1435
1436 InstructionsState() = delete;
1437 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1438 bool HasCopyables = false)
1439 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1440 static InstructionsState invalid() { return {nullptr, nullptr}; }
1441
1442 /// Checks if the value is a copyable element.
1443 bool isCopyableElement(Value *V) const {
1444 assert(valid() && "InstructionsState is invalid.");
1445 if (!HasCopyables)
1446 return false;
1447 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1448 return false;
1449 auto *I = dyn_cast<Instruction>(V);
1450 if (!I)
1451 return !isa<PoisonValue>(V);
1452 if (I->getParent() != MainOp->getParent() &&
1455 return true;
1456 if (I->getOpcode() == MainOp->getOpcode())
1457 return false;
1458 if (!I->isBinaryOp())
1459 return true;
1460 BinOpSameOpcodeHelper Converter(MainOp);
1461 return !Converter.add(I) || !Converter.add(MainOp) ||
1462 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1463 }
1464
1465 /// Checks if the value \p V is a transformed instruction, compatible either
1466 /// with main or alternate ops.
1467 bool isExpandedBinOp(Value *V) const {
1468 assert(valid() && "InstructionsState is invalid.");
1469 if (isCopyableElement(V))
1470 return false;
1471 auto *ExpandingOp = dyn_cast<Instruction>(V);
1472 if (!ExpandingOp)
1473 return false;
1474 auto CheckForTransformedOpcode = [](const Instruction *RefOp,
1475 const Instruction *ExpandingOp) {
1476 switch (RefOp->getOpcode()) {
1477 case Instruction::Add:
1478 switch (ExpandingOp->getOpcode()) {
1479 case Instruction::Shl:
1480 return match(ExpandingOp, m_Shl(m_Value(), m_One()));
1481 default:
1482 break;
1483 }
1484 break;
1485 default:
1486 break;
1487 }
1488 return false;
1489 };
1490 Instruction *MainOp = getMatchingMainOpOrAltOp(ExpandingOp);
1491 assert(MainOp &&
1492 "The instruction should be compatible with either main or alt op.");
1493 return CheckForTransformedOpcode(MainOp, ExpandingOp);
1494 }
1495
1496 /// Checks if the operand at index \p Idx of instruction \p I is an expanded
1497 /// operand.
1498 bool isExpandedOperand(Instruction *I, unsigned Idx) const {
1499 assert(isExpandedBinOp(I) && "Expected an expanded binop.");
1500 switch (I->getOpcode()) {
1501 case Instruction::Shl:
1502 assert(match(I, m_Shl(m_Value(), m_One())) && "Expected shl x, 1 only.");
1503 return Idx == 1;
1504 default:
1505 llvm_unreachable("Unexpected opcode for an expanded operand.");
1506 }
1507 }
1508
1509 /// Checks if the value is non-schedulable.
1510 bool isNonSchedulable(Value *V) const {
1511 assert(valid() && "InstructionsState is invalid.");
1512 auto *I = dyn_cast<Instruction>(V);
1513 if (!HasCopyables)
1514 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1516 // MainOp for copyables always schedulable to correctly identify
1517 // non-schedulable copyables.
1518 if (getMainOp() == V)
1519 return false;
1520 if (isCopyableElement(V)) {
1521 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1522 auto *I = dyn_cast<Instruction>(V);
1523 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1525 // If the copyable instructions comes after MainOp
1526 // (non-schedulable, but used in the block) - cannot vectorize
1527 // it, will possibly generate use before def.
1528 !MainOp->comesBefore(I));
1529 };
1530
1531 return IsNonSchedulableCopyableElement(V);
1532 }
1533 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1535 }
1536
1537 /// Checks if the state represents copyable instructions.
1538 bool areInstructionsWithCopyableElements() const {
1539 assert(valid() && "InstructionsState is invalid.");
1540 return HasCopyables;
1541 }
1542};
1543
1544std::pair<Instruction *, SmallVector<Value *>>
1545convertTo(Instruction *I, const InstructionsState &S) {
1546 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1547 assert(SelectedOp && "Cannot convert the instruction.");
1548 if (I->isBinaryOp()) {
1549 BinOpSameOpcodeHelper Converter(I);
1550 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1551 }
1552 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1553}
1554
1555} // end anonymous namespace
1556
1557static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1558 const TargetLibraryInfo &TLI);
1559
1560/// Find an instruction with a specific opcode in VL.
1561/// \param VL Array of values to search through. Must contain only Instructions
1562/// and PoisonValues.
1563/// \param Opcode The instruction opcode to search for
1564/// \returns
1565/// - The first instruction found with matching opcode
1566/// - nullptr if no matching instruction is found
1568 unsigned Opcode) {
1569 for (Value *V : VL) {
1570 if (isa<PoisonValue>(V))
1571 continue;
1572 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1573 auto *Inst = cast<Instruction>(V);
1574 if (Inst->getOpcode() == Opcode)
1575 return Inst;
1576 }
1577 return nullptr;
1578}
1579
1580/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1581/// compatible instructions or constants, or just some other regular values.
1582static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1583 Value *Op1, const TargetLibraryInfo &TLI) {
1584 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1585 (isConstant(BaseOp1) && isConstant(Op1)) ||
1586 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1587 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1588 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1589 getSameOpcode({BaseOp0, Op0}, TLI) ||
1590 getSameOpcode({BaseOp1, Op1}, TLI);
1591}
1592
1593/// \returns true if a compare instruction \p CI has similar "look" and
1594/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1595/// swapped, false otherwise.
1596static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1597 const TargetLibraryInfo &TLI) {
1598 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1599 "Assessing comparisons of different types?");
1600 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1601 CmpInst::Predicate Pred = CI->getPredicate();
1603
1604 Value *BaseOp0 = BaseCI->getOperand(0);
1605 Value *BaseOp1 = BaseCI->getOperand(1);
1606 Value *Op0 = CI->getOperand(0);
1607 Value *Op1 = CI->getOperand(1);
1608
1609 return (BasePred == Pred &&
1610 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1611 (BasePred == SwappedPred &&
1612 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1613}
1614
1615/// \returns analysis of the Instructions in \p VL described in
1616/// InstructionsState, the Opcode that we suppose the whole list
1617/// could be vectorized even if its structure is diverse.
1618static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1619 const TargetLibraryInfo &TLI) {
1620 // Make sure these are all Instructions.
1622 return InstructionsState::invalid();
1623
1624 auto *It = find_if(VL, IsaPred<Instruction>);
1625 if (It == VL.end())
1626 return InstructionsState::invalid();
1627
1628 Instruction *MainOp = cast<Instruction>(*It);
1629 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1630 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1631 (VL.size() == 2 && InstCnt < 2))
1632 return InstructionsState::invalid();
1633
1634 bool IsCastOp = isa<CastInst>(MainOp);
1635 bool IsBinOp = isa<BinaryOperator>(MainOp);
1636 bool IsCmpOp = isa<CmpInst>(MainOp);
1637 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1639 Instruction *AltOp = MainOp;
1640 unsigned Opcode = MainOp->getOpcode();
1641 unsigned AltOpcode = Opcode;
1642
1643 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1644 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1645 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1646 UniquePreds.insert(BasePred);
1647 UniqueNonSwappedPreds.insert(BasePred);
1648 for (Value *V : VL) {
1649 auto *I = dyn_cast<CmpInst>(V);
1650 if (!I)
1651 return false;
1652 CmpInst::Predicate CurrentPred = I->getPredicate();
1653 CmpInst::Predicate SwappedCurrentPred =
1654 CmpInst::getSwappedPredicate(CurrentPred);
1655 UniqueNonSwappedPreds.insert(CurrentPred);
1656 if (!UniquePreds.contains(CurrentPred) &&
1657 !UniquePreds.contains(SwappedCurrentPred))
1658 UniquePreds.insert(CurrentPred);
1659 }
1660 // Total number of predicates > 2, but if consider swapped predicates
1661 // compatible only 2, consider swappable predicates as compatible opcodes,
1662 // not alternate.
1663 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1664 }();
1665 // Check for one alternate opcode from another BinaryOperator.
1666 // TODO - generalize to support all operators (types, calls etc.).
1667 Intrinsic::ID BaseID = 0;
1668 SmallVector<VFInfo> BaseMappings;
1669 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1670 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1671 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1672 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1673 return InstructionsState::invalid();
1674 }
1675 bool AnyPoison = InstCnt != VL.size();
1676 // Check MainOp too to be sure that it matches the requirements for the
1677 // instructions.
1678 for (Value *V : iterator_range(It, VL.end())) {
1679 auto *I = dyn_cast<Instruction>(V);
1680 if (!I)
1681 continue;
1682
1683 // Cannot combine poison and divisions.
1684 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1685 // intrinsics/functions only.
1686 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1687 return InstructionsState::invalid();
1688 unsigned InstOpcode = I->getOpcode();
1689 if (IsBinOp && isa<BinaryOperator>(I)) {
1690 if (BinOpHelper.add(I))
1691 continue;
1692 } else if (IsCastOp && isa<CastInst>(I)) {
1693 Value *Op0 = MainOp->getOperand(0);
1694 Type *Ty0 = Op0->getType();
1695 Value *Op1 = I->getOperand(0);
1696 Type *Ty1 = Op1->getType();
1697 if (Ty0 == Ty1) {
1698 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1699 continue;
1700 if (Opcode == AltOpcode) {
1701 assert(isValidForAlternation(Opcode) &&
1702 isValidForAlternation(InstOpcode) &&
1703 "Cast isn't safe for alternation, logic needs to be updated!");
1704 AltOpcode = InstOpcode;
1705 AltOp = I;
1706 continue;
1707 }
1708 }
1709 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1710 auto *BaseInst = cast<CmpInst>(MainOp);
1711 Type *Ty0 = BaseInst->getOperand(0)->getType();
1712 Type *Ty1 = Inst->getOperand(0)->getType();
1713 if (Ty0 == Ty1) {
1714 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1715 assert(InstOpcode == AltOpcode &&
1716 "Alternate instructions are only supported by BinaryOperator "
1717 "and CastInst.");
1718 // Check for compatible operands. If the corresponding operands are not
1719 // compatible - need to perform alternate vectorization.
1720 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1721 CmpInst::Predicate SwappedCurrentPred =
1722 CmpInst::getSwappedPredicate(CurrentPred);
1723
1724 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1725 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1726 continue;
1727
1728 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1729 continue;
1730 auto *AltInst = cast<CmpInst>(AltOp);
1731 if (MainOp != AltOp) {
1732 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1733 continue;
1734 } else if (BasePred != CurrentPred) {
1735 assert(
1736 isValidForAlternation(InstOpcode) &&
1737 "CmpInst isn't safe for alternation, logic needs to be updated!");
1738 AltOp = I;
1739 continue;
1740 }
1741 CmpInst::Predicate AltPred = AltInst->getPredicate();
1742 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1743 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1744 continue;
1745 }
1746 } else if (InstOpcode == Opcode) {
1747 assert(InstOpcode == AltOpcode &&
1748 "Alternate instructions are only supported by BinaryOperator and "
1749 "CastInst.");
1750 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1751 if (Gep->getNumOperands() != 2 ||
1752 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1753 return InstructionsState::invalid();
1754 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1756 return InstructionsState::invalid();
1757 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1758 auto *BaseLI = cast<LoadInst>(MainOp);
1759 if (!LI->isSimple() || !BaseLI->isSimple())
1760 return InstructionsState::invalid();
1761 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1762 auto *CallBase = cast<CallInst>(MainOp);
1763 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1764 return InstructionsState::invalid();
1765 if (Call->hasOperandBundles() &&
1767 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1768 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1769 CallBase->op_begin() +
1771 return InstructionsState::invalid();
1773 if (ID != BaseID)
1774 return InstructionsState::invalid();
1775 if (!ID) {
1776 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1777 if (Mappings.size() != BaseMappings.size() ||
1778 Mappings.front().ISA != BaseMappings.front().ISA ||
1779 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1780 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1781 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1782 Mappings.front().Shape.Parameters !=
1783 BaseMappings.front().Shape.Parameters)
1784 return InstructionsState::invalid();
1785 }
1786 }
1787 continue;
1788 }
1789 return InstructionsState::invalid();
1790 }
1791
1792 if (IsBinOp) {
1793 if (!BinOpHelper.hasDefinedMainOpcode() ||
1794 !BinOpHelper.hasDefinedAltOpcode())
1795 return InstructionsState::invalid();
1796 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1797 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1798 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1799 assert(AltOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1800 }
1801 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1802 "Incorrect implementation of allSameOpcode.");
1803 InstructionsState S(MainOp, AltOp);
1804 assert(all_of(VL,
1805 [&](Value *V) {
1806 return isa<PoisonValue>(V) ||
1807 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1808 }) &&
1809 "Invalid InstructionsState.");
1810 return S;
1811}
1812
1813/// \returns true if all of the values in \p VL have the same type or false
1814/// otherwise.
1816 Type *Ty = VL.consume_front()->getType();
1817 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1818}
1819
1820/// \returns True if in-tree use also needs extract. This refers to
1821/// possible scalar operand in vectorized instruction.
1822static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1823 TargetLibraryInfo *TLI,
1824 const TargetTransformInfo *TTI) {
1825 if (!UserInst)
1826 return false;
1827 unsigned Opcode = UserInst->getOpcode();
1828 switch (Opcode) {
1829 case Instruction::Load: {
1830 LoadInst *LI = cast<LoadInst>(UserInst);
1831 return (LI->getPointerOperand() == Scalar);
1832 }
1833 case Instruction::Store: {
1834 StoreInst *SI = cast<StoreInst>(UserInst);
1835 return (SI->getPointerOperand() == Scalar);
1836 }
1837 case Instruction::Call: {
1838 CallInst *CI = cast<CallInst>(UserInst);
1840 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1841 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1842 Arg.value().get() == Scalar;
1843 });
1844 }
1845 default:
1846 return false;
1847 }
1848}
1849
1850/// \returns the AA location that is being access by the instruction.
1853 return MemoryLocation::get(SI);
1854 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1855 return MemoryLocation::get(LI);
1856 return MemoryLocation();
1857}
1858
1859/// \returns True if the instruction is not a volatile or atomic load/store.
1860static bool isSimple(Instruction *I) {
1861 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1862 return LI->isSimple();
1864 return SI->isSimple();
1866 return !MI->isVolatile();
1867 return true;
1868}
1869
1870/// Shuffles \p Mask in accordance with the given \p SubMask.
1871/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1872/// one but two input vectors.
1873static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1874 bool ExtendingManyInputs = false) {
1875 if (SubMask.empty())
1876 return;
1877 assert(
1878 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1879 // Check if input scalars were extended to match the size of other node.
1880 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1881 "SubMask with many inputs support must be larger than the mask.");
1882 if (Mask.empty()) {
1883 Mask.append(SubMask.begin(), SubMask.end());
1884 return;
1885 }
1886 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1887 int TermValue = std::min(Mask.size(), SubMask.size());
1888 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1889 if (SubMask[I] == PoisonMaskElem ||
1890 (!ExtendingManyInputs &&
1891 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1892 continue;
1893 NewMask[I] = Mask[SubMask[I]];
1894 }
1895 Mask.swap(NewMask);
1896}
1897
1898/// Order may have elements assigned special value (size) which is out of
1899/// bounds. Such indices only appear on places which correspond to undef values
1900/// (see canReuseExtract for details) and used in order to avoid undef values
1901/// have effect on operands ordering.
1902/// The first loop below simply finds all unused indices and then the next loop
1903/// nest assigns these indices for undef values positions.
1904/// As an example below Order has two undef positions and they have assigned
1905/// values 3 and 7 respectively:
1906/// before: 6 9 5 4 9 2 1 0
1907/// after: 6 3 5 4 7 2 1 0
1909 const size_t Sz = Order.size();
1910 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1911 SmallBitVector MaskedIndices(Sz);
1912 for (unsigned I = 0; I < Sz; ++I) {
1913 if (Order[I] < Sz)
1914 UnusedIndices.reset(Order[I]);
1915 else
1916 MaskedIndices.set(I);
1917 }
1918 if (MaskedIndices.none())
1919 return;
1920 assert(UnusedIndices.count() == MaskedIndices.count() &&
1921 "Non-synced masked/available indices.");
1922 int Idx = UnusedIndices.find_first();
1923 int MIdx = MaskedIndices.find_first();
1924 while (MIdx >= 0) {
1925 assert(Idx >= 0 && "Indices must be synced.");
1926 Order[MIdx] = Idx;
1927 Idx = UnusedIndices.find_next(Idx);
1928 MIdx = MaskedIndices.find_next(MIdx);
1929 }
1930}
1931
1932/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1933/// Opcode1.
1935 unsigned Opcode0, unsigned Opcode1) {
1936 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1937 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1938 for (unsigned Lane : seq<unsigned>(VL.size())) {
1939 if (isa<PoisonValue>(VL[Lane]))
1940 continue;
1941 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1942 OpcodeMask.set(Lane * ScalarTyNumElements,
1943 Lane * ScalarTyNumElements + ScalarTyNumElements);
1944 }
1945 return OpcodeMask;
1946}
1947
1948/// Replicates the given \p Val \p VF times.
1950 unsigned VF) {
1951 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1952 "Expected scalar constants.");
1953 SmallVector<Constant *> NewVal(Val.size() * VF);
1954 for (auto [I, V] : enumerate(Val))
1955 std::fill_n(NewVal.begin() + I * VF, VF, V);
1956 return NewVal;
1957}
1958
1960 SmallVectorImpl<int> &Mask) {
1961 Mask.clear();
1962 const unsigned E = Indices.size();
1963 Mask.resize(E, PoisonMaskElem);
1964 for (unsigned I = 0; I < E; ++I)
1965 Mask[Indices[I]] = I;
1966}
1967
1968/// Reorders the list of scalars in accordance with the given \p Mask.
1970 ArrayRef<int> Mask) {
1971 assert(!Mask.empty() && "Expected non-empty mask.");
1972 SmallVector<Value *> Prev(Scalars.size(),
1973 PoisonValue::get(Scalars.front()->getType()));
1974 Prev.swap(Scalars);
1975 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1976 if (Mask[I] != PoisonMaskElem)
1977 Scalars[Mask[I]] = Prev[I];
1978}
1979
1980/// Checks if the provided value does not require scheduling. It does not
1981/// require scheduling if this is not an instruction or it is an instruction
1982/// that does not read/write memory and all operands are either not instructions
1983/// or phi nodes or instructions from different blocks.
1985 auto *I = dyn_cast<Instruction>(V);
1986 if (!I)
1987 return true;
1988 return !mayHaveNonDefUseDependency(*I) &&
1989 all_of(I->operands(), [I](Value *V) {
1990 auto *IO = dyn_cast<Instruction>(V);
1991 if (!IO)
1992 return true;
1993 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1994 });
1995}
1996
1997/// Checks if the provided value does not require scheduling. It does not
1998/// require scheduling if this is not an instruction or it is an instruction
1999/// that does not read/write memory and all users are phi nodes or instructions
2000/// from the different blocks.
2001static bool isUsedOutsideBlock(Value *V) {
2002 auto *I = dyn_cast<Instruction>(V);
2003 if (!I)
2004 return true;
2005 // Limits the number of uses to save compile time.
2006 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
2007 all_of(I->users(), [I](User *U) {
2008 auto *IU = dyn_cast<Instruction>(U);
2009 if (!IU)
2010 return true;
2011 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
2012 });
2013}
2014
2015/// Checks if the specified value does not require scheduling. It does not
2016/// require scheduling if all operands and all users do not need to be scheduled
2017/// in the current basic block.
2020}
2021
2022/// Checks if the specified array of instructions does not require scheduling.
2023/// It is so if all either instructions have operands that do not require
2024/// scheduling or their users do not require scheduling since they are phis or
2025/// in other basic blocks.
2027 return !VL.empty() &&
2029}
2030
2031/// Returns true if widened type of \p Ty elements with size \p Sz represents
2032/// full vector type, i.e. adding extra element results in extra parts upon type
2033/// legalization.
2035 unsigned Sz) {
2036 if (Sz <= 1)
2037 return false;
2039 return false;
2040 if (has_single_bit(Sz))
2041 return true;
2042 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
2043 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
2044 Sz % NumParts == 0;
2045}
2046
2047/// Returns number of parts, the type \p VecTy will be split at the codegen
2048/// phase. If the type is going to be scalarized or does not uses whole
2049/// registers, returns 1.
2050static unsigned
2052 Type *ScalarTy,
2053 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
2054 unsigned NumParts = TTI.getNumberOfParts(VecTy);
2055 if (NumParts == 0 || NumParts >= Limit)
2056 return 1;
2057 unsigned Sz = getNumElements(VecTy);
2058 unsigned ScalarSz = getNumElements(ScalarTy);
2059 unsigned PWSz =
2060 getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
2061 if (NumParts >= Sz || PWSz % NumParts != 0 ||
2062 (PWSz / NumParts) % ScalarSz != 0 ||
2063 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
2064 return 1;
2065 const unsigned NumElts = PWSz / NumParts;
2066 if (divideCeil(Sz, NumElts) != NumParts)
2067 return 1;
2068 return NumParts;
2069}
2070
2071/// Bottom Up SLP Vectorizer.
2073 class TreeEntry;
2074 class ScheduleEntity;
2075 class ScheduleData;
2076 class ScheduleCopyableData;
2077 class ScheduleBundle;
2080
2081public:
2082 /// If we decide to generate strided load / store, this struct contains all
2083 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
2084 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
2085 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
2086 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
2087 /// size of element of FixedVectorType.
2089 Value *StrideVal = nullptr;
2090 const SCEV *StrideSCEV = nullptr;
2092 };
2093
2094 /// Tracks the state we can represent the loads in the given sequence.
2102
2109
2111 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
2113 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
2114 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
2115 AC(AC), DB(DB), DL(DL), ORE(ORE),
2116 Builder(Se->getContext(), TargetFolder(*DL)) {
2117 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
2118 // Use the vector register size specified by the target unless overridden
2119 // by a command-line option.
2120 // TODO: It would be better to limit the vectorization factor based on
2121 // data type rather than just register size. For example, x86 AVX has
2122 // 256-bit registers, but it does not support integer operations
2123 // at that width (that requires AVX2).
2124 if (MaxVectorRegSizeOption.getNumOccurrences())
2125 MaxVecRegSize = MaxVectorRegSizeOption;
2126 else
2127 MaxVecRegSize =
2128 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
2129 .getFixedValue();
2130
2131 if (MinVectorRegSizeOption.getNumOccurrences())
2132 MinVecRegSize = MinVectorRegSizeOption;
2133 else
2134 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2135 }
2136
2137 /// Vectorize the tree that starts with the elements in \p VL.
2138 /// Returns the vectorized root.
2140
2141 /// Vectorize the tree but with the list of externally used values \p
2142 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2143 /// generated extractvalue instructions.
2144 Value *
2145 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2146 Instruction *ReductionRoot = nullptr,
2147 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2148 VectorValuesAndScales = {});
2149
2150 /// \returns the cost incurred by unwanted spills and fills, caused by
2151 /// holding live values over call sites.
2153
2154 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2155 /// final cost.
2158
2159 /// \returns the vectorization cost of the subtree that starts at \p VL.
2160 /// A negative number means that this is profitable.
2162 ArrayRef<Value *> VectorizedVals = {},
2163 InstructionCost ReductionCost = TTI::TCC_Free,
2164 Instruction *RdxRoot = nullptr);
2165
2166 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2167 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2168 void buildTree(ArrayRef<Value *> Roots,
2169 const SmallDenseSet<Value *> &UserIgnoreLst);
2170
2171 /// Construct a vectorizable tree that starts at \p Roots.
2172 void buildTree(ArrayRef<Value *> Roots);
2173
2174 /// Return the scalars of the root node.
2176 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2177 return VectorizableTree.front()->Scalars;
2178 }
2179
2180 /// Returns the type/is-signed info for the root node in the graph without
2181 /// casting.
2182 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2183 const TreeEntry &Root = *VectorizableTree.front();
2184 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2185 !Root.Scalars.front()->getType()->isIntegerTy())
2186 return std::nullopt;
2187 auto It = MinBWs.find(&Root);
2188 if (It != MinBWs.end())
2189 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2190 It->second.first),
2191 It->second.second);
2192 if (Root.getOpcode() == Instruction::ZExt ||
2193 Root.getOpcode() == Instruction::SExt)
2194 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2195 Root.getOpcode() == Instruction::SExt);
2196 return std::nullopt;
2197 }
2198
2199 /// Checks if the root graph node can be emitted with narrower bitwidth at
2200 /// codegen and returns it signedness, if so.
2202 return MinBWs.at(VectorizableTree.front().get()).second;
2203 }
2204
2205 /// Returns reduction type after minbitdth analysis.
2207 if (ReductionBitWidth == 0 ||
2208 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2209 ReductionBitWidth >=
2210 DL->getTypeSizeInBits(
2211 VectorizableTree.front()->Scalars.front()->getType()))
2212 return getWidenedType(
2213 VectorizableTree.front()->Scalars.front()->getType(),
2214 VectorizableTree.front()->getVectorFactor());
2215 return getWidenedType(
2217 VectorizableTree.front()->Scalars.front()->getContext(),
2218 ReductionBitWidth),
2219 VectorizableTree.front()->getVectorFactor());
2220 }
2221
2222 /// Returns true if the tree results in one of the reduced bitcasts variants.
2224 return VectorizableTree.front()->hasState() &&
2225 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2226 VectorizableTree.front()->CombinedOp ==
2227 TreeEntry::ReducedBitcastBSwap ||
2228 VectorizableTree.front()->CombinedOp ==
2229 TreeEntry::ReducedBitcastLoads ||
2230 VectorizableTree.front()->CombinedOp ==
2231 TreeEntry::ReducedBitcastBSwapLoads) &&
2232 VectorizableTree.front()->State == TreeEntry::Vectorize;
2233 }
2234
2235 /// Returns true if the tree results in the reduced cmp bitcast root.
2237 return VectorizableTree.front()->hasState() &&
2238 VectorizableTree.front()->CombinedOp ==
2239 TreeEntry::ReducedCmpBitcast &&
2240 VectorizableTree.front()->State == TreeEntry::Vectorize;
2241 }
2242
2243 /// Returns true if the tree is a reduction tree.
2244 bool isReductionTree() const { return UserIgnoreList != nullptr; }
2245
2246 /// Builds external uses of the vectorized scalars, i.e. the list of
2247 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2248 /// ExternallyUsedValues contains additional list of external uses to handle
2249 /// vectorization of reductions.
2250 void
2251 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2252
2253 /// Transforms graph nodes to target specific representations, if profitable.
2254 void transformNodes();
2255
2256 /// Clear the internal data structures that are created by 'buildTree'.
2257 void deleteTree() {
2258 VectorizableTree.clear();
2259 ScalarToTreeEntries.clear();
2260 DeletedNodes.clear();
2261 TransformedToGatherNodes.clear();
2262 OperandsToTreeEntry.clear();
2263 ScalarsInSplitNodes.clear();
2264 MustGather.clear();
2265 NonScheduledFirst.clear();
2266 EntryToLastInstruction.clear();
2267 LastInstructionToPos.clear();
2268 LoadEntriesToVectorize.clear();
2269 IsGraphTransformMode = false;
2270 GatheredLoadsEntriesFirst.reset();
2271 CompressEntryToData.clear();
2272 ExternalUses.clear();
2273 ExternalUsesAsOriginalScalar.clear();
2274 ExternalUsesWithNonUsers.clear();
2275 for (auto &Iter : BlocksSchedules) {
2276 BlockScheduling *BS = Iter.second.get();
2277 BS->clear();
2278 }
2279 MinBWs.clear();
2280 ReductionBitWidth = 0;
2281 BaseGraphSize = 1;
2282 CastMaxMinBWSizes.reset();
2283 ExtraBitWidthNodes.clear();
2284 InstrElementSize.clear();
2285 UserIgnoreList = nullptr;
2286 PostponedGathers.clear();
2287 ValueToGatherNodes.clear();
2288 TreeEntryToStridedPtrInfoMap.clear();
2289 CurrentLoopNest.clear();
2290 MergedLoopBTCs.clear();
2291 }
2292
2293 unsigned getTreeSize() const { return VectorizableTree.size(); }
2294
2295 /// Returns the base graph size, before any transformations.
2296 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2297
2298 /// Perform LICM and CSE on the newly generated gather sequences.
2300
2301 /// Does this non-empty order represent an identity order? Identity
2302 /// should be represented as an empty order, so this is used to
2303 /// decide if we can canonicalize a computed order. Undef elements
2304 /// (represented as size) are ignored.
2306 assert(!Order.empty() && "expected non-empty order");
2307 const unsigned Sz = Order.size();
2308 return all_of(enumerate(Order), [&](const auto &P) {
2309 return P.value() == P.index() || P.value() == Sz;
2310 });
2311 }
2312
2313 /// Checks if the specified gather tree entry \p TE can be represented as a
2314 /// shuffled vector entry + (possibly) permutation with other gathers. It
2315 /// implements the checks only for possibly ordered scalars (Loads,
2316 /// ExtractElement, ExtractValue), which can be part of the graph.
2317 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2318 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2319 /// node might be ignored.
2320 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2321 bool TopToBottom,
2322 bool IgnoreReorder);
2323
2324 /// Sort loads into increasing pointers offsets to allow greater clustering.
2325 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2326
2327 /// Gets reordering data for the given tree entry. If the entry is vectorized
2328 /// - just return ReorderIndices, otherwise check if the scalars can be
2329 /// reordered and return the most optimal order.
2330 /// \return std::nullopt if ordering is not important, empty order, if
2331 /// identity order is important, or the actual order.
2332 /// \param TopToBottom If true, include the order of vectorized stores and
2333 /// insertelement nodes, otherwise skip them.
2334 /// \param IgnoreReorder true, if the root node order can be ignored.
2335 std::optional<OrdersType>
2336 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2337
2338 /// Checks if it is profitable to reorder the current tree.
2339 /// If the tree does not contain many profitable reordable nodes, better to
2340 /// skip it to save compile time.
2341 bool isProfitableToReorder() const;
2342
2343 /// Reorders the current graph to the most profitable order starting from the
2344 /// root node to the leaf nodes. The best order is chosen only from the nodes
2345 /// of the same size (vectorization factor). Smaller nodes are considered
2346 /// parts of subgraph with smaller VF and they are reordered independently. We
2347 /// can make it because we still need to extend smaller nodes to the wider VF
2348 /// and we can merge reordering shuffles with the widening shuffles.
2349 void reorderTopToBottom();
2350
2351 /// Reorders the current graph to the most profitable order starting from
2352 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2353 /// number of reshuffles if the leaf nodes use the same order. In this case we
2354 /// can merge the orders and just shuffle user node instead of shuffling its
2355 /// operands. Plus, even the leaf nodes have different orders, it allows to
2356 /// sink reordering in the graph closer to the root node and merge it later
2357 /// during analysis.
2358 void reorderBottomToTop(bool IgnoreReorder = false);
2359
2360 /// \return The vector element size in bits to use when vectorizing the
2361 /// expression tree ending at \p V. If V is a store, the size is the width of
2362 /// the stored value. Otherwise, the size is the width of the largest loaded
2363 /// value reaching V. This method is used by the vectorizer to calculate
2364 /// vectorization factors.
2365 unsigned getVectorElementSize(Value *V);
2366
2367 /// Compute the minimum type sizes required to represent the entries in a
2368 /// vectorizable tree.
2370
2371 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2372 unsigned getMaxVecRegSize() const {
2373 return MaxVecRegSize;
2374 }
2375
2376 // \returns minimum vector register size as set by cl::opt.
2377 unsigned getMinVecRegSize() const {
2378 return MinVecRegSize;
2379 }
2380
2381 unsigned getMinVF(unsigned Sz) const {
2382 return std::max(2U, getMinVecRegSize() / Sz);
2383 }
2384
2385 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2386 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2387 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2388 return MaxVF ? MaxVF : UINT_MAX;
2389 }
2390
2391 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2392 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2393 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2394 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2395 ///
2396 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2397 unsigned canMapToVector(Type *T) const;
2398
2399 /// \returns True if the VectorizableTree is both tiny and not fully
2400 /// vectorizable. We do not vectorize such trees.
2401 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2402
2403 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2404 /// It may happen, if all gather nodes are loads and they cannot be
2405 /// "clusterized". In this case even subgraphs cannot be vectorized more
2406 /// effectively than the base graph.
2407 bool isTreeNotExtendable() const;
2408
2409 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2410 Align Alignment, const int64_t Diff,
2411 const size_t Sz) const;
2412
2413 /// Return true if an array of scalar loads can be replaced with a strided
2414 /// load (with constant stride).
2415 ///
2416 /// It is possible that the load gets "widened". Suppose that originally each
2417 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2418 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2419 /// ...
2420 /// %b + 0 * %s + (w - 1)
2421 ///
2422 /// %b + 1 * %s + 0
2423 /// %b + 1 * %s + 1
2424 /// %b + 1 * %s + 2
2425 /// ...
2426 /// %b + 1 * %s + (w - 1)
2427 /// ...
2428 ///
2429 /// %b + (n - 1) * %s + 0
2430 /// %b + (n - 1) * %s + 1
2431 /// %b + (n - 1) * %s + 2
2432 /// ...
2433 /// %b + (n - 1) * %s + (w - 1)
2434 ///
2435 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2436 ///
2437 /// \param PointerOps list of pointer arguments of loads.
2438 /// \param ElemTy original scalar type of loads.
2439 /// \param Alignment alignment of the first load.
2440 /// \param SortedIndices is the order of PointerOps as returned by
2441 /// `sortPtrAccesses`
2442 /// \param Diff Pointer difference between the lowest and the highes pointer
2443 /// in `PointerOps` as returned by `getPointersDiff`.
2444 /// \param Ptr0 first pointer in `PointersOps`.
2445 /// \param PtrN last pointer in `PointersOps`.
2446 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2447 /// of `SPtrInfo` necessary to generate the strided load later.
2449 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2450 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2451 Value *Ptr0, StridedPtrInfo &SPtrInfo) const;
2452
2453 /// Return true if an array of scalar loads can be replaced with a strided
2454 /// load (with run-time stride).
2455 /// \param PointerOps list of pointer arguments of loads.
2456 /// \param ScalarTy type of loads.
2457 /// \param CommonAlignment common alignement of loads as computed by
2458 /// `computeCommonAlignment<LoadInst>`.
2459 /// \param SortedIndicies is a list of indicies computed by this function such
2460 /// that the sequence `PointerOps[SortedIndices[0]],
2461 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2462 /// ordered by the coefficient of the stride. For example, if PointerOps is
2463 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2464 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2465 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2466 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2467 /// of `SPtrInfo` necessary to generate the strided load later.
2468 /// \param IsLoad Is this a strided load (true) or strided store (false)
2469 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2470 Align CommonAlignment,
2471 SmallVectorImpl<unsigned> &SortedIndices,
2472 StridedPtrInfo &SPtrInfo, bool IsLoad) const;
2473
2474 /// Checks if the given array of loads can be represented as a vectorized,
2475 /// scatter or just simple gather.
2476 /// \param VL list of loads.
2477 /// \param VL0 main load value.
2478 /// \param Order returned order of load instructions.
2479 /// \param PointerOps returned list of pointer operands.
2480 /// \param BestVF return best vector factor, if recursive check found better
2481 /// vectorization sequences rather than masked gather.
2482 /// \param TryRecursiveCheck used to check if long masked gather can be
2483 /// represented as a serie of loads/insert subvector, if profitable.
2486 SmallVectorImpl<Value *> &PointerOps,
2487 StridedPtrInfo &SPtrInfo,
2488 unsigned *BestVF = nullptr,
2489 bool TryRecursiveCheck = true) const;
2490
2491 /// Checks whether some existing tree entry has scalars equal to \p VL.
2492 /// \p S is the common opcode of \p VL when one exists; an empty \p S means
2493 /// the values have no common opcode (mixed buildvector/gather candidates).
2494 bool hasSameNode(const InstructionsState &S, ArrayRef<Value *> VL) const {
2495 auto IsSame = [&](const TreeEntry *TE) { return TE->isSame(VL); };
2496 if (S) {
2497 // Any vectorized or gather entry equal to VL must contain S.getMainOp()
2498 // (the representative instruction, which is also the recorded scalar
2499 // for copyable-elements bundles), so probing the MainOp-indexed maps
2500 // is sufficient and avoids scanning the whole tree.
2501 return any_of(getTreeEntries(S.getMainOp()), IsSame) ||
2502 any_of(ValueToGatherNodes.lookup(S.getMainOp()), IsSame);
2503 }
2504 // No common opcode: only gather entries can match. Each non-constant
2505 // value in VL has to be in the gather entry's scalar list and is
2506 // therefore present in ValueToGatherNodes. Probe by VL members instead
2507 // of scanning the whole tree (O(tree) -> O(|VL|)).
2509 for (Value *V : VL) {
2510 // Constants/poisons are not tracked in ValueToGatherNodes.
2511 if (isConstant(V))
2512 continue;
2513 for (const TreeEntry *TE : ValueToGatherNodes.lookup(V)) {
2514 if (!Visited.insert(TE).second)
2515 continue;
2516 if (IsSame(TE))
2517 return true;
2518 }
2519 }
2520 return false;
2521 }
2522
2523 /// Registers non-vectorizable sequence of loads
2524 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2525 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2526 }
2527
2528 /// Checks if the given loads sequence is known as not vectorizable
2529 template <typename T>
2531 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2532 }
2533
2535
2536 /// This structure holds any data we need about the edges being traversed
2537 /// during buildTreeRec(). We keep track of:
2538 /// (i) the user TreeEntry index, and
2539 /// (ii) the index of the edge.
2540 struct EdgeInfo {
2541 EdgeInfo() = default;
2542 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2544 /// The user TreeEntry.
2545 TreeEntry *UserTE = nullptr;
2546 /// The operand index of the use.
2547 unsigned EdgeIdx = UINT_MAX;
2548#ifndef NDEBUG
2550 const BoUpSLP::EdgeInfo &EI) {
2551 EI.dump(OS);
2552 return OS;
2553 }
2554 /// Debug print.
2555 void dump(raw_ostream &OS) const {
2556 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2557 << " EdgeIdx:" << EdgeIdx << "}";
2558 }
2559 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2560#endif
2561 bool operator == (const EdgeInfo &Other) const {
2562 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2563 }
2564
2565 operator bool() const { return UserTE != nullptr; }
2566 };
2567 friend struct DenseMapInfo<EdgeInfo>;
2568
2569 /// A helper class used for scoring candidates for two consecutive lanes.
2571 const TargetLibraryInfo &TLI;
2572 const DataLayout &DL;
2573 ScalarEvolution &SE;
2574 const BoUpSLP &R;
2575 int NumLanes; // Total number of lanes (aka vectorization factor).
2576 int MaxLevel; // The maximum recursion depth for accumulating score.
2577
2578 public:
2580 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2581 int MaxLevel)
2582 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2583 MaxLevel(MaxLevel) {}
2584
2585 // The hard-coded scores listed here are not very important, though it shall
2586 // be higher for better matches to improve the resulting cost. When
2587 // computing the scores of matching one sub-tree with another, we are
2588 // basically counting the number of values that are matching. So even if all
2589 // scores are set to 1, we would still get a decent matching result.
2590 // However, sometimes we have to break ties. For example we may have to
2591 // choose between matching loads vs matching opcodes. This is what these
2592 // scores are helping us with: they provide the order of preference. Also,
2593 // this is important if the scalar is externally used or used in another
2594 // tree entry node in the different lane.
2595
2596 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2597 static const int ScoreConsecutiveLoads = 4;
2598 /// The same load multiple times. This should have a better score than
2599 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2600 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2601 /// a vector load and 1.0 for a broadcast.
2602 static const int ScoreSplatLoads = 3;
2603 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2604 static const int ScoreReversedLoads = 3;
2605 /// A load candidate for masked gather.
2606 static const int ScoreMaskedGatherCandidate = 1;
2607 /// ExtractElementInst from same vector and consecutive indexes.
2608 static const int ScoreConsecutiveExtracts = 4;
2609 /// ExtractElementInst from same vector and reversed indices.
2610 static const int ScoreReversedExtracts = 3;
2611 /// Constants.
2612 static const int ScoreConstants = 2;
2613 /// Instructions with the same opcode.
2614 static const int ScoreSameOpcode = 2;
2615 /// Instructions with alt opcodes (e.g, add + sub).
2616 static const int ScoreAltOpcodes = 1;
2617 /// Identical instructions (a.k.a. splat or broadcast).
2618 static const int ScoreSplat = 1;
2619 /// Matching with an undef is preferable to failing.
2620 static const int ScoreUndef = 1;
2621 /// Score for failing to find a decent match.
2622 static const int ScoreFail = 0;
2623 /// Score if all users are vectorized.
2624 static const int ScoreAllUserVectorized = 1;
2625
2626 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2627 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2628 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2629 /// MainAltOps.
2631 ArrayRef<Value *> MainAltOps) const {
2632 if (!isValidElementType(V1->getType()) ||
2635
2636 if (V1 == V2) {
2637 if (isa<LoadInst>(V1)) {
2638 // Retruns true if the users of V1 and V2 won't need to be extracted.
2639 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2640 // Bail out if we have too many uses to save compilation time.
2641 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2642 return false;
2643
2644 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2645 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2646 return U == U1 || U == U2 || R.isVectorized(U);
2647 });
2648 };
2649 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2650 };
2651 // A broadcast of a load can be cheaper on some targets.
2652 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2653 ElementCount::getFixed(NumLanes)) &&
2654 ((int)V1->getNumUses() == NumLanes ||
2655 AllUsersAreInternal(V1, V2)))
2657 }
2659 }
2660
2661 auto CheckSameEntryOrFail = [&]() {
2662 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2664 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2665 !TEs2.empty() &&
2666 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2668 }
2670 };
2671
2672 auto *LI1 = dyn_cast<LoadInst>(V1);
2673 auto *LI2 = dyn_cast<LoadInst>(V2);
2674 if (LI1 && LI2) {
2675 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2676 !LI2->isSimple())
2677 return CheckSameEntryOrFail();
2678
2679 std::optional<int64_t> Dist = getPointersDiff(
2680 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2681 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2682 if (!Dist || *Dist == 0) {
2683 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2684 getUnderlyingObject(LI2->getPointerOperand()) &&
2685 R.TTI->isLegalMaskedGather(
2686 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2688 return CheckSameEntryOrFail();
2689 }
2690 // The distance is too large - still may be profitable to use masked
2691 // loads/gathers.
2692 if (std::abs(*Dist) > NumLanes / 2)
2694 // This still will detect consecutive loads, but we might have "holes"
2695 // in some cases. It is ok for non-power-2 vectorization and may produce
2696 // better results. It should not affect current vectorization.
2699 }
2700
2701 auto *C1 = dyn_cast<Constant>(V1);
2702 auto *C2 = dyn_cast<Constant>(V2);
2703 if (C1 && C2)
2705
2706 // Consider constants and buildvector compatible.
2707 if ((C1 && isa<InsertElementInst>(V2)) ||
2708 (C2 && isa<InsertElementInst>(V1)))
2710
2711 // Extracts from consecutive indexes of the same vector better score as
2712 // the extracts could be optimized away.
2713 Value *EV1;
2714 ConstantInt *Ex1Idx;
2715 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2716 // Undefs are always profitable for extractelements.
2717 // Compiler can easily combine poison and extractelement <non-poison> or
2718 // undef and extractelement <poison>. But combining undef +
2719 // extractelement <non-poison-but-may-produce-poison> requires some
2720 // extra operations.
2721 if (isa<UndefValue>(V2))
2722 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2725 Value *EV2 = nullptr;
2726 ConstantInt *Ex2Idx = nullptr;
2727 if (match(V2,
2729 m_Undef())))) {
2730 // Undefs are always profitable for extractelements.
2731 if (!Ex2Idx)
2733 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2735 if (EV2 == EV1) {
2736 int Idx1 = Ex1Idx->getZExtValue();
2737 int Idx2 = Ex2Idx->getZExtValue();
2738 int Dist = Idx2 - Idx1;
2739 // The distance is too large - still may be profitable to use
2740 // shuffles.
2741 if (std::abs(Dist) == 0)
2743 if (std::abs(Dist) > NumLanes / 2)
2747 }
2749 }
2750 return CheckSameEntryOrFail();
2751 }
2752
2753 auto *I1 = dyn_cast<Instruction>(V1);
2754 auto *I2 = dyn_cast<Instruction>(V2);
2755 if (I1 && I2) {
2756 if (I1->getParent() != I2->getParent())
2757 return CheckSameEntryOrFail();
2758 Value *V;
2759 Value *Cond;
2760 // ZExt i1 to something must be considered same opcode for select i1
2761 // cmp, x, y
2762 // Required to better match the transformation after
2763 // BoUpSLP::matchesInversedZExtSelect analysis.
2764 if ((match(I1, m_ZExt(m_Value(V))) &&
2765 match(I2, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
2766 V->getType() == Cond->getType()) ||
2767 (match(I2, m_ZExt(m_Value(V))) &&
2768 match(I1, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
2769 V->getType() == Cond->getType()))
2771 SmallVector<Value *, 4> Ops(MainAltOps);
2772 Ops.push_back(I1);
2773 Ops.push_back(I2);
2774 InstructionsState S = getSameOpcode(Ops, TLI);
2775 // Note: Only consider instructions with <= 2 operands to avoid
2776 // complexity explosion.
2777 if (S &&
2778 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2779 !S.isAltShuffle()) &&
2780 all_of(Ops, [&S](Value *V) {
2781 return isa<PoisonValue>(V) ||
2782 cast<Instruction>(V)->getNumOperands() ==
2783 S.getMainOp()->getNumOperands();
2784 }))
2785 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2787 }
2788
2789 if (I1 && isa<PoisonValue>(V2))
2791
2792 if (isa<UndefValue>(V2))
2794
2795 return CheckSameEntryOrFail();
2796 }
2797
2798 /// Go through the operands of \p LHS and \p RHS recursively until
2799 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2800 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2801 /// of \p U1 and \p U2), except at the beginning of the recursion where
2802 /// these are set to nullptr.
2803 ///
2804 /// For example:
2805 /// \verbatim
2806 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2807 /// \ / \ / \ / \ /
2808 /// + + + +
2809 /// G1 G2 G3 G4
2810 /// \endverbatim
2811 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2812 /// each level recursively, accumulating the score. It starts from matching
2813 /// the additions at level 0, then moves on to the loads (level 1). The
2814 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2815 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2816 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2817 /// Please note that the order of the operands does not matter, as we
2818 /// evaluate the score of all profitable combinations of operands. In
2819 /// other words the score of G1 and G4 is the same as G1 and G2. This
2820 /// heuristic is based on ideas described in:
2821 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2822 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2823 /// Luís F. W. Góes
2825 Instruction *U2, int CurrLevel,
2826 ArrayRef<Value *> MainAltOps) const {
2827
2828 // Get the shallow score of V1 and V2.
2829 int ShallowScoreAtThisLevel =
2830 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2831
2832 // If reached MaxLevel,
2833 // or if V1 and V2 are not instructions,
2834 // or if they are SPLAT,
2835 // or if they are not consecutive,
2836 // or if profitable to vectorize loads or extractelements, early return
2837 // the current cost.
2838 auto *I1 = dyn_cast<Instruction>(LHS);
2839 auto *I2 = dyn_cast<Instruction>(RHS);
2840 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2841 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2842 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2843 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2845 ShallowScoreAtThisLevel))
2846 return ShallowScoreAtThisLevel;
2847 assert(I1 && I2 && "Should have early exited.");
2848
2849 // Contains the I2 operand indexes that got matched with I1 operands.
2850 SmallSet<unsigned, 4> Op2Used;
2851
2852 // Recursion towards the operands of I1 and I2. We are trying all possible
2853 // operand pairs, and keeping track of the best score.
2854 if (I1->getNumOperands() != I2->getNumOperands())
2856 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2857 OpIdx1 != NumOperands1; ++OpIdx1) {
2858 // Try to pair op1I with the best operand of I2.
2859 int MaxTmpScore = 0;
2860 unsigned MaxOpIdx2 = 0;
2861 bool FoundBest = false;
2862 // If I2 is commutative try all combinations.
2863 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2864 unsigned ToIdx = isCommutative(I2)
2865 ? I2->getNumOperands()
2866 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2867 assert(FromIdx <= ToIdx && "Bad index");
2868 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2869 // Skip operands already paired with OpIdx1.
2870 if (Op2Used.count(OpIdx2))
2871 continue;
2872 // Recursively calculate the cost at each level
2873 int TmpScore =
2874 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2875 I1, I2, CurrLevel + 1, {});
2876 // Look for the best score.
2877 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2878 TmpScore > MaxTmpScore) {
2879 MaxTmpScore = TmpScore;
2880 MaxOpIdx2 = OpIdx2;
2881 FoundBest = true;
2882 }
2883 }
2884 if (FoundBest) {
2885 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2886 Op2Used.insert(MaxOpIdx2);
2887 ShallowScoreAtThisLevel += MaxTmpScore;
2888 }
2889 }
2890 return ShallowScoreAtThisLevel;
2891 }
2892 };
2893 /// A helper data structure to hold the operands of a vector of instructions.
2894 /// This supports a fixed vector length for all operand vectors.
2896 /// For each operand we need (i) the value, and (ii) the opcode that it
2897 /// would be attached to if the expression was in a left-linearized form.
2898 /// This is required to avoid illegal operand reordering.
2899 /// For example:
2900 /// \verbatim
2901 /// 0 Op1
2902 /// |/
2903 /// Op1 Op2 Linearized + Op2
2904 /// \ / ----------> |/
2905 /// - -
2906 ///
2907 /// Op1 - Op2 (0 + Op1) - Op2
2908 /// \endverbatim
2909 ///
2910 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2911 ///
2912 /// Another way to think of this is to track all the operations across the
2913 /// path from the operand all the way to the root of the tree and to
2914 /// calculate the operation that corresponds to this path. For example, the
2915 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2916 /// corresponding operation is a '-' (which matches the one in the
2917 /// linearized tree, as shown above).
2918 ///
2919 /// For lack of a better term, we refer to this operation as Accumulated
2920 /// Path Operation (APO).
2921 struct OperandData {
2922 OperandData() = default;
2923 OperandData(Value *V, bool APO, bool IsUsed)
2924 : V(V), APO(APO), IsUsed(IsUsed) {}
2925 /// The operand value.
2926 Value *V = nullptr;
2927 /// TreeEntries only allow a single opcode, or an alternate sequence of
2928 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2929 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2930 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2931 /// (e.g., Add/Mul)
2932 bool APO = false;
2933 /// Helper data for the reordering function.
2934 bool IsUsed = false;
2935 };
2936
2937 /// During operand reordering, we are trying to select the operand at lane
2938 /// that matches best with the operand at the neighboring lane. Our
2939 /// selection is based on the type of value we are looking for. For example,
2940 /// if the neighboring lane has a load, we need to look for a load that is
2941 /// accessing a consecutive address. These strategies are summarized in the
2942 /// 'ReorderingMode' enumerator.
2943 enum class ReorderingMode {
2944 Load, ///< Matching loads to consecutive memory addresses
2945 Opcode, ///< Matching instructions based on opcode (same or alternate)
2946 Constant, ///< Matching constants
2947 Splat, ///< Matching the same instruction multiple times (broadcast)
2948 Failed, ///< We failed to create a vectorizable group
2949 };
2950
2951 using OperandDataVec = SmallVector<OperandData, 2>;
2952
2953 /// A vector of operand vectors.
2955 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2956 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2957 unsigned ArgSize = 0;
2958
2959 const TargetLibraryInfo &TLI;
2960 const DataLayout &DL;
2961 ScalarEvolution &SE;
2962 const BoUpSLP &R;
2963 const Loop *L = nullptr;
2964
2965 /// \returns the operand data at \p OpIdx and \p Lane.
2966 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2967 return OpsVec[OpIdx][Lane];
2968 }
2969
2970 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2971 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2972 return OpsVec[OpIdx][Lane];
2973 }
2974
2975 /// Clears the used flag for all entries.
2976 void clearUsed() {
2977 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2978 OpIdx != NumOperands; ++OpIdx)
2979 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2980 ++Lane)
2981 OpsVec[OpIdx][Lane].IsUsed = false;
2982 }
2983
2984 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2985 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2986 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2987 }
2988
2989 /// \param Lane lane of the operands under analysis.
2990 /// \param OpIdx operand index in \p Lane lane we're looking the best
2991 /// candidate for.
2992 /// \param Idx operand index of the current candidate value.
2993 /// \returns The additional score due to possible broadcasting of the
2994 /// elements in the lane. It is more profitable to have power-of-2 unique
2995 /// elements in the lane, it will be vectorized with higher probability
2996 /// after removing duplicates. Currently the SLP vectorizer supports only
2997 /// vectorization of the power-of-2 number of unique scalars.
2998 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2999 const SmallBitVector &UsedLanes) const {
3000 Value *IdxLaneV = getData(Idx, Lane).V;
3001 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
3002 isa<ExtractElementInst>(IdxLaneV))
3003 return 0;
3005 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
3006 if (Ln == Lane)
3007 continue;
3008 Value *OpIdxLnV = getData(OpIdx, Ln).V;
3009 if (!isa<Instruction>(OpIdxLnV))
3010 return 0;
3011 Uniques.try_emplace(OpIdxLnV, Ln);
3012 }
3013 unsigned UniquesCount = Uniques.size();
3014 auto IdxIt = Uniques.find(IdxLaneV);
3015 unsigned UniquesCntWithIdxLaneV =
3016 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
3017 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
3018 auto OpIdxIt = Uniques.find(OpIdxLaneV);
3019 unsigned UniquesCntWithOpIdxLaneV =
3020 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
3021 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
3022 return 0;
3023 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
3024 UniquesCntWithOpIdxLaneV,
3025 UniquesCntWithOpIdxLaneV -
3026 bit_floor(UniquesCntWithOpIdxLaneV)) -
3027 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
3028 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
3029 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
3030 }
3031
3032 /// \param Lane lane of the operands under analysis.
3033 /// \param OpIdx operand index in \p Lane lane we're looking the best
3034 /// candidate for.
3035 /// \param Idx operand index of the current candidate value.
3036 /// \returns The additional score for the scalar which users are all
3037 /// vectorized.
3038 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
3039 Value *IdxLaneV = getData(Idx, Lane).V;
3040 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
3041 // Do not care about number of uses for vector-like instructions
3042 // (extractelement/extractvalue with constant indices), they are extracts
3043 // themselves and already externally used. Vectorization of such
3044 // instructions does not add extra extractelement instruction, just may
3045 // remove it.
3046 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
3047 isVectorLikeInstWithConstOps(OpIdxLaneV))
3049 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
3050 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
3051 return 0;
3052 return R.areAllUsersVectorized(IdxLaneI)
3054 : 0;
3055 }
3056
3057 /// Score scaling factor for fully compatible instructions but with
3058 /// different number of external uses. Allows better selection of the
3059 /// instructions with less external uses.
3060 static const int ScoreScaleFactor = 10;
3061
3062 /// \Returns the look-ahead score, which tells us how much the sub-trees
3063 /// rooted at \p LHS and \p RHS match, the more they match the higher the
3064 /// score. This helps break ties in an informed way when we cannot decide on
3065 /// the order of the operands by just considering the immediate
3066 /// predecessors.
3067 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
3068 int Lane, unsigned OpIdx, unsigned Idx,
3069 bool &IsUsed, const SmallBitVector &UsedLanes) {
3070 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
3072 // Keep track of the instruction stack as we recurse into the operands
3073 // during the look-ahead score exploration.
3074 int Score =
3075 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
3076 /*CurrLevel=*/1, MainAltOps);
3077 if (Score) {
3078 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
3079 if (Score <= -SplatScore) {
3080 // Failed score.
3081 Score = 0;
3082 } else {
3083 Score += SplatScore;
3084 // Scale score to see the difference between different operands
3085 // and similar operands but all vectorized/not all vectorized
3086 // uses. It does not affect actual selection of the best
3087 // compatible operand in general, just allows to select the
3088 // operand with all vectorized uses.
3089 Score *= ScoreScaleFactor;
3090 Score += getExternalUseScore(Lane, OpIdx, Idx);
3091 IsUsed = true;
3092 }
3093 }
3094 return Score;
3095 }
3096
3097 /// Best defined scores per lanes between the passes. Used to choose the
3098 /// best operand (with the highest score) between the passes.
3099 /// The key - {Operand Index, Lane}.
3100 /// The value - the best score between the passes for the lane and the
3101 /// operand.
3103 BestScoresPerLanes;
3104
3105 // Search all operands in Ops[*][Lane] for the one that matches best
3106 // Ops[OpIdx][LastLane] and return its opreand index.
3107 // If no good match can be found, return std::nullopt.
3108 std::optional<unsigned>
3109 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
3110 ArrayRef<ReorderingMode> ReorderingModes,
3111 ArrayRef<Value *> MainAltOps,
3112 const SmallBitVector &UsedLanes) {
3113 unsigned NumOperands = getNumOperands();
3114
3115 // The operand of the previous lane at OpIdx.
3116 Value *OpLastLane = getData(OpIdx, LastLane).V;
3117
3118 // Our strategy mode for OpIdx.
3119 ReorderingMode RMode = ReorderingModes[OpIdx];
3120 if (RMode == ReorderingMode::Failed)
3121 return std::nullopt;
3122
3123 // The linearized opcode of the operand at OpIdx, Lane.
3124 bool OpIdxAPO = getData(OpIdx, Lane).APO;
3125
3126 // The best operand index and its score.
3127 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
3128 // are using the score to differentiate between the two.
3129 struct BestOpData {
3130 std::optional<unsigned> Idx;
3131 unsigned Score = 0;
3132 } BestOp;
3133 BestOp.Score =
3134 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
3135 .first->second;
3136
3137 // Track if the operand must be marked as used. If the operand is set to
3138 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
3139 // want to reestimate the operands again on the following iterations).
3140 bool IsUsed = RMode == ReorderingMode::Splat ||
3141 RMode == ReorderingMode::Constant ||
3142 RMode == ReorderingMode::Load;
3143 // Iterate through all unused operands and look for the best.
3144 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
3145 // Get the operand at Idx and Lane.
3146 OperandData &OpData = getData(Idx, Lane);
3147 Value *Op = OpData.V;
3148 bool OpAPO = OpData.APO;
3149
3150 // Skip already selected operands.
3151 if (OpData.IsUsed)
3152 continue;
3153
3154 // Skip if we are trying to move the operand to a position with a
3155 // different opcode in the linearized tree form. This would break the
3156 // semantics.
3157 if (OpAPO != OpIdxAPO)
3158 continue;
3159
3160 // Look for an operand that matches the current mode.
3161 switch (RMode) {
3162 case ReorderingMode::Load:
3163 case ReorderingMode::Opcode: {
3164 bool LeftToRight = Lane > LastLane;
3165 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
3166 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
3167 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3168 OpIdx, Idx, IsUsed, UsedLanes);
3169 if (Score > static_cast<int>(BestOp.Score) ||
3170 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
3171 Idx == OpIdx)) {
3172 BestOp.Idx = Idx;
3173 BestOp.Score = Score;
3174 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
3175 }
3176 break;
3177 }
3178 case ReorderingMode::Constant:
3179 if (isa<Constant>(Op) ||
3180 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
3181 BestOp.Idx = Idx;
3182 if (isa<Constant>(Op)) {
3184 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3186 }
3188 IsUsed = false;
3189 }
3190 break;
3191 case ReorderingMode::Splat:
3192 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
3193 IsUsed = Op == OpLastLane;
3194 if (Op == OpLastLane) {
3195 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3196 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3198 }
3199 BestOp.Idx = Idx;
3200 }
3201 break;
3202 case ReorderingMode::Failed:
3203 llvm_unreachable("Not expected Failed reordering mode.");
3204 }
3205 }
3206
3207 if (BestOp.Idx) {
3208 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3209 return BestOp.Idx;
3210 }
3211 // If we could not find a good match return std::nullopt.
3212 return std::nullopt;
3213 }
3214
3215 /// Helper for reorderOperandVecs.
3216 /// \returns the lane that we should start reordering from. This is the one
3217 /// which has the least number of operands that can freely move about or
3218 /// less profitable because it already has the most optimal set of operands.
3219 unsigned getBestLaneToStartReordering() const {
3220 unsigned Min = UINT_MAX;
3221 unsigned SameOpNumber = 0;
3222 // std::pair<unsigned, unsigned> is used to implement a simple voting
3223 // algorithm and choose the lane with the least number of operands that
3224 // can freely move about or less profitable because it already has the
3225 // most optimal set of operands. The first unsigned is a counter for
3226 // voting, the second unsigned is the counter of lanes with instructions
3227 // with same/alternate opcodes and same parent basic block.
3229 // Try to be closer to the original results, if we have multiple lanes
3230 // with same cost. If 2 lanes have the same cost, use the one with the
3231 // highest index.
3232 for (int I = getNumLanes(); I > 0; --I) {
3233 unsigned Lane = I - 1;
3234 OperandsOrderData NumFreeOpsHash =
3235 getMaxNumOperandsThatCanBeReordered(Lane);
3236 // Compare the number of operands that can move and choose the one with
3237 // the least number.
3238 if (NumFreeOpsHash.NumOfAPOs < Min) {
3239 Min = NumFreeOpsHash.NumOfAPOs;
3240 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3241 HashMap.clear();
3242 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3243 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3244 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3245 // Select the most optimal lane in terms of number of operands that
3246 // should be moved around.
3247 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3248 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3249 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3250 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3251 auto [It, Inserted] =
3252 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3253 if (!Inserted)
3254 ++It->second.first;
3255 }
3256 }
3257 // Select the lane with the minimum counter.
3258 unsigned BestLane = 0;
3259 unsigned CntMin = UINT_MAX;
3260 for (const auto &Data : reverse(HashMap)) {
3261 if (Data.second.first < CntMin) {
3262 CntMin = Data.second.first;
3263 BestLane = Data.second.second;
3264 }
3265 }
3266 return BestLane;
3267 }
3268
3269 /// Data structure that helps to reorder operands.
3270 struct OperandsOrderData {
3271 /// The best number of operands with the same APOs, which can be
3272 /// reordered.
3273 unsigned NumOfAPOs = UINT_MAX;
3274 /// Number of operands with the same/alternate instruction opcode and
3275 /// parent.
3276 unsigned NumOpsWithSameOpcodeParent = 0;
3277 /// Hash for the actual operands ordering.
3278 /// Used to count operands, actually their position id and opcode
3279 /// value. It is used in the voting mechanism to find the lane with the
3280 /// least number of operands that can freely move about or less profitable
3281 /// because it already has the most optimal set of operands. Can be
3282 /// replaced with SmallVector<unsigned> instead but hash code is faster
3283 /// and requires less memory.
3284 unsigned Hash = 0;
3285 };
3286 /// \returns the maximum number of operands that are allowed to be reordered
3287 /// for \p Lane and the number of compatible instructions(with the same
3288 /// parent/opcode). This is used as a heuristic for selecting the first lane
3289 /// to start operand reordering.
3290 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3291 unsigned CntTrue = 0;
3292 unsigned NumOperands = getNumOperands();
3293 // Operands with the same APO can be reordered. We therefore need to count
3294 // how many of them we have for each APO, like this: Cnt[APO] = x.
3295 // Since we only have two APOs, namely true and false, we can avoid using
3296 // a map. Instead we can simply count the number of operands that
3297 // correspond to one of them (in this case the 'true' APO), and calculate
3298 // the other by subtracting it from the total number of operands.
3299 // Operands with the same instruction opcode and parent are more
3300 // profitable since we don't need to move them in many cases, with a high
3301 // probability such lane already can be vectorized effectively.
3302 bool AllUndefs = true;
3303 unsigned NumOpsWithSameOpcodeParent = 0;
3304 Instruction *OpcodeI = nullptr;
3305 BasicBlock *Parent = nullptr;
3306 unsigned Hash = 0;
3307 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3308 const OperandData &OpData = getData(OpIdx, Lane);
3309 if (OpData.APO)
3310 ++CntTrue;
3311 // Use Boyer-Moore majority voting for finding the majority opcode and
3312 // the number of times it occurs.
3313 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3314 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3315 I->getParent() != Parent) {
3316 if (NumOpsWithSameOpcodeParent == 0) {
3317 NumOpsWithSameOpcodeParent = 1;
3318 OpcodeI = I;
3319 Parent = I->getParent();
3320 } else {
3321 --NumOpsWithSameOpcodeParent;
3322 }
3323 } else {
3324 ++NumOpsWithSameOpcodeParent;
3325 }
3326 }
3327 Hash = hash_combine(
3328 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3329 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3330 }
3331 if (AllUndefs)
3332 return {};
3333 OperandsOrderData Data;
3334 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3335 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3336 Data.Hash = Hash;
3337 return Data;
3338 }
3339
3340 /// Go through the instructions in VL and append their operands.
3341 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3342 const InstructionsState &S) {
3343 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3344 assert((empty() || all_of(Operands,
3345 [this](const ValueList &VL) {
3346 return VL.size() == getNumLanes();
3347 })) &&
3348 "Expected same number of lanes");
3349 assert(S.valid() && "InstructionsState is invalid.");
3350 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3351 // arguments to the intrinsic produces the same result.
3352 Instruction *MainOp = S.getMainOp();
3353 unsigned NumOperands = MainOp->getNumOperands();
3355 OpsVec.resize(ArgSize);
3356 unsigned NumLanes = VL.size();
3357 for (OperandDataVec &Ops : OpsVec)
3358 Ops.resize(NumLanes);
3359 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3360 // Our tree has just 3 nodes: the root and two operands.
3361 // It is therefore trivial to get the APO. We only need to check the
3362 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3363 // operand. The LHS operand of both add and sub is never attached to an
3364 // inversese operation in the linearized form, therefore its APO is
3365 // false. The RHS is true only if V is an inverse operation.
3366
3367 // Since operand reordering is performed on groups of commutative
3368 // operations or alternating sequences (e.g., +, -), we can safely tell
3369 // the inverse operations by checking commutativity.
3370 auto *I = dyn_cast<Instruction>(VL[Lane]);
3371 if (!I && isa<PoisonValue>(VL[Lane])) {
3372 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3373 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3374 continue;
3375 }
3376 bool IsInverseOperation = false;
3377 if (S.isCopyableElement(VL[Lane])) {
3378 // The value is a copyable element.
3379 IsInverseOperation =
3380 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3381 } else {
3382 assert(I && "Expected instruction");
3383 auto [SelectedOp, Ops] = convertTo(I, S);
3384 // We cannot check commutativity by the converted instruction
3385 // (SelectedOp) because isCommutative also examines def-use
3386 // relationships.
3387 IsInverseOperation = !isCommutative(SelectedOp, I);
3388 }
3389 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3390 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3391 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3392 }
3393 }
3394 }
3395
3396 /// \returns the number of operands.
3397 unsigned getNumOperands() const { return ArgSize; }
3398
3399 /// \returns the number of lanes.
3400 unsigned getNumLanes() const { return OpsVec[0].size(); }
3401
3402 /// \returns the operand value at \p OpIdx and \p Lane.
3403 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3404 return getData(OpIdx, Lane).V;
3405 }
3406
3407 /// \returns true if the data structure is empty.
3408 bool empty() const { return OpsVec.empty(); }
3409
3410 /// Clears the data.
3411 void clear() { OpsVec.clear(); }
3412
3413 /// \Returns true if there are enough operands identical to \p Op to fill
3414 /// the whole vector (it is mixed with constants or loop invariant values).
3415 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3416 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3417 assert(Op == getValue(OpIdx, Lane) &&
3418 "Op is expected to be getValue(OpIdx, Lane).");
3419 // Small number of loads - try load matching.
3420 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3421 return false;
3422 bool OpAPO = getData(OpIdx, Lane).APO;
3423 bool IsInvariant = L && L->isLoopInvariant(Op);
3424 unsigned Cnt = 0;
3425 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3426 if (Ln == Lane)
3427 continue;
3428 // This is set to true if we found a candidate for broadcast at Lane.
3429 bool FoundCandidate = false;
3430 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3431 OperandData &Data = getData(OpI, Ln);
3432 if (Data.APO != OpAPO || Data.IsUsed)
3433 continue;
3434 Value *OpILane = getValue(OpI, Lane);
3435 bool IsConstantOp = isa<Constant>(OpILane);
3436 // Consider the broadcast candidate if:
3437 // 1. Same value is found in one of the operands.
3438 if (Data.V == Op ||
3439 // 2. The operand in the given lane is not constant but there is a
3440 // constant operand in another lane (which can be moved to the
3441 // given lane). In this case we can represent it as a simple
3442 // permutation of constant and broadcast.
3443 (!IsConstantOp &&
3444 ((Lns > 2 && isa<Constant>(Data.V)) ||
3445 // 2.1. If we have only 2 lanes, need to check that value in the
3446 // next lane does not build same opcode sequence.
3447 (Lns == 2 &&
3448 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3449 isa<Constant>(Data.V)))) ||
3450 // 3. The operand in the current lane is loop invariant (can be
3451 // hoisted out) and another operand is also a loop invariant
3452 // (though not a constant). In this case the whole vector can be
3453 // hoisted out.
3454 // FIXME: need to teach the cost model about this case for better
3455 // estimation.
3456 (IsInvariant && !isa<Constant>(Data.V) &&
3457 !getSameOpcode({Op, Data.V}, TLI) &&
3458 L->isLoopInvariant(Data.V))) {
3459 FoundCandidate = true;
3460 Data.IsUsed = Data.V == Op;
3461 if (Data.V == Op)
3462 ++Cnt;
3463 break;
3464 }
3465 }
3466 if (!FoundCandidate)
3467 return false;
3468 }
3469 return getNumLanes() == 2 || Cnt > 1;
3470 }
3471
3472 /// Checks if there is at least single compatible operand in lanes other
3473 /// than \p Lane, compatible with the operand \p Op.
3474 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3475 assert(Op == getValue(OpIdx, Lane) &&
3476 "Op is expected to be getValue(OpIdx, Lane).");
3477 bool OpAPO = getData(OpIdx, Lane).APO;
3478 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3479 if (Ln == Lane)
3480 continue;
3481 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3482 const OperandData &Data = getData(OpI, Ln);
3483 if (Data.APO != OpAPO || Data.IsUsed)
3484 return true;
3485 Value *OpILn = getValue(OpI, Ln);
3486 return (L && L->isLoopInvariant(OpILn)) ||
3487 (getSameOpcode({Op, OpILn}, TLI) &&
3488 allSameBlock({Op, OpILn}));
3489 }))
3490 return true;
3491 }
3492 return false;
3493 }
3494
3495 public:
3496 /// Initialize with all the operands of the instruction vector \p RootVL.
3498 const InstructionsState &S, const BoUpSLP &R)
3499 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3500 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3501 // Append all the operands of RootVL.
3502 appendOperands(RootVL, Operands, S);
3503 }
3504
3505 /// \Returns a value vector with the operands across all lanes for the
3506 /// opearnd at \p OpIdx.
3507 ValueList getVL(unsigned OpIdx) const {
3508 ValueList OpVL(OpsVec[OpIdx].size());
3509 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3510 "Expected same num of lanes across all operands");
3511 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3512 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3513 return OpVL;
3514 }
3515
3516 // Performs operand reordering for 2 or more operands.
3517 // The original operands are in OrigOps[OpIdx][Lane].
3518 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3519 void reorder() {
3520 unsigned NumOperands = getNumOperands();
3521 unsigned NumLanes = getNumLanes();
3522 // Each operand has its own mode. We are using this mode to help us select
3523 // the instructions for each lane, so that they match best with the ones
3524 // we have selected so far.
3525 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3526
3527 // This is a greedy single-pass algorithm. We are going over each lane
3528 // once and deciding on the best order right away with no back-tracking.
3529 // However, in order to increase its effectiveness, we start with the lane
3530 // that has operands that can move the least. For example, given the
3531 // following lanes:
3532 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3533 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3534 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3535 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3536 // we will start at Lane 1, since the operands of the subtraction cannot
3537 // be reordered. Then we will visit the rest of the lanes in a circular
3538 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3539
3540 // Find the first lane that we will start our search from.
3541 unsigned FirstLane = getBestLaneToStartReordering();
3542
3543 // Initialize the modes.
3544 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3545 Value *OpLane0 = getValue(OpIdx, FirstLane);
3546 // Keep track if we have instructions with all the same opcode on one
3547 // side.
3548 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3549 // Check if OpLane0 should be broadcast.
3550 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3551 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3552 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3553 else if (isa<LoadInst>(OpILane0))
3554 ReorderingModes[OpIdx] = ReorderingMode::Load;
3555 else
3556 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3557 } else if (isa<Constant>(OpLane0)) {
3558 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3559 } else if (isa<Argument>(OpLane0)) {
3560 // Our best hope is a Splat. It may save some cost in some cases.
3561 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3562 } else {
3563 llvm_unreachable("Unexpected value kind.");
3564 }
3565 }
3566
3567 // Check that we don't have same operands. No need to reorder if operands
3568 // are just perfect diamond or shuffled diamond match. Do not do it only
3569 // for possible broadcasts.
3570 auto &&SkipReordering = [this]() {
3571 SmallPtrSet<Value *, 4> UniqueValues;
3572 ArrayRef<OperandData> Op0 = OpsVec.front();
3573 for (const OperandData &Data : Op0)
3574 UniqueValues.insert(Data.V);
3576 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3577 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3578 return !UniqueValues.contains(Data.V);
3579 }))
3580 return false;
3581 }
3582 return UniqueValues.size() != 2;
3583 };
3584
3585 // If the initial strategy fails for any of the operand indexes, then we
3586 // perform reordering again in a second pass. This helps avoid assigning
3587 // high priority to the failed strategy, and should improve reordering for
3588 // the non-failed operand indexes.
3589 for (int Pass = 0; Pass != 2; ++Pass) {
3590 // Check if no need to reorder operands since they're are perfect or
3591 // shuffled diamond match.
3592 // Need to do it to avoid extra external use cost counting for
3593 // shuffled matches, which may cause regressions.
3594 if (SkipReordering())
3595 break;
3596 // Skip the second pass if the first pass did not fail.
3597 bool StrategyFailed = false;
3598 // Mark all operand data as free to use.
3599 clearUsed();
3600 // We keep the original operand order for the FirstLane, so reorder the
3601 // rest of the lanes. We are visiting the nodes in a circular fashion,
3602 // using FirstLane as the center point and increasing the radius
3603 // distance.
3604 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3605 for (unsigned I = 0; I < NumOperands; ++I)
3606 MainAltOps[I].push_back(getData(I, FirstLane).V);
3607
3608 SmallBitVector UsedLanes(NumLanes);
3609 UsedLanes.set(FirstLane);
3610 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3611 // Visit the lane on the right and then the lane on the left.
3612 for (int Direction : {+1, -1}) {
3613 int Lane = FirstLane + Direction * Distance;
3614 if (Lane < 0 || Lane >= (int)NumLanes)
3615 continue;
3616 UsedLanes.set(Lane);
3617 int LastLane = Lane - Direction;
3618 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3619 "Out of bounds");
3620 // Look for a good match for each operand.
3621 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3622 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3623 std::optional<unsigned> BestIdx =
3624 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3625 MainAltOps[OpIdx], UsedLanes);
3626 // By not selecting a value, we allow the operands that follow to
3627 // select a better matching value. We will get a non-null value in
3628 // the next run of getBestOperand().
3629 if (BestIdx) {
3630 // Swap the current operand with the one returned by
3631 // getBestOperand().
3632 swap(OpIdx, *BestIdx, Lane);
3633 } else {
3634 // Enable the second pass.
3635 StrategyFailed = true;
3636 }
3637 // Try to get the alternate opcode and follow it during analysis.
3638 if (MainAltOps[OpIdx].size() != 2) {
3639 OperandData &AltOp = getData(OpIdx, Lane);
3640 InstructionsState OpS =
3641 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3642 if (OpS && OpS.isAltShuffle())
3643 MainAltOps[OpIdx].push_back(AltOp.V);
3644 }
3645 }
3646 }
3647 }
3648 // Skip second pass if the strategy did not fail.
3649 if (!StrategyFailed)
3650 break;
3651 }
3652 }
3653
3654#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3655 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3656 switch (RMode) {
3657 case ReorderingMode::Load:
3658 return "Load";
3659 case ReorderingMode::Opcode:
3660 return "Opcode";
3661 case ReorderingMode::Constant:
3662 return "Constant";
3663 case ReorderingMode::Splat:
3664 return "Splat";
3665 case ReorderingMode::Failed:
3666 return "Failed";
3667 }
3668 llvm_unreachable("Unimplemented Reordering Type");
3669 }
3670
3671 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3672 raw_ostream &OS) {
3673 return OS << getModeStr(RMode);
3674 }
3675
3676 /// Debug print.
3677 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3678 printMode(RMode, dbgs());
3679 }
3680
3681 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3682 return printMode(RMode, OS);
3683 }
3684
3686 const unsigned Indent = 2;
3687 unsigned Cnt = 0;
3688 for (const OperandDataVec &OpDataVec : OpsVec) {
3689 OS << "Operand " << Cnt++ << "\n";
3690 for (const OperandData &OpData : OpDataVec) {
3691 OS.indent(Indent) << "{";
3692 if (Value *V = OpData.V)
3693 OS << *V;
3694 else
3695 OS << "null";
3696 OS << ", APO:" << OpData.APO << "}\n";
3697 }
3698 OS << "\n";
3699 }
3700 return OS;
3701 }
3702
3703 /// Debug print.
3704 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3705#endif
3706 };
3707
3708 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3709 /// for a pair which have highest score deemed to have best chance to form
3710 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3711 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3712 /// of the cost, considered to be good enough score.
3713 std::pair<std::optional<int>, int>
3714 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3715 int Limit = LookAheadHeuristics::ScoreFail) const {
3716 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3718 int BestScore = Limit;
3719 std::optional<int> Index;
3720 for (int I : seq<int>(0, Candidates.size())) {
3721 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3722 Candidates[I].second,
3723 /*U1=*/nullptr, /*U2=*/nullptr,
3724 /*CurrLevel=*/1, {});
3725 if (Score > BestScore) {
3726 BestScore = Score;
3727 Index = I;
3728 }
3729 }
3730 return std::make_pair(Index, BestScore);
3731 }
3732
3733 /// Checks if the instruction is marked for deletion.
3734 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3735
3736 /// Removes an instruction from its block and eventually deletes it.
3737 /// It's like Instruction::eraseFromParent() except that the actual deletion
3738 /// is delayed until BoUpSLP is destructed.
3740 DeletedInstructions.insert(I);
3741 }
3742
3743 /// Remove instructions from the parent function and clear the operands of \p
3744 /// DeadVals instructions, marking for deletion trivially dead operands.
3745 template <typename T>
3747 ArrayRef<T *> DeadVals,
3748 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3749 VectorValuesAndScales) {
3751 for (T *V : DeadVals) {
3752 auto *I = cast<Instruction>(V);
3754 }
3755 DenseSet<Value *> Processed;
3756 for (T *V : DeadVals) {
3757 if (!V || !Processed.insert(V).second)
3758 continue;
3759 auto *I = cast<Instruction>(V);
3761 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3762 for (Use &U : I->operands()) {
3763 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3764 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3766 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3767 return Entry->VectorizedValue == OpI;
3768 })))
3769 DeadInsts.push_back(OpI);
3770 }
3771 I->dropAllReferences();
3772 }
3773 for (T *V : DeadVals) {
3774 auto *I = cast<Instruction>(V);
3775 if (!I->getParent())
3776 continue;
3777 assert((I->use_empty() || all_of(I->uses(),
3778 [&](Use &U) {
3779 return isDeleted(
3780 cast<Instruction>(U.getUser()));
3781 })) &&
3782 "trying to erase instruction with users.");
3783 I->removeFromParent();
3784 SE->forgetValue(I);
3785 }
3786 // Process the dead instruction list until empty.
3787 while (!DeadInsts.empty()) {
3788 Value *V = DeadInsts.pop_back_val();
3790 if (!VI || !VI->getParent())
3791 continue;
3793 "Live instruction found in dead worklist!");
3794 assert(VI->use_empty() && "Instructions with uses are not dead.");
3795
3796 // Don't lose the debug info while deleting the instructions.
3797 salvageDebugInfo(*VI);
3798
3799 // Null out all of the instruction's operands to see if any operand
3800 // becomes dead as we go.
3801 for (Use &OpU : VI->operands()) {
3802 Value *OpV = OpU.get();
3803 if (!OpV)
3804 continue;
3805 OpU.set(nullptr);
3806
3807 if (!OpV->use_empty())
3808 continue;
3809
3810 // If the operand is an instruction that became dead as we nulled out
3811 // the operand, and if it is 'trivially' dead, delete it in a future
3812 // loop iteration.
3813 if (auto *OpI = dyn_cast<Instruction>(OpV))
3814 if (!DeletedInstructions.contains(OpI) &&
3815 (!OpI->getType()->isVectorTy() ||
3816 none_of(
3817 VectorValuesAndScales,
3818 [&](const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3819 &V) { return std::get<0>(V) == OpI; })) &&
3821 DeadInsts.push_back(OpI);
3822 }
3823
3824 VI->removeFromParent();
3825 eraseInstruction(VI);
3826 SE->forgetValue(VI);
3827 }
3828 }
3829
3830 /// Checks if the instruction was already analyzed for being possible
3831 /// reduction root.
3833 return AnalyzedReductionsRoots.count(I);
3834 }
3835 /// Register given instruction as already analyzed for being possible
3836 /// reduction root.
3838 AnalyzedReductionsRoots.insert(I);
3839 }
3840 /// Checks if the provided list of reduced values was checked already for
3841 /// vectorization.
3843 return AnalyzedReductionVals.contains(hash_value(VL));
3844 }
3845 /// Adds the list of reduced values to list of already checked values for the
3846 /// vectorization.
3848 AnalyzedReductionVals.insert(hash_value(VL));
3849 }
3850 /// Clear the list of the analyzed reduction root instructions.
3852 AnalyzedReductionsRoots.clear();
3853 AnalyzedReductionVals.clear();
3854 AnalyzedMinBWVals.clear();
3855 }
3856 /// Checks if the given value is gathered in one of the nodes.
3857 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3858 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3859 }
3860 /// Checks if the given value is gathered in one of the nodes.
3861 bool isGathered(const Value *V) const {
3862 return MustGather.contains(V);
3863 }
3864 /// Checks if the specified value was not schedule.
3865 bool isNotScheduled(const Value *V) const {
3866 return NonScheduledFirst.contains(V);
3867 }
3868
3869 /// Check if the value is vectorized in the tree.
3870 bool isVectorized(const Value *V) const {
3871 assert(V && "V cannot be nullptr.");
3872 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
3873 return any_of(Entries, [&](const TreeEntry *E) {
3874 return !DeletedNodes.contains(E) && !TransformedToGatherNodes.contains(E);
3875 });
3876 }
3877
3878 /// Checks if it is legal and profitable to build SplitVectorize node for the
3879 /// given \p VL.
3880 /// \param Op1 first homogeneous scalars.
3881 /// \param Op2 second homogeneous scalars.
3882 /// \param ReorderIndices indices to reorder the scalars.
3883 /// \returns true if the node was successfully built.
3885 const InstructionsState &LocalState,
3888 OrdersType &ReorderIndices) const;
3889
3890 ~BoUpSLP();
3891
3892private:
3893 /// Determine if a node \p E in can be demoted to a smaller type with a
3894 /// truncation. We collect the entries that will be demoted in ToDemote.
3895 /// \param E Node for analysis
3896 /// \param ToDemote indices of the nodes to be demoted.
3897 bool collectValuesToDemote(
3898 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3900 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3901 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3902
3903 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3904 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3905 /// they have only one user and reordarable).
3906 /// \param ReorderableGathers List of all gather nodes that require reordering
3907 /// (e.g., gather of extractlements or partially vectorizable loads).
3908 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3909 /// reordering, subset of \p NonVectorized.
3910 void buildReorderableOperands(
3911 TreeEntry *UserTE,
3912 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3913 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3914 SmallVectorImpl<TreeEntry *> &GatherOps);
3915
3916 /// Checks if the given \p TE is a gather node with clustered reused scalars
3917 /// and reorders it per given \p Mask.
3918 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3919
3920 /// Checks if all users of \p I are the part of the vectorization tree.
3921 bool areAllUsersVectorized(
3922 Instruction *I,
3923 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3924
3925 /// Estimates the number of scalar instructions in the tree.
3926 unsigned getNumScalarInsts() const;
3927
3928 /// Estimates the number of vector instructions (including buildvectors,
3929 /// shuffles, and extracts) that the tree will produce.
3930 unsigned getNumVectorInsts() const;
3931
3932 /// Return information about the vector formed for the specified index
3933 /// of a vector of (the same) instruction.
3936
3937 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3938 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3939 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3940 return const_cast<TreeEntry *>(
3941 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3942 }
3943
3944 /// Gets the root instruction for the given node. If the node is a strided
3945 /// load/store node with the reverse order, the root instruction is the last
3946 /// one.
3947 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3948
3949 /// \returns Cast context for the given graph node.
3951 getCastContextHint(const TreeEntry &TE) const;
3952
3953 /// \returns the scale of the given tree entry to the loop iteration.
3954 /// \p Scalar is the scalar value from the entry, if using the parent for the
3955 /// external use.
3956 /// \p U is the user of the vectorized value from the entry, if using the
3957 /// parent for the external use.
3958 uint64_t getScaleToLoopIterations(const TreeEntry &TE,
3959 Value *Scalar = nullptr,
3960 Instruction *U = nullptr);
3961
3962 /// \returns the product of trip counts of the loop \p L and all of its
3963 /// enclosing loops. Unlike the state kept by getScaleToLoopIterations(),
3964 /// this helper depends only on the loop structure and is independent of
3965 /// per-entry operand invariance. Returns 1 when loop-aware cost modeling
3966 /// is disabled or \p L is null.
3967 uint64_t getLoopNestScale(const Loop *L);
3968
3969 /// \returns a refined execution scale for a gather/buildvector tree entry
3970 /// \p TE. The scale is computed as the average of per-lane execution
3971 /// scales: each lane's scale is the loop-nest scale of the loop that
3972 /// contains the lane's defining instruction (or 1 if the lane is a
3973 /// constant / loop-invariant non-instruction value). This models the
3974 /// LICM hoisting that optimizeGatherSequence() performs after vectorization
3975 /// for inserts with loop-invariant operands. Falls back to the whole-entry
3976 /// scale when per-lane information is unavailable or the feature is off.
3977 uint64_t getGatherNodeEffectiveScale(const TreeEntry &TE);
3978
3979 /// Get the loop nest for the given loop \p L.
3980 ArrayRef<const Loop *> getLoopNest(const Loop *L);
3981
3982 /// \returns the cost of the vectorizable entry.
3983 InstructionCost getEntryCost(const TreeEntry *E,
3984 ArrayRef<Value *> VectorizedVals,
3985 SmallPtrSetImpl<Value *> &CheckedExtracts);
3986
3987 /// Estimates spill/reload cost from vector register pressure for \p E at the
3988 /// point of emitting its vector result type \p FinalVecTy. \p ScalarTy is the
3989 /// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
3990 /// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
3991 InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
3992 VectorType *VecTy,
3993 VectorType *FinalVecTy,
3995
3996 /// This is the recursive part of buildTree.
3997 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3998 unsigned InterleaveFactor = 0);
3999
4000 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
4001 /// be vectorized to use the original vector (or aggregate "bitcast" to a
4002 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
4003 /// returns false, setting \p CurrentOrder to either an empty vector or a
4004 /// non-identity permutation that allows to reuse extract instructions.
4005 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
4006 /// extract order.
4007 bool canReuseExtract(ArrayRef<Value *> VL,
4008 SmallVectorImpl<unsigned> &CurrentOrder,
4009 bool ResizeAllowed = false) const;
4010
4011 /// Vectorize a single entry in the tree.
4012 Value *vectorizeTree(TreeEntry *E);
4013
4014 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
4015 /// \p E.
4016 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
4017
4018 /// Create a new vector from a list of scalar values. Produces a sequence
4019 /// which exploits values reused across lanes, and arranges the inserts
4020 /// for ease of later optimization.
4021 template <typename BVTy, typename ResTy, typename... Args>
4022 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
4023
4024 /// Create a new vector from a list of scalar values. Produces a sequence
4025 /// which exploits values reused across lanes, and arranges the inserts
4026 /// for ease of later optimization.
4027 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
4028
4029 /// Returns the instruction in the bundle, which can be used as a base point
4030 /// for scheduling. Usually it is the last instruction in the bundle, except
4031 /// for the case when all operands are external (in this case, it is the first
4032 /// instruction in the list).
4033 Instruction &getLastInstructionInBundle(const TreeEntry *E);
4034
4035 /// Tries to find extractelement instructions with constant indices from fixed
4036 /// vector type and gather such instructions into a bunch, which highly likely
4037 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
4038 /// was successful, the matched scalars are replaced by poison values in \p VL
4039 /// for future analysis.
4040 std::optional<TargetTransformInfo::ShuffleKind>
4041 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
4042 SmallVectorImpl<int> &Mask) const;
4043
4044 /// Tries to find extractelement instructions with constant indices from fixed
4045 /// vector type and gather such instructions into a bunch, which highly likely
4046 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
4047 /// was successful, the matched scalars are replaced by poison values in \p VL
4048 /// for future analysis.
4050 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
4052 unsigned NumParts) const;
4053
4054 /// Checks if the gathered \p VL can be represented as a single register
4055 /// shuffle(s) of previous tree entries.
4056 /// \param TE Tree entry checked for permutation.
4057 /// \param VL List of scalars (a subset of the TE scalar), checked for
4058 /// permutations. Must form single-register vector.
4059 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
4060 /// commands to build the mask using the original vector value, without
4061 /// relying on the potential reordering.
4062 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
4063 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
4064 std::optional<TargetTransformInfo::ShuffleKind>
4065 isGatherShuffledSingleRegisterEntry(
4066 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
4067 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
4068 unsigned SliceSize);
4069
4070 /// Checks if the gathered \p VL can be represented as multi-register
4071 /// shuffle(s) of previous tree entries.
4072 /// \param TE Tree entry checked for permutation.
4073 /// \param VL List of scalars (a subset of the TE scalar), checked for
4074 /// permutations.
4075 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
4076 /// commands to build the mask using the original vector value, without
4077 /// relying on the potential reordering.
4078 /// \returns per-register series of ShuffleKind, if gathered values can be
4079 /// represented as shuffles of previous tree entries. \p Mask is filled with
4080 /// the shuffle mask (also on per-register base).
4082 isGatherShuffledEntry(
4083 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
4085 unsigned NumParts, bool ForOrder = false);
4086
4087 /// \returns the cost of gathering (inserting) the values in \p VL into a
4088 /// vector.
4089 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
4090 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
4091 Type *ScalarTy) const;
4092
4093 /// Set the Builder insert point to one after the last instruction in
4094 /// the bundle
4095 void setInsertPointAfterBundle(const TreeEntry *E);
4096
4097 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
4098 /// specified, the starting vector value is poison.
4099 Value *
4100 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
4101 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
4102
4103 /// \returns whether the VectorizableTree is fully vectorizable and will
4104 /// be beneficial even the tree height is tiny.
4105 bool isFullyVectorizableTinyTree(bool ForReduction) const;
4106
4107 /// Run through the list of all gathered loads in the graph and try to find
4108 /// vector loads/masked gathers instead of regular gathers. Later these loads
4109 /// are reshufled to build final gathered nodes.
4110 void tryToVectorizeGatheredLoads(
4111 const SmallMapVector<
4112 std::tuple<BasicBlock *, Value *, Type *>,
4113 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
4114 &GatheredLoads);
4115
4116 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
4117 /// users of \p TE and collects the stores. It returns the map from the store
4118 /// pointers to the collected stores.
4120 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
4121
4122 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
4123 /// stores in \p StoresVec can form a vector instruction. If so it returns
4124 /// true and populates \p ReorderIndices with the shuffle indices of the
4125 /// stores when compared to the sorted vector.
4126 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
4127 OrdersType &ReorderIndices) const;
4128
4129 /// Iterates through the users of \p TE, looking for scalar stores that can be
4130 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
4131 /// their order and builds an order index vector for each store bundle. It
4132 /// returns all these order vectors found.
4133 /// We run this after the tree has formed, otherwise we may come across user
4134 /// instructions that are not yet in the tree.
4136 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
4137
4138 /// Tries to reorder the gathering node for better vectorization
4139 /// opportunities.
4140 void reorderGatherNode(TreeEntry &TE);
4141
4142 /// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
4143 /// .., 56))-like pattern.
4144 /// If the int shifts unique, also strided, but not ordered, sets \p Order.
4145 /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
4146 /// If the root nodes are loads, sets \p ForLoads to true.
4147 bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap,
4148 bool &ForLoads) const;
4149
4150 /// Checks if the \p SelectTE matches zext+selects, which can be inversed for
4151 /// better codegen in case like zext (icmp ne), select (icmp eq), ....
4152 bool matchesInversedZExtSelect(
4153 const TreeEntry &SelectTE,
4154 SmallVectorImpl<unsigned> &InversedCmpsIndices) const;
4155
4156 /// Checks if the tree is reduction or of bit selects, like select %cmp, <1,
4157 /// 2, 4, 8, ..>, zeroinitializer, which can be reduced just to a bitcast %cmp
4158 /// to in.
4159 bool matchesSelectOfBits(const TreeEntry &SelectTE) const;
4160
4161 class TreeEntry {
4162 public:
4163 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
4164 TreeEntry(VecTreeTy &Container) : Container(Container) {}
4165
4166 /// \returns Common mask for reorder indices and reused scalars.
4167 SmallVector<int> getCommonMask() const {
4168 if (State == TreeEntry::SplitVectorize)
4169 return {};
4170 SmallVector<int> Mask;
4171 inversePermutation(ReorderIndices, Mask);
4172 ::addMask(Mask, ReuseShuffleIndices);
4173 return Mask;
4174 }
4175
4176 /// \returns The mask for split nodes.
4177 SmallVector<int> getSplitMask() const {
4178 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
4179 "Expected only split vectorize node.");
4180 unsigned CommonVF = std::max<unsigned>(
4181 CombinedEntriesWithIndices.back().second,
4182 Scalars.size() - CombinedEntriesWithIndices.back().second);
4183 const unsigned Scale = getNumElements(Scalars.front()->getType());
4184 CommonVF *= Scale;
4185 SmallVector<int> Mask(getVectorFactor() * Scale, PoisonMaskElem);
4186 for (auto [Idx, I] : enumerate(ReorderIndices)) {
4187 for (unsigned K : seq<unsigned>(Scale)) {
4188 Mask[Scale * I + K] =
4189 Scale * Idx + K +
4190 (Idx >= CombinedEntriesWithIndices.back().second
4191 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
4192 : 0);
4193 }
4194 }
4195 return Mask;
4196 }
4197
4198 /// Updates (reorders) SplitVectorize node according to the given mask \p
4199 /// Mask and order \p MaskOrder.
4200 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
4201 ArrayRef<int> MaskOrder);
4202
4203 /// \returns true if the scalars in VL are equal to this entry.
4204 bool isSame(ArrayRef<Value *> VL) const {
4205 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
4206 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
4207 return std::equal(VL.begin(), VL.end(), Scalars.begin());
4208 return VL.size() == Mask.size() &&
4209 std::equal(VL.begin(), VL.end(), Mask.begin(),
4210 [Scalars](Value *V, int Idx) {
4211 return (isa<UndefValue>(V) &&
4212 Idx == PoisonMaskElem) ||
4213 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4214 });
4215 };
4216 if (!ReorderIndices.empty()) {
4217 // TODO: implement matching if the nodes are just reordered, still can
4218 // treat the vector as the same if the list of scalars matches VL
4219 // directly, without reordering.
4220 SmallVector<int> Mask;
4221 inversePermutation(ReorderIndices, Mask);
4222 if (VL.size() == Scalars.size())
4223 return IsSame(Scalars, Mask);
4224 if (VL.size() == ReuseShuffleIndices.size()) {
4225 ::addMask(Mask, ReuseShuffleIndices);
4226 return IsSame(Scalars, Mask);
4227 }
4228 return false;
4229 }
4230 return IsSame(Scalars, ReuseShuffleIndices);
4231 }
4232
4233 /// \returns true if current entry has same operands as \p TE.
4234 bool hasEqualOperands(const TreeEntry &TE) const {
4235 if (TE.getNumOperands() != getNumOperands())
4236 return false;
4237 SmallBitVector Used(getNumOperands());
4238 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
4239 unsigned PrevCount = Used.count();
4240 for (unsigned K = 0; K < E; ++K) {
4241 if (Used.test(K))
4242 continue;
4243 if (getOperand(K) == TE.getOperand(I)) {
4244 Used.set(K);
4245 break;
4246 }
4247 }
4248 // Check if we actually found the matching operand.
4249 if (PrevCount == Used.count())
4250 return false;
4251 }
4252 return true;
4253 }
4254
4255 /// \return Final vectorization factor for the node. Defined by the total
4256 /// number of vectorized scalars, including those, used several times in the
4257 /// entry and counted in the \a ReuseShuffleIndices, if any.
4258 unsigned getVectorFactor() const {
4259 if (!ReuseShuffleIndices.empty())
4260 return ReuseShuffleIndices.size();
4261 return Scalars.size();
4262 };
4263
4264 /// Checks if the current node is a gather node.
4265 bool isGather() const { return State == NeedToGather; }
4266
4267 /// A vector of scalars.
4268 ValueList Scalars;
4269
4270 /// The Scalars are vectorized into this value. It is initialized to Null.
4271 WeakTrackingVH VectorizedValue = nullptr;
4272
4273 /// Do we need to gather this sequence or vectorize it
4274 /// (either with vector instruction or with scatter/gather
4275 /// intrinsics for store/load)?
4276 enum EntryState {
4277 Vectorize, ///< The node is regularly vectorized.
4278 ScatterVectorize, ///< Masked scatter/gather node.
4279 StridedVectorize, ///< Strided loads (and stores)
4280 CompressVectorize, ///< (Masked) load with compress.
4281 NeedToGather, ///< Gather/buildvector node.
4282 CombinedVectorize, ///< Vectorized node, combined with its user into more
4283 ///< complex node like select/cmp to minmax, mul/add to
4284 ///< fma, etc. Must be used for the following nodes in
4285 ///< the pattern, not the very first one.
4286 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4287 ///< independently and then combines back.
4288 };
4289 EntryState State;
4290
4291 /// List of combined opcodes supported by the vectorizer.
4292 enum CombinedOpcode {
4293 NotCombinedOp = -1,
4294 MinMax = Instruction::OtherOpsEnd + 1,
4295 FMulAdd,
4296 ReducedBitcast,
4297 ReducedBitcastBSwap,
4298 ReducedBitcastLoads,
4299 ReducedBitcastBSwapLoads,
4300 ReducedCmpBitcast,
4301 };
4302 CombinedOpcode CombinedOp = NotCombinedOp;
4303
4304 /// Does this sequence require some shuffling?
4305 SmallVector<int, 4> ReuseShuffleIndices;
4306
4307 /// Does this entry require reordering?
4308 SmallVector<unsigned, 4> ReorderIndices;
4309
4310 /// Points back to the VectorizableTree.
4311 ///
4312 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4313 /// to be a pointer and needs to be able to initialize the child iterator.
4314 /// Thus we need a reference back to the container to translate the indices
4315 /// to entries.
4316 VecTreeTy &Container;
4317
4318 /// The TreeEntry index containing the user of this entry.
4319 EdgeInfo UserTreeIndex;
4320
4321 /// The index of this treeEntry in VectorizableTree.
4322 unsigned Idx = 0;
4323
4324 /// For gather/buildvector/alt opcode nodes, which are combined from
4325 /// other nodes as a series of insertvector instructions.
4326 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4327
4328 private:
4329 /// The operands of each instruction in each lane Operands[op_index][lane].
4330 /// Note: This helps avoid the replication of the code that performs the
4331 /// reordering of operands during buildTreeRec() and vectorizeTree().
4332 SmallVector<ValueList, 2> Operands;
4333
4334 /// Copyable elements of the entry node.
4335 SmallPtrSet<const Value *, 4> CopyableElements;
4336
4337 /// MainOp and AltOp are recorded inside. S should be obtained from
4338 /// newTreeEntry.
4339 InstructionsState S = InstructionsState::invalid();
4340
4341 /// Interleaving factor for interleaved loads Vectorize nodes.
4342 unsigned InterleaveFactor = 0;
4343
4344 /// True if the node does not require scheduling.
4345 bool DoesNotNeedToSchedule = false;
4346
4347 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4348 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4349 if (Operands.size() < OpIdx + 1)
4350 Operands.resize(OpIdx + 1);
4351 assert(Operands[OpIdx].empty() && "Already resized?");
4352 assert(OpVL.size() <= Scalars.size() &&
4353 "Number of operands is greater than the number of scalars.");
4354 Operands[OpIdx].resize(OpVL.size());
4355 copy(OpVL, Operands[OpIdx].begin());
4356 }
4357
4358 /// Maps values to their lanes in the node.
4359 mutable SmallDenseMap<Value *, unsigned> ValueToLane;
4360
4361 public:
4362 /// Returns interleave factor for interleave nodes.
4363 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4364 /// Sets interleaving factor for the interleaving nodes.
4365 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4366
4367 /// Marks the node as one that does not require scheduling.
4368 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4369 /// Returns true if the node is marked as one that does not require
4370 /// scheduling.
4371 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4372
4373 /// Set this bundle's operands from \p Operands.
4374 void setOperands(ArrayRef<ValueList> Operands) {
4375 for (unsigned I : seq<unsigned>(Operands.size()))
4376 setOperand(I, Operands[I]);
4377 }
4378
4379 /// Reorders operands of the node to the given mask \p Mask.
4380 void reorderOperands(ArrayRef<int> Mask) {
4381 for (ValueList &Operand : Operands)
4382 reorderScalars(Operand, Mask);
4383 }
4384
4385 /// \returns the \p OpIdx operand of this TreeEntry.
4386 ValueList &getOperand(unsigned OpIdx) {
4387 assert(OpIdx < Operands.size() && "Off bounds");
4388 return Operands[OpIdx];
4389 }
4390
4391 /// \returns the \p OpIdx operand of this TreeEntry.
4392 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4393 assert(OpIdx < Operands.size() && "Off bounds");
4394 return Operands[OpIdx];
4395 }
4396
4397 /// \returns the number of operands.
4398 unsigned getNumOperands() const { return Operands.size(); }
4399
4400 /// \return the single \p OpIdx operand.
4401 Value *getSingleOperand(unsigned OpIdx) const {
4402 assert(OpIdx < Operands.size() && "Off bounds");
4403 assert(!Operands[OpIdx].empty() && "No operand available");
4404 return Operands[OpIdx][0];
4405 }
4406
4407 /// Some of the instructions in the list have alternate opcodes.
4408 bool isAltShuffle() const { return S.isAltShuffle(); }
4409
4410 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4411 return S.getMatchingMainOpOrAltOp(I);
4412 }
4413
4414 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4415 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4416 /// \p OpValue.
4417 Value *isOneOf(Value *Op) const {
4418 auto *I = dyn_cast<Instruction>(Op);
4419 if (I && getMatchingMainOpOrAltOp(I))
4420 return Op;
4421 return S.getMainOp();
4422 }
4423
4424 void setOperations(const InstructionsState &S) {
4425 assert(S && "InstructionsState is invalid.");
4426 this->S = S;
4427 }
4428
4429 Instruction *getMainOp() const { return S.getMainOp(); }
4430
4431 Instruction *getAltOp() const { return S.getAltOp(); }
4432
4433 /// The main/alternate opcodes for the list of instructions.
4434 unsigned getOpcode() const { return S.getOpcode(); }
4435
4436 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4437
4438 bool hasState() const { return S.valid(); }
4439
4440 /// Add \p V to the list of copyable elements.
4441 void addCopyableElement(Value *V) {
4442 assert(S.isCopyableElement(V) && "Not a copyable element.");
4443 CopyableElements.insert(V);
4444 }
4445
4446 /// Returns true if \p V is a copyable element.
4447 bool isCopyableElement(Value *V) const {
4448 return CopyableElements.contains(V);
4449 }
4450
4451 /// Checks if the value \p V is a transformed instruction, compatible either
4452 /// with main or alternate ops.
4453 bool isExpandedBinOp(Value *V) const {
4454 assert(hasState() && "InstructionsState is invalid.");
4455 if (isCopyableElement(V))
4456 return false;
4457 return S.isExpandedBinOp(V);
4458 }
4459
4460 /// Checks if the operand at index \p Idx of instruction \p I is an expanded
4461 /// operand.
4462 bool isExpandedOperand(Instruction *I, unsigned Idx) const {
4463 assert(hasState() && "InstructionsState is invalid.");
4464 if (isCopyableElement(I))
4465 return false;
4466 if (!isExpandedBinOp(I))
4467 return false;
4468 return S.isExpandedOperand(I, Idx);
4469 }
4470
4471 /// Returns true if any scalar in the list is a copyable element.
4472 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4473
4474 /// Returns the state of the operations.
4475 const InstructionsState &getOperations() const { return S; }
4476
4477 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4478 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4479 unsigned findLaneForValue(Value *V) const {
4480 auto Res = ValueToLane.try_emplace(V, getVectorFactor());
4481 if (!Res.second)
4482 return Res.first->second;
4483 unsigned &FoundLane = Res.first->getSecond();
4484 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4485 std::advance(It, 1)) {
4486 if (*It != V)
4487 continue;
4488 FoundLane = std::distance(Scalars.begin(), It);
4489 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4490 if (!ReorderIndices.empty())
4491 FoundLane = ReorderIndices[FoundLane];
4492 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4493 if (ReuseShuffleIndices.empty())
4494 break;
4495 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4496 RIt != ReuseShuffleIndices.end()) {
4497 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4498 break;
4499 }
4500 }
4501 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4502 return FoundLane;
4503 }
4504
4505 /// Build a shuffle mask for graph entry which represents a merge of main
4506 /// and alternate operations.
4507 void
4508 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4509 SmallVectorImpl<int> &Mask,
4510 SmallVectorImpl<Value *> *OpScalars = nullptr,
4511 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4512
4513 /// Return true if this is a non-power-of-2 node.
4514 bool isNonPowOf2Vec() const {
4515 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4516 return IsNonPowerOf2;
4517 }
4518
4519 Value *getOrdered(unsigned Idx) const {
4520 if (ReorderIndices.empty())
4521 return Scalars[Idx];
4522 SmallVector<int> Mask;
4523 inversePermutation(ReorderIndices, Mask);
4524 return Scalars[Mask[Idx]];
4525 }
4526
4527#ifndef NDEBUG
4528 /// Debug printer.
4529 LLVM_DUMP_METHOD void dump() const {
4530 dbgs() << Idx << ".\n";
4531 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4532 dbgs() << "Operand " << OpI << ":\n";
4533 for (const Value *V : Operands[OpI])
4534 dbgs().indent(2) << *V << "\n";
4535 }
4536 dbgs() << "Scalars: \n";
4537 for (Value *V : Scalars) {
4538 dbgs().indent(2) << *V
4539 << ((S && S.isExpandedBinOp(V)) ? " [[Expanded]]\n"
4540 : "\n");
4541 }
4542 dbgs() << "State: ";
4543 if (S && hasCopyableElements())
4544 dbgs() << "[[Copyable]] ";
4545 switch (State) {
4546 case Vectorize:
4547 if (InterleaveFactor > 0) {
4548 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4549 << "\n";
4550 } else {
4551 dbgs() << "Vectorize\n";
4552 }
4553 break;
4554 case ScatterVectorize:
4555 dbgs() << "ScatterVectorize\n";
4556 break;
4557 case StridedVectorize:
4558 dbgs() << "StridedVectorize\n";
4559 break;
4560 case CompressVectorize:
4561 dbgs() << "CompressVectorize\n";
4562 break;
4563 case NeedToGather:
4564 dbgs() << "NeedToGather\n";
4565 break;
4566 case CombinedVectorize:
4567 dbgs() << "CombinedVectorize\n";
4568 break;
4569 case SplitVectorize:
4570 dbgs() << "SplitVectorize\n";
4571 break;
4572 }
4573 if (S) {
4574 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4575 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4576 } else {
4577 dbgs() << "MainOp: NULL\n";
4578 dbgs() << "AltOp: NULL\n";
4579 }
4580 dbgs() << "VectorizedValue: ";
4581 if (VectorizedValue)
4582 dbgs() << *VectorizedValue << "\n";
4583 else
4584 dbgs() << "NULL\n";
4585 dbgs() << "ReuseShuffleIndices: ";
4586 if (ReuseShuffleIndices.empty())
4587 dbgs() << "Empty";
4588 else
4589 for (int ReuseIdx : ReuseShuffleIndices)
4590 dbgs() << ReuseIdx << ", ";
4591 dbgs() << "\n";
4592 dbgs() << "ReorderIndices: ";
4593 for (unsigned ReorderIdx : ReorderIndices)
4594 dbgs() << ReorderIdx << ", ";
4595 dbgs() << "\n";
4596 dbgs() << "UserTreeIndex: ";
4597 if (UserTreeIndex)
4598 dbgs() << UserTreeIndex;
4599 else
4600 dbgs() << "<invalid>";
4601 dbgs() << "\n";
4602 if (!CombinedEntriesWithIndices.empty()) {
4603 dbgs() << "Combined entries: ";
4604 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4605 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4606 });
4607 dbgs() << "\n";
4608 }
4609 }
4610#endif
4611 };
4612
4613#ifndef NDEBUG
4614 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4615 InstructionCost VecCost, InstructionCost ScalarCost,
4616 StringRef Banner) const {
4617 dbgs() << "SLP: " << Banner << ":\n";
4618 E->dump();
4619 dbgs() << "SLP: Costs:\n";
4620 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4621 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4622 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4623 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4624 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4625 }
4626#endif
4627
4628 /// Create a new gather TreeEntry
4629 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4630 const InstructionsState &S,
4631 const EdgeInfo &UserTreeIdx,
4632 ArrayRef<int> ReuseShuffleIndices = {}) {
4633 auto Invalid = ScheduleBundle::invalid();
4634 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4635 }
4636
4637 /// Create a new VectorizableTree entry.
4638 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4639 const InstructionsState &S,
4640 const EdgeInfo &UserTreeIdx,
4641 ArrayRef<int> ReuseShuffleIndices = {},
4642 ArrayRef<unsigned> ReorderIndices = {},
4643 unsigned InterleaveFactor = 0) {
4644 TreeEntry::EntryState EntryState =
4645 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4646 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4647 ReuseShuffleIndices, ReorderIndices);
4648 if (E && InterleaveFactor > 0)
4649 E->setInterleave(InterleaveFactor);
4650 return E;
4651 }
4652
4653 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4654 TreeEntry::EntryState EntryState,
4655 ScheduleBundle &Bundle, const InstructionsState &S,
4656 const EdgeInfo &UserTreeIdx,
4657 ArrayRef<int> ReuseShuffleIndices = {},
4658 ArrayRef<unsigned> ReorderIndices = {}) {
4659 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4660 EntryState == TreeEntry::SplitVectorize)) ||
4661 (Bundle && EntryState != TreeEntry::NeedToGather &&
4662 EntryState != TreeEntry::SplitVectorize)) &&
4663 "Need to vectorize gather entry?");
4664 // Gathered loads still gathered? Do not create entry, use the original one.
4665 if (GatheredLoadsEntriesFirst.has_value() &&
4666 EntryState == TreeEntry::NeedToGather && S &&
4667 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4668 !UserTreeIdx.UserTE)
4669 return nullptr;
4670 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4671 TreeEntry *Last = VectorizableTree.back().get();
4672 Last->Idx = VectorizableTree.size() - 1;
4673 Last->State = EntryState;
4674 if (UserTreeIdx.UserTE)
4675 OperandsToTreeEntry.try_emplace(
4676 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4677 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4678 ReuseShuffleIndices.end());
4679 if (ReorderIndices.empty()) {
4680 Last->Scalars.assign(VL.begin(), VL.end());
4681 if (S)
4682 Last->setOperations(S);
4683 } else {
4684 // Reorder scalars and build final mask.
4685 Last->Scalars.assign(VL.size(), nullptr);
4686 transform(ReorderIndices, Last->Scalars.begin(),
4687 [VL](unsigned Idx) -> Value * {
4688 if (Idx >= VL.size())
4689 return UndefValue::get(VL.front()->getType());
4690 return VL[Idx];
4691 });
4692 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4693 if (S)
4694 Last->setOperations(S);
4695 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4696 }
4697 if (EntryState == TreeEntry::SplitVectorize) {
4698 assert(S && "Split nodes must have operations.");
4699 Last->setOperations(S);
4700 SmallPtrSet<Value *, 4> Processed;
4701 for (Value *V : VL) {
4702 auto *I = dyn_cast<Instruction>(V);
4703 if (!I)
4704 continue;
4705 auto It = ScalarsInSplitNodes.find(V);
4706 if (It == ScalarsInSplitNodes.end()) {
4707 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4708 (void)Processed.insert(V);
4709 } else if (Processed.insert(V).second) {
4710 assert(!is_contained(It->getSecond(), Last) &&
4711 "Value already associated with the node.");
4712 It->getSecond().push_back(Last);
4713 }
4714 }
4715 } else if (!Last->isGather()) {
4716 if (isa<PHINode>(S.getMainOp()) ||
4717 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4718 (!S.areInstructionsWithCopyableElements() &&
4719 doesNotNeedToSchedule(VL)) ||
4720 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4721 Last->setDoesNotNeedToSchedule();
4722 SmallPtrSet<Value *, 4> Processed;
4723 for (Value *V : VL) {
4724 if (isa<PoisonValue>(V))
4725 continue;
4726 if (S.isCopyableElement(V)) {
4727 Last->addCopyableElement(V);
4728 continue;
4729 }
4730 auto It = ScalarToTreeEntries.find(V);
4731 if (It == ScalarToTreeEntries.end()) {
4732 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4733 (void)Processed.insert(V);
4734 } else if (Processed.insert(V).second) {
4735 assert(!is_contained(It->getSecond(), Last) &&
4736 "Value already associated with the node.");
4737 It->getSecond().push_back(Last);
4738 }
4739 }
4740 // Update the scheduler bundle to point to this TreeEntry.
4741 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4742 "Bundle and VL out of sync");
4743 if (!Bundle.getBundle().empty()) {
4744#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4745 auto *BundleMember = Bundle.getBundle().begin();
4746 SmallPtrSet<Value *, 4> Processed;
4747 for (Value *V : VL) {
4748 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4749 continue;
4750 ++BundleMember;
4751 }
4752 assert(BundleMember == Bundle.getBundle().end() &&
4753 "Bundle and VL out of sync");
4754#endif
4755 Bundle.setTreeEntry(Last);
4756 }
4757 } else {
4758 // Build a map for gathered scalars to the nodes where they are used.
4759 bool AllConstsOrCasts = true;
4760 for (Value *V : VL) {
4761 if (S && S.areInstructionsWithCopyableElements() &&
4762 S.isCopyableElement(V))
4763 Last->addCopyableElement(V);
4764 if (!isConstant(V)) {
4765 auto *I = dyn_cast<CastInst>(V);
4766 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4767 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4768 !UserTreeIdx.UserTE->isGather())
4769 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4770 }
4771 }
4772 if (AllConstsOrCasts)
4773 CastMaxMinBWSizes =
4774 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4775 MustGather.insert_range(VL);
4776 }
4777
4778 if (UserTreeIdx.UserTE)
4779 Last->UserTreeIndex = UserTreeIdx;
4780 return Last;
4781 }
4782
4783 /// -- Vectorization State --
4784 /// Holds all of the tree entries.
4785 TreeEntry::VecTreeTy VectorizableTree;
4786
4787#ifndef NDEBUG
4788 /// Debug printer.
4789 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4790 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4791 VectorizableTree[Id]->dump();
4792 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4793 dbgs() << "[[TRANSFORMED TO GATHER]]";
4794 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4795 dbgs() << "[[DELETED NODE]]";
4796 dbgs() << "\n";
4797 }
4798 }
4799#endif
4800
4801 /// Get list of vector entries, associated with the value \p V.
4802 ArrayRef<TreeEntry *> getTreeEntries(const Value *V) const {
4803 assert(V && "V cannot be nullptr.");
4804 auto It = ScalarToTreeEntries.find(V);
4805 if (It == ScalarToTreeEntries.end())
4806 return {};
4807 return It->getSecond();
4808 }
4809
4810 /// Get list of split vector entries, associated with the value \p V.
4811 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4812 assert(V && "V cannot be nullptr.");
4813 auto It = ScalarsInSplitNodes.find(V);
4814 if (It == ScalarsInSplitNodes.end())
4815 return {};
4816 return It->getSecond();
4817 }
4818
4819 /// Returns first vector node for value \p V, matching values \p VL.
4820 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4821 bool SameVF = false) const {
4822 assert(V && "V cannot be nullptr.");
4823 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4824 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4825 return TE;
4826 return nullptr;
4827 }
4828
4829 /// Contains all the outputs of legality analysis for a list of values to
4830 /// vectorize.
4831 class ScalarsVectorizationLegality {
4832 InstructionsState S;
4833 bool IsLegal;
4834 bool TryToFindDuplicates;
4835 bool TrySplitVectorize;
4836
4837 public:
4838 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4839 bool TryToFindDuplicates = true,
4840 bool TrySplitVectorize = false)
4841 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4842 TrySplitVectorize(TrySplitVectorize) {
4843 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4844 "Inconsistent state");
4845 }
4846 const InstructionsState &getInstructionsState() const { return S; };
4847 bool isLegal() const { return IsLegal; }
4848 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4849 bool trySplitVectorize() const { return TrySplitVectorize; }
4850 };
4851
4852 /// Checks if the specified list of the instructions/values can be vectorized
4853 /// in general.
4854 ScalarsVectorizationLegality
4855 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4856 const EdgeInfo &UserTreeIdx) const;
4857
4858 /// Checks if the specified list of the instructions/values can be vectorized
4859 /// and fills required data before actual scheduling of the instructions.
4860 TreeEntry::EntryState getScalarsVectorizationState(
4861 const InstructionsState &S, ArrayRef<Value *> VL,
4862 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4863 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4864
4865 /// Maps a specific scalar to its tree entry(ies).
4866 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4867
4868 /// List of deleted non-profitable nodes.
4869 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4870
4871 /// List of nodes, transformed to gathered, with their conservative
4872 /// gather/buildvector cost estimation.
4873 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4874
4875 /// Maps the operand index and entry to the corresponding tree entry.
4876 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4877 OperandsToTreeEntry;
4878
4879 /// Scalars, used in split vectorize nodes.
4880 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4881
4882 /// Maps a value to the proposed vectorizable size.
4883 SmallDenseMap<Value *, unsigned> InstrElementSize;
4884
4885 /// A list of scalars that we found that we need to keep as scalars.
4886 ValueSet MustGather;
4887
4888 /// A set of first non-schedulable values.
4889 ValueSet NonScheduledFirst;
4890
4891 /// A map between the vectorized entries and the last instructions in the
4892 /// bundles. The bundles are built in use order, not in the def order of the
4893 /// instructions. So, we cannot rely directly on the last instruction in the
4894 /// bundle being the last instruction in the program order during
4895 /// vectorization process since the basic blocks are affected, need to
4896 /// pre-gather them before.
4897 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4898
4899 /// Keeps the mapping between the last instructions and their insertion
4900 /// points, which is an instruction-after-the-last-instruction.
4901 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4902
4903 /// List of gather nodes, depending on other gather/vector nodes, which should
4904 /// be emitted after the vector instruction emission process to correctly
4905 /// handle order of the vector instructions and shuffles.
4906 SetVector<const TreeEntry *> PostponedGathers;
4907
4908 using ValueToGatherNodesMap =
4909 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4910 ValueToGatherNodesMap ValueToGatherNodes;
4911
4912 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
4913
4914 /// A list of the load entries (node indices), which can be vectorized using
4915 /// strided or masked gather approach, but attempted to be represented as
4916 /// contiguous loads.
4917 SetVector<unsigned> LoadEntriesToVectorize;
4918
4919 /// true if graph nodes transforming mode is on.
4920 bool IsGraphTransformMode = false;
4921
4922 /// The index of the first gathered load entry in the VectorizeTree.
4923 std::optional<unsigned> GatheredLoadsEntriesFirst;
4924
4925 /// Maps compress entries to their mask data for the final codegen.
4926 SmallDenseMap<const TreeEntry *,
4927 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4928 CompressEntryToData;
4929
4930 /// The loop nest, used to check if only a single loop nest is vectorized, not
4931 /// multiple, to avoid side-effects from the loop-aware cost model.
4932 SmallVector<const Loop *> CurrentLoopNest;
4933
4934 /// Per-depth SCEVs trip counts at every loop level where the tree builder has
4935 /// joined diverging sibling loops.
4936 SmallVector<const SCEV *> MergedLoopBTCs;
4937
4938 /// Maps the loops to their loop nests.
4939 SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
4940
4941 /// Per-loop cache of nest scale factors: the product of trip counts of the
4942 /// loop and all of its ancestors. Shared by getLoopNestScale() and (via it)
4943 /// by getScaleToLoopIterations() and getGatherNodeEffectiveScale().
4944 SmallDenseMap<const Loop *, uint64_t> LoopNestScaleCache;
4945
4946 /// This POD struct describes one external user in the vectorized tree.
4947 struct ExternalUser {
4948 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4949 : Scalar(S), User(U), E(E), Lane(L) {}
4950
4951 /// Which scalar in our function.
4952 Value *Scalar = nullptr;
4953
4954 /// Which user that uses the scalar.
4955 llvm::User *User = nullptr;
4956
4957 /// Vector node, the value is part of.
4958 const TreeEntry &E;
4959
4960 /// Which lane does the scalar belong to.
4961 unsigned Lane;
4962 };
4963 using UserList = SmallVector<ExternalUser, 16>;
4964
4965 /// Checks if two instructions may access the same memory.
4966 ///
4967 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4968 /// is invariant in the calling loop.
4969 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4970 Instruction *Inst2) {
4971 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4972 // First check if the result is already in the cache.
4973 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4974 auto Res = AliasCache.try_emplace(Key);
4975 if (!Res.second)
4976 return Res.first->second;
4977 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4978 // Store the result in the cache.
4979 Res.first->getSecond() = Aliased;
4980 return Aliased;
4981 }
4982
4983 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4984
4985 /// Cache for alias results.
4986 /// TODO: consider moving this to the AliasAnalysis itself.
4987 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4988
4989 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4990 // globally through SLP because we don't perform any action which
4991 // invalidates capture results.
4992 BatchAAResults BatchAA;
4993
4994 /// Temporary store for deleted instructions. Instructions will be deleted
4995 /// eventually when the BoUpSLP is destructed. The deferral is required to
4996 /// ensure that there are no incorrect collisions in the AliasCache, which
4997 /// can happen if a new instruction is allocated at the same address as a
4998 /// previously deleted instruction.
4999 DenseSet<Instruction *> DeletedInstructions;
5000
5001 /// Set of the instruction, being analyzed already for reductions.
5002 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
5003
5004 /// Set of hashes for the list of reduction values already being analyzed.
5005 DenseSet<size_t> AnalyzedReductionVals;
5006
5007 /// Values, already been analyzed for mininmal bitwidth and found to be
5008 /// non-profitable.
5009 DenseSet<Value *> AnalyzedMinBWVals;
5010
5011 /// A list of values that need to extracted out of the tree.
5012 /// This list holds pairs of (Internal Scalar : External User). External User
5013 /// can be nullptr, it means that this Internal Scalar will be used later,
5014 /// after vectorization.
5015 UserList ExternalUses;
5016
5017 /// A list of GEPs which can be reaplced by scalar GEPs instead of
5018 /// extractelement instructions.
5019 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
5020
5021 /// A list of scalar to be extracted without specific user necause of too many
5022 /// uses.
5023 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
5024
5025 /// Values used only by @llvm.assume calls.
5026 SmallPtrSet<const Value *, 32> EphValues;
5027
5028 /// Holds all of the instructions that we gathered, shuffle instructions and
5029 /// extractelements.
5030 SetVector<Instruction *> GatherShuffleExtractSeq;
5031
5032 /// A list of blocks that we are going to CSE.
5033 DenseSet<BasicBlock *> CSEBlocks;
5034
5035 /// List of hashes of vector of loads, which are known to be non vectorizable.
5036 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
5037
5038 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
5039 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
5040 /// instructions, while ScheduleBundle represents a batch of instructions,
5041 /// going to be groupped together. ScheduleCopyableData models extra user for
5042 /// "copyable" instructions.
5043 class ScheduleEntity {
5044 friend class ScheduleBundle;
5045 friend class ScheduleData;
5046 friend class ScheduleCopyableData;
5047
5048 protected:
5049 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
5050 Kind getKind() const { return K; }
5051 ScheduleEntity(Kind K) : K(K) {}
5052
5053 private:
5054 /// Used for getting a "good" final ordering of instructions.
5055 int SchedulingPriority = 0;
5056 /// True if this instruction (or bundle) is scheduled (or considered as
5057 /// scheduled in the dry-run).
5058 bool IsScheduled = false;
5059 /// The kind of the ScheduleEntity.
5060 const Kind K = Kind::ScheduleData;
5061
5062 public:
5063 ScheduleEntity() = delete;
5064 /// Gets/sets the scheduling priority.
5065 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
5066 int getSchedulingPriority() const { return SchedulingPriority; }
5067 bool isReady() const {
5068 if (const auto *SD = dyn_cast<ScheduleData>(this))
5069 return SD->isReady();
5070 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5071 return CD->isReady();
5072 return cast<ScheduleBundle>(this)->isReady();
5073 }
5074 /// Returns true if the dependency information has been calculated.
5075 /// Note that depenendency validity can vary between instructions within
5076 /// a single bundle.
5077 bool hasValidDependencies() const {
5078 if (const auto *SD = dyn_cast<ScheduleData>(this))
5079 return SD->hasValidDependencies();
5080 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5081 return CD->hasValidDependencies();
5082 return cast<ScheduleBundle>(this)->hasValidDependencies();
5083 }
5084 /// Gets the number of unscheduled dependencies.
5085 int getUnscheduledDeps() const {
5086 if (const auto *SD = dyn_cast<ScheduleData>(this))
5087 return SD->getUnscheduledDeps();
5088 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5089 return CD->getUnscheduledDeps();
5090 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
5091 }
5092 /// Increments the number of unscheduled dependencies.
5093 int incrementUnscheduledDeps(int Incr) {
5094 if (auto *SD = dyn_cast<ScheduleData>(this))
5095 return SD->incrementUnscheduledDeps(Incr);
5096 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
5097 }
5098 /// Gets the number of dependencies.
5099 int getDependencies() const {
5100 if (const auto *SD = dyn_cast<ScheduleData>(this))
5101 return SD->getDependencies();
5102 return cast<ScheduleCopyableData>(this)->getDependencies();
5103 }
5104 /// Gets the instruction.
5105 Instruction *getInst() const {
5106 if (const auto *SD = dyn_cast<ScheduleData>(this))
5107 return SD->getInst();
5108 return cast<ScheduleCopyableData>(this)->getInst();
5109 }
5110
5111 /// Gets/sets if the bundle is scheduled.
5112 bool isScheduled() const { return IsScheduled; }
5113 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
5114
5115 static bool classof(const ScheduleEntity *) { return true; }
5116
5117#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5118 void dump(raw_ostream &OS) const {
5119 if (const auto *SD = dyn_cast<ScheduleData>(this))
5120 return SD->dump(OS);
5121 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5122 return CD->dump(OS);
5123 return cast<ScheduleBundle>(this)->dump(OS);
5124 }
5125
5126 LLVM_DUMP_METHOD void dump() const {
5127 dump(dbgs());
5128 dbgs() << '\n';
5129 }
5130#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5131 };
5132
5133#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5135 const BoUpSLP::ScheduleEntity &SE) {
5136 SE.dump(OS);
5137 return OS;
5138 }
5139#endif
5140
5141 /// Contains all scheduling relevant data for an instruction.
5142 /// A ScheduleData either represents a single instruction or a member of an
5143 /// instruction bundle (= a group of instructions which is combined into a
5144 /// vector instruction).
5145 class ScheduleData final : public ScheduleEntity {
5146 public:
5147 // The initial value for the dependency counters. It means that the
5148 // dependencies are not calculated yet.
5149 enum { InvalidDeps = -1 };
5150
5151 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
5152 static bool classof(const ScheduleEntity *Entity) {
5153 return Entity->getKind() == Kind::ScheduleData;
5154 }
5155
5156 void init(int BlockSchedulingRegionID, Instruction *I) {
5157 NextLoadStore = nullptr;
5158 IsScheduled = false;
5159 SchedulingRegionID = BlockSchedulingRegionID;
5160 clearDependencies();
5161 Inst = I;
5162 }
5163
5164 /// Verify basic self consistency properties
5165 void verify() {
5166 if (hasValidDependencies()) {
5167 assert(UnscheduledDeps <= Dependencies && "invariant");
5168 } else {
5169 assert(UnscheduledDeps == Dependencies && "invariant");
5170 }
5171
5172 if (IsScheduled) {
5173 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5174 "unexpected scheduled state");
5175 }
5176 }
5177
5178 /// Returns true if the dependency information has been calculated.
5179 /// Note that depenendency validity can vary between instructions within
5180 /// a single bundle.
5181 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
5182
5183 /// Returns true if it is ready for scheduling, i.e. it has no more
5184 /// unscheduled depending instructions/bundles.
5185 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5186
5187 /// Modifies the number of unscheduled dependencies for this instruction,
5188 /// and returns the number of remaining dependencies for the containing
5189 /// bundle.
5190 int incrementUnscheduledDeps(int Incr) {
5191 assert(hasValidDependencies() &&
5192 "increment of unscheduled deps would be meaningless");
5193 UnscheduledDeps += Incr;
5194 assert(UnscheduledDeps >= 0 &&
5195 "Expected valid number of unscheduled deps");
5196 return UnscheduledDeps;
5197 }
5198
5199 /// Sets the number of unscheduled dependencies to the number of
5200 /// dependencies.
5201 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5202
5203 /// Clears all dependency information.
5204 void clearDependencies() {
5205 clearDirectDependencies();
5206 MemoryDependencies.clear();
5207 ControlDependencies.clear();
5208 }
5209
5210 /// Clears all direct dependencies only, except for control and memory
5211 /// dependencies.
5212 /// Required for copyable elements to correctly handle control/memory deps
5213 /// and avoid extra reclaculation of such deps.
5214 void clearDirectDependencies() {
5215 Dependencies = InvalidDeps;
5216 resetUnscheduledDeps();
5217 IsScheduled = false;
5218 }
5219
5220 /// Gets the number of unscheduled dependencies.
5221 int getUnscheduledDeps() const { return UnscheduledDeps; }
5222 /// Gets the number of dependencies.
5223 int getDependencies() const { return Dependencies; }
5224 /// Initializes the number of dependencies.
5225 void initDependencies() { Dependencies = 0; }
5226 /// Increments the number of dependencies.
5227 void incDependencies() { Dependencies++; }
5228
5229 /// Gets scheduling region ID.
5230 int getSchedulingRegionID() const { return SchedulingRegionID; }
5231
5232 /// Gets the instruction.
5233 Instruction *getInst() const { return Inst; }
5234
5235 /// Gets the list of memory dependencies.
5236 ArrayRef<ScheduleData *> getMemoryDependencies() const {
5237 return MemoryDependencies;
5238 }
5239 /// Adds a memory dependency.
5240 void addMemoryDependency(ScheduleData *Dep) {
5241 MemoryDependencies.push_back(Dep);
5242 }
5243 /// Gets the list of control dependencies.
5244 ArrayRef<ScheduleData *> getControlDependencies() const {
5245 return ControlDependencies;
5246 }
5247 /// Adds a control dependency.
5248 void addControlDependency(ScheduleData *Dep) {
5249 ControlDependencies.push_back(Dep);
5250 }
5251 /// Gets/sets the next load/store instruction in the block.
5252 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
5253 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5254
5255 void dump(raw_ostream &OS) const { OS << *Inst; }
5256
5257 LLVM_DUMP_METHOD void dump() const {
5258 dump(dbgs());
5259 dbgs() << '\n';
5260 }
5261
5262 private:
5263 Instruction *Inst = nullptr;
5264
5265 /// Single linked list of all memory instructions (e.g. load, store, call)
5266 /// in the block - until the end of the scheduling region.
5267 ScheduleData *NextLoadStore = nullptr;
5268
5269 /// The dependent memory instructions.
5270 /// This list is derived on demand in calculateDependencies().
5271 SmallVector<ScheduleData *> MemoryDependencies;
5272
5273 /// List of instructions which this instruction could be control dependent
5274 /// on. Allowing such nodes to be scheduled below this one could introduce
5275 /// a runtime fault which didn't exist in the original program.
5276 /// ex: this is a load or udiv following a readonly call which inf loops
5277 SmallVector<ScheduleData *> ControlDependencies;
5278
5279 /// This ScheduleData is in the current scheduling region if this matches
5280 /// the current SchedulingRegionID of BlockScheduling.
5281 int SchedulingRegionID = 0;
5282
5283 /// The number of dependencies. Constitutes of the number of users of the
5284 /// instruction plus the number of dependent memory instructions (if any).
5285 /// This value is calculated on demand.
5286 /// If InvalidDeps, the number of dependencies is not calculated yet.
5287 int Dependencies = InvalidDeps;
5288
5289 /// The number of dependencies minus the number of dependencies of scheduled
5290 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5291 /// for scheduling.
5292 /// Note that this is negative as long as Dependencies is not calculated.
5293 int UnscheduledDeps = InvalidDeps;
5294 };
5295
5296#ifndef NDEBUG
5298 const BoUpSLP::ScheduleData &SD) {
5299 SD.dump(OS);
5300 return OS;
5301 }
5302#endif
5303
5304 class ScheduleBundle final : public ScheduleEntity {
5305 /// The schedule data for the instructions in the bundle.
5307 /// True if this bundle is valid.
5308 bool IsValid = true;
5309 /// The TreeEntry that this instruction corresponds to.
5310 TreeEntry *TE = nullptr;
5311 ScheduleBundle(bool IsValid)
5312 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5313
5314 public:
5315 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5316 static bool classof(const ScheduleEntity *Entity) {
5317 return Entity->getKind() == Kind::ScheduleBundle;
5318 }
5319
5320 /// Verify basic self consistency properties
5321 void verify() const {
5322 for (const ScheduleEntity *SD : Bundle) {
5323 if (SD->hasValidDependencies()) {
5324 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5325 "invariant");
5326 } else {
5327 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5328 "invariant");
5329 }
5330
5331 if (isScheduled()) {
5332 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5333 "unexpected scheduled state");
5334 }
5335 }
5336 }
5337
5338 /// Returns the number of unscheduled dependencies in the bundle.
5339 int unscheduledDepsInBundle() const {
5340 assert(*this && "bundle must not be empty");
5341 int Sum = 0;
5342 for (const ScheduleEntity *BundleMember : Bundle) {
5343 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5344 return ScheduleData::InvalidDeps;
5345 Sum += BundleMember->getUnscheduledDeps();
5346 }
5347 return Sum;
5348 }
5349
5350 /// Returns true if the dependency information has been calculated.
5351 /// Note that depenendency validity can vary between instructions within
5352 /// a single bundle.
5353 bool hasValidDependencies() const {
5354 return all_of(Bundle, [](const ScheduleEntity *SD) {
5355 return SD->hasValidDependencies();
5356 });
5357 }
5358
5359 /// Returns true if it is ready for scheduling, i.e. it has no more
5360 /// unscheduled depending instructions/bundles.
5361 bool isReady() const {
5362 assert(*this && "bundle must not be empty");
5363 return unscheduledDepsInBundle() == 0 && !isScheduled();
5364 }
5365
5366 /// Returns the bundle of scheduling data, associated with the current
5367 /// instruction.
5368 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5369 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5370 /// Adds an instruction to the bundle.
5371 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5372
5373 /// Gets/sets the associated tree entry.
5374 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5375 TreeEntry *getTreeEntry() const { return TE; }
5376
5377 static ScheduleBundle invalid() { return {false}; }
5378
5379 operator bool() const { return IsValid; }
5380
5381#ifndef NDEBUG
5382 void dump(raw_ostream &OS) const {
5383 if (!*this) {
5384 OS << "[]";
5385 return;
5386 }
5387 OS << '[';
5388 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5390 OS << "<Copyable>";
5391 OS << *SD->getInst();
5392 });
5393 OS << ']';
5394 }
5395
5396 LLVM_DUMP_METHOD void dump() const {
5397 dump(dbgs());
5398 dbgs() << '\n';
5399 }
5400#endif // NDEBUG
5401 };
5402
5403#ifndef NDEBUG
5405 const BoUpSLP::ScheduleBundle &Bundle) {
5406 Bundle.dump(OS);
5407 return OS;
5408 }
5409#endif
5410
5411 /// Contains all scheduling relevant data for the copyable instruction.
5412 /// It models the virtual instructions, supposed to replace the original
5413 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5414 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5415 /// instruction %virt = add %0, 0.
5416 class ScheduleCopyableData final : public ScheduleEntity {
5417 /// The source schedule data for the instruction.
5418 Instruction *Inst = nullptr;
5419 /// The edge information for the instruction.
5420 const EdgeInfo EI;
5421 /// This ScheduleData is in the current scheduling region if this matches
5422 /// the current SchedulingRegionID of BlockScheduling.
5423 int SchedulingRegionID = 0;
5424 /// Bundle, this data is part of.
5425 ScheduleBundle &Bundle;
5426
5427 public:
5428 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5429 const EdgeInfo &EI, ScheduleBundle &Bundle)
5430 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5431 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5432 static bool classof(const ScheduleEntity *Entity) {
5433 return Entity->getKind() == Kind::ScheduleCopyableData;
5434 }
5435
5436 /// Verify basic self consistency properties
5437 void verify() {
5438 if (hasValidDependencies()) {
5439 assert(UnscheduledDeps <= Dependencies && "invariant");
5440 } else {
5441 assert(UnscheduledDeps == Dependencies && "invariant");
5442 }
5443
5444 if (IsScheduled) {
5445 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5446 "unexpected scheduled state");
5447 }
5448 }
5449
5450 /// Returns true if the dependency information has been calculated.
5451 /// Note that depenendency validity can vary between instructions within
5452 /// a single bundle.
5453 bool hasValidDependencies() const {
5454 return Dependencies != ScheduleData::InvalidDeps;
5455 }
5456
5457 /// Returns true if it is ready for scheduling, i.e. it has no more
5458 /// unscheduled depending instructions/bundles.
5459 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5460
5461 /// Modifies the number of unscheduled dependencies for this instruction,
5462 /// and returns the number of remaining dependencies for the containing
5463 /// bundle.
5464 int incrementUnscheduledDeps(int Incr) {
5465 assert(hasValidDependencies() &&
5466 "increment of unscheduled deps would be meaningless");
5467 UnscheduledDeps += Incr;
5468 assert(UnscheduledDeps >= 0 && "invariant");
5469 return UnscheduledDeps;
5470 }
5471
5472 /// Sets the number of unscheduled dependencies to the number of
5473 /// dependencies.
5474 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5475
5476 /// Gets the number of unscheduled dependencies.
5477 int getUnscheduledDeps() const { return UnscheduledDeps; }
5478 /// Gets the number of dependencies.
5479 int getDependencies() const { return Dependencies; }
5480 /// Initializes the number of dependencies.
5481 void initDependencies() { Dependencies = 0; }
5482 /// Increments the number of dependencies.
5483 void incDependencies() { Dependencies++; }
5484
5485 /// Gets scheduling region ID.
5486 int getSchedulingRegionID() const { return SchedulingRegionID; }
5487
5488 /// Gets the instruction.
5489 Instruction *getInst() const { return Inst; }
5490
5491 /// Clears all dependency information.
5492 void clearDependencies() {
5493 Dependencies = ScheduleData::InvalidDeps;
5494 UnscheduledDeps = ScheduleData::InvalidDeps;
5495 IsScheduled = false;
5496 }
5497
5498 /// Gets the edge information.
5499 const EdgeInfo &getEdgeInfo() const { return EI; }
5500
5501 /// Gets the bundle.
5502 ScheduleBundle &getBundle() { return Bundle; }
5503 const ScheduleBundle &getBundle() const { return Bundle; }
5504
5505#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5506 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5507
5508 LLVM_DUMP_METHOD void dump() const {
5509 dump(dbgs());
5510 dbgs() << '\n';
5511 }
5512#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5513
5514 private:
5515 /// true, if it has valid dependency information. These nodes always have
5516 /// only single dependency.
5517 int Dependencies = ScheduleData::InvalidDeps;
5518
5519 /// The number of dependencies minus the number of dependencies of scheduled
5520 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5521 /// for scheduling.
5522 /// Note that this is negative as long as Dependencies is not calculated.
5523 int UnscheduledDeps = ScheduleData::InvalidDeps;
5524 };
5525
5526#ifndef NDEBUG
5527 friend inline raw_ostream &
5528 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5529 SD.dump(OS);
5530 return OS;
5531 }
5532#endif
5533
5534 friend struct GraphTraits<BoUpSLP *>;
5535 friend struct DOTGraphTraits<BoUpSLP *>;
5536
5537 /// Contains all scheduling data for a basic block.
5538 /// It does not schedules instructions, which are not memory read/write
5539 /// instructions and their operands are either constants, or arguments, or
5540 /// phis, or instructions from others blocks, or their users are phis or from
5541 /// the other blocks. The resulting vector instructions can be placed at the
5542 /// beginning of the basic block without scheduling (if operands does not need
5543 /// to be scheduled) or at the end of the block (if users are outside of the
5544 /// block). It allows to save some compile time and memory used by the
5545 /// compiler.
5546 /// ScheduleData is assigned for each instruction in between the boundaries of
5547 /// the tree entry, even for those, which are not part of the graph. It is
5548 /// required to correctly follow the dependencies between the instructions and
5549 /// their correct scheduling. The ScheduleData is not allocated for the
5550 /// instructions, which do not require scheduling, like phis, nodes with
5551 /// extractelements/insertelements only or nodes with instructions, with
5552 /// uses/operands outside of the block.
5553 struct BlockScheduling {
5554 BlockScheduling(BasicBlock *BB)
5555 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5556
5557 void clear() {
5558 ScheduledBundles.clear();
5559 ScheduledBundlesList.clear();
5560 ScheduleCopyableDataMap.clear();
5561 ScheduleCopyableDataMapByInst.clear();
5562 ScheduleCopyableDataMapByInstUser.clear();
5563 ScheduleCopyableDataMapByUsers.clear();
5564 ReadyInsts.clear();
5565 ScheduleStart = nullptr;
5566 ScheduleEnd = nullptr;
5567 FirstLoadStoreInRegion = nullptr;
5568 LastLoadStoreInRegion = nullptr;
5569 RegionHasStackSave = false;
5570
5571 // Reduce the maximum schedule region size by the size of the
5572 // previous scheduling run.
5573 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5574 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5575 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5576 ScheduleRegionSize = 0;
5577
5578 // Make a new scheduling region, i.e. all existing ScheduleData is not
5579 // in the new region yet.
5580 ++SchedulingRegionID;
5581 }
5582
5583 ScheduleData *getScheduleData(Instruction *I) {
5584 if (!I)
5585 return nullptr;
5586 if (BB != I->getParent())
5587 // Avoid lookup if can't possibly be in map.
5588 return nullptr;
5589 ScheduleData *SD = ScheduleDataMap.lookup(I);
5590 if (SD && isInSchedulingRegion(*SD))
5591 return SD;
5592 return nullptr;
5593 }
5594
5595 ScheduleData *getScheduleData(Value *V) {
5596 return getScheduleData(dyn_cast<Instruction>(V));
5597 }
5598
5599 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5600 /// operand number) and value.
5601 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5602 const Value *V) const {
5603 if (ScheduleCopyableDataMap.empty())
5604 return nullptr;
5605 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5606 if (It == ScheduleCopyableDataMap.end())
5607 return nullptr;
5608 ScheduleCopyableData *SD = It->getSecond().get();
5609 if (!isInSchedulingRegion(*SD))
5610 return nullptr;
5611 return SD;
5612 }
5613
5614 /// Returns the ScheduleCopyableData for the given user \p User, operand
5615 /// number and operand \p V.
5617 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5618 const Value *V) {
5619 if (ScheduleCopyableDataMapByInstUser.empty())
5620 return {};
5621 const auto It = ScheduleCopyableDataMapByInstUser.find(
5622 std::make_pair(std::make_pair(User, OperandIdx), V));
5623 if (It == ScheduleCopyableDataMapByInstUser.end())
5624 return {};
5626 for (ScheduleCopyableData *SD : It->getSecond()) {
5627 if (isInSchedulingRegion(*SD))
5628 Res.push_back(SD);
5629 }
5630 return Res;
5631 }
5632
5633 /// Returns true if all operands of the given instruction \p User are
5634 /// replaced by copyable data.
5635 /// \param User The user instruction.
5636 /// \param Op The operand, which might be replaced by the copyable data.
5637 /// \param SLP The SLP tree.
5638 /// \param NumOps The number of operands used. If the instruction uses the
5639 /// same operand several times, check for the first use, then the second,
5640 /// etc.
5641 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5642 Instruction *Op, BoUpSLP &SLP,
5643 unsigned NumOps) const {
5644 assert(NumOps > 0 && "No operands");
5645 if (ScheduleCopyableDataMap.empty())
5646 return false;
5647 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5648 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5649 if (Entries.empty())
5650 return false;
5651 unsigned CurNumOps = 0;
5652 for (const Use &U : User->operands()) {
5653 if (U.get() != Op)
5654 continue;
5655 ++CurNumOps;
5656 // Check all tree entries, if they have operands replaced by copyable
5657 // data.
5658 for (TreeEntry *TE : Entries) {
5659 unsigned Inc = 0;
5660 bool IsNonSchedulableWithParentPhiNode =
5661 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5662 TE->UserTreeIndex.UserTE->hasState() &&
5663 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5664 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5665 // Count the number of unique phi nodes, which are the parent for
5666 // parent entry, and exit, if all the unique phis are processed.
5667 if (IsNonSchedulableWithParentPhiNode) {
5668 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5669 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5670 for (Value *V : ParentTE->Scalars) {
5671 auto *PHI = dyn_cast<PHINode>(V);
5672 if (!PHI)
5673 continue;
5674 if (ParentsUniqueUsers.insert(PHI).second &&
5675 is_contained(PHI->incoming_values(), User))
5676 ++Inc;
5677 }
5678 } else {
5679 Inc = count(TE->Scalars, User);
5680 }
5681
5682 // Check if the user is commutative.
5683 // The commutatives are handled later, as their operands can be
5684 // reordered.
5685 // Same applies even for non-commutative cmps, because we can invert
5686 // their predicate potentially and, thus, reorder the operands.
5687 bool IsCommutativeUser =
5688 ::isCommutative(User) &&
5689 ::isCommutableOperand(User, User, U.getOperandNo());
5690 if (!IsCommutativeUser) {
5691 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
5692 IsCommutativeUser =
5693 ::isCommutative(MainOp, User) &&
5694 ::isCommutableOperand(MainOp, User, U.getOperandNo());
5695 }
5696 // The commutative user with the same operands can be safely
5697 // considered as non-commutative, operands reordering does not change
5698 // the semantics.
5699 assert(
5700 (!IsCommutativeUser ||
5701 (((::isCommutative(User) &&
5702 ::isCommutableOperand(User, User, 0) &&
5703 ::isCommutableOperand(User, User, 1)) ||
5704 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5705 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5706 User, 0) &&
5707 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5708 User, 1))))) &&
5709 "Expected commutative user with 2 first commutable operands");
5710 bool IsCommutativeWithSameOps =
5711 IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
5712 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5713 !isa<CmpInst>(User)) {
5714 EdgeInfo EI(TE, U.getOperandNo());
5715 if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
5716 continue;
5717 return false;
5718 }
5719 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5720 .first->getSecond() += Inc;
5721 }
5722 }
5723 if (PotentiallyReorderedEntriesCount.empty())
5724 return true;
5725 // Check the commutative/cmp entries.
5726 for (auto &P : PotentiallyReorderedEntriesCount) {
5727 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5728 bool IsNonSchedulableWithParentPhiNode =
5729 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5730 P.first->UserTreeIndex.UserTE->hasState() &&
5731 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5732 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5733 auto *It = find(P.first->Scalars, User);
5734 do {
5735 assert(It != P.first->Scalars.end() &&
5736 "User is not in the tree entry");
5737 int Lane = std::distance(P.first->Scalars.begin(), It);
5738 assert(Lane >= 0 && "Lane is not found");
5739 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5740 Lane = P.first->ReorderIndices[Lane];
5741 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5742 "Couldn't find extract lane");
5743 // Count the number of unique phi nodes, which are the parent for
5744 // parent entry, and exit, if all the unique phis are processed.
5745 if (IsNonSchedulableWithParentPhiNode) {
5746 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5747 Value *User = ParentTE->Scalars[Lane];
5748 if (!ParentsUniqueUsers.insert(User).second) {
5749 It =
5750 find(make_range(std::next(It), P.first->Scalars.end()), User);
5751 continue;
5752 }
5753 }
5754 for (unsigned OpIdx :
5756 P.first->getMainOp()))) {
5757 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5758 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5759 --P.getSecond();
5760 }
5761 // If parent node is schedulable, it will be handled correctly.
5762 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5763 } while (It != P.first->Scalars.end());
5764 }
5765 return all_of(PotentiallyReorderedEntriesCount,
5766 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5767 return P.second == NumOps - 1;
5768 });
5769 }
5770
5772 getScheduleCopyableData(const Instruction *I) const {
5773 if (ScheduleCopyableDataMapByInst.empty())
5774 return {};
5775 const auto It = ScheduleCopyableDataMapByInst.find(I);
5776 if (It == ScheduleCopyableDataMapByInst.end())
5777 return {};
5779 for (ScheduleCopyableData *SD : It->getSecond()) {
5780 if (isInSchedulingRegion(*SD))
5781 Res.push_back(SD);
5782 }
5783 return Res;
5784 }
5785
5787 getScheduleCopyableDataUsers(const Instruction *User) const {
5788 if (ScheduleCopyableDataMapByUsers.empty())
5789 return {};
5790 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5791 if (It == ScheduleCopyableDataMapByUsers.end())
5792 return {};
5794 for (ScheduleCopyableData *SD : It->getSecond()) {
5795 if (isInSchedulingRegion(*SD))
5796 Res.push_back(SD);
5797 }
5798 return Res;
5799 }
5800
5801 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5802 Instruction *I,
5803 int SchedulingRegionID,
5804 ScheduleBundle &Bundle) {
5805 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5806 ScheduleCopyableData *CD =
5807 ScheduleCopyableDataMap
5808 .try_emplace(std::make_pair(EI, I),
5809 std::make_unique<ScheduleCopyableData>(
5810 SchedulingRegionID, I, EI, Bundle))
5811 .first->getSecond()
5812 .get();
5813 ScheduleCopyableDataMapByInst[I].push_back(CD);
5814 if (EI.UserTE) {
5815 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5816 const auto *It = find(Op, I);
5817 assert(It != Op.end() && "Lane not set");
5818 SmallPtrSet<Instruction *, 4> Visited;
5819 do {
5820 int Lane = std::distance(Op.begin(), It);
5821 assert(Lane >= 0 && "Lane not set");
5822 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5823 !EI.UserTE->ReorderIndices.empty())
5824 Lane = EI.UserTE->ReorderIndices[Lane];
5825 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5826 "Couldn't find extract lane");
5827 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5828 if (!Visited.insert(In).second) {
5829 It = find(make_range(std::next(It), Op.end()), I);
5830 continue;
5831 }
5832 ScheduleCopyableDataMapByInstUser
5833 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5834 .first->getSecond()
5835 .push_back(CD);
5836 ScheduleCopyableDataMapByUsers.try_emplace(I)
5837 .first->getSecond()
5838 .insert(CD);
5839 // Remove extra deps for users, becoming non-immediate users of the
5840 // instruction. It may happen, if the chain of same copyable elements
5841 // appears in the tree.
5842 if (In == I) {
5843 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5844 if (ScheduleCopyableData *UserCD =
5845 getScheduleCopyableData(UserEI, In))
5846 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5847 }
5848 It = find(make_range(std::next(It), Op.end()), I);
5849 } while (It != Op.end());
5850 } else {
5851 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5852 CD);
5853 }
5854 return *CD;
5855 }
5856
5857 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5858 auto *I = dyn_cast<Instruction>(V);
5859 if (!I)
5860 return {};
5861 auto It = ScheduledBundles.find(I);
5862 if (It == ScheduledBundles.end())
5863 return {};
5864 return It->getSecond();
5865 }
5866
5867 /// Returns true if the entity is in the scheduling region.
5868 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5869 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5870 return Data->getSchedulingRegionID() == SchedulingRegionID;
5871 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5872 return CD->getSchedulingRegionID() == SchedulingRegionID;
5873 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5874 [&](const ScheduleEntity *BundleMember) {
5875 return isInSchedulingRegion(*BundleMember);
5876 });
5877 }
5878
5879 /// Marks an instruction as scheduled and puts all dependent ready
5880 /// instructions into the ready-list.
5881 template <typename ReadyListType>
5882 void schedule(const BoUpSLP &R, const InstructionsState &S,
5883 const EdgeInfo &EI, ScheduleEntity *Data,
5884 ReadyListType &ReadyList) {
5885 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5887 // Handle the def-use chain dependencies.
5888
5889 // Decrement the unscheduled counter and insert to ready list if ready.
5890 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5891 if ((IsControl || Data->hasValidDependencies()) &&
5892 Data->incrementUnscheduledDeps(-1) == 0) {
5893 // There are no more unscheduled dependencies after
5894 // decrementing, so we can put the dependent instruction
5895 // into the ready list.
5896 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5898 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5899 CopyableBundle.push_back(&CD->getBundle());
5900 Bundles = CopyableBundle;
5901 } else {
5902 Bundles = getScheduleBundles(Data->getInst());
5903 }
5904 if (!Bundles.empty()) {
5905 for (ScheduleBundle *Bundle : Bundles) {
5906 if (Bundle->unscheduledDepsInBundle() == 0) {
5907 assert(!Bundle->isScheduled() &&
5908 "already scheduled bundle gets ready");
5909 ReadyList.insert(Bundle);
5911 << "SLP: gets ready: " << *Bundle << "\n");
5912 }
5913 }
5914 return;
5915 }
5916 assert(!Data->isScheduled() &&
5917 "already scheduled bundle gets ready");
5919 "Expected non-copyable data");
5920 ReadyList.insert(Data);
5921 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5922 }
5923 };
5924
5925 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5926 Instruction *I) {
5927 if (!ScheduleCopyableDataMap.empty()) {
5929 getScheduleCopyableData(User, OpIdx, I);
5930 for (ScheduleCopyableData *CD : CopyableData)
5931 DecrUnsched(CD, /*IsControl=*/false);
5932 if (!CopyableData.empty())
5933 return;
5934 }
5935 if (ScheduleData *OpSD = getScheduleData(I))
5936 DecrUnsched(OpSD, /*IsControl=*/false);
5937 };
5938
5939 // If BundleMember is a vector bundle, its operands may have been
5940 // reordered during buildTree(). We therefore need to get its operands
5941 // through the TreeEntry.
5942 if (!Bundles.empty()) {
5943 auto *In = BundleMember->getInst();
5944 // Count uses of each instruction operand.
5945 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5946 unsigned TotalOpCount = 0;
5947 if (isa<ScheduleCopyableData>(BundleMember)) {
5948 // Copyable data is used only once (uses itself).
5949 TotalOpCount = OperandsUses[In] = 1;
5950 } else {
5951 for (const Use &U : In->operands()) {
5952 if (auto *I = dyn_cast<Instruction>(U.get())) {
5953 auto Res = OperandsUses.try_emplace(I, 0);
5954 unsigned ExtraDeps = 1;
5955 // Count all expanded operands in the binops.
5956 for (ScheduleBundle *Bundle : Bundles) {
5957 if (const TreeEntry *TE = Bundle->getTreeEntry()) {
5958 if (TE->isExpandedBinOp(In))
5959 ++ExtraDeps;
5960 } else if (S.isExpandedBinOp(In)) {
5961 ++ExtraDeps;
5962 }
5963 }
5964 Res.first->getSecond() += ExtraDeps;
5965 TotalOpCount += ExtraDeps;
5966 }
5967 }
5968 }
5969 // Decrement the unscheduled counter and insert to ready list if
5970 // ready.
5971 auto DecrUnschedForInst =
5972 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5973 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5974 &Checked,
5975 bool IsExpandedOperand = false) {
5976 if (!ScheduleCopyableDataMap.empty()) {
5977 const EdgeInfo EI = {UserTE, OpIdx};
5978 if (ScheduleCopyableData *CD =
5979 getScheduleCopyableData(EI, I)) {
5980 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5981 return;
5982 DecrUnsched(CD, /*IsControl=*/false);
5983 return;
5984 }
5985 }
5986 auto It = OperandsUses.find(I);
5987 assert(It != OperandsUses.end() && "Operand not found");
5988 if (It->second > 0) {
5989 if (ScheduleData *OpSD = getScheduleData(I)) {
5990 if (!IsExpandedOperand &&
5991 !Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5992 return;
5993 --It->getSecond();
5994 assert(TotalOpCount > 0 && "No more operands to decrement");
5995 --TotalOpCount;
5996 DecrUnsched(OpSD, /*IsControl=*/false);
5997 } else {
5998 --It->getSecond();
5999 assert(TotalOpCount > 0 && "No more operands to decrement");
6000 --TotalOpCount;
6001 }
6002 }
6003 };
6004
6005 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
6006 for (ScheduleBundle *Bundle : Bundles) {
6007 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
6008 break;
6009 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
6010 // Need to search for the lane since the tree entry can be
6011 // reordered.
6012 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
6013 bool IsNonSchedulableWithParentPhiNode =
6014 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
6015 Bundle->getTreeEntry()->UserTreeIndex &&
6016 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
6017 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
6018 TreeEntry::SplitVectorize &&
6019 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
6020 Instruction::PHI;
6021 do {
6022 int Lane =
6023 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
6024 assert(Lane >= 0 && "Lane not set");
6025 if (isa<StoreInst>(In) &&
6026 !Bundle->getTreeEntry()->ReorderIndices.empty())
6027 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
6028 assert(Lane < static_cast<int>(
6029 Bundle->getTreeEntry()->Scalars.size()) &&
6030 "Couldn't find extract lane");
6031
6032 // Since vectorization tree is being built recursively this
6033 // assertion ensures that the tree entry has all operands set
6034 // before reaching this code. Couple of exceptions known at the
6035 // moment are extracts where their second (immediate) operand is
6036 // not added. Since immediates do not affect scheduler behavior
6037 // this is considered okay.
6038 assert(
6039 In &&
6041 In->getNumOperands() ==
6042 Bundle->getTreeEntry()->getNumOperands() ||
6043 (isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
6044 Instruction::Select) ||
6045 Bundle->getTreeEntry()->isCopyableElement(In)) &&
6046 "Missed TreeEntry operands?");
6047
6048 // Count the number of unique phi nodes, which are the parent for
6049 // parent entry, and exit, if all the unique phis are processed.
6050 if (IsNonSchedulableWithParentPhiNode) {
6051 const TreeEntry *ParentTE =
6052 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
6053 Value *User = ParentTE->Scalars[Lane];
6054 if (!ParentsUniqueUsers.insert(User).second) {
6055 It = std::find(std::next(It),
6056 Bundle->getTreeEntry()->Scalars.end(), In);
6057 continue;
6058 }
6059 }
6060
6061 for (unsigned OpIdx :
6062 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
6063 if (auto *I = dyn_cast<Instruction>(
6064 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
6065 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
6066 << *I << "\n");
6067 DecrUnschedForInst(
6068 I, Bundle->getTreeEntry(), OpIdx, Checked,
6069 Bundle->getTreeEntry()->isExpandedOperand(In, OpIdx));
6070 }
6071 // If parent node is schedulable, it will be handled correctly.
6072 if (Bundle->getTreeEntry()->isCopyableElement(In))
6073 break;
6074 It = std::find(std::next(It),
6075 Bundle->getTreeEntry()->Scalars.end(), In);
6076 } while (It != Bundle->getTreeEntry()->Scalars.end());
6077 }
6078 } else {
6079 // If BundleMember is a stand-alone instruction, no operand reordering
6080 // has taken place, so we directly access its operands.
6081 for (Use &U : BundleMember->getInst()->operands()) {
6082 if (auto *I = dyn_cast<Instruction>(U.get())) {
6084 << "SLP: check for readiness (def): " << *I << "\n");
6085 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
6086 }
6087 }
6088 }
6089 // Handle the memory dependencies.
6090 auto *SD = dyn_cast<ScheduleData>(BundleMember);
6091 if (!SD)
6092 return;
6093 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
6094 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
6095 if (!VisitedMemory.insert(MemoryDep).second)
6096 continue;
6097 // There are no more unscheduled dependencies after decrementing,
6098 // so we can put the dependent instruction into the ready list.
6099 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
6100 << *MemoryDep << "\n");
6101 DecrUnsched(MemoryDep);
6102 }
6103 // Handle the control dependencies.
6104 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
6105 for (ScheduleData *Dep : SD->getControlDependencies()) {
6106 if (!VisitedControl.insert(Dep).second)
6107 continue;
6108 // There are no more unscheduled dependencies after decrementing,
6109 // so we can put the dependent instruction into the ready list.
6111 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
6112 DecrUnsched(Dep, /*IsControl=*/true);
6113 }
6114 };
6115 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
6116 SD->setScheduled(/*Scheduled=*/true);
6117 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
6120 Instruction *In = SD->getInst();
6121 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
6122 if (!Entries.empty()) {
6123 for (TreeEntry *TE : Entries) {
6125 In->getNumOperands() != TE->getNumOperands())
6126 continue;
6127 auto &BundlePtr =
6128 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
6129 BundlePtr->setTreeEntry(TE);
6130 BundlePtr->add(SD);
6131 Bundles.push_back(BundlePtr.get());
6132 }
6133 }
6134 ProcessBundleMember(SD, Bundles);
6135 } else {
6136 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
6137 Bundle.setScheduled(/*Scheduled=*/true);
6138 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
6139 auto AreAllBundlesScheduled =
6140 [&](const ScheduleEntity *SD,
6141 ArrayRef<ScheduleBundle *> SDBundles) {
6143 return true;
6144 return !SDBundles.empty() &&
6145 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
6146 return SDBundle->isScheduled();
6147 });
6148 };
6149 for (ScheduleEntity *SD : Bundle.getBundle()) {
6152 SDBundles = getScheduleBundles(SD->getInst());
6153 if (AreAllBundlesScheduled(SD, SDBundles)) {
6154 SD->setScheduled(/*Scheduled=*/true);
6155 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
6156 : SDBundles);
6157 }
6158 }
6159 }
6160 }
6161
6162 /// Verify basic self consistency properties of the data structure.
6163 void verify() {
6164 if (!ScheduleStart)
6165 return;
6166
6167 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
6168 ScheduleStart->comesBefore(ScheduleEnd) &&
6169 "Not a valid scheduling region?");
6170
6171 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
6172 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
6173 if (!Bundles.empty()) {
6174 for (ScheduleBundle *Bundle : Bundles) {
6175 assert(isInSchedulingRegion(*Bundle) &&
6176 "primary schedule data not in window?");
6177 Bundle->verify();
6178 }
6179 continue;
6180 }
6181 auto *SD = getScheduleData(I);
6182 if (!SD)
6183 continue;
6184 assert(isInSchedulingRegion(*SD) &&
6185 "primary schedule data not in window?");
6186 SD->verify();
6187 }
6188
6189 assert(all_of(ReadyInsts,
6190 [](const ScheduleEntity *Bundle) {
6191 return Bundle->isReady();
6192 }) &&
6193 "item in ready list not ready?");
6194 }
6195
6196 /// Put all instructions into the ReadyList which are ready for scheduling.
6197 template <typename ReadyListType>
6198 void initialFillReadyList(ReadyListType &ReadyList) {
6199 SmallPtrSet<ScheduleBundle *, 16> Visited;
6200 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
6201 ScheduleData *SD = getScheduleData(I);
6202 if (SD && SD->hasValidDependencies() && SD->isReady()) {
6203 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
6204 !Bundles.empty()) {
6205 for (ScheduleBundle *Bundle : Bundles) {
6206 if (!Visited.insert(Bundle).second)
6207 continue;
6208 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
6209 ReadyList.insert(Bundle);
6210 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
6211 << *Bundle << "\n");
6212 }
6213 }
6214 continue;
6215 }
6216 ReadyList.insert(SD);
6218 << "SLP: initially in ready list: " << *SD << "\n");
6219 }
6220 }
6221 }
6222
6223 /// Build a bundle from the ScheduleData nodes corresponding to the
6224 /// scalar instruction for each lane.
6225 /// \param VL The list of scalar instructions.
6226 /// \param S The state of the instructions.
6227 /// \param EI The edge in the SLP graph or the user node/operand number.
6228 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
6229 const InstructionsState &S, const EdgeInfo &EI);
6230
6231 /// Checks if a bundle of instructions can be scheduled, i.e. has no
6232 /// cyclic dependencies. This is only a dry-run, no instructions are
6233 /// actually moved at this stage.
6234 /// \returns the scheduling bundle. The returned Optional value is not
6235 /// std::nullopt if \p VL is allowed to be scheduled.
6236 std::optional<ScheduleBundle *>
6237 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
6238 const InstructionsState &S, const EdgeInfo &EI);
6239
6240 /// Allocates schedule data chunk.
6241 ScheduleData *allocateScheduleDataChunks();
6242
6243 /// Extends the scheduling region so that V is inside the region.
6244 /// \returns true if the region size is within the limit.
6245 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
6246
6247 /// Initialize the ScheduleData structures for new instructions in the
6248 /// scheduling region.
6249 void initScheduleData(Instruction *FromI, Instruction *ToI,
6250 ScheduleData *PrevLoadStore,
6251 ScheduleData *NextLoadStore);
6252
6253 /// Updates the dependency information of a bundle and of all instructions/
6254 /// bundles which depend on the original bundle.
6255 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
6256 BoUpSLP *SLP,
6257 const SmallPtrSetImpl<Value *> &ExpandedOps,
6258 ArrayRef<ScheduleData *> ControlDeps = {});
6259
6260 /// Sets all instruction in the scheduling region to un-scheduled.
6261 void resetSchedule();
6262
6263 BasicBlock *BB;
6264
6265 /// Simple memory allocation for ScheduleData.
6267
6268 /// The size of a ScheduleData array in ScheduleDataChunks.
6269 int ChunkSize;
6270
6271 /// The allocator position in the current chunk, which is the last entry
6272 /// of ScheduleDataChunks.
6273 int ChunkPos;
6274
6275 /// Attaches ScheduleData to Instruction.
6276 /// Note that the mapping survives during all vectorization iterations, i.e.
6277 /// ScheduleData structures are recycled.
6278 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6279
6280 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6281 /// number) and the operand instruction, represented as copyable element.
6282 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6283 std::unique_ptr<ScheduleCopyableData>>
6284 ScheduleCopyableDataMap;
6285
6286 /// Represents mapping between instruction and all related
6287 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6288 /// element). The SLP tree may contain several representations of the same
6289 /// instruction.
6290 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6291 ScheduleCopyableDataMapByInst;
6292
6293 /// Represents mapping between user value and operand number, the operand
6294 /// value and all related ScheduleCopyableData. The relation is 1:n, because
6295 /// the same user may refernce the same operand in different tree entries
6296 /// and the operand may be modelled by the different copyable data element.
6297 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
6299 ScheduleCopyableDataMapByInstUser;
6300
6301 /// Represents mapping between instruction and all related
6302 /// ScheduleCopyableData. It represents the mapping between the actual
6303 /// instruction and the last copyable data element in the chain. E.g., if
6304 /// the graph models the following instructions:
6305 /// %0 = non-add instruction ...
6306 /// ...
6307 /// %4 = add %3, 1
6308 /// %5 = add %4, 1
6309 /// %6 = insertelement poison, %0, 0
6310 /// %7 = insertelement %6, %5, 1
6311 /// And the graph is modeled as:
6312 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6313 /// -> [1, 0] -> [%1, 0]
6314 ///
6315 /// this map will map %0 only to the copyable element <1>, which is the last
6316 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6317 /// keep the map to <0>, not the %0.
6318 SmallDenseMap<const Instruction *,
6319 SmallSetVector<ScheduleCopyableData *, 4>>
6320 ScheduleCopyableDataMapByUsers;
6321
6322 /// Attaches ScheduleBundle to Instruction.
6323 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6324 ScheduledBundles;
6325 /// The list of ScheduleBundles.
6326 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6327
6328 /// The ready-list for scheduling (only used for the dry-run).
6329 SetVector<ScheduleEntity *> ReadyInsts;
6330
6331 /// The first instruction of the scheduling region.
6332 Instruction *ScheduleStart = nullptr;
6333
6334 /// The first instruction _after_ the scheduling region.
6335 Instruction *ScheduleEnd = nullptr;
6336
6337 /// The first memory accessing instruction in the scheduling region
6338 /// (can be null).
6339 ScheduleData *FirstLoadStoreInRegion = nullptr;
6340
6341 /// The last memory accessing instruction in the scheduling region
6342 /// (can be null).
6343 ScheduleData *LastLoadStoreInRegion = nullptr;
6344
6345 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6346 /// region? Used to optimize the dependence calculation for the
6347 /// common case where there isn't.
6348 bool RegionHasStackSave = false;
6349
6350 /// The current size of the scheduling region.
6351 int ScheduleRegionSize = 0;
6352
6353 /// The maximum size allowed for the scheduling region.
6354 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6355
6356 /// The ID of the scheduling region. For a new vectorization iteration this
6357 /// is incremented which "removes" all ScheduleData from the region.
6358 /// Make sure that the initial SchedulingRegionID is greater than the
6359 /// initial SchedulingRegionID in ScheduleData (which is 0).
6360 int SchedulingRegionID = 1;
6361 };
6362
6363 /// Attaches the BlockScheduling structures to basic blocks.
6364 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6365
6366 /// Performs the "real" scheduling. Done before vectorization is actually
6367 /// performed in a basic block.
6368 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6369
6370 /// List of users to ignore during scheduling and that don't need extracting.
6371 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6372
6373 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6374 /// sorted SmallVectors of unsigned.
6375 struct OrdersTypeDenseMapInfo {
6376 static OrdersType getEmptyKey() {
6377 OrdersType V;
6378 V.push_back(~1U);
6379 return V;
6380 }
6381
6382 static OrdersType getTombstoneKey() {
6383 OrdersType V;
6384 V.push_back(~2U);
6385 return V;
6386 }
6387
6388 static unsigned getHashValue(const OrdersType &V) {
6389 return static_cast<unsigned>(hash_combine_range(V));
6390 }
6391
6392 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6393 return LHS == RHS;
6394 }
6395 };
6396
6397 // Analysis and block reference.
6398 Function *F;
6399 ScalarEvolution *SE;
6400 TargetTransformInfo *TTI;
6401 TargetLibraryInfo *TLI;
6402 LoopInfo *LI;
6403 DominatorTree *DT;
6404 AssumptionCache *AC;
6405 DemandedBits *DB;
6406 const DataLayout *DL;
6407 OptimizationRemarkEmitter *ORE;
6408
6409 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6410 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6411
6412 /// Instruction builder to construct the vectorized tree.
6413 IRBuilder<TargetFolder> Builder;
6414
6415 /// A map of scalar integer values to the smallest bit width with which they
6416 /// can legally be represented. The values map to (width, signed) pairs,
6417 /// where "width" indicates the minimum bit width and "signed" is True if the
6418 /// value must be signed-extended, rather than zero-extended, back to its
6419 /// original width.
6420 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6421
6422 /// Final size of the reduced vector, if the current graph represents the
6423 /// input for the reduction and it was possible to narrow the size of the
6424 /// reduction.
6425 unsigned ReductionBitWidth = 0;
6426
6427 /// Canonical graph size before the transformations.
6428 unsigned BaseGraphSize = 1;
6429
6430 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6431 /// type sizes, used in the tree.
6432 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6433
6434 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6435 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6436 DenseSet<unsigned> ExtraBitWidthNodes;
6437};
6438
6439template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6443 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6444 SecondInfo::getEmptyKey());
6445 }
6446
6448 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6449 SecondInfo::getTombstoneKey());
6450 }
6451
6452 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6453 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6454 SecondInfo::getHashValue(Val.EdgeIdx));
6455 }
6456
6457 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6458 const BoUpSLP::EdgeInfo &RHS) {
6459 return LHS == RHS;
6460 }
6461};
6462
6463template <> struct llvm::GraphTraits<BoUpSLP *> {
6464 using TreeEntry = BoUpSLP::TreeEntry;
6465
6466 /// NodeRef has to be a pointer per the GraphWriter.
6468
6469 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6470
6471 /// Add the VectorizableTree to the index iterator to be able to return
6472 /// TreeEntry pointers.
6474 : public iterator_adaptor_base<
6475 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6477
6481
6482 NodeRef operator*() { return I->UserTE; }
6483 };
6484
6486 return R.VectorizableTree[0].get();
6487 }
6488
6490 return {&N->UserTreeIndex, N->Container};
6491 }
6492
6494 return {&N->UserTreeIndex + 1, N->Container};
6495 }
6496
6497 /// For the node iterator we just need to turn the TreeEntry iterator into a
6498 /// TreeEntry* iterator so that it dereferences to NodeRef.
6500 using ItTy = ContainerTy::iterator;
6501 ItTy It;
6502
6503 public:
6504 nodes_iterator(const ItTy &It2) : It(It2) {}
6505 NodeRef operator*() { return It->get(); }
6507 ++It;
6508 return *this;
6509 }
6510 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6511 };
6512
6514 return nodes_iterator(R->VectorizableTree.begin());
6515 }
6516
6518 return nodes_iterator(R->VectorizableTree.end());
6519 }
6520
6521 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6522};
6523
6524template <>
6526 using TreeEntry = BoUpSLP::TreeEntry;
6527
6528 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6529
6530 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6531 std::string Str;
6532 raw_string_ostream OS(Str);
6533 OS << Entry->Idx << ".\n";
6534 if (isSplat(Entry->Scalars))
6535 OS << "<splat> ";
6536 for (auto *V : Entry->Scalars) {
6537 OS << *V;
6538 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6539 return EU.Scalar == V;
6540 }))
6541 OS << " <extract>";
6542 OS << "\n";
6543 }
6544 return Str;
6545 }
6546
6547 static std::string getNodeAttributes(const TreeEntry *Entry,
6548 const BoUpSLP *) {
6549 if (Entry->isGather())
6550 return "color=red";
6551 if (Entry->State == TreeEntry::ScatterVectorize ||
6552 Entry->State == TreeEntry::StridedVectorize ||
6553 Entry->State == TreeEntry::CompressVectorize)
6554 return "color=blue";
6555 return "";
6556 }
6557};
6558
6561 for (auto *I : DeletedInstructions) {
6562 if (!I->getParent()) {
6563 // Temporarily insert instruction back to erase them from parent and
6564 // memory later.
6565 if (isa<PHINode>(I))
6566 // Phi nodes must be the very first instructions in the block.
6567 I->insertBefore(F->getEntryBlock(),
6568 F->getEntryBlock().getFirstNonPHIIt());
6569 else
6570 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6571 continue;
6572 }
6573 for (Use &U : I->operands()) {
6574 auto *Op = dyn_cast<Instruction>(U.get());
6575 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6577 DeadInsts.emplace_back(Op);
6578 }
6579 I->dropAllReferences();
6580 }
6581 for (auto *I : DeletedInstructions) {
6582 assert(I->use_empty() &&
6583 "trying to erase instruction with users.");
6584 I->eraseFromParent();
6585 }
6586
6587 // Cleanup any dead scalar code feeding the vectorized instructions
6589
6590#ifdef EXPENSIVE_CHECKS
6591 // If we could guarantee that this call is not extremely slow, we could
6592 // remove the ifdef limitation (see PR47712).
6593 assert(!verifyFunction(*F, &dbgs()));
6594#endif
6595}
6596
6597/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6598/// contains original mask for the scalars reused in the node. Procedure
6599/// transform this mask in accordance with the given \p Mask.
6601 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6602 "Expected non-empty mask.");
6603 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6604 Prev.swap(Reuses);
6605 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6606 if (Mask[I] != PoisonMaskElem)
6607 Reuses[Mask[I]] = Prev[I];
6608}
6609
6610/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6611/// the original order of the scalars. Procedure transforms the provided order
6612/// in accordance with the given \p Mask. If the resulting \p Order is just an
6613/// identity order, \p Order is cleared.
6615 bool BottomOrder = false) {
6616 assert(!Mask.empty() && "Expected non-empty mask.");
6617 unsigned Sz = Mask.size();
6618 if (BottomOrder) {
6619 SmallVector<unsigned> PrevOrder;
6620 if (Order.empty()) {
6621 PrevOrder.resize(Sz);
6622 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6623 } else {
6624 PrevOrder.swap(Order);
6625 }
6626 Order.assign(Sz, Sz);
6627 for (unsigned I = 0; I < Sz; ++I)
6628 if (Mask[I] != PoisonMaskElem)
6629 Order[I] = PrevOrder[Mask[I]];
6630 if (all_of(enumerate(Order), [&](const auto &Data) {
6631 return Data.value() == Sz || Data.index() == Data.value();
6632 })) {
6633 Order.clear();
6634 return;
6635 }
6636 fixupOrderingIndices(Order);
6637 return;
6638 }
6639 SmallVector<int> MaskOrder;
6640 if (Order.empty()) {
6641 MaskOrder.resize(Sz);
6642 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6643 } else {
6644 inversePermutation(Order, MaskOrder);
6645 }
6646 reorderReuses(MaskOrder, Mask);
6647 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6648 Order.clear();
6649 return;
6650 }
6651 Order.assign(Sz, Sz);
6652 for (unsigned I = 0; I < Sz; ++I)
6653 if (MaskOrder[I] != PoisonMaskElem)
6654 Order[MaskOrder[I]] = I;
6655 fixupOrderingIndices(Order);
6656}
6657
6658std::optional<BoUpSLP::OrdersType>
6659BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6660 bool TopToBottom, bool IgnoreReorder) {
6661 assert(TE.isGather() && "Expected gather node only.");
6662 // Try to find subvector extract/insert patterns and reorder only such
6663 // patterns.
6664 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6665 Type *ScalarTy = GatheredScalars.front()->getType();
6666 size_t NumScalars = GatheredScalars.size();
6667 if (!isValidElementType(ScalarTy))
6668 return std::nullopt;
6669 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6670 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy, NumScalars);
6671 SmallVector<int> ExtractMask;
6672 SmallVector<int> Mask;
6675 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6677 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6678 /*ForOrder=*/true);
6679 // No shuffled operands - ignore.
6680 if (GatherShuffles.empty() && ExtractShuffles.empty())
6681 return std::nullopt;
6682 OrdersType CurrentOrder(NumScalars, NumScalars);
6683 if (GatherShuffles.size() == 1 &&
6684 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6685 Entries.front().front()->isSame(TE.Scalars)) {
6686 // If the full matched node in whole tree rotation - no need to consider the
6687 // matching order, rotating the whole tree.
6688 if (TopToBottom)
6689 return std::nullopt;
6690 // No need to keep the order for the same user node.
6691 if (Entries.front().front()->UserTreeIndex.UserTE ==
6692 TE.UserTreeIndex.UserTE)
6693 return std::nullopt;
6694 // No need to keep the order for the matched root node, if it can be freely
6695 // reordered.
6696 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6697 return std::nullopt;
6698 // If shuffling 2 elements only and the matching node has reverse reuses -
6699 // no need to count order, both work fine.
6700 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6701 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6702 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6703 [](const auto &P) {
6704 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6705 }))
6706 return std::nullopt;
6707
6708 // Perfect match in the graph, will reuse the previously vectorized
6709 // node. Cost is 0.
6710 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6711 return CurrentOrder;
6712 }
6713 auto IsSplatMask = [](ArrayRef<int> Mask) {
6714 int SingleElt = PoisonMaskElem;
6715 return all_of(Mask, [&](int I) {
6716 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6717 SingleElt = I;
6718 return I == PoisonMaskElem || I == SingleElt;
6719 });
6720 };
6721 // Exclusive broadcast mask - ignore.
6722 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6723 (Entries.size() != 1 ||
6724 Entries.front().front()->ReorderIndices.empty())) ||
6725 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6726 return std::nullopt;
6727 SmallBitVector ShuffledSubMasks(NumParts);
6728 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6729 ArrayRef<int> Mask, int PartSz, int NumParts,
6730 function_ref<unsigned(unsigned)> GetVF) {
6731 for (int I : seq<int>(NumParts)) {
6732 if (ShuffledSubMasks.test(I))
6733 continue;
6734 const int VF = GetVF(I);
6735 if (VF == 0)
6736 continue;
6737 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6738 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6739 // Shuffle of at least 2 vectors - ignore.
6740 if (any_of(Slice, not_equal_to(NumScalars))) {
6741 llvm::fill(Slice, NumScalars);
6742 ShuffledSubMasks.set(I);
6743 continue;
6744 }
6745 // Try to include as much elements from the mask as possible.
6746 int FirstMin = INT_MAX;
6747 int SecondVecFound = false;
6748 for (int K : seq<int>(Limit)) {
6749 int Idx = Mask[I * PartSz + K];
6750 if (Idx == PoisonMaskElem) {
6751 Value *V = GatheredScalars[I * PartSz + K];
6752 if (isConstant(V) && !isa<PoisonValue>(V)) {
6753 SecondVecFound = true;
6754 break;
6755 }
6756 continue;
6757 }
6758 if (Idx < VF) {
6759 if (FirstMin > Idx)
6760 FirstMin = Idx;
6761 } else {
6762 SecondVecFound = true;
6763 break;
6764 }
6765 }
6766 FirstMin = (FirstMin / PartSz) * PartSz;
6767 // Shuffle of at least 2 vectors - ignore.
6768 if (SecondVecFound) {
6769 llvm::fill(Slice, NumScalars);
6770 ShuffledSubMasks.set(I);
6771 continue;
6772 }
6773 for (int K : seq<int>(Limit)) {
6774 int Idx = Mask[I * PartSz + K];
6775 if (Idx == PoisonMaskElem)
6776 continue;
6777 Idx -= FirstMin;
6778 if (Idx >= PartSz) {
6779 // Cross-part / second-vector reference: this slice cannot be
6780 // ordered as a single first-vector permutation, give up.
6781 SecondVecFound = true;
6782 break;
6783 }
6784 // For the last partial slice, Limit < PartSz and Idx in [Limit,
6785 // PartSz) addresses the unused padded tail (no scalar at that
6786 // position). Skip the write but keep ordering the remaining K's.
6787 if (static_cast<unsigned>(I * PartSz + Idx) >= CurrentOrder.size())
6788 continue;
6789 if (CurrentOrder[I * PartSz + Idx] >
6790 static_cast<unsigned>(I * PartSz + K) &&
6791 CurrentOrder[I * PartSz + Idx] !=
6792 static_cast<unsigned>(I * PartSz + Idx))
6793 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6794 }
6795 // Shuffle of at least 2 vectors - ignore.
6796 if (SecondVecFound) {
6797 llvm::fill(Slice, NumScalars);
6798 ShuffledSubMasks.set(I);
6799 continue;
6800 }
6801 }
6802 };
6803 int PartSz = getPartNumElems(NumScalars, NumParts);
6804 if (!ExtractShuffles.empty())
6805 TransformMaskToOrder(
6806 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6807 if (I >= ExtractShuffles.size() || !ExtractShuffles[I])
6808 return 0U;
6809 unsigned VF = 0;
6810 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6811 for (unsigned Idx : seq<unsigned>(Sz)) {
6812 int K = I * PartSz + Idx;
6813 if (static_cast<unsigned>(K) >= ExtractMask.size())
6814 break;
6815 if (ExtractMask[K] == PoisonMaskElem)
6816 continue;
6817 if (!TE.ReuseShuffleIndices.empty())
6818 K = TE.ReuseShuffleIndices[K];
6819 if (K == PoisonMaskElem)
6820 continue;
6821 if (!TE.ReorderIndices.empty())
6822 K = std::distance(TE.ReorderIndices.begin(),
6823 find(TE.ReorderIndices, K));
6824 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6825 if (!EI)
6826 continue;
6827 VF = std::max(VF, EI->getVectorOperandType()
6828 ->getElementCount()
6829 .getKnownMinValue());
6830 }
6831 return VF;
6832 });
6833 // Check special corner case - single shuffle of the same entry.
6834 if (GatherShuffles.size() == 1 && NumParts != 1) {
6835 if (ShuffledSubMasks.any())
6836 return std::nullopt;
6837 PartSz = NumScalars;
6838 NumParts = 1;
6839 }
6840 if (!Entries.empty())
6841 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6842 if (I >= GatherShuffles.size() || !GatherShuffles[I])
6843 return 0U;
6844 return std::max(Entries[I].front()->getVectorFactor(),
6845 Entries[I].back()->getVectorFactor());
6846 });
6847 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6848 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6849 return std::nullopt;
6850 return std::move(CurrentOrder);
6851}
6852
6853static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6854 const TargetLibraryInfo &TLI,
6855 bool CompareOpcodes = true) {
6858 return false;
6859 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6860 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6861 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6862 (!GEP2 || GEP2->getNumOperands() == 2) &&
6863 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6864 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6865 !CompareOpcodes ||
6866 (GEP1 && GEP2 &&
6867 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6868}
6869
6870/// Calculates minimal alignment as a common alignment.
6871template <typename T>
6873 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6874 for (Value *V : VL)
6875 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6876 return CommonAlignment;
6877}
6878
6879/// Check if \p Order represents reverse order.
6881 assert(!Order.empty() &&
6882 "Order is empty. Please check it before using isReverseOrder.");
6883 unsigned Sz = Order.size();
6884 return all_of(enumerate(Order), [&](const auto &Pair) {
6885 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6886 });
6887}
6888
6889/// Checks if the provided list of pointers \p Pointers represents the strided
6890/// pointers for type ElemTy. If they are not, nullptr is returned.
6891/// Otherwise, SCEV* of the stride value is returned.
6892/// If `PointerOps` can be rearanged into the following sequence:
6893/// ```
6894/// %x + c_0 * stride,
6895/// %x + c_1 * stride,
6896/// %x + c_2 * stride
6897/// ...
6898/// ```
6899/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6900/// and the SCEV of the `stride` will be returned.
6901static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6902 const DataLayout &DL, ScalarEvolution &SE,
6903 SmallVectorImpl<unsigned> &SortedIndices,
6904 SmallVectorImpl<int64_t> &Coeffs) {
6905 assert(Coeffs.size() == PointerOps.size() &&
6906 "Coeffs vector needs to be of correct size");
6908 const SCEV *PtrSCEVLowest = nullptr;
6909 const SCEV *PtrSCEVHighest = nullptr;
6910 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6911 // addresses).
6912 for (Value *Ptr : PointerOps) {
6913 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6914 if (!PtrSCEV)
6915 return nullptr;
6916 SCEVs.push_back(PtrSCEV);
6917 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6918 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6919 continue;
6920 }
6921 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6922 if (isa<SCEVCouldNotCompute>(Diff))
6923 return nullptr;
6924 if (Diff->isNonConstantNegative()) {
6925 PtrSCEVLowest = PtrSCEV;
6926 continue;
6927 }
6928 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6929 if (isa<SCEVCouldNotCompute>(Diff1))
6930 return nullptr;
6931 if (Diff1->isNonConstantNegative()) {
6932 PtrSCEVHighest = PtrSCEV;
6933 continue;
6934 }
6935 }
6936 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6937 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6938 if (isa<SCEVCouldNotCompute>(Dist))
6939 return nullptr;
6940 int Size = DL.getTypeStoreSize(ElemTy);
6941 auto TryGetStride = [&](const SCEV *Dist,
6942 const SCEV *Multiplier) -> const SCEV * {
6943 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6944 if (M->getOperand(0) == Multiplier)
6945 return M->getOperand(1);
6946 if (M->getOperand(1) == Multiplier)
6947 return M->getOperand(0);
6948 return nullptr;
6949 }
6950 if (Multiplier == Dist)
6951 return SE.getConstant(Dist->getType(), 1);
6952 return SE.getUDivExactExpr(Dist, Multiplier);
6953 };
6954 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6955 const SCEV *Stride = nullptr;
6956 if (Size != 1 || SCEVs.size() > 1) {
6957 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6958 Stride = TryGetStride(Dist, Sz);
6959 if (!Stride)
6960 return nullptr;
6961 }
6962 if (!Stride || isa<SCEVConstant>(Stride))
6963 return nullptr;
6964 // Iterate through all pointers and check if all distances are
6965 // unique multiple of Stride.
6966 using DistOrdPair = std::pair<int64_t, int>;
6967 auto Compare = llvm::less_first();
6968 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6969 int Cnt = 0;
6970 bool IsConsecutive = true;
6971 for (const auto [Idx, PtrSCEV] : enumerate(SCEVs)) {
6972 unsigned Dist = 0;
6973 if (PtrSCEV != PtrSCEVLowest) {
6974 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6975 const SCEV *Coeff = TryGetStride(Diff, Stride);
6976 if (!Coeff)
6977 return nullptr;
6978 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6979 if (!SC || isa<SCEVCouldNotCompute>(SC))
6980 return nullptr;
6981 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6982 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6983 SE.getMulExpr(Stride, SC)))
6984 ->isZero())
6985 return nullptr;
6986 Dist = SC->getAPInt().getZExtValue();
6987 } else {
6988 Coeffs[Idx] = 0;
6989 }
6990 // If the strides are not the same or repeated, we can't vectorize.
6991 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6992 return nullptr;
6993 auto Res = Offsets.emplace(Dist, Cnt);
6994 if (!Res.second)
6995 return nullptr;
6996 // Consecutive order if the inserted element is the last one.
6997 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6998 ++Cnt;
6999 }
7000 if (Offsets.size() != SCEVs.size())
7001 return nullptr;
7002 SortedIndices.clear();
7003 if (!IsConsecutive) {
7004 // Fill SortedIndices array only if it is non-consecutive.
7005 SortedIndices.resize(PointerOps.size());
7006 Cnt = 0;
7007 for (const std::pair<int64_t, int> &Pair : Offsets) {
7008 SortedIndices[Cnt] = Pair.second;
7009 ++Cnt;
7010 }
7011 }
7012 return Stride;
7013}
7014
7015static std::pair<InstructionCost, InstructionCost>
7017 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7018 Type *ScalarTy, VectorType *VecTy);
7019
7020/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7021/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7022/// subvector pattern.
7023static InstructionCost
7025 VectorType *Tp, ArrayRef<int> Mask = {},
7027 int Index = 0, VectorType *SubTp = nullptr,
7029 VectorType *DstTy = Tp;
7030 if (!Mask.empty())
7031 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
7032
7033 if (Kind != TTI::SK_PermuteTwoSrc)
7034 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
7035 Args);
7036 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7037 int NumSubElts;
7039 Mask, NumSrcElts, NumSubElts, Index)) {
7040 if (Index + NumSubElts > NumSrcElts &&
7041 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7042 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
7043 TTI::TCK_RecipThroughput, Index, Tp);
7044 }
7045 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
7046 Args);
7047}
7048
7049/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
7050/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
7051/// instead of a scalar.
7053 const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty,
7054 const APInt &DemandedElts, bool Insert, bool Extract,
7055 TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
7056 ArrayRef<Value *> VL = {},
7059 "ScalableVectorType is not supported.");
7060 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
7061 getNumElements(Ty) &&
7062 "Incorrect usage.");
7063 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
7064 assert(SLPReVec && "Only supported by REVEC.");
7065 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
7066 // of CreateInsertElement.
7067 unsigned ScalarTyNumElements = VecTy->getNumElements();
7068 InstructionCost Cost = 0;
7069 for (unsigned I : seq(DemandedElts.getBitWidth())) {
7070 if (!DemandedElts[I])
7071 continue;
7072 if (Insert)
7074 I * ScalarTyNumElements, VecTy);
7075 if (Extract)
7077 I * ScalarTyNumElements, VecTy);
7078 }
7079 return Cost;
7080 }
7081 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
7082 CostKind, ForPoisonSrc, VL, VIC);
7083}
7084
7085/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
7086/// is a FixedVectorType, a vector will be extracted instead of a scalar.
7088 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
7089 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
7090 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
7091 if (Opcode == Instruction::ExtractElement) {
7092 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
7093 assert(SLPReVec && "Only supported by REVEC.");
7094 assert(isa<VectorType>(Val) && "Val must be a vector type.");
7096 cast<VectorType>(Val), {}, CostKind,
7097 Index * VecTy->getNumElements(), VecTy);
7098 }
7099 }
7100 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
7101 ScalarUserAndIdx);
7102}
7103
7104/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
7105/// is a FixedVectorType, a vector will be extracted instead of a scalar.
7107 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
7108 VectorType *VecTy, unsigned Index,
7110 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
7111 assert(SLPReVec && "Only supported by REVEC.");
7112 auto *SubTp =
7113 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
7115 Index * ScalarTy->getNumElements(), SubTp) +
7116 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
7117 CostKind);
7118 }
7119 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
7120}
7121
7122/// Creates subvector insert. Generates shuffle using \p Generator or
7123/// using default shuffle.
7125 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
7126 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
7127 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
7128 return Vec;
7129 const unsigned SubVecVF = getNumElements(V->getType());
7130 // Create shuffle, insertvector requires that index is multiple of
7131 // the subvector length.
7132 const unsigned VecVF = getNumElements(Vec->getType());
7134 if (isa<PoisonValue>(Vec)) {
7135 auto *Begin = std::next(Mask.begin(), Index);
7136 std::iota(Begin, std::next(Begin, SubVecVF), 0);
7137 Vec = Builder.CreateShuffleVector(V, Mask);
7138 return Vec;
7139 }
7140 std::iota(Mask.begin(), Mask.end(), 0);
7141 std::iota(std::next(Mask.begin(), Index),
7142 std::next(Mask.begin(), Index + SubVecVF), VecVF);
7143 if (Generator)
7144 return Generator(Vec, V, Mask);
7145 // 1. Resize V to the size of Vec.
7146 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
7147 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
7148 V = Builder.CreateShuffleVector(V, ResizeMask);
7149 // 2. Insert V into Vec.
7150 return Builder.CreateShuffleVector(Vec, V, Mask);
7151}
7152
7153/// Generates subvector extract using \p Generator or using default shuffle.
7155 unsigned SubVecVF, unsigned Index) {
7156 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
7157 std::iota(Mask.begin(), Mask.end(), Index);
7158 return Builder.CreateShuffleVector(Vec, Mask);
7159}
7160
7161/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
7162/// with \p Order.
7163/// \return true if the mask represents strided access, false - otherwise.
7165 ArrayRef<unsigned> Order, Type *ScalarTy,
7166 const DataLayout &DL, ScalarEvolution &SE,
7167 SmallVectorImpl<int> &CompressMask) {
7168 const unsigned Sz = PointerOps.size();
7169 CompressMask.assign(Sz, PoisonMaskElem);
7170 // The first element always set.
7171 CompressMask[0] = 0;
7172 // Check if the mask represents strided access.
7173 std::optional<unsigned> Stride = 0;
7174 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
7175 for (unsigned I : seq<unsigned>(1, Sz)) {
7176 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
7177 std::optional<int64_t> OptPos =
7178 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
7179 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
7180 return false;
7181 unsigned Pos = static_cast<unsigned>(*OptPos);
7182 CompressMask[I] = Pos;
7183 if (!Stride)
7184 continue;
7185 if (*Stride == 0) {
7186 *Stride = Pos;
7187 continue;
7188 }
7189 if (Pos != *Stride * I)
7190 Stride.reset();
7191 }
7192 return Stride.has_value();
7193}
7194
7195/// Checks if the \p VL can be transformed to a (masked)load + compress or
7196/// (masked) interleaved load.
7198 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
7201 const DominatorTree &DT, const TargetLibraryInfo &TLI,
7202 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
7203 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
7204 VectorType *&LoadVecTy) {
7205 InterleaveFactor = 0;
7206 Type *ScalarTy = VL.front()->getType();
7207 const size_t Sz = VL.size();
7208 auto *VecTy = getWidenedType(ScalarTy, Sz);
7210 SmallVector<int> Mask;
7211 if (!Order.empty())
7212 inversePermutation(Order, Mask);
7213 // Check external uses.
7214 for (const auto [I, V] : enumerate(VL)) {
7215 if (AreAllUsersVectorized(V))
7216 continue;
7217 InstructionCost ExtractCost =
7218 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
7219 Mask.empty() ? I : Mask[I]);
7220 InstructionCost ScalarCost =
7221 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
7222 if (ExtractCost <= ScalarCost)
7223 return false;
7224 }
7225 Value *Ptr0;
7226 Value *PtrN;
7227 if (Order.empty()) {
7228 Ptr0 = PointerOps.front();
7229 PtrN = PointerOps.back();
7230 } else {
7231 Ptr0 = PointerOps[Order.front()];
7232 PtrN = PointerOps[Order.back()];
7233 }
7234 std::optional<int64_t> Diff =
7235 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
7236 if (!Diff)
7237 return false;
7238 const size_t MaxRegSize =
7240 .getFixedValue();
7241 // Check for very large distances between elements.
7242 if (*Diff / Sz >= MaxRegSize / 8)
7243 return false;
7244 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
7245 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
7246 Align CommonAlignment = LI->getAlign();
7247 IsMasked = !isSafeToLoadUnconditionally(
7248 Ptr0, LoadVecTy, CommonAlignment, DL,
7249 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
7250 &TLI);
7251 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
7252 LI->getPointerAddressSpace()))
7253 return false;
7254 // TODO: perform the analysis of each scalar load for better
7255 // safe-load-unconditionally analysis.
7256 bool IsStrided =
7257 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
7258 assert(CompressMask.size() >= 2 && "At least two elements are required");
7259 SmallVector<Value *> OrderedPointerOps(PointerOps);
7260 if (!Order.empty())
7261 reorderScalars(OrderedPointerOps, Mask);
7262 auto [ScalarGEPCost, VectorGEPCost] =
7263 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
7264 Instruction::Load, CostKind, ScalarTy, LoadVecTy);
7265 // The cost of scalar loads.
7266 InstructionCost ScalarLoadsCost =
7268 [&](InstructionCost C, Value *V) {
7269 return C + TTI.getInstructionCost(cast<Instruction>(V),
7270 CostKind);
7271 }) +
7272 ScalarGEPCost;
7273 APInt DemandedElts = APInt::getAllOnes(Sz);
7274 InstructionCost GatherCost =
7275 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7276 /*Insert=*/true,
7277 /*Extract=*/false, CostKind) +
7278 ScalarLoadsCost;
7279 InstructionCost LoadCost = 0;
7280 if (IsMasked) {
7281 LoadCost = TTI.getMemIntrinsicInstrCost(
7282 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
7283 CommonAlignment,
7284 LI->getPointerAddressSpace()),
7285 CostKind);
7286 } else {
7287 LoadCost =
7288 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7289 LI->getPointerAddressSpace(), CostKind);
7290 }
7291 if (IsStrided && !IsMasked && Order.empty()) {
7292 // Check for potential segmented(interleaved) loads.
7293 VectorType *AlignedLoadVecTy = getWidenedType(
7294 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
7295 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
7296 DL, cast<LoadInst>(VL.back()), &AC, &DT,
7297 &TLI))
7298 AlignedLoadVecTy = LoadVecTy;
7299 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7300 CommonAlignment,
7301 LI->getPointerAddressSpace())) {
7302 InstructionCost InterleavedCost =
7303 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7304 Instruction::Load, AlignedLoadVecTy,
7305 CompressMask[1], {}, CommonAlignment,
7306 LI->getPointerAddressSpace(), CostKind, IsMasked);
7307 if (InterleavedCost < GatherCost) {
7308 InterleaveFactor = CompressMask[1];
7309 LoadVecTy = AlignedLoadVecTy;
7310 return true;
7311 }
7312 }
7313 }
7314 InstructionCost CompressCost = ::getShuffleCost(
7315 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
7316 if (!Order.empty()) {
7317 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7318 for (unsigned I : seq<unsigned>(Sz)) {
7319 NewMask[I] = CompressMask[Mask[I]];
7320 }
7321 CompressMask.swap(NewMask);
7322 }
7323 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7324 return TotalVecCost < GatherCost;
7325}
7326
7327/// Checks if the \p VL can be transformed to a (masked)load + compress or
7328/// (masked) interleaved load.
7329static bool
7332 const DataLayout &DL, ScalarEvolution &SE,
7333 AssumptionCache &AC, const DominatorTree &DT,
7334 const TargetLibraryInfo &TLI,
7335 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7336 bool IsMasked;
7337 unsigned InterleaveFactor;
7338 SmallVector<int> CompressMask;
7339 VectorType *LoadVecTy;
7340 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7341 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7342 CompressMask, LoadVecTy);
7343}
7344
7345/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7346/// PointerOps:
7347/// 1. Target with strided load support is detected.
7348/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7349/// potential stride <= MaxProfitableStride and the potential stride is
7350/// power-of-2 (to avoid perf regressions for the very small number of loads)
7351/// and max distance > number of loads, or potential stride is -1.
7352/// 3. The loads are ordered, or number of unordered loads <=
7353/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7354/// to avoid extra costs for very expensive shuffles).
7355/// 4. Any pointer operand is an instruction with the users outside of the
7356/// current graph (for masked gathers extra extractelement instructions
7357/// might be required).
7359 Align Alignment, const int64_t Diff,
7360 const size_t Sz) const {
7361 if (Diff % (Sz - 1) != 0)
7362 return false;
7363
7364 // Try to generate strided load node.
7365 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
7366 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
7367 return !isVectorized(U) && !MustGather.contains(U);
7368 });
7369 });
7370
7371 const uint64_t AbsoluteDiff = std::abs(Diff);
7372 auto *VecTy = getWidenedType(ScalarTy, Sz);
7373 if (IsAnyPointerUsedOutGraph ||
7374 (AbsoluteDiff > Sz &&
7376 (AbsoluteDiff <= MaxProfitableStride * Sz && AbsoluteDiff % Sz == 0 &&
7377 has_single_bit(AbsoluteDiff / Sz)))) ||
7378 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7379 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7380 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7381 return false;
7382 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7383 return false;
7384 return true;
7385 }
7386 return false;
7387}
7388
7390 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7391 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7392 Value *Ptr0, StridedPtrInfo &SPtrInfo) const {
7393 const size_t Sz = PointerOps.size();
7394 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7395 // Go through `PointerOps` in sorted order and record offsets from
7396 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7397 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7398 // PointerOps[0]. This is safe since only offset differences are used below.
7399 for (unsigned I : seq<unsigned>(Sz)) {
7400 Value *Ptr =
7401 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7402 std::optional<int64_t> Offset =
7403 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr, *DL, *SE);
7404 assert(Offset && "sortPtrAccesses should have validated this pointer");
7405 SortedOffsetsFromBase[I] = *Offset;
7406 }
7407
7408 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7409 // ```
7410 // [
7411 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7412 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7413 // ...
7414 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7415 // GroupSize - 1}), // last group
7416 // ]
7417 // ```
7418 // The distance between consecutive elements within each group should all be
7419 // the same `StrideWithinGroup`. The distance between the first elements of
7420 // consecutive groups should all be the same `StrideBetweenGroups`.
7421
7422 int64_t StrideWithinGroup =
7423 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7424 // Determine size of the first group. Later we will check that all other
7425 // groups have the same size.
7426 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7427 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7428 StrideWithinGroup;
7429 };
7430 auto Indices = seq<unsigned>(1, Sz);
7431 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7432 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7433
7434 unsigned VecSz = Sz;
7435 Type *NewScalarTy = ScalarTy;
7436
7437 // Quick detour: at this point we can say what the type of strided load would
7438 // be if all the checks pass. Check if this type is legal for the target.
7439 bool NeedsWidening = Sz != GroupSize;
7440 const uint64_t UnitBitWidth = DL->getTypeSizeInBits(ScalarTy).getFixedValue();
7441 if (NeedsWidening) {
7442 if (Sz % GroupSize != 0)
7443 return false;
7444
7445 if (StrideWithinGroup != 1)
7446 return false;
7447 VecSz = Sz / GroupSize;
7448 NewScalarTy = Type::getIntNTy(SE->getContext(), UnitBitWidth * GroupSize);
7449 } else if (ScalarTy->isVectorTy()) {
7450 NewScalarTy = Type::getIntNTy(SE->getContext(), UnitBitWidth);
7451 }
7452
7453 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7454 return false;
7455
7456 int64_t StrideIntVal = StrideWithinGroup;
7457 if (NeedsWidening) {
7458 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7459 // Check that the strides between groups are all the same.
7460 unsigned CurrentGroupStartIdx = GroupSize;
7461 int64_t StrideBetweenGroups =
7462 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7463 StrideIntVal = StrideBetweenGroups;
7464 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7465 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7466 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7467 StrideBetweenGroups)
7468 return false;
7469 }
7470
7471 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7472 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7473 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7474 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7475 return GroupEndIdx - StartIdx == GroupSize;
7476 };
7477 for (unsigned I = 0; I < Sz; I += GroupSize) {
7478 if (!CheckGroup(I))
7479 return false;
7480 }
7481 }
7482
7483 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7484 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
7485 SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
7486 return true;
7487}
7488
7490 Type *BaseTy, Align CommonAlignment,
7491 SmallVectorImpl<unsigned> &SortedIndices,
7492 StridedPtrInfo &SPtrInfo,
7493 bool IsLoad) const {
7494 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7495 // is constant, we partition `PointerOps` sequence into subsequences of
7496 // pointers with the same offset. For each offset we record values from
7497 // `PointerOps` and their indicies in `PointerOps`.
7499 OffsetToPointerOpIdxMap;
7500 for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7501 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7502 if (!PtrSCEV)
7503 return false;
7504
7505 const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7506 int64_t Offset = 0;
7507 if (Add) {
7508 // `Offset` is non-zero.
7509 for (int I : seq<int>(Add->getNumOperands())) {
7510 const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7511 if (!SC)
7512 continue;
7513 Offset = SC->getAPInt().getSExtValue();
7514 if (Offset >= std::numeric_limits<int64_t>::max() - 1) {
7515 Offset = 0;
7516 continue;
7517 }
7518 break;
7519 }
7520 }
7521 OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7522 OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7523 }
7524 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7525
7526 // Quick detour: at this point we can say what the type of strided load would
7527 // be if all the checks pass. Check if this type is legal for the target.
7528 const unsigned Sz = PointerOps.size();
7529 unsigned VecSz = Sz;
7530 Type *NewScalarTy = BaseTy;
7531 if (NumOffsets > 1) {
7532 if (Sz % NumOffsets != 0)
7533 return false;
7534 VecSz = Sz / NumOffsets;
7535 }
7536 if (NumOffsets > 1 || BaseTy->isVectorTy())
7537 NewScalarTy = Type::getIntNTy(
7538 SE->getContext(),
7539 DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
7540 FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
7541 unsigned MinProfitableStridedOps =
7543 const unsigned BaseTyNumElts = getNumElements(BaseTy);
7544 if (Sz * BaseTyNumElts < MinProfitableStridedOps ||
7545 !TTI->isTypeLegal(StridedLoadTy) ||
7546 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7547 return false;
7548
7549 // Check if the offsets are contiguous and that each group has the required
7550 // size.
7551 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7552 for (auto [Idx, MapPair] : enumerate(OffsetToPointerOpIdxMap)) {
7553 if (MapPair.second.first.size() != VecSz)
7554 return false;
7555 SortedOffsetsV[Idx] = MapPair.first;
7556 }
7557 sort(SortedOffsetsV);
7558
7559 if (NumOffsets > 1) {
7560 int64_t BaseBytes = DL->getTypeStoreSize(BaseTy);
7561 for (int I : seq<int>(1, SortedOffsetsV.size())) {
7562 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != BaseBytes)
7563 return false;
7564 }
7565 }
7566
7567 // Introduce some notation for the explanations below. Let `PointerOps_j`
7568 // denote the subsequence of `PointerOps` with offsets equal to
7569 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7570 // ```
7571 // PointerOps_j[SortedIndices_j[0]],
7572 // PointerOps_j[SortedIndices_j[1]],
7573 // PointerOps_j[SortedIndices_j[2]],
7574 // ...
7575 // ```
7576 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7577 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7578 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7579 // The entire sorted `PointerOps` looks like this:
7580 // ```
7581 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7582 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7583 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7584 // ...
7585 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7586 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7587 //
7588 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7589 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7590 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7591 // ...
7592 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7593 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7594 //
7595 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7596 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7597 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7598 // ...
7599 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7600 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7601 // ...
7602 // ...
7603 // ...
7604 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7605 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7606 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7607 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7608 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7609 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7610 // ...
7611 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7612 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7613 // ```
7614 // In order to be able to generate a strided load, we need the following
7615 // checks to pass:
7616 //
7617 // (1) for each `PointerOps_j` check that the distance
7618 // between adjacent pointers are all equal to the same value (stride).
7619 // (2) for each `PointerOps_j` check that coefficients calculated by
7620 // `calculateRtStride` are all the same.
7621 //
7622 // As we do that, also calculate SortedIndices. Since we should not modify
7623 // `SortedIndices` unless we know that all the checks succeed, record the
7624 // indicies into `SortedIndicesDraft`.
7625 SmallVector<unsigned> SortedIndicesDraft(Sz);
7626
7627 // Given sorted indices for a particular offset (as calculated by
7628 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7629 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7630 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7631 // \param `IndicesInAllPointerOps` vector of indices of the
7632 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7633 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7634 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7635 auto UpdateSortedIndices =
7636 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7637 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7638 if (SortedIndicesForOffset.empty()) {
7639 SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7640 std::iota(SortedIndicesForOffset.begin(),
7641 SortedIndicesForOffset.end(), 0);
7642 }
7643 for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7644 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7645 IndicesInAllPointerOps[Idx];
7646 }
7647 };
7648
7649 int64_t LowestOffset = SortedOffsetsV[0];
7650 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7651
7652 SmallVector<int64_t> Coeffs0(VecSz);
7653 SmallVector<unsigned> SortedIndicesForOffset0;
7654 const SCEV *Stride0 = calculateRtStride(PointerOps0, BaseTy, *DL, *SE,
7655 SortedIndicesForOffset0, Coeffs0);
7656 if (!Stride0)
7657 return false;
7658 unsigned NumCoeffs0 = Coeffs0.size();
7659 if (NumCoeffs0 * NumOffsets != Sz)
7660 return false;
7661 sort(Coeffs0);
7662
7663 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7664 OffsetToPointerOpIdxMap[LowestOffset].second;
7665 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7666
7667 // Now that we know what the common stride and coefficients has to be check
7668 // the remaining `PointerOps_j`.
7669 SmallVector<int64_t> Coeffs;
7670 SmallVector<unsigned> SortedIndicesForOffset;
7671 for (int J : seq<int>(1, NumOffsets)) {
7672 Coeffs.clear();
7673 Coeffs.resize(VecSz);
7674 SortedIndicesForOffset.clear();
7675
7676 int64_t Offset = SortedOffsetsV[J];
7677 ArrayRef<Value *> PointerOpsForOffset =
7678 OffsetToPointerOpIdxMap[Offset].first;
7679 ArrayRef<unsigned> IndicesInAllPointerOps =
7680 OffsetToPointerOpIdxMap[Offset].second;
7681 const SCEV *StrideWithinGroup = calculateRtStride(
7682 PointerOpsForOffset, BaseTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
7683
7684 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7685 return false;
7686 if (Coeffs.size() != NumCoeffs0)
7687 return false;
7688 sort(Coeffs);
7689 if (Coeffs != Coeffs0)
7690 return false;
7691
7692 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7693 }
7694
7695 SortedIndices.clear();
7696 SortedIndices = std::move(SortedIndicesDraft);
7697 SPtrInfo.StrideSCEV = Stride0;
7698 SPtrInfo.Ty = StridedLoadTy;
7699 return true;
7700}
7701
7703 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7704 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7705 unsigned *BestVF, bool TryRecursiveCheck) const {
7706 // Check that a vectorized load would load the same memory as a scalar
7707 // load. For example, we don't want to vectorize loads that are smaller
7708 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7709 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7710 // from such a struct, we read/write packed bits disagreeing with the
7711 // unvectorized version.
7712 if (BestVF)
7713 *BestVF = 0;
7715 return LoadsState::Gather;
7716 Type *ScalarTy = VL0->getType();
7717
7718 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7719 return LoadsState::Gather;
7720
7721 // Make sure all loads in the bundle are simple - we can't vectorize
7722 // atomic or volatile loads.
7723 PointerOps.clear();
7724 const size_t Sz = VL.size();
7725 PointerOps.resize(Sz);
7726 auto *POIter = PointerOps.begin();
7727 for (Value *V : VL) {
7728 auto *L = dyn_cast<LoadInst>(V);
7729 if (!L || !L->isSimple())
7730 return LoadsState::Gather;
7731 *POIter = L->getPointerOperand();
7732 ++POIter;
7733 }
7734
7735 Order.clear();
7736 // Check the order of pointer operands or that all pointers are the same.
7737 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7738
7739 auto *VecTy = getWidenedType(ScalarTy, Sz);
7740 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7741 // Cache masked gather legality - both the !IsSorted path below and the
7742 // post-branch check use the same VecTy/CommonAlignment, and the underlying
7743 // TTI calls are virtual.
7744 std::optional<bool> MaskedGatherLegal;
7745 auto IsMaskedGatherLegal = [&] {
7746 if (!MaskedGatherLegal)
7747 MaskedGatherLegal =
7748 TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
7749 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment);
7750 return *MaskedGatherLegal;
7751 };
7752 if (!IsSorted) {
7753 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7754 SPtrInfo, /*isLoad=*/true))
7756
7757 if (!IsMaskedGatherLegal())
7758 return LoadsState::Gather;
7759
7760 if (!all_of(PointerOps, [&](Value *P) {
7761 return arePointersCompatible(P, PointerOps.front(), *TLI);
7762 }))
7763 return LoadsState::Gather;
7764
7765 } else {
7766 Value *Ptr0;
7767 Value *PtrN;
7768 if (Order.empty()) {
7769 Ptr0 = PointerOps.front();
7770 PtrN = PointerOps.back();
7771 } else {
7772 Ptr0 = PointerOps[Order.front()];
7773 PtrN = PointerOps[Order.back()];
7774 }
7775 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7776 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7777 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7778 std::optional<int64_t> Diff0 =
7779 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr0, *DL, *SE);
7780 std::optional<int64_t> DiffN =
7781 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, PtrN, *DL, *SE);
7782 assert(Diff0 && DiffN &&
7783 "sortPtrAccesses should have validated these pointers");
7784 int64_t Diff = *DiffN - *Diff0;
7785 // Check that the sorted loads are consecutive.
7786 if (static_cast<uint64_t>(Diff) == Sz - 1)
7787 return LoadsState::Vectorize;
7788 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7789 *TLI, [&](Value *V) {
7790 return areAllUsersVectorized(
7791 cast<Instruction>(V), UserIgnoreList);
7792 }))
7794 Align Alignment =
7795 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7796 ->getAlign();
7797 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7798 Diff, Ptr0, SPtrInfo))
7800 }
7801 if (!IsMaskedGatherLegal())
7802 return LoadsState::Gather;
7803 // Correctly identify compare the cost of loads + shuffles rather than
7804 // strided/masked gather loads. Returns true if vectorized + shuffles
7805 // representation is better than just gather.
7806 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7807 unsigned *BestVF,
7808 bool ProfitableGatherPointers) {
7809 if (BestVF)
7810 *BestVF = 0;
7811 // Compare masked gather cost and loads + insert subvector costs.
7813 auto [ScalarGEPCost, VectorGEPCost] =
7814 getGEPCosts(TTI, PointerOps, PointerOps.front(), Instruction::Load,
7815 CostKind, ScalarTy, VecTy);
7816 // Estimate the cost of masked gather GEP. If not a splat, roughly
7817 // estimate as a buildvector, otherwise estimate as splat.
7818 APInt DemandedElts = APInt::getAllOnes(Sz);
7819 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7820 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7821 // Cache the underlying object of PointerOps.front() - it is invariant
7822 // across the per-V comparisons below and getUnderlyingObject walks
7823 // GEP/cast chains.
7824 const Value *FrontUO = getUnderlyingObject(PointerOps.front());
7825 if (static_cast<unsigned>(count_if(
7826 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7827 any_of(PointerOps,
7828 [&](Value *V) { return getUnderlyingObject(V) != FrontUO; }))
7829 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7830 DemandedElts, /*Insert=*/true,
7831 /*Extract=*/false, CostKind);
7832 else
7833 VectorGEPCost +=
7835 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7836 /*Insert=*/true, /*Extract=*/false, CostKind) +
7837 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7838 // The cost of scalar loads.
7839 InstructionCost ScalarLoadsCost =
7841 [&](InstructionCost C, Value *V) {
7842 return C + TTI.getInstructionCost(cast<Instruction>(V),
7843 CostKind);
7844 }) +
7845 ScalarGEPCost;
7846 // The cost of masked gather.
7847 InstructionCost MaskedGatherCost =
7848 TTI.getMemIntrinsicInstrCost(
7849 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7851 /*VariableMask=*/false, CommonAlignment),
7852 CostKind) +
7853 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7854 InstructionCost GatherCost =
7855 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7856 /*Insert=*/true,
7857 /*Extract=*/false, CostKind) +
7858 ScalarLoadsCost;
7859 // The list of loads is small or perform partial check already - directly
7860 // compare masked gather cost and gather cost.
7861 constexpr unsigned ListLimit = 4;
7862 if (!TryRecursiveCheck || VL.size() < ListLimit)
7863 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7864
7865 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7866 unsigned MinVF = getMinVF(2 * Sz);
7867 DemandedElts.clearAllBits();
7868 // Iterate through possible vectorization factors and check if vectorized +
7869 // shuffles is better than just gather.
7870 for (unsigned VF =
7871 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7872 VF >= MinVF;
7873 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7875 for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) {
7876 const unsigned SliceVF = std::min(VF, End - Cnt);
7877 ArrayRef<Value *> Slice = VL.slice(Cnt, SliceVF);
7879 SmallVector<Value *> PointerOps;
7880 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7881 PointerOps, SPtrInfo, BestVF,
7882 /*TryRecursiveCheck=*/false);
7883 // Check that the sorted loads are consecutive.
7884 if (LS == LoadsState::Gather) {
7885 if (BestVF) {
7886 DemandedElts.setAllBits();
7887 break;
7888 }
7889 DemandedElts.setBits(Cnt, Cnt + SliceVF);
7890 continue;
7891 }
7892 // If need the reorder - consider as high-cost masked gather for now.
7893 if ((LS == LoadsState::Vectorize ||
7896 !Order.empty() && !isReverseOrder(Order))
7898 States.emplace_back(Cnt, LS);
7899 }
7900 if (DemandedElts.isAllOnes())
7901 // All loads gathered - try smaller VF.
7902 continue;
7903 // Can be vectorized later as a serie of loads/insertelements.
7904 InstructionCost VecLdCost = 0;
7905 if (!DemandedElts.isZero()) {
7906 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7907 /*Insert=*/true,
7908 /*Extract=*/false, CostKind) +
7909 ScalarGEPCost;
7910 for (unsigned Idx : seq<unsigned>(VL.size()))
7911 if (DemandedElts[Idx])
7912 VecLdCost +=
7913 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7914 }
7915 for (const auto &[SliceStart, LS] : States) {
7916 const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - SliceStart);
7917 auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
7918 auto *LI0 = cast<LoadInst>(VL[SliceStart]);
7919 InstructionCost VectorGEPCost =
7920 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7921 ? 0
7922 : getGEPCosts(TTI,
7923 ArrayRef(PointerOps).slice(SliceStart, SliceVF),
7924 LI0->getPointerOperand(), Instruction::Load,
7925 CostKind, ScalarTy, SubVecTy)
7926 .second;
7927 if (LS == LoadsState::ScatterVectorize) {
7928 if (static_cast<unsigned>(
7929 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7930 PointerOps.size() - 1 ||
7931 any_of(PointerOps, [&](Value *V) {
7932 return getUnderlyingObject(V) != FrontUO;
7933 }))
7934 VectorGEPCost += getScalarizationOverhead(
7935 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF),
7936 /*Insert=*/true, /*Extract=*/false, CostKind);
7937 else
7938 VectorGEPCost +=
7940 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0),
7941 /*Insert=*/true, /*Extract=*/false, CostKind) +
7942 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7943 CostKind);
7944 }
7945 switch (LS) {
7947 VecLdCost +=
7948 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7949 LI0->getPointerAddressSpace(), CostKind,
7951 VectorGEPCost;
7952 break;
7954 VecLdCost += TTI.getMemIntrinsicInstrCost(
7956 Intrinsic::experimental_vp_strided_load,
7957 SubVecTy, LI0->getPointerOperand(),
7958 /*VariableMask=*/false, CommonAlignment),
7959 CostKind) +
7960 VectorGEPCost;
7961 break;
7963 VecLdCost += TTI.getMemIntrinsicInstrCost(
7965 Intrinsic::masked_load, SubVecTy,
7966 CommonAlignment, LI0->getPointerAddressSpace()),
7967 CostKind) +
7969 {}, CostKind);
7970 break;
7972 VecLdCost += TTI.getMemIntrinsicInstrCost(
7974 Intrinsic::masked_gather, SubVecTy,
7975 LI0->getPointerOperand(),
7976 /*VariableMask=*/false, CommonAlignment),
7977 CostKind) +
7978 VectorGEPCost;
7979 break;
7980 case LoadsState::Gather:
7981 llvm_unreachable("Gathers are not added to States");
7982 }
7983 SmallVector<int> ShuffleMask(VL.size());
7984 const unsigned SliceIdx = SliceStart / VF;
7985 for (int Idx : seq<int>(VL.size()))
7986 ShuffleMask[Idx] = Idx / VF == SliceIdx ? VL.size() + Idx % VF : Idx;
7987 if (SliceStart > 0)
7988 VecLdCost +=
7989 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7990 CostKind, SliceStart, SubVecTy);
7991 }
7992 // If masked gather cost is higher - better to vectorize, so
7993 // consider it as a gather node. It will be better estimated
7994 // later.
7995 if (MaskedGatherCost >= VecLdCost &&
7996 VecLdCost - GatherCost < -SLPCostThreshold) {
7997 if (BestVF)
7998 *BestVF = VF;
7999 return true;
8000 }
8001 }
8002 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
8003 };
8004 // TODO: need to improve analysis of the pointers, if not all of them are
8005 // GEPs or have > 2 operands, we end up with a gather node, which just
8006 // increases the cost.
8007 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
8008 bool ProfitableGatherPointers =
8009 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
8010 return L->isLoopInvariant(V);
8011 })) <= Sz / 2;
8012 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
8014 return (!GEP && doesNotNeedToBeScheduled(P)) ||
8015 (GEP && GEP->getNumOperands() == 2 &&
8016 isa<Constant, Instruction>(GEP->getOperand(1)));
8017 })) {
8018 // Check if potential masked gather can be represented as series
8019 // of loads + insertsubvectors.
8020 // If masked gather cost is higher - better to vectorize, so
8021 // consider it as a gather node. It will be better estimated
8022 // later.
8023 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
8024 ProfitableGatherPointers))
8026 }
8027
8028 return LoadsState::Gather;
8029}
8030
8032 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
8033 const DataLayout &DL, ScalarEvolution &SE,
8034 SmallVectorImpl<unsigned> &SortedIndices) {
8035 assert(
8036 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
8037 "Expected list of pointer operands.");
8038 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
8039 // Ptr into, sort and return the sorted indices with values next to one
8040 // another.
8042 std::pair<BasicBlock *, Value *>,
8044 Bases;
8045 Bases
8046 .try_emplace(std::make_pair(
8048 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
8049
8050 SortedIndices.clear();
8051 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
8052 auto Key = std::make_pair(BBs[Cnt + 1],
8054 bool Found = any_of(Bases.try_emplace(Key).first->second,
8055 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
8056 std::optional<int64_t> Diff =
8057 getPointersDiff(ElemTy, std::get<0>(Base.front()),
8058 ElemTy, Ptr, DL, SE,
8059 /*StrictCheck=*/true);
8060 if (!Diff)
8061 return false;
8062
8063 Base.emplace_back(Ptr, *Diff, Cnt + 1);
8064 return true;
8065 });
8066
8067 if (!Found) {
8068 // If we haven't found enough to usefully cluster, return early.
8069 if (Bases.size() > VL.size() / 2 - 1)
8070 return false;
8071
8072 // Not found already - add a new Base
8073 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
8074 }
8075 }
8076
8077 if (Bases.size() == VL.size())
8078 return false;
8079
8080 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
8081 Bases.front().second.size() == VL.size()))
8082 return false;
8083
8084 // For each of the bases sort the pointers by Offset and check if any of the
8085 // base become consecutively allocated.
8086 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
8087 SmallPtrSet<Value *, 13> FirstPointers;
8088 SmallPtrSet<Value *, 13> SecondPointers;
8089 Value *P1 = Ptr1;
8090 Value *P2 = Ptr2;
8091 unsigned Depth = 0;
8092 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
8093 if (P1 == P2 || Depth > RecursionMaxDepth)
8094 return false;
8095 FirstPointers.insert(P1);
8096 SecondPointers.insert(P2);
8097 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
8098 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
8099 ++Depth;
8100 }
8101 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
8102 "Unable to find matching root.");
8103 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
8104 };
8105 for (auto &Base : Bases) {
8106 for (auto &Vec : Base.second) {
8107 if (Vec.size() > 1) {
8109 int64_t InitialOffset = std::get<1>(Vec[0]);
8110 bool AnyConsecutive =
8111 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
8112 return std::get<1>(P.value()) ==
8113 int64_t(P.index()) + InitialOffset;
8114 });
8115 // Fill SortedIndices array only if it looks worth-while to sort the
8116 // ptrs.
8117 if (!AnyConsecutive)
8118 return false;
8119 }
8120 }
8121 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
8122 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
8123 });
8124 }
8125
8126 for (auto &T : Bases)
8127 for (const auto &Vec : T.second)
8128 for (const auto &P : Vec)
8129 SortedIndices.push_back(std::get<2>(P));
8130
8131 assert(SortedIndices.size() == VL.size() &&
8132 "Expected SortedIndices to be the size of VL");
8133 return true;
8134}
8135
8136std::optional<BoUpSLP::OrdersType>
8137BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
8138 assert(TE.isGather() && "Expected gather node only.");
8139 Type *ScalarTy = TE.Scalars[0]->getType();
8140
8142 Ptrs.reserve(TE.Scalars.size());
8144 BBs.reserve(TE.Scalars.size());
8145 for (Value *V : TE.Scalars) {
8146 auto *L = dyn_cast<LoadInst>(V);
8147 if (!L || !L->isSimple())
8148 return std::nullopt;
8149 Ptrs.push_back(L->getPointerOperand());
8150 BBs.push_back(L->getParent());
8151 }
8152
8153 BoUpSLP::OrdersType Order;
8154 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
8155 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
8156 return std::move(Order);
8157 return std::nullopt;
8158}
8159
8160/// Check if two insertelement instructions are from the same buildvector.
8163 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
8164 // Instructions must be from the same basic blocks.
8165 if (VU->getParent() != V->getParent())
8166 return false;
8167 // Checks if 2 insertelements are from the same buildvector.
8168 if (VU->getType() != V->getType())
8169 return false;
8170 // Multiple used inserts are separate nodes.
8171 if (!VU->hasOneUse() && !V->hasOneUse())
8172 return false;
8173 auto *IE1 = VU;
8174 auto *IE2 = V;
8175 std::optional<unsigned> Idx1 = getElementIndex(IE1);
8176 std::optional<unsigned> Idx2 = getElementIndex(IE2);
8177 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
8178 return false;
8179 // Go through the vector operand of insertelement instructions trying to find
8180 // either VU as the original vector for IE2 or V as the original vector for
8181 // IE1.
8183 bool IsReusedIdx = false;
8184 do {
8185 if (IE2 == VU && !IE1)
8186 return VU->hasOneUse();
8187 if (IE1 == V && !IE2)
8188 return V->hasOneUse();
8189 if (IE1 && IE1 != V) {
8190 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
8191 IsReusedIdx |= ReusedIdx.test(Idx1);
8192 ReusedIdx.set(Idx1);
8193 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
8194 IE1 = nullptr;
8195 else
8196 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
8197 }
8198 if (IE2 && IE2 != VU) {
8199 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
8200 IsReusedIdx |= ReusedIdx.test(Idx2);
8201 ReusedIdx.set(Idx2);
8202 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
8203 IE2 = nullptr;
8204 else
8205 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
8206 }
8207 } while (!IsReusedIdx && (IE1 || IE2));
8208 return false;
8209}
8210
8211/// Checks if the specified instruction \p I is an alternate operation for
8212/// the given \p MainOp and \p AltOp instructions.
8213static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
8214 Instruction *AltOp,
8215 const TargetLibraryInfo &TLI);
8216
8217std::optional<BoUpSLP::OrdersType>
8218BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
8219 bool IgnoreReorder) {
8220 // No need to reorder if need to shuffle reuses, still need to shuffle the
8221 // node.
8222 if (!TE.ReuseShuffleIndices.empty()) {
8223 if (isSplat(TE.Scalars))
8224 return std::nullopt;
8225 // Check if reuse shuffle indices can be improved by reordering.
8226 // For this, check that reuse mask is "clustered", i.e. each scalar values
8227 // is used once in each submask of size <number_of_scalars>.
8228 // Example: 4 scalar values.
8229 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
8230 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
8231 // element 3 is used twice in the second submask.
8232 unsigned Sz = TE.Scalars.size();
8233 if (TE.isGather()) {
8234 if (std::optional<OrdersType> CurrentOrder =
8235 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
8236 SmallVector<int> Mask;
8237 fixupOrderingIndices(*CurrentOrder);
8238 inversePermutation(*CurrentOrder, Mask);
8239 ::addMask(Mask, TE.ReuseShuffleIndices);
8240 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
8241 unsigned Sz = TE.Scalars.size();
8242 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
8243 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
8244 if (Idx != PoisonMaskElem)
8245 Res[Idx + K * Sz] = I + K * Sz;
8246 }
8247 return std::move(Res);
8248 }
8249 }
8250 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8251 ::getNumberOfParts(*TTI,
8252 getWidenedType(getValueType(TE.Scalars.front()),
8253 2 * TE.getVectorFactor()),
8254 getValueType(TE.Scalars.front())) == 1)
8255 return std::nullopt;
8256 if (TE.ReuseShuffleIndices.size() % Sz != 0)
8257 return std::nullopt;
8258 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
8259 Sz)) {
8260 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8261 if (TE.ReorderIndices.empty())
8262 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
8263 else
8264 inversePermutation(TE.ReorderIndices, ReorderMask);
8265 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
8266 unsigned VF = ReorderMask.size();
8267 OrdersType ResOrder(VF, VF);
8268 unsigned NumParts = divideCeil(VF, Sz);
8269 SmallBitVector UsedVals(NumParts);
8270 for (unsigned I = 0; I < VF; I += Sz) {
8271 int Val = PoisonMaskElem;
8272 unsigned UndefCnt = 0;
8273 unsigned Limit = std::min(Sz, VF - I);
8274 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
8275 [&](int Idx) {
8276 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
8277 Val = Idx;
8278 if (Idx == PoisonMaskElem)
8279 ++UndefCnt;
8280 return Idx != PoisonMaskElem && Idx != Val;
8281 }) ||
8282 Val >= static_cast<int>(NumParts) || Val == PoisonMaskElem ||
8283 UsedVals.test(Val) || UndefCnt > Sz / 2)
8284 return std::nullopt;
8285 UsedVals.set(Val);
8286 for (unsigned K = 0; K < NumParts; ++K) {
8287 unsigned Idx = Val + Sz * K;
8288 if (Idx < VF && I + K < VF)
8289 ResOrder[Idx] = I + K;
8290 }
8291 }
8292 return std::move(ResOrder);
8293 }
8294 unsigned VF = TE.getVectorFactor();
8295 // Try build correct order for extractelement instructions.
8296 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8297 TE.ReuseShuffleIndices.end());
8298 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8299 all_of(TE.Scalars, [Sz](Value *V) {
8300 if (isa<PoisonValue>(V))
8301 return true;
8302 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8303 return Idx && *Idx < Sz;
8304 })) {
8305 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8306 "by BinaryOperator and CastInst.");
8307 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8308 if (TE.ReorderIndices.empty())
8309 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
8310 else
8311 inversePermutation(TE.ReorderIndices, ReorderMask);
8312 for (unsigned I = 0; I < VF; ++I) {
8313 int &Idx = ReusedMask[I];
8314 if (Idx == PoisonMaskElem)
8315 continue;
8316 Value *V = TE.Scalars[ReorderMask[Idx]];
8317 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
8318 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
8319 }
8320 }
8321 // Build the order of the VF size, need to reorder reuses shuffles, they are
8322 // always of VF size.
8323 OrdersType ResOrder(VF);
8324 std::iota(ResOrder.begin(), ResOrder.end(), 0);
8325 auto *It = ResOrder.begin();
8326 for (unsigned K = 0; K < VF; K += Sz) {
8327 OrdersType CurrentOrder(TE.ReorderIndices);
8328 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
8329 if (SubMask.front() == PoisonMaskElem)
8330 std::iota(SubMask.begin(), SubMask.end(), 0);
8331 reorderOrder(CurrentOrder, SubMask);
8332 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
8333 std::advance(It, Sz);
8334 }
8335 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
8336 return Data.index() == Data.value();
8337 }))
8338 return std::nullopt; // No need to reorder.
8339 return std::move(ResOrder);
8340 }
8341 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8342 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8343 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
8344 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
8345 return std::nullopt;
8346 if (TE.State == TreeEntry::SplitVectorize ||
8347 ((TE.State == TreeEntry::Vectorize ||
8348 TE.State == TreeEntry::StridedVectorize ||
8349 TE.State == TreeEntry::CompressVectorize) &&
8351 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
8352 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8353 "Alternate instructions are only supported by "
8354 "BinaryOperator and CastInst.");
8355 return TE.ReorderIndices;
8356 }
8357 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8358 TE.isAltShuffle()) {
8359 assert(TE.ReuseShuffleIndices.empty() &&
8360 "ReuseShuffleIndices should be "
8361 "empty for alternate instructions.");
8362 SmallVector<int> Mask;
8363 TE.buildAltOpShuffleMask(
8364 [&](Instruction *I) {
8365 assert(TE.getMatchingMainOpOrAltOp(I) &&
8366 "Unexpected main/alternate opcode");
8367 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
8368 },
8369 Mask);
8370 const int VF = TE.getVectorFactor();
8371 OrdersType ResOrder(VF, VF);
8372 for (unsigned I : seq<unsigned>(VF)) {
8373 if (Mask[I] == PoisonMaskElem)
8374 continue;
8375 ResOrder[Mask[I] % VF] = I;
8376 }
8377 return std::move(ResOrder);
8378 }
8379 if (!TE.ReorderIndices.empty())
8380 return TE.ReorderIndices;
8381 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8382 if (!TE.ReorderIndices.empty())
8383 return TE.ReorderIndices;
8384
8385 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8386 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
8387 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
8388 continue;
8389 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
8390 if (!II)
8391 continue;
8392 Instruction *BVHead = nullptr;
8393 BasicBlock *BB = II->getParent();
8394 while (II && II->hasOneUse() && II->getParent() == BB) {
8395 BVHead = II;
8396 II = dyn_cast<InsertElementInst>(II->getOperand(0));
8397 }
8398 I = BVHead;
8399 }
8400
8401 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8402 assert(BB1 != BB2 && "Expected different basic blocks.");
8403 if (!DT->isReachableFromEntry(BB1))
8404 return false;
8405 if (!DT->isReachableFromEntry(BB2))
8406 return true;
8407 auto *NodeA = DT->getNode(BB1);
8408 auto *NodeB = DT->getNode(BB2);
8409 assert(NodeA && "Should only process reachable instructions");
8410 assert(NodeB && "Should only process reachable instructions");
8411 assert((NodeA == NodeB) ==
8412 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8413 "Different nodes should have different DFS numbers");
8414 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8415 };
8416 auto PHICompare = [&](unsigned I1, unsigned I2) {
8417 Value *V1 = TE.Scalars[I1];
8418 Value *V2 = TE.Scalars[I2];
8419 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8420 return false;
8421 if (isa<PoisonValue>(V1))
8422 return true;
8423 if (isa<PoisonValue>(V2))
8424 return false;
8425 if (V1->getNumUses() < V2->getNumUses())
8426 return true;
8427 if (V1->getNumUses() > V2->getNumUses())
8428 return false;
8429 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
8430 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
8431 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8432 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8433 FirstUserOfPhi2->getParent());
8434 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
8435 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
8436 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
8437 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
8438 if (IE1 && !IE2)
8439 return true;
8440 if (!IE1 && IE2)
8441 return false;
8442 if (IE1 && IE2) {
8443 if (UserBVHead[I1] && !UserBVHead[I2])
8444 return true;
8445 if (!UserBVHead[I1])
8446 return false;
8447 if (UserBVHead[I1] == UserBVHead[I2])
8448 return getElementIndex(IE1) < getElementIndex(IE2);
8449 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8450 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8451 UserBVHead[I2]->getParent());
8452 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8453 }
8454 if (EE1 && !EE2)
8455 return true;
8456 if (!EE1 && EE2)
8457 return false;
8458 if (EE1 && EE2) {
8459 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
8460 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
8461 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
8462 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
8463 if (!Inst2 && !P2)
8464 return Inst1 || P1;
8465 if (EE1->getOperand(0) == EE2->getOperand(0))
8466 return getElementIndex(EE1) < getElementIndex(EE2);
8467 if (!Inst1 && Inst2)
8468 return false;
8469 if (Inst1 && Inst2) {
8470 if (Inst1->getParent() != Inst2->getParent())
8471 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8472 return Inst1->comesBefore(Inst2);
8473 }
8474 if (!P1 && P2)
8475 return false;
8476 assert(P1 && P2 &&
8477 "Expected either instructions or arguments vector operands.");
8478 return P1->getArgNo() < P2->getArgNo();
8479 }
8480 return false;
8481 };
8482 OrdersType Phis(TE.Scalars.size());
8483 std::iota(Phis.begin(), Phis.end(), 0);
8484 stable_sort(Phis, PHICompare);
8485 if (isIdentityOrder(Phis))
8486 return std::nullopt; // No need to reorder.
8487 return std::move(Phis);
8488 }
8489 if (TE.isGather() &&
8490 (!TE.hasState() || !TE.isAltShuffle() ||
8491 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8492 allSameType(TE.Scalars)) {
8493 // TODO: add analysis of other gather nodes with extractelement
8494 // instructions and other values/instructions, not only undefs.
8495 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8497 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
8498 all_of(TE.Scalars, [](Value *V) {
8499 auto *EE = dyn_cast<ExtractElementInst>(V);
8500 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8501 })) {
8502 // Check that gather of extractelements can be represented as
8503 // just a shuffle of a single vector.
8504 OrdersType CurrentOrder;
8505 bool Reuse =
8506 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8507 if (Reuse || !CurrentOrder.empty())
8508 return std::move(CurrentOrder);
8509 }
8510 // If the gather node is <undef, v, .., poison> and
8511 // insertelement poison, v, 0 [+ permute]
8512 // is cheaper than
8513 // insertelement poison, v, n - try to reorder.
8514 // If rotating the whole graph, exclude the permute cost, the whole graph
8515 // might be transformed.
8516 int Sz = TE.Scalars.size();
8517 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
8518 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
8519 const auto *It = find_if_not(TE.Scalars, isConstant);
8520 if (It == TE.Scalars.begin())
8521 return OrdersType();
8522 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
8523 if (It != TE.Scalars.end()) {
8524 OrdersType Order(Sz, Sz);
8525 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8526 Order[Idx] = 0;
8527 fixupOrderingIndices(Order);
8528 SmallVector<int> Mask;
8529 inversePermutation(Order, Mask);
8530 InstructionCost PermuteCost =
8531 TopToBottom
8532 ? 0
8533 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
8534 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8535 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
8536 PoisonValue::get(Ty), *It);
8537 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8538 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
8539 PoisonValue::get(Ty), *It);
8540 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8541 OrdersType Order(Sz, Sz);
8542 Order[Idx] = 0;
8543 return std::move(Order);
8544 }
8545 }
8546 }
8547 if (isSplat(TE.Scalars))
8548 return std::nullopt;
8549 if (TE.Scalars.size() >= 3)
8550 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8551 return Order;
8552 // Check if can include the order of vectorized loads. For masked gathers do
8553 // extra analysis later, so include such nodes into a special list.
8554 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8555 SmallVector<Value *> PointerOps;
8556 StridedPtrInfo SPtrInfo;
8557 OrdersType CurrentOrder;
8558 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
8559 CurrentOrder, PointerOps, SPtrInfo);
8562 return std::move(CurrentOrder);
8563 }
8564 if (std::optional<OrdersType> CurrentOrder =
8565 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8566 return CurrentOrder;
8567 }
8568 return std::nullopt;
8569}
8570
8571/// Checks if the given mask is a "clustered" mask with the same clusters of
8572/// size \p Sz, which are not identity submasks.
8574 unsigned Sz) {
8575 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
8576 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
8577 return false;
8578 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8579 ArrayRef<int> Cluster = Mask.slice(I, Sz);
8580 if (Cluster != FirstCluster)
8581 return false;
8582 }
8583 return true;
8584}
8585
8586void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8587 // Reorder reuses mask.
8588 reorderReuses(TE.ReuseShuffleIndices, Mask);
8589 const unsigned Sz = TE.Scalars.size();
8590 // For vectorized and non-clustered reused no need to do anything else.
8591 if (!TE.isGather() ||
8593 Sz) ||
8594 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8595 return;
8596 SmallVector<int> NewMask;
8597 inversePermutation(TE.ReorderIndices, NewMask);
8598 addMask(NewMask, TE.ReuseShuffleIndices);
8599 // Clear reorder since it is going to be applied to the new mask.
8600 TE.ReorderIndices.clear();
8601 // Try to improve gathered nodes with clustered reuses, if possible.
8602 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8603 SmallVector<unsigned> NewOrder(Slice);
8604 inversePermutation(NewOrder, NewMask);
8605 reorderScalars(TE.Scalars, NewMask);
8606 // Fill the reuses mask with the identity submasks.
8607 for (auto *It = TE.ReuseShuffleIndices.begin(),
8608 *End = TE.ReuseShuffleIndices.end();
8609 It != End; std::advance(It, Sz))
8610 std::iota(It, std::next(It, Sz), 0);
8611}
8612
8614 ArrayRef<unsigned> SecondaryOrder) {
8615 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8616 "Expected same size of orders");
8617 size_t Sz = Order.size();
8618 SmallBitVector UsedIndices(Sz);
8619 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8620 if (Order[Idx] != Sz)
8621 UsedIndices.set(Order[Idx]);
8622 }
8623 if (SecondaryOrder.empty()) {
8624 for (unsigned Idx : seq<unsigned>(0, Sz))
8625 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8626 Order[Idx] = Idx;
8627 } else {
8628 for (unsigned Idx : seq<unsigned>(0, Sz))
8629 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8630 !UsedIndices.test(SecondaryOrder[Idx]))
8631 Order[Idx] = SecondaryOrder[Idx];
8632 }
8633}
8634
8637 return false;
8638
8639 constexpr unsigned TinyVF = 2;
8640 constexpr unsigned TinyTree = 10;
8641 constexpr unsigned PhiOpsLimit = 12;
8642 constexpr unsigned GatherLoadsLimit = 2;
8643 if (VectorizableTree.size() <= TinyTree)
8644 return true;
8645 if (VectorizableTree.front()->hasState() &&
8646 !VectorizableTree.front()->isGather() &&
8647 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8648 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8649 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8650 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8651 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8652 VectorizableTree.front()->ReorderIndices.empty()) {
8653 // Check if the tree has only single store and single (unordered) load node,
8654 // other nodes are phis or geps/binops, combined with phis, and/or single
8655 // gather load node
8656 if (VectorizableTree.front()->hasState() &&
8657 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8658 VectorizableTree.front()->Scalars.size() == TinyVF &&
8659 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8660 return false;
8661 // Single node, which require reorder - skip.
8662 if (VectorizableTree.front()->hasState() &&
8663 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8664 VectorizableTree.front()->ReorderIndices.empty()) {
8665 const unsigned ReorderedSplitsCnt =
8666 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8667 return TE->State == TreeEntry::SplitVectorize &&
8668 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8669 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8670 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8671 });
8672 if (ReorderedSplitsCnt <= 1 &&
8673 static_cast<unsigned>(count_if(
8674 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8675 return ((!TE->isGather() &&
8676 (TE->ReorderIndices.empty() ||
8677 (TE->UserTreeIndex.UserTE &&
8678 TE->UserTreeIndex.UserTE->State ==
8679 TreeEntry::Vectorize &&
8680 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8681 .empty()))) ||
8682 (TE->isGather() && TE->ReorderIndices.empty() &&
8683 (!TE->hasState() || TE->isAltShuffle() ||
8684 TE->getOpcode() == Instruction::Load ||
8685 TE->getOpcode() == Instruction::ZExt ||
8686 TE->getOpcode() == Instruction::SExt))) &&
8687 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8688 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8689 return !isConstant(V) && isVectorized(V);
8690 }));
8691 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8692 return false;
8693 }
8694 bool HasPhis = false;
8695 bool HasLoad = true;
8696 unsigned GatherLoads = 0;
8697 for (const std::unique_ptr<TreeEntry> &TE :
8698 ArrayRef(VectorizableTree).drop_front()) {
8699 if (TE->State == TreeEntry::SplitVectorize)
8700 continue;
8701 if (!TE->hasState()) {
8702 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8704 continue;
8705 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8707 continue;
8708 return true;
8709 }
8710 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8711 if (!TE->isGather()) {
8712 HasLoad = false;
8713 continue;
8714 }
8715 if (HasLoad)
8716 return true;
8717 ++GatherLoads;
8718 if (GatherLoads >= GatherLoadsLimit)
8719 return true;
8720 }
8721 if (TE->getOpcode() == Instruction::GetElementPtr ||
8722 Instruction::isBinaryOp(TE->getOpcode()))
8723 continue;
8724 if (TE->getOpcode() != Instruction::PHI &&
8725 (!TE->hasCopyableElements() ||
8726 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8727 TE->Scalars.size() / 2))
8728 return true;
8729 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8730 TE->getNumOperands() > PhiOpsLimit)
8731 return false;
8732 HasPhis = true;
8733 }
8734 return !HasPhis;
8735 }
8736 return true;
8737}
8738
8739void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8740 ArrayRef<int> MaskOrder) {
8741 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8742 SmallVector<int> NewMask(getVectorFactor());
8743 SmallVector<int> NewMaskOrder(getVectorFactor());
8744 std::iota(NewMask.begin(), NewMask.end(), 0);
8745 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8746 if (Idx == 0) {
8747 copy(Mask, NewMask.begin());
8748 copy(MaskOrder, NewMaskOrder.begin());
8749 } else {
8750 assert(Idx == 1 && "Expected either 0 or 1 index.");
8751 unsigned Offset = CombinedEntriesWithIndices.back().second;
8752 for (unsigned I : seq<unsigned>(Mask.size())) {
8753 NewMask[I + Offset] = Mask[I] + Offset;
8754 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8755 }
8756 }
8757 reorderScalars(Scalars, NewMask);
8758 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8759 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8760 ReorderIndices.clear();
8761}
8762
8764 // Maps VF to the graph nodes.
8766 // ExtractElement gather nodes which can be vectorized and need to handle
8767 // their ordering.
8769
8770 // Phi nodes can have preferred ordering based on their result users
8772
8773 // AltShuffles can also have a preferred ordering that leads to fewer
8774 // instructions, e.g., the addsub instruction in x86.
8775 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8776
8777 // Maps a TreeEntry to the reorder indices of external users.
8779 ExternalUserReorderMap;
8780 // Compute IgnoreReorder once - it depends only on UserIgnoreList and
8781 // VectorizableTree.front(), which do not change during this loop.
8782 const bool IgnoreReorder =
8783 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8784 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8785 VectorizableTree.front()->getOpcode() == Instruction::Store);
8786 // Find all reorderable nodes with the given VF.
8787 // Currently the are vectorized stores,loads,extracts + some gathering of
8788 // extracts.
8789 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8790 const std::unique_ptr<TreeEntry> &TE) {
8791 // Look for external users that will probably be vectorized.
8792 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8793 findExternalStoreUsersReorderIndices(TE.get());
8794 if (!ExternalUserReorderIndices.empty()) {
8795 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8796 ExternalUserReorderMap.try_emplace(TE.get(),
8797 std::move(ExternalUserReorderIndices));
8798 }
8799
8800 // Patterns like [fadd,fsub] can be combined into a single instruction in
8801 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8802 // to take into account their order when looking for the most used order.
8803 if (TE->hasState() && TE->isAltShuffle() &&
8804 TE->State != TreeEntry::SplitVectorize) {
8805 Type *ScalarTy = TE->Scalars[0]->getType();
8806 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8807 unsigned Opcode0 = TE->getOpcode();
8808 unsigned Opcode1 = TE->getAltOpcode();
8809 SmallBitVector OpcodeMask(
8810 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8811 // If this pattern is supported by the target then we consider the order.
8812 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8813 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8814 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8815 }
8816 // TODO: Check the reverse order too.
8817 }
8818
8819 if (std::optional<OrdersType> CurrentOrder =
8820 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8821 // Do not include ordering for nodes used in the alt opcode vectorization,
8822 // better to reorder them during bottom-to-top stage. If follow the order
8823 // here, it causes reordering of the whole graph though actually it is
8824 // profitable just to reorder the subgraph that starts from the alternate
8825 // opcode vectorization node. Such nodes already end-up with the shuffle
8826 // instruction and it is just enough to change this shuffle rather than
8827 // rotate the scalars for the whole graph.
8828 unsigned Cnt = 0;
8829 const TreeEntry *UserTE = TE.get();
8830 while (UserTE && Cnt < RecursionMaxDepth) {
8831 if (!UserTE->UserTreeIndex)
8832 break;
8833 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8834 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8835 UserTE->UserTreeIndex.UserTE->Idx != 0)
8836 return;
8837 UserTE = UserTE->UserTreeIndex.UserTE;
8838 ++Cnt;
8839 }
8840 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8841 if (!(TE->State == TreeEntry::Vectorize ||
8842 TE->State == TreeEntry::StridedVectorize ||
8843 TE->State == TreeEntry::SplitVectorize ||
8844 TE->State == TreeEntry::CompressVectorize) ||
8845 !TE->ReuseShuffleIndices.empty())
8846 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8847 if (TE->State == TreeEntry::Vectorize &&
8848 TE->getOpcode() == Instruction::PHI)
8849 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8850 }
8851 });
8852
8853 // Reorder the graph nodes according to their vectorization factor.
8854 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8855 !VFToOrderedEntries.empty() && VF > 1; --VF) {
8856 auto It = VFToOrderedEntries.find(VF);
8857 if (It == VFToOrderedEntries.end())
8858 continue;
8859 // Try to find the most profitable order. We just are looking for the most
8860 // used order and reorder scalar elements in the nodes according to this
8861 // mostly used order.
8862 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8863 // Delete VF entry upon exit.
8864 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(It); });
8865
8866 // All operands are reordered and used only in this node - propagate the
8867 // most used order to the user node.
8870 OrdersUses;
8871 for (const TreeEntry *OpTE : OrderedEntries) {
8872 // No need to reorder this nodes, still need to extend and to use shuffle,
8873 // just need to merge reordering shuffle and the reuse shuffle.
8874 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8875 OpTE->State != TreeEntry::SplitVectorize)
8876 continue;
8877 // Count number of orders uses.
8878 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8879 &PhisToOrders]() -> const OrdersType & {
8880 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8881 auto It = GathersToOrders.find(OpTE);
8882 if (It != GathersToOrders.end())
8883 return It->second;
8884 }
8885 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8886 auto It = AltShufflesToOrders.find(OpTE);
8887 if (It != AltShufflesToOrders.end())
8888 return It->second;
8889 }
8890 if (OpTE->State == TreeEntry::Vectorize &&
8891 OpTE->getOpcode() == Instruction::PHI) {
8892 auto It = PhisToOrders.find(OpTE);
8893 if (It != PhisToOrders.end())
8894 return It->second;
8895 }
8896 return OpTE->ReorderIndices;
8897 }();
8898 // First consider the order of the external scalar users.
8899 auto It = ExternalUserReorderMap.find(OpTE);
8900 if (It != ExternalUserReorderMap.end()) {
8901 const auto &ExternalUserReorderIndices = It->second;
8902 // If the OpTE vector factor != number of scalars - use natural order,
8903 // it is an attempt to reorder node with reused scalars but with
8904 // external uses.
8905 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8906 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8907 ExternalUserReorderIndices.size();
8908 } else {
8909 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8910 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8911 }
8912 // No other useful reorder data in this entry.
8913 if (Order.empty())
8914 continue;
8915 }
8916 // Stores actually store the mask, not the order, need to invert.
8917 if (OpTE->State == TreeEntry::Vectorize &&
8918 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8919 assert(!OpTE->isAltShuffle() &&
8920 "Alternate instructions are only supported by BinaryOperator "
8921 "and CastInst.");
8922 SmallVector<int> Mask;
8923 inversePermutation(Order, Mask);
8924 unsigned E = Order.size();
8925 OrdersType CurrentOrder(E, E);
8926 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8927 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8928 });
8929 fixupOrderingIndices(CurrentOrder);
8930 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8931 } else {
8932 ++OrdersUses.try_emplace(Order, 0).first->second;
8933 }
8934 }
8935 if (OrdersUses.empty())
8936 continue;
8937 // Choose the most used order.
8938 unsigned IdentityCnt = 0;
8939 unsigned FilledIdentityCnt = 0;
8940 OrdersType IdentityOrder(VF, VF);
8941 for (auto &Pair : OrdersUses) {
8942 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8943 if (!Pair.first.empty())
8944 FilledIdentityCnt += Pair.second;
8945 IdentityCnt += Pair.second;
8946 combineOrders(IdentityOrder, Pair.first);
8947 }
8948 }
8949 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8950 unsigned Cnt = IdentityCnt;
8951 for (auto &Pair : OrdersUses) {
8952 // Prefer identity order. But, if filled identity found (non-empty order)
8953 // with same number of uses, as the new candidate order, we can choose
8954 // this candidate order.
8955 if (Cnt < Pair.second ||
8956 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8957 Cnt == Pair.second && !BestOrder.empty() &&
8958 isIdentityOrder(BestOrder))) {
8959 combineOrders(Pair.first, BestOrder);
8960 BestOrder = Pair.first;
8961 Cnt = Pair.second;
8962 } else {
8963 combineOrders(BestOrder, Pair.first);
8964 }
8965 }
8966 // Set order of the user node.
8967 if (isIdentityOrder(BestOrder))
8968 continue;
8969 fixupOrderingIndices(BestOrder);
8970 SmallVector<int> Mask;
8971 inversePermutation(BestOrder, Mask);
8972 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8973 unsigned E = BestOrder.size();
8974 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8975 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8976 });
8977 // Do an actual reordering, if profitable.
8978 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8979 // Just do the reordering for the nodes with the given VF.
8980 if (TE->Scalars.size() != VF) {
8981 if (TE->ReuseShuffleIndices.size() == VF) {
8982 assert(TE->State != TreeEntry::SplitVectorize &&
8983 "Split vectorized not expected.");
8984 // Need to reorder the reuses masks of the operands with smaller VF to
8985 // be able to find the match between the graph nodes and scalar
8986 // operands of the given node during vectorization/cost estimation.
8987 assert(
8988 (!TE->UserTreeIndex ||
8989 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8990 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8991 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8992 "All users must be of VF size.");
8993 if (SLPReVec) {
8994 assert(SLPReVec && "Only supported by REVEC.");
8995 // ShuffleVectorInst does not do reorderOperands (and it should not
8996 // because ShuffleVectorInst supports only a limited set of
8997 // patterns). Only do reorderNodeWithReuses if the user is not
8998 // ShuffleVectorInst.
8999 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
9000 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
9001 continue;
9002 }
9003 // Update ordering of the operands with the smaller VF than the given
9004 // one.
9005 reorderNodeWithReuses(*TE, Mask);
9006 // Update orders in user split vectorize nodes.
9007 if (TE->UserTreeIndex &&
9008 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9009 TE->UserTreeIndex.UserTE->reorderSplitNode(
9010 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
9011 }
9012 continue;
9013 }
9014 if ((TE->State == TreeEntry::SplitVectorize &&
9015 TE->ReuseShuffleIndices.empty()) ||
9016 ((TE->State == TreeEntry::Vectorize ||
9017 TE->State == TreeEntry::StridedVectorize ||
9018 TE->State == TreeEntry::CompressVectorize) &&
9020 InsertElementInst>(TE->getMainOp()) ||
9021 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
9022 assert(
9023 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
9024 TE->ReuseShuffleIndices.empty())) &&
9025 "Alternate instructions are only supported by BinaryOperator "
9026 "and CastInst.");
9027 // Build correct orders for extract{element,value}, loads,
9028 // stores and alternate (split) nodes.
9029 reorderOrder(TE->ReorderIndices, Mask);
9030 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
9031 TE->reorderOperands(Mask);
9032 } else {
9033 // Reorder the node and its operands.
9034 TE->reorderOperands(Mask);
9035 assert(TE->ReorderIndices.empty() &&
9036 "Expected empty reorder sequence.");
9037 reorderScalars(TE->Scalars, Mask);
9038 }
9039 if (!TE->ReuseShuffleIndices.empty()) {
9040 // Apply reversed order to keep the original ordering of the reused
9041 // elements to avoid extra reorder indices shuffling.
9042 OrdersType CurrentOrder;
9043 reorderOrder(CurrentOrder, MaskOrder);
9044 SmallVector<int> NewReuses;
9045 inversePermutation(CurrentOrder, NewReuses);
9046 addMask(NewReuses, TE->ReuseShuffleIndices);
9047 TE->ReuseShuffleIndices.swap(NewReuses);
9048 } else if (TE->UserTreeIndex &&
9049 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9050 // Update orders in user split vectorize nodes.
9051 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
9052 Mask, MaskOrder);
9053 }
9054 }
9055}
9056
9057void BoUpSLP::buildReorderableOperands(
9058 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
9059 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
9060 SmallVectorImpl<TreeEntry *> &GatherOps) {
9061 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
9062 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
9063 return OpData.first == I &&
9064 (OpData.second->State == TreeEntry::Vectorize ||
9065 OpData.second->State == TreeEntry::StridedVectorize ||
9066 OpData.second->State == TreeEntry::CompressVectorize ||
9067 OpData.second->State == TreeEntry::SplitVectorize);
9068 }))
9069 continue;
9070 // Do not request operands, if they do not exist.
9071 if (UserTE->hasState()) {
9072 if (UserTE->getOpcode() == Instruction::ExtractElement ||
9073 UserTE->getOpcode() == Instruction::ExtractValue)
9074 continue;
9075 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
9076 continue;
9077 if (UserTE->getOpcode() == Instruction::Store && I == 1 &&
9078 (UserTE->State == TreeEntry::Vectorize ||
9079 UserTE->State == TreeEntry::StridedVectorize))
9080 continue;
9081 if (UserTE->getOpcode() == Instruction::Load &&
9082 (UserTE->State == TreeEntry::Vectorize ||
9083 UserTE->State == TreeEntry::StridedVectorize ||
9084 UserTE->State == TreeEntry::CompressVectorize))
9085 continue;
9086 }
9087 TreeEntry *TE = getOperandEntry(UserTE, I);
9088 assert(TE && "Expected operand entry.");
9089 if (!TE->isGather()) {
9090 // Add the node to the list of the ordered nodes with the identity
9091 // order.
9092 Edges.emplace_back(I, TE);
9093 // Add ScatterVectorize nodes to the list of operands, where just
9094 // reordering of the scalars is required. Similar to the gathers, so
9095 // simply add to the list of gathered ops.
9096 // If there are reused scalars, process this node as a regular vectorize
9097 // node, just reorder reuses mask.
9098 if (TE->State == TreeEntry::ScatterVectorize &&
9099 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
9100 GatherOps.push_back(TE);
9101 continue;
9102 }
9103 if (ReorderableGathers.contains(TE))
9104 GatherOps.push_back(TE);
9105 }
9106}
9107
9108void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
9109 struct TreeEntryCompare {
9110 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
9111 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
9112 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
9113 return LHS->Idx < RHS->Idx;
9114 }
9115 };
9117 DenseSet<const TreeEntry *> GathersToOrders;
9118 // Find all reorderable leaf nodes with the given VF.
9119 // Currently the are vectorized loads,extracts without alternate operands +
9120 // some gathering of extracts.
9122 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9123 if (TE->State != TreeEntry::Vectorize &&
9124 TE->State != TreeEntry::StridedVectorize &&
9125 TE->State != TreeEntry::CompressVectorize &&
9126 TE->State != TreeEntry::SplitVectorize)
9127 NonVectorized.insert(TE.get());
9128 if (std::optional<OrdersType> CurrentOrder =
9129 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
9130 Queue.push(TE.get());
9131 if (!(TE->State == TreeEntry::Vectorize ||
9132 TE->State == TreeEntry::StridedVectorize ||
9133 TE->State == TreeEntry::CompressVectorize ||
9134 TE->State == TreeEntry::SplitVectorize) ||
9135 !TE->ReuseShuffleIndices.empty())
9136 GathersToOrders.insert(TE.get());
9137 }
9138 }
9139
9140 // 1. Propagate order to the graph nodes, which use only reordered nodes.
9141 // I.e., if the node has operands, that are reordered, try to make at least
9142 // one operand order in the natural order and reorder others + reorder the
9143 // user node itself.
9144 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
9145 while (!Queue.empty()) {
9146 // 1. Filter out only reordered nodes.
9147 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
9148 TreeEntry *TE = Queue.top();
9149 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
9150 Queue.pop();
9151 SmallVector<TreeEntry *> OrderedOps(1, TE);
9152 while (!Queue.empty()) {
9153 TE = Queue.top();
9154 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
9155 break;
9156 Queue.pop();
9157 OrderedOps.push_back(TE);
9158 }
9159 for (TreeEntry *TE : OrderedOps) {
9160 if (!(TE->State == TreeEntry::Vectorize ||
9161 TE->State == TreeEntry::StridedVectorize ||
9162 TE->State == TreeEntry::CompressVectorize ||
9163 TE->State == TreeEntry::SplitVectorize ||
9164 (TE->isGather() && GathersToOrders.contains(TE))) ||
9165 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
9166 !Visited.insert(TE).second)
9167 continue;
9168 // Build a map between user nodes and their operands order to speedup
9169 // search. The graph currently does not provide this dependency directly.
9170 Users.first = TE->UserTreeIndex.UserTE;
9171 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
9172 }
9173 if (Users.first) {
9174 auto &Data = Users;
9175 if (Data.first->State == TreeEntry::SplitVectorize) {
9176 assert(
9177 Data.second.size() <= 2 &&
9178 "Expected not greater than 2 operands for split vectorize node.");
9179 if (any_of(Data.second,
9180 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
9181 continue;
9182 // Update orders in user split vectorize nodes.
9183 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
9184 "Expected exactly 2 entries.");
9185 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
9186 TreeEntry &OpTE = *VectorizableTree[P.first];
9187 OrdersType Order = OpTE.ReorderIndices;
9188 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
9189 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
9190 continue;
9191 const auto BestOrder =
9192 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
9193 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
9194 continue;
9195 Order = *BestOrder;
9196 }
9197 fixupOrderingIndices(Order);
9198 SmallVector<int> Mask;
9199 inversePermutation(Order, Mask);
9200 const unsigned E = Order.size();
9201 SmallVector<int> MaskOrder(E, PoisonMaskElem);
9202 transform(Order, MaskOrder.begin(), [E](unsigned I) {
9203 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9204 });
9205 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
9206 // Clear ordering of the operand.
9207 if (!OpTE.ReorderIndices.empty()) {
9208 OpTE.ReorderIndices.clear();
9209 } else if (!OpTE.ReuseShuffleIndices.empty()) {
9210 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
9211 } else {
9212 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
9213 reorderScalars(OpTE.Scalars, Mask);
9214 }
9215 }
9216 if (Data.first->ReuseShuffleIndices.empty() &&
9217 !Data.first->ReorderIndices.empty()) {
9218 // Insert user node to the list to try to sink reordering deeper in
9219 // the graph.
9220 Queue.push(Data.first);
9221 }
9222 continue;
9223 }
9224 // Check that operands are used only in the User node.
9225 SmallVector<TreeEntry *> GatherOps;
9226 buildReorderableOperands(Data.first, Data.second, NonVectorized,
9227 GatherOps);
9228 // All operands are reordered and used only in this node - propagate the
9229 // most used order to the user node.
9232 OrdersUses;
9233 // Do the analysis for each tree entry only once, otherwise the order of
9234 // the same node my be considered several times, though might be not
9235 // profitable.
9238 for (const auto &Op : Data.second) {
9239 TreeEntry *OpTE = Op.second;
9240 if (!VisitedOps.insert(OpTE).second)
9241 continue;
9242 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
9243 continue;
9244 const auto Order = [&]() -> const OrdersType {
9245 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
9246 return getReorderingData(*OpTE, /*TopToBottom=*/false,
9247 IgnoreReorder)
9248 .value_or(OrdersType(1));
9249 return OpTE->ReorderIndices;
9250 }();
9251 // The order is partially ordered, skip it in favor of fully non-ordered
9252 // orders.
9253 if (Order.size() == 1)
9254 continue;
9255
9256 // Check that the reordering does not increase number of shuffles, i.e.
9257 // same-values-nodes has same parents or their parents has same parents.
9258 if (!Order.empty() && !isIdentityOrder(Order)) {
9259 Value *Root = OpTE->hasState()
9260 ? OpTE->getMainOp()
9261 : *find_if_not(OpTE->Scalars, isConstant);
9262 auto GetSameNodesUsers = [&](Value *Root) {
9264 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
9265 if (TE != OpTE && TE->UserTreeIndex &&
9266 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9267 TE->Scalars.size() == OpTE->Scalars.size() &&
9268 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9269 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9270 Res.insert(TE->UserTreeIndex.UserTE);
9271 }
9272 for (const TreeEntry *TE : getTreeEntries(Root)) {
9273 if (TE != OpTE && TE->UserTreeIndex &&
9274 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9275 TE->Scalars.size() == OpTE->Scalars.size() &&
9276 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9277 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9278 Res.insert(TE->UserTreeIndex.UserTE);
9279 }
9280 return Res.takeVector();
9281 };
9282 auto GetNumOperands = [](const TreeEntry *TE) {
9283 if (TE->State == TreeEntry::SplitVectorize)
9284 return TE->getNumOperands();
9285 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
9286 return CI->arg_size();
9287 return TE->getNumOperands();
9288 };
9289 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9290 const TreeEntry *TE) {
9292 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
9294 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
9297 continue;
9298 const TreeEntry *Op = getOperandEntry(TE, Idx);
9299 if (Op->isGather() && Op->hasState()) {
9300 const TreeEntry *VecOp =
9301 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
9302 if (VecOp)
9303 Op = VecOp;
9304 }
9305 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9306 return false;
9307 }
9308 return true;
9309 };
9310 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
9311 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
9312 if (!RevisitedOps.insert(UTE).second)
9313 return false;
9314 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
9315 !UTE->ReuseShuffleIndices.empty() ||
9316 (UTE->UserTreeIndex &&
9317 UTE->UserTreeIndex.UserTE == Data.first) ||
9318 (Data.first->UserTreeIndex &&
9319 Data.first->UserTreeIndex.UserTE == UTE) ||
9320 (IgnoreReorder && UTE->UserTreeIndex &&
9321 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9322 NodeShouldBeReorderedWithOperands(UTE);
9323 }))
9324 continue;
9325 for (TreeEntry *UTE : Users) {
9327 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
9329 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
9332 continue;
9333 const TreeEntry *Op = getOperandEntry(UTE, Idx);
9334 Visited.erase(Op);
9335 Queue.push(const_cast<TreeEntry *>(Op));
9336 }
9337 }
9338 }
9339 unsigned NumOps = count_if(
9340 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9341 return P.second == OpTE;
9342 });
9343 // Stores actually store the mask, not the order, need to invert.
9344 if (OpTE->State == TreeEntry::Vectorize &&
9345 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9346 assert(!OpTE->isAltShuffle() &&
9347 "Alternate instructions are only supported by BinaryOperator "
9348 "and CastInst.");
9349 SmallVector<int> Mask;
9350 inversePermutation(Order, Mask);
9351 unsigned E = Order.size();
9352 OrdersType CurrentOrder(E, E);
9353 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9354 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9355 });
9356 fixupOrderingIndices(CurrentOrder);
9357 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
9358 } else {
9359 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
9360 }
9361 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
9362 const auto AllowsReordering = [&](const TreeEntry *TE) {
9363 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9364 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9365 (IgnoreReorder && TE->Idx == 0))
9366 return true;
9367 if (TE->isGather()) {
9368 if (GathersToOrders.contains(TE))
9369 return !getReorderingData(*TE, /*TopToBottom=*/false,
9370 IgnoreReorder)
9371 .value_or(OrdersType(1))
9372 .empty();
9373 return true;
9374 }
9375 return false;
9376 };
9377 if (OpTE->UserTreeIndex) {
9378 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9379 if (!VisitedUsers.insert(UserTE).second)
9380 continue;
9381 // May reorder user node if it requires reordering, has reused
9382 // scalars, is an alternate op vectorize node or its op nodes require
9383 // reordering.
9384 if (AllowsReordering(UserTE))
9385 continue;
9386 // Check if users allow reordering.
9387 // Currently look up just 1 level of operands to avoid increase of
9388 // the compile time.
9389 // Profitable to reorder if definitely more operands allow
9390 // reordering rather than those with natural order.
9392 if (static_cast<unsigned>(count_if(
9393 Ops, [UserTE, &AllowsReordering](
9394 const std::pair<unsigned, TreeEntry *> &Op) {
9395 return AllowsReordering(Op.second) &&
9396 Op.second->UserTreeIndex.UserTE == UserTE;
9397 })) <= Ops.size() / 2)
9398 ++Res.first->second;
9399 }
9400 }
9401 if (OrdersUses.empty()) {
9402 Visited.insert_range(llvm::make_second_range(Data.second));
9403 continue;
9404 }
9405 // Choose the most used order.
9406 unsigned IdentityCnt = 0;
9407 unsigned VF = Data.second.front().second->getVectorFactor();
9408 OrdersType IdentityOrder(VF, VF);
9409 for (auto &Pair : OrdersUses) {
9410 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9411 IdentityCnt += Pair.second;
9412 combineOrders(IdentityOrder, Pair.first);
9413 }
9414 }
9415 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9416 unsigned Cnt = IdentityCnt;
9417 for (auto &Pair : OrdersUses) {
9418 // Prefer identity order. But, if filled identity found (non-empty
9419 // order) with same number of uses, as the new candidate order, we can
9420 // choose this candidate order.
9421 if (Cnt < Pair.second) {
9422 combineOrders(Pair.first, BestOrder);
9423 BestOrder = Pair.first;
9424 Cnt = Pair.second;
9425 } else {
9426 combineOrders(BestOrder, Pair.first);
9427 }
9428 }
9429 // Set order of the user node.
9430 if (isIdentityOrder(BestOrder)) {
9431 Visited.insert_range(llvm::make_second_range(Data.second));
9432 continue;
9433 }
9434 fixupOrderingIndices(BestOrder);
9435 // Erase operands from OrderedEntries list and adjust their orders.
9436 VisitedOps.clear();
9437 SmallVector<int> Mask;
9438 inversePermutation(BestOrder, Mask);
9439 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9440 unsigned E = BestOrder.size();
9441 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9442 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9443 });
9444 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9445 TreeEntry *TE = Op.second;
9446 if (!VisitedOps.insert(TE).second)
9447 continue;
9448 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9449 reorderNodeWithReuses(*TE, Mask);
9450 continue;
9451 }
9452 // Gathers are processed separately.
9453 if (TE->State != TreeEntry::Vectorize &&
9454 TE->State != TreeEntry::StridedVectorize &&
9455 TE->State != TreeEntry::CompressVectorize &&
9456 TE->State != TreeEntry::SplitVectorize &&
9457 (TE->State != TreeEntry::ScatterVectorize ||
9458 TE->ReorderIndices.empty()))
9459 continue;
9460 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9461 TE->ReorderIndices.empty()) &&
9462 "Non-matching sizes of user/operand entries.");
9463 reorderOrder(TE->ReorderIndices, Mask);
9464 if (IgnoreReorder && TE == VectorizableTree.front().get())
9465 IgnoreReorder = false;
9466 }
9467 // For gathers just need to reorder its scalars.
9468 for (TreeEntry *Gather : GatherOps) {
9469 assert(Gather->ReorderIndices.empty() &&
9470 "Unexpected reordering of gathers.");
9471 if (!Gather->ReuseShuffleIndices.empty()) {
9472 // Just reorder reuses indices.
9473 reorderReuses(Gather->ReuseShuffleIndices, Mask);
9474 continue;
9475 }
9476 reorderScalars(Gather->Scalars, Mask);
9477 Visited.insert(Gather);
9478 }
9479 // Reorder operands of the user node and set the ordering for the user
9480 // node itself.
9481 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9482 return TE.isAltShuffle() &&
9483 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9484 TE.ReorderIndices.empty());
9485 };
9486 if (Data.first->State != TreeEntry::Vectorize ||
9488 Data.first->getMainOp()) ||
9489 IsNotProfitableAltCodeNode(*Data.first))
9490 Data.first->reorderOperands(Mask);
9491 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
9492 IsNotProfitableAltCodeNode(*Data.first) ||
9493 Data.first->State == TreeEntry::CompressVectorize) {
9494 reorderScalars(Data.first->Scalars, Mask);
9495 reorderOrder(Data.first->ReorderIndices, MaskOrder,
9496 /*BottomOrder=*/true);
9497 if (Data.first->ReuseShuffleIndices.empty() &&
9498 !Data.first->ReorderIndices.empty() &&
9499 !IsNotProfitableAltCodeNode(*Data.first)) {
9500 // Insert user node to the list to try to sink reordering deeper in
9501 // the graph.
9502 Queue.push(Data.first);
9503 }
9504 } else {
9505 reorderOrder(Data.first->ReorderIndices, Mask);
9506 }
9507 }
9508 }
9509 // If the reordering is unnecessary, just remove the reorder.
9510 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9511 VectorizableTree.front()->ReuseShuffleIndices.empty())
9512 VectorizableTree.front()->ReorderIndices.clear();
9513}
9514
9515Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9516 if (Entry.hasState() &&
9517 (Entry.getOpcode() == Instruction::Store ||
9518 Entry.getOpcode() == Instruction::Load) &&
9519 Entry.State == TreeEntry::StridedVectorize &&
9520 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
9521 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
9522 return dyn_cast<Instruction>(Entry.Scalars.front());
9523}
9524
9526 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9527 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9528 DenseMap<Value *, unsigned> ScalarToExtUses;
9529 // Collect the values that we need to extract from the tree.
9530 for (auto &TEPtr : VectorizableTree) {
9531 TreeEntry *Entry = TEPtr.get();
9532
9533 // No need to handle users of gathered values.
9534 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9535 DeletedNodes.contains(Entry) ||
9536 TransformedToGatherNodes.contains(Entry))
9537 continue;
9538
9539 // For each lane:
9540 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9541 Value *Scalar = Entry->Scalars[Lane];
9542 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
9543 continue;
9544
9545 // All uses must be replaced already? No need to do it again.
9546 auto It = ScalarToExtUses.find(Scalar);
9547 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9548 continue;
9549
9550 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9551 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9552 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9553 << " from " << *Scalar << "for many users.\n");
9554 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9555 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9556 ExternalUsesWithNonUsers.insert(Scalar);
9557 continue;
9558 }
9559
9560 // Check if the scalar is externally used as an extra arg.
9561 const auto ExtI = ExternallyUsedValues.find(Scalar);
9562 if (ExtI != ExternallyUsedValues.end()) {
9563 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9564 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9565 << FoundLane << " from " << *Scalar << ".\n");
9566 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
9567 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9568 continue;
9569 }
9570 for (User *U : Scalar->users()) {
9571 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9572
9573 Instruction *UserInst = dyn_cast<Instruction>(U);
9574 if (!UserInst || isDeleted(UserInst))
9575 continue;
9576
9577 // Ignore users in the user ignore list.
9578 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9579 continue;
9580
9581 // Skip in-tree scalars that become vectors
9582 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
9583 any_of(UseEntries, [this](const TreeEntry *UseEntry) {
9584 return !DeletedNodes.contains(UseEntry) &&
9585 !TransformedToGatherNodes.contains(UseEntry);
9586 })) {
9587 // Some in-tree scalars will remain as scalar in vectorized
9588 // instructions. If that is the case, the one in FoundLane will
9589 // be used.
9590 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9591 isa<LoadInst, StoreInst>(UserInst)) ||
9592 isa<CallInst>(UserInst)) ||
9593 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9594 if (DeletedNodes.contains(UseEntry) ||
9595 TransformedToGatherNodes.contains(UseEntry))
9596 return true;
9597 return UseEntry->State == TreeEntry::ScatterVectorize ||
9599 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9600 TTI);
9601 })) {
9602 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9603 << ".\n");
9604 assert(none_of(UseEntries,
9605 [](TreeEntry *UseEntry) {
9606 return UseEntry->isGather();
9607 }) &&
9608 "Bad state");
9609 continue;
9610 }
9611 U = nullptr;
9612 if (It != ScalarToExtUses.end()) {
9613 ExternalUses[It->second].User = nullptr;
9614 break;
9615 }
9616 }
9617
9618 if (U && Scalar->hasNUsesOrMore(UsesLimit))
9619 U = nullptr;
9620 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9621 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9622 << " from lane " << FoundLane << " from " << *Scalar
9623 << ".\n");
9624 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9625 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9626 ExternalUsesWithNonUsers.insert(Scalar);
9627 if (!U)
9628 break;
9629 }
9630 }
9631 }
9632}
9633
9635BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9638 PtrToStoresMap;
9639 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9640 Value *V = TE->Scalars[Lane];
9641 // Don't iterate over the users of constant data.
9642 if (!isa<Instruction>(V))
9643 continue;
9644 // To save compilation time we don't visit if we have too many users.
9645 if (V->hasNUsesOrMore(UsesLimit))
9646 break;
9647
9648 // Collect stores per pointer object.
9649 for (User *U : V->users()) {
9650 auto *SI = dyn_cast<StoreInst>(U);
9651 // Test whether we can handle the store. V might be a global, which could
9652 // be used in a different function.
9653 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9654 !isValidElementType(SI->getValueOperand()->getType()))
9655 continue;
9656 // Skip entry if already
9657 if (isVectorized(U))
9658 continue;
9659
9660 Value *Ptr =
9661 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9662 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9663 SI->getValueOperand()->getType(), Ptr}];
9664 // For now just keep one store per pointer object per lane.
9665 // TODO: Extend this to support multiple stores per pointer per lane
9666 if (StoresVec.size() > Lane)
9667 continue;
9668 if (!StoresVec.empty()) {
9669 std::optional<int64_t> Diff = getPointersDiff(
9670 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9671 SI->getValueOperand()->getType(),
9672 StoresVec.front()->getPointerOperand(), *DL, *SE,
9673 /*StrictCheck=*/true);
9674 // We failed to compare the pointers so just abandon this store.
9675 if (!Diff)
9676 continue;
9677 }
9678 StoresVec.push_back(SI);
9679 }
9680 }
9681 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9682 unsigned I = 0;
9683 for (auto &P : PtrToStoresMap) {
9684 Res[I].swap(P.second);
9685 ++I;
9686 }
9687 return Res;
9688}
9689
9690bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9691 OrdersType &ReorderIndices) const {
9692 // We check whether the stores in StoreVec can form a vector by sorting them
9693 // and checking whether they are consecutive.
9694
9695 // To avoid calling getPointersDiff() while sorting we create a vector of
9696 // pairs {store, offset from first} and sort this instead.
9698 StoreInst *S0 = StoresVec[0];
9699 StoreOffsetVec.emplace_back(0, 0);
9700 Type *S0Ty = S0->getValueOperand()->getType();
9701 Value *S0Ptr = S0->getPointerOperand();
9702 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9703 StoreInst *SI = StoresVec[Idx];
9704 std::optional<int64_t> Diff =
9705 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9706 SI->getPointerOperand(), *DL, *SE,
9707 /*StrictCheck=*/true);
9708 StoreOffsetVec.emplace_back(*Diff, Idx);
9709 }
9710
9711 // Check if the stores are consecutive by checking if their difference is 1.
9712 if (StoreOffsetVec.size() != StoresVec.size())
9713 return false;
9714 sort(StoreOffsetVec, llvm::less_first());
9715 unsigned Idx = 0;
9716 int64_t PrevDist = 0;
9717 for (const auto &P : StoreOffsetVec) {
9718 if (Idx > 0 && P.first != PrevDist + 1)
9719 return false;
9720 PrevDist = P.first;
9721 ++Idx;
9722 }
9723
9724 // Calculate the shuffle indices according to their offset against the sorted
9725 // StoreOffsetVec.
9726 ReorderIndices.assign(StoresVec.size(), 0);
9727 bool IsIdentity = true;
9728 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9729 ReorderIndices[P.second] = I;
9730 IsIdentity &= P.second == I;
9731 }
9732 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9733 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9734 // same convention here.
9735 if (IsIdentity)
9736 ReorderIndices.clear();
9737
9738 return true;
9739}
9740
9741#ifndef NDEBUG
9743 for (unsigned Idx : Order)
9744 dbgs() << Idx << ", ";
9745 dbgs() << "\n";
9746}
9747#endif
9748
9750BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9751 unsigned NumLanes = TE->Scalars.size();
9752
9753 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9754
9755 // Holds the reorder indices for each candidate store vector that is a user of
9756 // the current TreeEntry.
9757 SmallVector<OrdersType, 1> ExternalReorderIndices;
9758
9759 // Now inspect the stores collected per pointer and look for vectorization
9760 // candidates. For each candidate calculate the reorder index vector and push
9761 // it into `ExternalReorderIndices`
9762 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9763 // If we have fewer than NumLanes stores, then we can't form a vector.
9764 if (StoresVec.size() != NumLanes)
9765 continue;
9766
9767 // If the stores are not consecutive then abandon this StoresVec.
9768 OrdersType ReorderIndices;
9769 if (!canFormVector(StoresVec, ReorderIndices))
9770 continue;
9771
9772 // We now know that the scalars in StoresVec can form a vector instruction,
9773 // so set the reorder indices.
9774 ExternalReorderIndices.push_back(ReorderIndices);
9775 }
9776 return ExternalReorderIndices;
9777}
9778
9780 const SmallDenseSet<Value *> &UserIgnoreLst) {
9781 deleteTree();
9782 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9783 "TreeEntryToStridedPtrInfoMap is not cleared");
9784 UserIgnoreList = &UserIgnoreLst;
9785 if (!allSameType(Roots))
9786 return;
9787 buildTreeRec(Roots, 0, EdgeInfo());
9788}
9789
9791 deleteTree();
9792 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9793 "TreeEntryToStridedPtrInfoMap is not cleared");
9794 if (!allSameType(Roots))
9795 return;
9796 buildTreeRec(Roots, 0, EdgeInfo());
9797}
9798
9799/// Tries to find subvector of loads and builds new vector of only loads if can
9800/// be profitable.
9802 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9804 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9805 bool AddNew = true) {
9806 if (VL.empty())
9807 return;
9808 Type *ScalarTy = getValueType(VL.front());
9809 if (!isValidElementType(ScalarTy))
9810 return;
9812 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9813 for (Value *V : VL) {
9814 auto *LI = dyn_cast<LoadInst>(V);
9815 if (!LI)
9816 continue;
9817 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9818 continue;
9819 bool IsFound = false;
9820 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9821 assert(LI->getParent() == Data.front().first->getParent() &&
9822 LI->getType() == Data.front().first->getType() &&
9823 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9824 getUnderlyingObject(Data.front().first->getPointerOperand(),
9826 "Expected loads with the same type, same parent and same "
9827 "underlying pointer.");
9828 std::optional<int64_t> Dist = getPointersDiff(
9829 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9830 Data.front().first->getPointerOperand(), DL, SE,
9831 /*StrictCheck=*/true);
9832 if (!Dist)
9833 continue;
9834 auto It = Map.find(*Dist);
9835 if (It != Map.end() && It->second != LI)
9836 continue;
9837 if (It == Map.end()) {
9838 Data.emplace_back(LI, *Dist);
9839 Map.try_emplace(*Dist, LI);
9840 }
9841 IsFound = true;
9842 break;
9843 }
9844 if (!IsFound) {
9845 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9846 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9847 }
9848 }
9849 auto FindMatchingLoads =
9852 &GatheredLoads,
9854 int64_t &Offset, unsigned &Start) {
9855 if (Loads.empty())
9856 return GatheredLoads.end();
9857 LoadInst *LI = Loads.front().first;
9858 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9859 if (Idx < Start)
9860 continue;
9861 ToAdd.clear();
9862 if (LI->getParent() != Data.front().first->getParent() ||
9863 LI->getType() != Data.front().first->getType())
9864 continue;
9865 std::optional<int64_t> Dist =
9867 Data.front().first->getType(),
9868 Data.front().first->getPointerOperand(), DL, SE,
9869 /*StrictCheck=*/true);
9870 if (!Dist)
9871 continue;
9872 SmallSet<int64_t, 4> DataDists;
9874 for (std::pair<LoadInst *, int64_t> P : Data) {
9875 DataDists.insert(P.second);
9876 DataLoads.insert(P.first);
9877 }
9878 // Found matching gathered loads - check if all loads are unique or
9879 // can be effectively vectorized.
9880 unsigned NumUniques = 0;
9881 for (auto [Cnt, Pair] : enumerate(Loads)) {
9882 bool Used = DataLoads.contains(Pair.first);
9883 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9884 ++NumUniques;
9885 ToAdd.insert(Cnt);
9886 } else if (Used) {
9887 Repeated.insert(Cnt);
9888 }
9889 }
9890 if (NumUniques > 0 &&
9891 (Loads.size() == NumUniques ||
9892 (Loads.size() - NumUniques >= 2 &&
9893 Loads.size() - NumUniques >= Loads.size() / 2 &&
9894 (has_single_bit(Data.size() + NumUniques) ||
9895 bit_ceil(Data.size()) <
9896 bit_ceil(Data.size() + NumUniques))))) {
9897 Offset = *Dist;
9898 Start = Idx + 1;
9899 return std::next(GatheredLoads.begin(), Idx);
9900 }
9901 }
9902 ToAdd.clear();
9903 return GatheredLoads.end();
9904 };
9905 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9906 unsigned Start = 0;
9907 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9908 int64_t Offset = 0;
9909 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9910 Offset, Start);
9911 while (It != GatheredLoads.end()) {
9912 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9913 for (unsigned Idx : LocalToAdd)
9914 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9915 ToAdd.insert_range(LocalToAdd);
9916 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9917 Start);
9918 }
9919 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9920 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9921 })) {
9922 auto AddNewLoads =
9924 for (unsigned Idx : seq<unsigned>(Data.size())) {
9925 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9926 continue;
9927 Loads.push_back(Data[Idx]);
9928 }
9929 };
9930 if (!AddNew) {
9931 LoadInst *LI = Data.front().first;
9932 It = find_if(
9933 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9934 return PD.front().first->getParent() == LI->getParent() &&
9935 PD.front().first->getType() == LI->getType();
9936 });
9937 while (It != GatheredLoads.end()) {
9938 AddNewLoads(*It);
9939 It = std::find_if(
9940 std::next(It), GatheredLoads.end(),
9941 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9942 return PD.front().first->getParent() == LI->getParent() &&
9943 PD.front().first->getType() == LI->getType();
9944 });
9945 }
9946 }
9947 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9948 AddNewLoads(GatheredLoads.emplace_back());
9949 }
9950 }
9951}
9952
9953void BoUpSLP::tryToVectorizeGatheredLoads(
9954 const SmallMapVector<
9955 std::tuple<BasicBlock *, Value *, Type *>,
9956 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9957 &GatheredLoads) {
9958 GatheredLoadsEntriesFirst = VectorizableTree.size();
9959
9960 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9961 LoadEntriesToVectorize.size());
9962 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9963 Set.insert_range(VectorizableTree[Idx]->Scalars);
9964
9965 // Sort loads by distance.
9966 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9967 const std::pair<LoadInst *, int64_t> &L2) {
9968 return L1.second > L2.second;
9969 };
9970
9971 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9972 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9973 Loads.size());
9974 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9975 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9976 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9977 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9978 };
9979
9980 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9981 BoUpSLP::ValueSet &VectorizedLoads,
9982 SmallVectorImpl<LoadInst *> &NonVectorized,
9983 bool Final, unsigned MaxVF) {
9985 unsigned StartIdx = 0;
9986 SmallVector<int> CandidateVFs;
9987 if (isAllowedNonPowerOf2VF(MaxVF))
9988 CandidateVFs.push_back(MaxVF);
9989 for (int NumElts = getFloorFullVectorNumberOfElements(
9990 *TTI, Loads.front()->getType(), MaxVF);
9991 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9992 *TTI, Loads.front()->getType(), NumElts - 1)) {
9993 CandidateVFs.push_back(NumElts);
9994 if (VectorizeNonPowerOf2 && NumElts > 2)
9995 CandidateVFs.push_back(NumElts - 1);
9996 }
9997
9998 if (Final && CandidateVFs.empty())
9999 return Results;
10000
10001 unsigned BestVF = Final ? CandidateVFs.back() : 0;
10002 for (unsigned NumElts : CandidateVFs) {
10003 if (Final && NumElts > BestVF)
10004 continue;
10005 SmallVector<unsigned> MaskedGatherVectorized;
10006 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
10007 ++Cnt) {
10008 ArrayRef<LoadInst *> Slice =
10009 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
10010 if (VectorizedLoads.count(Slice.front()) ||
10011 VectorizedLoads.count(Slice.back()) ||
10013 continue;
10014 // Check if it is profitable to try vectorizing gathered loads. It is
10015 // profitable if we have more than 3 consecutive loads or if we have
10016 // less but all users are vectorized or deleted.
10017 bool AllowToVectorize = false;
10018 // Check if it is profitable to vectorize 2-elements loads.
10019 if (NumElts == 2) {
10020 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
10021 Slice.front()->getType(), ElementCount::getFixed(NumElts));
10022 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
10023 for (LoadInst *LI : Slice) {
10024 // If single use/user - allow to vectorize.
10025 if (LI->hasOneUse())
10026 continue;
10027 // 1. Check if number of uses equals number of users.
10028 // 2. All users are deleted.
10029 // 3. The load broadcasts are not allowed or the load is not
10030 // broadcasted.
10031 if (static_cast<unsigned int>(std::distance(
10032 LI->user_begin(), LI->user_end())) != LI->getNumUses())
10033 return false;
10034 if (!IsLegalBroadcastLoad)
10035 continue;
10036 if (LI->hasNUsesOrMore(UsesLimit))
10037 return false;
10038 for (User *U : LI->users()) {
10039 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
10040 continue;
10041 for (const TreeEntry *UTE : getTreeEntries(U)) {
10042 for (int I : seq<int>(UTE->getNumOperands())) {
10043 if (all_of(UTE->getOperand(I), [LI](Value *V) {
10044 return V == LI || isa<PoisonValue>(V);
10045 }))
10046 // Found legal broadcast - do not vectorize.
10047 return false;
10048 }
10049 }
10050 }
10051 }
10052 return true;
10053 };
10054 AllowToVectorize = CheckIfAllowed(Slice);
10055 } else {
10056 AllowToVectorize =
10057 NumElts >= 3 ||
10058 any_of(ValueToGatherNodes.at(Slice.front()),
10059 [=](const TreeEntry *TE) {
10060 return TE->Scalars.size() == 2 &&
10061 ((TE->Scalars.front() == Slice.front() &&
10062 TE->Scalars.back() == Slice.back()) ||
10063 (TE->Scalars.front() == Slice.back() &&
10064 TE->Scalars.back() == Slice.front()));
10065 });
10066 }
10067 if (AllowToVectorize) {
10068 SmallVector<Value *> PointerOps;
10069 OrdersType CurrentOrder;
10070 // Try to build vector load.
10071 ArrayRef<Value *> Values(
10072 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
10073 StridedPtrInfo SPtrInfo;
10074 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
10075 PointerOps, SPtrInfo, &BestVF);
10076 if (LS != LoadsState::Gather ||
10077 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
10078 if (LS == LoadsState::ScatterVectorize) {
10079 if (MaskedGatherVectorized.empty() ||
10080 Cnt >= MaskedGatherVectorized.back() + NumElts)
10081 MaskedGatherVectorized.push_back(Cnt);
10082 continue;
10083 }
10084 if (LS != LoadsState::Gather) {
10085 Results.emplace_back(Values, LS);
10086 VectorizedLoads.insert_range(Slice);
10087 // If we vectorized initial block, no need to try to vectorize it
10088 // again.
10089 if (Cnt == StartIdx)
10090 StartIdx += NumElts;
10091 }
10092 // Check if the whole array was vectorized already - exit.
10093 if (StartIdx >= Loads.size())
10094 break;
10095 // Erase last masked gather candidate, if another candidate within
10096 // the range is found to be better.
10097 if (!MaskedGatherVectorized.empty() &&
10098 Cnt < MaskedGatherVectorized.back() + NumElts)
10099 MaskedGatherVectorized.pop_back();
10100 Cnt += NumElts - 1;
10101 continue;
10102 }
10103 }
10104 if (!AllowToVectorize || BestVF == 0)
10106 }
10107 // Mark masked gathers candidates as vectorized, if any.
10108 for (unsigned Cnt : MaskedGatherVectorized) {
10109 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
10110 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
10111 ArrayRef<Value *> Values(
10112 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
10113 Results.emplace_back(Values, LoadsState::ScatterVectorize);
10114 VectorizedLoads.insert_range(Slice);
10115 // If we vectorized initial block, no need to try to vectorize it again.
10116 if (Cnt == StartIdx)
10117 StartIdx += NumElts;
10118 }
10119 }
10120 for (LoadInst *LI : Loads) {
10121 if (!VectorizedLoads.contains(LI))
10122 NonVectorized.push_back(LI);
10123 }
10124 return Results;
10125 };
10126 auto ProcessGatheredLoads =
10127 [&, &TTI = *TTI](
10129 bool Final = false) {
10130 SmallVector<LoadInst *> NonVectorized;
10131 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
10132 GatheredLoads) {
10133 if (LoadsDists.size() <= 1) {
10134 NonVectorized.push_back(LoadsDists.back().first);
10135 continue;
10136 }
10138 LoadsDists);
10139 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
10140 stable_sort(LocalLoadsDists, LoadSorter);
10142 unsigned MaxConsecutiveDistance = 0;
10143 unsigned CurrentConsecutiveDist = 1;
10144 int64_t LastDist = LocalLoadsDists.front().second;
10145 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
10146 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
10147 if (isVectorized(L.first))
10148 continue;
10149 assert(LastDist >= L.second &&
10150 "Expected first distance always not less than second");
10151 if (static_cast<uint64_t>(LastDist - L.second) ==
10152 CurrentConsecutiveDist) {
10153 ++CurrentConsecutiveDist;
10154 MaxConsecutiveDistance =
10155 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
10156 Loads.push_back(L.first);
10157 continue;
10158 }
10159 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
10160 !Loads.empty())
10161 Loads.pop_back();
10162 CurrentConsecutiveDist = 1;
10163 LastDist = L.second;
10164 Loads.push_back(L.first);
10165 }
10166 if (Loads.size() <= 1)
10167 continue;
10168 if (AllowMaskedGather)
10169 MaxConsecutiveDistance = Loads.size();
10170 else if (MaxConsecutiveDistance < 2)
10171 continue;
10172 BoUpSLP::ValueSet VectorizedLoads;
10173 SmallVector<LoadInst *> SortedNonVectorized;
10175 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
10176 Final, MaxConsecutiveDistance);
10177 if (!Results.empty() && !SortedNonVectorized.empty() &&
10178 OriginalLoads.size() == Loads.size() &&
10179 MaxConsecutiveDistance == Loads.size() &&
10181 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
10182 return P.second == LoadsState::ScatterVectorize;
10183 })) {
10184 VectorizedLoads.clear();
10185 SmallVector<LoadInst *> UnsortedNonVectorized;
10187 UnsortedResults =
10188 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
10189 UnsortedNonVectorized, Final,
10190 OriginalLoads.size());
10191 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
10192 SortedNonVectorized.swap(UnsortedNonVectorized);
10193 Results.swap(UnsortedResults);
10194 }
10195 }
10196 for (auto [Slice, _] : Results) {
10197 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
10198 << Slice.size() << ")\n");
10199 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
10200 for (Value *L : Slice)
10201 if (!isVectorized(L))
10202 SortedNonVectorized.push_back(cast<LoadInst>(L));
10203 continue;
10204 }
10205
10206 // Select maximum VF as a maximum of user gathered nodes and
10207 // distance between scalar loads in these nodes.
10208 unsigned MaxVF = Slice.size();
10209 unsigned UserMaxVF = 0;
10210 unsigned InterleaveFactor = 0;
10211 if (MaxVF == 2) {
10212 UserMaxVF = MaxVF;
10213 } else {
10214 // Found distance between segments of the interleaved loads.
10215 std::optional<unsigned> InterleavedLoadsDistance = 0;
10216 unsigned Order = 0;
10217 std::optional<unsigned> CommonVF = 0;
10218 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
10219 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
10220 for (auto [Idx, V] : enumerate(Slice)) {
10221 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
10222 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
10223 unsigned Pos =
10224 EntryToPosition.try_emplace(E, Idx).first->second;
10225 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
10226 if (CommonVF) {
10227 if (*CommonVF == 0) {
10228 CommonVF = E->Scalars.size();
10229 continue;
10230 }
10231 if (*CommonVF != E->Scalars.size())
10232 CommonVF.reset();
10233 }
10234 // Check if the load is the part of the interleaved load.
10235 if (Pos != Idx && InterleavedLoadsDistance) {
10236 if (!DeinterleavedNodes.contains(E) &&
10237 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
10238 if (isa<Constant>(V))
10239 return false;
10240 if (isVectorized(V))
10241 return true;
10242 const auto &Nodes = ValueToGatherNodes.at(V);
10243 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
10244 !is_contained(Slice, V);
10245 })) {
10246 InterleavedLoadsDistance.reset();
10247 continue;
10248 }
10249 DeinterleavedNodes.insert(E);
10250 if (*InterleavedLoadsDistance == 0) {
10251 InterleavedLoadsDistance = Idx - Pos;
10252 continue;
10253 }
10254 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
10255 (Idx - Pos) / *InterleavedLoadsDistance < Order)
10256 InterleavedLoadsDistance.reset();
10257 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
10258 }
10259 }
10260 }
10261 DeinterleavedNodes.clear();
10262 // Check if the large load represents interleaved load operation.
10263 if (InterleavedLoadsDistance.value_or(0) > 1 &&
10264 CommonVF.value_or(0) != 0) {
10265 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
10266 unsigned VF = *CommonVF;
10267 OrdersType Order;
10268 SmallVector<Value *> PointerOps;
10269 StridedPtrInfo SPtrInfo;
10270 // Segmented load detected - vectorize at maximum vector factor.
10271 if (InterleaveFactor <= Slice.size() &&
10272 TTI.isLegalInterleavedAccessType(
10273 getWidenedType(Slice.front()->getType(), VF),
10274 InterleaveFactor,
10275 cast<LoadInst>(Slice.front())->getAlign(),
10276 cast<LoadInst>(Slice.front())
10278 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
10279 SPtrInfo) == LoadsState::Vectorize) {
10280 UserMaxVF = InterleaveFactor * VF;
10281 } else {
10282 InterleaveFactor = 0;
10283 }
10284 }
10285 // Cannot represent the loads as consecutive vectorizable nodes -
10286 // just exit.
10287 unsigned ConsecutiveNodesSize = 0;
10288 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10289 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10290 [&, Slice = Slice](const auto &P) {
10291 const auto *It = find_if(Slice, [&](Value *V) {
10292 return std::get<1>(P).contains(V);
10293 });
10294 if (It == Slice.end())
10295 return false;
10296 const TreeEntry &TE =
10297 *VectorizableTree[std::get<0>(P)];
10298 ArrayRef<Value *> VL = TE.Scalars;
10299 OrdersType Order;
10300 SmallVector<Value *> PointerOps;
10301 StridedPtrInfo SPtrInfo;
10303 VL, VL.front(), Order, PointerOps, SPtrInfo);
10304 if (State == LoadsState::ScatterVectorize ||
10306 return false;
10307 ConsecutiveNodesSize += VL.size();
10308 size_t Start = std::distance(Slice.begin(), It);
10309 size_t Sz = Slice.size() - Start;
10310 return Sz < VL.size() ||
10311 Slice.slice(Start, VL.size()) != VL;
10312 }))
10313 continue;
10314 // Try to build long masked gather loads.
10315 UserMaxVF = bit_ceil(UserMaxVF);
10316 if (InterleaveFactor == 0 &&
10317 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
10318 [&, Slice = Slice](unsigned Idx) {
10319 OrdersType Order;
10320 SmallVector<Value *> PointerOps;
10321 StridedPtrInfo SPtrInfo;
10322 return canVectorizeLoads(
10323 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10324 Slice[Idx * UserMaxVF], Order, PointerOps,
10325 SPtrInfo) == LoadsState::ScatterVectorize;
10326 }))
10327 UserMaxVF = MaxVF;
10328 if (Slice.size() != ConsecutiveNodesSize)
10329 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10330 }
10331 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10332 bool IsVectorized = true;
10333 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10334 ArrayRef<Value *> SubSlice =
10335 Slice.slice(I, std::min(VF, E - I));
10336 if (isVectorized(SubSlice.front()))
10337 continue;
10338 // Check if the subslice is to be-vectorized entry, which is not
10339 // equal to entry.
10340 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10341 [&](const auto &P) {
10342 return !SubSlice.equals(
10343 VectorizableTree[std::get<0>(P)]
10344 ->Scalars) &&
10345 set_is_subset(SubSlice, std::get<1>(P));
10346 }))
10347 continue;
10348 unsigned Sz = VectorizableTree.size();
10349 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
10350 if (Sz == VectorizableTree.size()) {
10351 IsVectorized = false;
10352 // Try non-interleaved vectorization with smaller vector
10353 // factor.
10354 if (InterleaveFactor > 0) {
10355 VF = 2 * (MaxVF / InterleaveFactor);
10356 InterleaveFactor = 0;
10357 }
10358 continue;
10359 }
10360 }
10361 if (IsVectorized)
10362 break;
10363 }
10364 }
10365 NonVectorized.append(SortedNonVectorized);
10366 }
10367 return NonVectorized;
10368 };
10369 for (const auto &GLs : GatheredLoads) {
10370 const auto &Ref = GLs.second;
10371 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10372 if (!Ref.empty() && !NonVectorized.empty() &&
10373 accumulate(
10374 Ref, 0u,
10375 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10376 -> unsigned { return S + LoadsDists.size(); }) !=
10377 NonVectorized.size() &&
10378 IsMaskedGatherSupported(NonVectorized)) {
10380 FinalGatheredLoads;
10381 for (LoadInst *LI : NonVectorized) {
10382 // Reinsert non-vectorized loads to other list of loads with the same
10383 // base pointers.
10384 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
10385 FinalGatheredLoads,
10386 /*AddNew=*/false);
10387 }
10388 // Final attempt to vectorize non-vectorized loads.
10389 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10390 }
10391 }
10392 // Try to vectorize postponed load entries, previously marked as gathered.
10393 for (unsigned Idx : LoadEntriesToVectorize) {
10394 const TreeEntry &E = *VectorizableTree[Idx];
10395 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10396 // Avoid reordering, if possible.
10397 if (!E.ReorderIndices.empty()) {
10398 // Build a mask out of the reorder indices and reorder scalars per this
10399 // mask.
10400 SmallVector<int> ReorderMask;
10401 inversePermutation(E.ReorderIndices, ReorderMask);
10402 reorderScalars(GatheredScalars, ReorderMask);
10403 }
10404 buildTreeRec(GatheredScalars, 0, EdgeInfo());
10405 }
10406 // If no new entries created, consider it as no gathered loads entries must be
10407 // handled.
10408 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10409 VectorizableTree.size())
10410 GatheredLoadsEntriesFirst.reset();
10411}
10412
10413/// Generates key/subkey pair for the given value to provide effective sorting
10414/// of the values and better detection of the vectorizable values sequences. The
10415/// keys/subkeys can be used for better sorting of the values themselves (keys)
10416/// and in values subgroups (subkeys).
10417static std::pair<size_t, size_t> generateKeySubkey(
10418 Value *V, const TargetLibraryInfo *TLI,
10419 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10420 bool AllowAlternate) {
10421 hash_code Key = hash_value(V->getValueID() + 2);
10422 hash_code SubKey = hash_value(0);
10423 // Sort the loads by the distance between the pointers.
10424 if (auto *LI = dyn_cast<LoadInst>(V)) {
10425 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
10426 if (LI->isSimple())
10427 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
10428 else
10429 Key = SubKey = hash_value(LI);
10430 } else if (isVectorLikeInstWithConstOps(V)) {
10431 // Sort extracts by the vector operands.
10433 Key = hash_value(Value::UndefValueVal + 1);
10434 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
10435 if (!isUndefVector(EI->getVectorOperand()).all() &&
10436 !isa<UndefValue>(EI->getIndexOperand()))
10437 SubKey = hash_value(EI->getVectorOperand());
10438 }
10439 } else if (auto *I = dyn_cast<Instruction>(V)) {
10440 // Sort other instructions just by the opcodes except for CMPInst.
10441 // For CMP also sort by the predicate kind.
10443 isValidForAlternation(I->getOpcode())) {
10444 if (AllowAlternate)
10445 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
10446 else
10447 Key = hash_combine(hash_value(I->getOpcode()), Key);
10448 SubKey = hash_combine(
10449 hash_value(I->getOpcode()), hash_value(I->getType()),
10451 ? I->getType()
10452 : cast<CastInst>(I)->getOperand(0)->getType()));
10453 // For casts, look through the only operand to improve compile time.
10454 if (isa<CastInst>(I)) {
10455 std::pair<size_t, size_t> OpVals =
10456 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
10457 /*AllowAlternate=*/true);
10458 Key = hash_combine(OpVals.first, Key);
10459 SubKey = hash_combine(OpVals.first, SubKey);
10460 }
10461 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
10462 CmpInst::Predicate Pred = CI->getPredicate();
10463 if (CI->isCommutative())
10464 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
10466 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
10467 hash_value(SwapPred),
10468 hash_value(CI->getOperand(0)->getType()));
10469 } else if (auto *Call = dyn_cast<CallInst>(I)) {
10472 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
10473 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
10474 SubKey = hash_combine(hash_value(I->getOpcode()),
10475 hash_value(Call->getCalledFunction()));
10476 } else {
10478 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
10479 }
10480 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10481 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
10482 hash_value(Op.Tag), SubKey);
10483 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
10484 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
10485 SubKey = hash_value(Gep->getPointerOperand());
10486 else
10487 SubKey = hash_value(Gep);
10488 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
10489 !isa<ConstantInt>(I->getOperand(1))) {
10490 // Do not try to vectorize instructions with potentially high cost.
10491 SubKey = hash_value(I);
10492 } else {
10493 SubKey = hash_value(I->getOpcode());
10494 }
10495 Key = hash_combine(hash_value(I->getParent()->getNumber()), Key);
10496 }
10497 return std::make_pair(Key, SubKey);
10498}
10499
10500/// Checks if the specified instruction \p I is an main operation for the given
10501/// \p MainOp and \p AltOp instructions.
10502static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10503 Instruction *AltOp, const TargetLibraryInfo &TLI);
10504
10505/// Builds the arguments types vector for the given call instruction with the
10506/// given \p ID for the specified vector factor.
10509 const unsigned VF, unsigned MinBW,
10510 const TargetTransformInfo *TTI) {
10511 SmallVector<Type *> ArgTys;
10512 for (auto [Idx, Arg] : enumerate(CI->args())) {
10515 ArgTys.push_back(Arg->getType());
10516 continue;
10517 }
10518 if (MinBW > 0) {
10519 ArgTys.push_back(
10520 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10521 continue;
10522 }
10523 }
10524 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10525 }
10526 return ArgTys;
10527}
10528
10529/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10530/// function (if possible) calls. Returns invalid cost for the corresponding
10531/// calls, if they cannot be vectorized/will be scalarized.
10532static std::pair<InstructionCost, InstructionCost>
10535 ArrayRef<Type *> ArgTys) {
10536 auto Shape = VFShape::get(CI->getFunctionType(),
10538 false /*HasGlobalPred*/);
10539 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10540 auto LibCost = InstructionCost::getInvalid();
10541 if (!CI->isNoBuiltin() && VecFunc) {
10542 // Calculate the cost of the vector library call.
10543 // If the corresponding vector call is cheaper, return its cost.
10544 LibCost =
10545 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10546 }
10548
10549 // Calculate the cost of the vector intrinsic call.
10550 FastMathFlags FMF;
10551 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10552 FMF = FPCI->getFastMathFlags();
10553 const InstructionCost ScalarLimit = 10000;
10554 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10555 LibCost.isValid() ? LibCost : ScalarLimit);
10556 auto IntrinsicCost =
10557 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10558 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10559 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10561
10562 return {IntrinsicCost, LibCost};
10563}
10564
10565/// Find the innermost loop starting from \p L, for which at least a single
10566/// value in \p VL is not invariant.
10568 ArrayRef<Value *> VL) {
10569 assert(L && "Expected valid loop");
10570 auto IsLoopInvariant = [&](const Loop *L, ArrayRef<Value *> VL) {
10571 return all_of(VL, [&](Value *V) {
10572 return isa<Constant>(V) || !isa<Instruction>(V) || L->isLoopInvariant(V);
10573 });
10574 };
10575 while (L && IsLoopInvariant(L, VL))
10576 L = L->getParentLoop();
10577 return L;
10578}
10579
10580/// Get the loop nest for the given loop.
10581ArrayRef<const Loop *> BoUpSLP::getLoopNest(const Loop *L) {
10582 assert(L && "Expected valid loop");
10583 if (LoopAwareTripCount == 0)
10584 return {};
10585 SmallVector<const Loop *> &Res =
10586 LoopToLoopNest.try_emplace(L).first->getSecond();
10587 if (!Res.empty())
10588 return Res;
10589 SmallVector<const Loop *> LoopNest;
10590 while (L) {
10591 LoopNest.push_back(L);
10592 L = L->getParentLoop();
10593 }
10594 Res.assign(LoopNest.rbegin(), LoopNest.rend());
10595 return Res;
10596}
10597
10598BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10599 const InstructionsState &S, ArrayRef<Value *> VL,
10600 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10601 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10602 assert(S.getMainOp() &&
10603 "Expected instructions with same/alternate opcodes only.");
10604
10605 unsigned ShuffleOrOp =
10606 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10607 Instruction *VL0 = S.getMainOp();
10608 switch (ShuffleOrOp) {
10609 case Instruction::PHI: {
10610 // Too many operands - gather, most probably won't be vectorized.
10611 if (VL0->getNumOperands() > MaxPHINumOperands)
10612 return TreeEntry::NeedToGather;
10613 // Check for terminator values (e.g. invoke).
10614 for (Value *V : VL) {
10615 auto *PHI = dyn_cast<PHINode>(V);
10616 if (!PHI)
10617 continue;
10618 for (Value *Incoming : PHI->incoming_values()) {
10620 if (Term && Term->isTerminator()) {
10622 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10623 return TreeEntry::NeedToGather;
10624 }
10625 }
10626 }
10627
10628 return TreeEntry::Vectorize;
10629 }
10630 case Instruction::ExtractElement:
10631 if (any_of(VL, [&](Value *V) {
10632 auto *EI = dyn_cast<ExtractElementInst>(V);
10633 if (!EI)
10634 return true;
10635 return isVectorized(EI->getOperand(0));
10636 }))
10637 return TreeEntry::NeedToGather;
10638 [[fallthrough]];
10639 case Instruction::ExtractValue: {
10640 bool Reuse = canReuseExtract(VL, CurrentOrder);
10641 if (Reuse || !CurrentOrder.empty())
10642 return TreeEntry::Vectorize;
10643 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10644 return TreeEntry::NeedToGather;
10645 }
10646 case Instruction::InsertElement: {
10647 // Check that we have a buildvector and not a shuffle of 2 or more
10648 // different vectors.
10649 ValueSet SourceVectors;
10650 for (Value *V : VL) {
10651 if (isa<PoisonValue>(V)) {
10652 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10653 return TreeEntry::NeedToGather;
10654 }
10655 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10656 assert(getElementIndex(V) != std::nullopt &&
10657 "Non-constant or undef index?");
10658 }
10659
10660 if (count_if(VL, [&SourceVectors](Value *V) {
10661 return !SourceVectors.contains(V);
10662 }) >= 2) {
10663 // Found 2nd source vector - cancel.
10664 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10665 "different source vectors.\n");
10666 return TreeEntry::NeedToGather;
10667 }
10668
10669 if (any_of(VL, [&SourceVectors](Value *V) {
10670 // The last InsertElement can have multiple uses.
10671 return SourceVectors.contains(V) && !V->hasOneUse();
10672 })) {
10673 assert(SLPReVec && "Only supported by REVEC.");
10674 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10675 "multiple uses.\n");
10676 return TreeEntry::NeedToGather;
10677 }
10678
10679 return TreeEntry::Vectorize;
10680 }
10681 case Instruction::Load: {
10682 // Check that a vectorized load would load the same memory as a scalar
10683 // load. For example, we don't want to vectorize loads that are smaller
10684 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10685 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10686 // from such a struct, we read/write packed bits disagreeing with the
10687 // unvectorized version.
10688 auto IsGatheredNode = [&]() {
10689 if (!GatheredLoadsEntriesFirst)
10690 return false;
10691 return all_of(VL, [&](Value *V) {
10692 if (isa<PoisonValue>(V))
10693 return true;
10694 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10695 return TE->Idx >= *GatheredLoadsEntriesFirst;
10696 });
10697 });
10698 };
10699 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10701 return TreeEntry::Vectorize;
10703 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10704 // Delay slow vectorized nodes for better vectorization attempts.
10705 LoadEntriesToVectorize.insert(VectorizableTree.size());
10706 return TreeEntry::NeedToGather;
10707 }
10708 return IsGatheredNode() ? TreeEntry::NeedToGather
10709 : TreeEntry::CompressVectorize;
10711 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10712 // Delay slow vectorized nodes for better vectorization attempts.
10713 LoadEntriesToVectorize.insert(VectorizableTree.size());
10714 return TreeEntry::NeedToGather;
10715 }
10716 return IsGatheredNode() ? TreeEntry::NeedToGather
10717 : TreeEntry::ScatterVectorize;
10719 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10720 // Delay slow vectorized nodes for better vectorization attempts.
10721 LoadEntriesToVectorize.insert(VectorizableTree.size());
10722 return TreeEntry::NeedToGather;
10723 }
10724 return IsGatheredNode() ? TreeEntry::NeedToGather
10725 : TreeEntry::StridedVectorize;
10726 case LoadsState::Gather:
10727#ifndef NDEBUG
10728 Type *ScalarTy = VL0->getType();
10729 if (DL->getTypeSizeInBits(ScalarTy) !=
10730 DL->getTypeAllocSizeInBits(ScalarTy))
10731 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10732 else if (any_of(VL, [](Value *V) {
10733 auto *LI = dyn_cast<LoadInst>(V);
10734 return !LI || !LI->isSimple();
10735 }))
10736 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10737 else
10738 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10739#endif // NDEBUG
10741 return TreeEntry::NeedToGather;
10742 }
10743 llvm_unreachable("Unexpected state of loads");
10744 }
10745 case Instruction::ZExt:
10746 case Instruction::SExt:
10747 case Instruction::FPToUI:
10748 case Instruction::FPToSI:
10749 case Instruction::FPExt:
10750 case Instruction::PtrToInt:
10751 case Instruction::IntToPtr:
10752 case Instruction::SIToFP:
10753 case Instruction::UIToFP:
10754 case Instruction::Trunc:
10755 case Instruction::FPTrunc:
10756 case Instruction::BitCast: {
10757 Type *SrcTy = VL0->getOperand(0)->getType();
10758 for (Value *V : VL) {
10759 if (isa<PoisonValue>(V))
10760 continue;
10761 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10762 if (Ty != SrcTy || !isValidElementType(Ty)) {
10763 LLVM_DEBUG(
10764 dbgs() << "SLP: Gathering casts with different src types.\n");
10765 return TreeEntry::NeedToGather;
10766 }
10767 }
10768 return TreeEntry::Vectorize;
10769 }
10770 case Instruction::ICmp:
10771 case Instruction::FCmp: {
10772 // Check that all of the compares have the same predicate.
10773 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10775 Type *ComparedTy = VL0->getOperand(0)->getType();
10776 for (Value *V : VL) {
10777 if (isa<PoisonValue>(V))
10778 continue;
10779 auto *Cmp = cast<CmpInst>(V);
10780 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10781 Cmp->getOperand(0)->getType() != ComparedTy) {
10782 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10783 return TreeEntry::NeedToGather;
10784 }
10785 }
10786 return TreeEntry::Vectorize;
10787 }
10788 case Instruction::Select:
10789 if (SLPReVec) {
10790 SmallPtrSet<Type *, 4> CondTypes;
10791 for (Value *V : VL) {
10792 Value *Cond;
10793 if (!match(V, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
10794 !match(V, m_ZExt(m_Value(Cond))))
10795 continue;
10796 CondTypes.insert(Cond->getType());
10797 }
10798 if (CondTypes.size() > 1) {
10799 LLVM_DEBUG(
10800 dbgs()
10801 << "SLP: Gathering select with different condition types.\n");
10802 return TreeEntry::NeedToGather;
10803 }
10804 }
10805 [[fallthrough]];
10806 case Instruction::FNeg:
10807 case Instruction::Add:
10808 case Instruction::FAdd:
10809 case Instruction::Sub:
10810 case Instruction::FSub:
10811 case Instruction::Mul:
10812 case Instruction::FMul:
10813 case Instruction::UDiv:
10814 case Instruction::SDiv:
10815 case Instruction::FDiv:
10816 case Instruction::URem:
10817 case Instruction::SRem:
10818 case Instruction::FRem:
10819 case Instruction::Shl:
10820 case Instruction::LShr:
10821 case Instruction::AShr:
10822 case Instruction::And:
10823 case Instruction::Or:
10824 case Instruction::Xor:
10825 case Instruction::Freeze:
10826 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10827 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10828 auto *I = dyn_cast<Instruction>(V);
10829 return I && I->isBinaryOp() && !I->isFast();
10830 }))
10831 return TreeEntry::NeedToGather;
10832 return TreeEntry::Vectorize;
10833 case Instruction::GetElementPtr: {
10834 // We don't combine GEPs with complicated (nested) indexing.
10835 for (Value *V : VL) {
10836 auto *I = dyn_cast<GetElementPtrInst>(V);
10837 if (!I)
10838 continue;
10839 if (I->getNumOperands() != 2) {
10840 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10841 return TreeEntry::NeedToGather;
10842 }
10843 }
10844
10845 // We can't combine several GEPs into one vector if they operate on
10846 // different types.
10847 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10848 for (Value *V : VL) {
10849 auto *GEP = dyn_cast<GEPOperator>(V);
10850 if (!GEP)
10851 continue;
10852 Type *CurTy = GEP->getSourceElementType();
10853 if (Ty0 != CurTy) {
10854 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10855 return TreeEntry::NeedToGather;
10856 }
10857 }
10858
10859 // We don't combine GEPs with non-constant indexes.
10860 Type *Ty1 = VL0->getOperand(1)->getType();
10861 for (Value *V : VL) {
10862 auto *I = dyn_cast<GetElementPtrInst>(V);
10863 if (!I)
10864 continue;
10865 auto *Op = I->getOperand(1);
10866 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10867 (Op->getType() != Ty1 &&
10868 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10869 Op->getType()->getScalarSizeInBits() >
10870 DL->getIndexSizeInBits(
10871 V->getType()->getPointerAddressSpace())))) {
10872 LLVM_DEBUG(
10873 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10874 return TreeEntry::NeedToGather;
10875 }
10876 }
10877
10878 return TreeEntry::Vectorize;
10879 }
10880 case Instruction::Store: {
10881 // Check if the stores are consecutive or if we need to swizzle them.
10882 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10883 // Avoid types that are padded when being allocated as scalars, while
10884 // being packed together in a vector (such as i1).
10885 if (DL->getTypeSizeInBits(ScalarTy) !=
10886 DL->getTypeAllocSizeInBits(ScalarTy)) {
10887 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10888 return TreeEntry::NeedToGather;
10889 }
10890 // Make sure all stores in the bundle are simple - we can't vectorize
10891 // atomic or volatile stores.
10892 for (Value *V : VL) {
10893 auto *SI = cast<StoreInst>(V);
10894 if (!SI->isSimple()) {
10895 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10896 return TreeEntry::NeedToGather;
10897 }
10898 PointerOps.push_back(SI->getPointerOperand());
10899 }
10900
10901 // Check the order of pointer operands.
10902 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10903 Value *Ptr0;
10904 Value *PtrN;
10905 if (CurrentOrder.empty()) {
10906 Ptr0 = PointerOps.front();
10907 PtrN = PointerOps.back();
10908 } else {
10909 Ptr0 = PointerOps[CurrentOrder.front()];
10910 PtrN = PointerOps[CurrentOrder.back()];
10911 }
10912 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL);
10913 std::optional<int64_t> Dist =
10914 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10915 // Check that the sorted pointer operands are consecutive.
10916 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10917 return TreeEntry::Vectorize;
10918 if (EnableStridedStores &&
10919 analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
10920 CurrentOrder, *Dist, Ptr0, SPtrInfo))
10921 return TreeEntry::StridedVectorize;
10922 }
10923
10924 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10925 return TreeEntry::NeedToGather;
10926 }
10927 case Instruction::Call: {
10928 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10929 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10930 auto *I = dyn_cast<Instruction>(V);
10931 return I && !I->isFast();
10932 }))
10933 return TreeEntry::NeedToGather;
10934 // Check if the calls are all to the same vectorizable intrinsic or
10935 // library function.
10936 CallInst *CI = cast<CallInst>(VL0);
10938
10939 VFShape Shape = VFShape::get(
10940 CI->getFunctionType(),
10941 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10942 false /*HasGlobalPred*/);
10943 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10944
10945 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10946 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10947 return TreeEntry::NeedToGather;
10948 }
10949 Function *F = CI->getCalledFunction();
10950 unsigned NumArgs = CI->arg_size();
10951 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10952 for (unsigned J = 0; J != NumArgs; ++J)
10954 ScalarArgs[J] = CI->getArgOperand(J);
10955 for (Value *V : VL) {
10956 CallInst *CI2 = dyn_cast<CallInst>(V);
10957 if (!CI2 || CI2->getCalledFunction() != F ||
10958 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10959 (VecFunc &&
10960 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10962 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10963 << "\n");
10964 return TreeEntry::NeedToGather;
10965 }
10966 // Some intrinsics have scalar arguments and should be same in order for
10967 // them to be vectorized.
10968 for (unsigned J = 0; J != NumArgs; ++J) {
10970 Value *A1J = CI2->getArgOperand(J);
10971 if (ScalarArgs[J] != A1J) {
10973 << "SLP: mismatched arguments in call:" << *CI
10974 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10975 return TreeEntry::NeedToGather;
10976 }
10977 }
10978 }
10979 // Verify that the bundle operands are identical between the two calls.
10980 if (CI->hasOperandBundles() &&
10981 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10982 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10983 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10984 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10985 << "!=" << *V << '\n');
10986 return TreeEntry::NeedToGather;
10987 }
10988 }
10989 SmallVector<Type *> ArgTys =
10990 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10991 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10992 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10993 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10994 return TreeEntry::NeedToGather;
10995
10996 return TreeEntry::Vectorize;
10997 }
10998 case Instruction::ShuffleVector: {
10999 if (!S.isAltShuffle()) {
11000 // REVEC can support non alternate shuffle.
11002 return TreeEntry::Vectorize;
11003 // If this is not an alternate sequence of opcode like add-sub
11004 // then do not vectorize this instruction.
11005 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
11006 return TreeEntry::NeedToGather;
11007 }
11008
11009 return TreeEntry::Vectorize;
11010 }
11011 default:
11012 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
11013 return TreeEntry::NeedToGather;
11014 }
11015}
11016
11017namespace {
11018/// Allows to correctly handle operands of the phi nodes based on the \p Main
11019/// PHINode order of incoming basic blocks/values.
11020class PHIHandler {
11021 DominatorTree &DT;
11022 PHINode *Main = nullptr;
11025
11026public:
11027 PHIHandler() = delete;
11028 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
11029 : DT(DT), Main(Main), Phis(Phis),
11030 Operands(Main->getNumIncomingValues(),
11031 SmallVector<Value *>(Phis.size(), nullptr)) {}
11032 void buildOperands() {
11033 constexpr unsigned FastLimit = 4;
11034 if (Main->getNumIncomingValues() <= FastLimit) {
11035 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
11036 BasicBlock *InBB = Main->getIncomingBlock(I);
11037 if (!DT.isReachableFromEntry(InBB)) {
11038 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
11039 continue;
11040 }
11041 // Prepare the operand vector.
11042 for (auto [Idx, V] : enumerate(Phis)) {
11043 auto *P = dyn_cast<PHINode>(V);
11044 if (!P) {
11046 "Expected isa instruction or poison value.");
11047 Operands[I][Idx] = V;
11048 continue;
11049 }
11050 if (P->getIncomingBlock(I) == InBB)
11051 Operands[I][Idx] = P->getIncomingValue(I);
11052 else
11053 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
11054 }
11055 }
11056 return;
11057 }
11058 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
11059 Blocks;
11060 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
11061 BasicBlock *InBB = Main->getIncomingBlock(I);
11062 if (!DT.isReachableFromEntry(InBB)) {
11063 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
11064 continue;
11065 }
11066 Blocks.try_emplace(InBB).first->second.push_back(I);
11067 }
11068 for (auto [Idx, V] : enumerate(Phis)) {
11069 if (isa<PoisonValue>(V)) {
11070 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
11071 Operands[I][Idx] = V;
11072 continue;
11073 }
11074 auto *P = cast<PHINode>(V);
11075 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
11076 BasicBlock *InBB = P->getIncomingBlock(I);
11077 if (InBB == Main->getIncomingBlock(I)) {
11078 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
11079 continue;
11080 Operands[I][Idx] = P->getIncomingValue(I);
11081 continue;
11082 }
11083 auto *It = Blocks.find(InBB);
11084 if (It == Blocks.end())
11085 continue;
11086 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
11087 }
11088 }
11089 for (const auto &P : Blocks) {
11090 ArrayRef<unsigned> IncomingValues = P.second;
11091 if (IncomingValues.size() <= 1)
11092 continue;
11093 unsigned BasicI = IncomingValues.consume_front();
11094 for (unsigned I : IncomingValues) {
11095 assert(all_of(enumerate(Operands[I]),
11096 [&](const auto &Data) {
11097 return !Data.value() ||
11098 Data.value() == Operands[BasicI][Data.index()];
11099 }) &&
11100 "Expected empty operands list.");
11101 Operands[I] = Operands[BasicI];
11102 }
11103 }
11104 }
11105 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
11106};
11107} // namespace
11108
11109/// Returns main/alternate instructions for the given \p VL. Unlike
11110/// getSameOpcode supports non-compatible instructions for better SplitVectorize
11111/// node support.
11112/// \returns first main/alt instructions, if only poisons and instruction with
11113/// only 2 opcodes exists. Returns pair of nullptr otherwise.
11114static std::pair<Instruction *, Instruction *>
11116 Instruction *MainOp = nullptr;
11117 Instruction *AltOp = nullptr;
11118 for (Value *V : VL) {
11119 if (isa<PoisonValue>(V))
11120 continue;
11121 auto *I = dyn_cast<Instruction>(V);
11122 if (!I)
11123 return {};
11124 if (!MainOp) {
11125 MainOp = I;
11126 continue;
11127 }
11128 if (MainOp->getOpcode() == I->getOpcode()) {
11129 if (I->getParent() != MainOp->getParent())
11130 return {};
11131 continue;
11132 }
11133 if (!AltOp) {
11134 AltOp = I;
11135 continue;
11136 }
11137 if (AltOp->getOpcode() == I->getOpcode()) {
11138 if (I->getParent() != AltOp->getParent())
11139 return {};
11140 continue;
11141 }
11142 return {};
11143 }
11144 if (!AltOp)
11145 return {};
11146 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
11147 "Expected different main and alt instructions.");
11148 return std::make_pair(MainOp, AltOp);
11149}
11150
11151/// Checks that every instruction appears once in the list and if not, packs
11152/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
11153/// unique scalars is extended by poison values to the whole register size.
11154///
11155/// \returns false if \p VL could not be uniquified, in which case \p VL is
11156/// unchanged and \p ReuseShuffleIndices is empty.
11158 SmallVectorImpl<int> &ReuseShuffleIndices,
11159 const TargetTransformInfo &TTI,
11160 const TargetLibraryInfo &TLI,
11161 const InstructionsState &S,
11162 const BoUpSLP::EdgeInfo &UserTreeIdx,
11163 const BoUpSLP &R, bool BuildGatherOnly = true) {
11164 // Check that every instruction appears once in this bundle.
11165 SmallVector<Value *> UniqueValues;
11166 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
11167 for (Value *V : VL) {
11168 if (isConstant(V)) {
11169 // Constants are always considered distinct, even if the same constant
11170 // appears multiple times in VL.
11171 ReuseShuffleIndices.emplace_back(
11172 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
11173 UniqueValues.emplace_back(V);
11174 continue;
11175 }
11176 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
11177 ReuseShuffleIndices.emplace_back(Res.first->second);
11178 if (Res.second)
11179 UniqueValues.emplace_back(V);
11180 }
11181
11182 // Check if we need to schedule the scalars. If no, can keep original scalars
11183 // and avoid extra shuffles.
11184 bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
11185 !isVectorLikeInstWithConstOps(S.getMainOp()) &&
11186 (S.areInstructionsWithCopyableElements() ||
11187 !doesNotNeedToSchedule(UniqueValues));
11188 // Compute this flag BEFORE the tail-poison erase below - it must reflect
11189 // the state of the original VL (for the InsertsCost call), not the
11190 // potentially-shrunk UniqueValues.
11191 bool AreAllValuesNonConst = UniquePositions.size() == UniqueValues.size();
11192 // Drop tail poisons, if the values can be vectorized.
11193 if (RequireScheduling) {
11194 const auto EndIt =
11195 find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()),
11197 assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison.");
11198 UniqueValues.erase(EndIt.base(), UniqueValues.end());
11199 }
11200 unsigned NumUniqueScalarValues = UniqueValues.size();
11201 if (NumUniqueScalarValues == VL.size()) {
11202 ReuseShuffleIndices.clear();
11203 return true;
11204 }
11205
11206 // For VL=4 with 3 unique values: keep originals. A <3 x T> vector is
11207 // always widened to <4 x T> on hardware, so the packing just adds an
11208 // extra expand shuffle. Does not apply to loads (a <3 x T> load is a
11209 // single memory access) or PHIs (benefit from compact packing in loops).
11210 constexpr unsigned SmallVecWidth = 4;
11211 constexpr unsigned SmallVecUniqueThreshold = 3;
11212 if (VL.size() == SmallVecWidth &&
11213 NumUniqueScalarValues == SmallVecUniqueThreshold && !BuildGatherOnly &&
11214 !(S && (S.getOpcode() == Instruction::Load ||
11215 S.getOpcode() == Instruction::PHI))) {
11216 // Keep originals with identity reuse - no packing, no extra shuffle.
11217 ReuseShuffleIndices.clear();
11218 return true;
11219 }
11220
11221 // Checks if unique inserts + shuffle is more profitable than just inserts or
11222 // vectorized values.
11223 auto EstimatePackPlusShuffleVsInserts = [&]() {
11224 // Single instruction/argument insert - no shuffle.
11225 if (UniquePositions.size() == 1 &&
11226 (NumUniqueScalarValues == 1 ||
11228 return std::make_pair(false, false);
11229 // For large gathers with power-of-2 VL where packing would produce
11230 // non-power-of-2, reject if most scalars are constants - the packing
11231 // overhead (non-power-of-2 split + shuffles) outweighs the benefit.
11232 constexpr unsigned MinVLForConstGatherCheck = 4;
11233 if (BuildGatherOnly && VL.size() > MinVLForConstGatherCheck &&
11234 has_single_bit(static_cast<unsigned>(VL.size())) &&
11235 !has_single_bit(NumUniqueScalarValues) &&
11236 UniquePositions.size() * 2 < NumUniqueScalarValues)
11237 return std::make_pair(false, false);
11238 auto CheckLoads = [&](ArrayRef<Value *> Loads, bool IncludeGather) {
11239 assert(S && S.getOpcode() == Instruction::Load && "Expected load.");
11240 BoUpSLP::OrdersType Order;
11241 SmallVector<Value *> PointerOps;
11242 BoUpSLP::StridedPtrInfo SPtrInfo;
11243 BoUpSLP::LoadsState Res = R.canVectorizeLoads(Loads, S.getMainOp(), Order,
11244 PointerOps, SPtrInfo);
11245 return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) ||
11249 };
11250 // Operand of the root tree entry on the vectorize path: always pack the
11251 // scalars (PackProfitable=true). Choose between keeping the original VL
11252 // and packing the unique values:
11253 // - For loads, prefer the originals only when both the deduplicated and
11254 // the full sequence can be vectorized non-Gather, or when the reuse
11255 // mask is the identity (the shuffle is free).
11256 // - For everything else (including !S, where RequireScheduling is forced
11257 // to false above), keep originals iff no scheduling is required.
11258 bool IsRootOperand =
11259 UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 && !BuildGatherOnly;
11260 if (IsRootOperand) {
11261 if (S && S.getOpcode() == Instruction::Load) {
11262 bool UseOrig = (CheckLoads(UniqueValues, /*IncludeGather=*/true) &&
11263 CheckLoads(VL, /*IncludeGather=*/false)) ||
11265 ReuseShuffleIndices, ReuseShuffleIndices.size());
11266 return std::make_pair(true, UseOrig);
11267 }
11268 return std::make_pair(true, !RequireScheduling);
11269 }
11270 APInt DemandedElts = APInt::getZero(VL.size());
11271 for (auto [Idx, Val] : enumerate(ReuseShuffleIndices))
11272 if (Val != PoisonMaskElem && UniquePositions.contains(UniqueValues[Val]))
11273 DemandedElts.setBit(Idx);
11274 Type *ScalarTy = ::getValueType(UniqueValues.front());
11275 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11276 auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues);
11277 const unsigned NumParts = ::getNumberOfParts(TTI, VecTy, ScalarTy);
11278 const unsigned UniquesNumParts =
11279 ::getNumberOfParts(TTI, UniquesVecTy, ScalarTy);
11280 // No need to schedule scalars and only single register used? Use original
11281 // scalars, do not pack.
11282 if (!RequireScheduling) {
11283 if (VL.size() / NumUniqueScalarValues == 1 &&
11284 (NumParts <= 1 || UniquesNumParts >= NumParts))
11285 return std::make_pair(true, true);
11286 // For PHI operands, prefer packing with reuse shuffle - the PHI
11287 // carries the vector through the loop cheaply.
11288 if (S && S.getOpcode() == Instruction::PHI && NumUniqueScalarValues > 1 &&
11289 UniquesNumParts <= NumParts)
11290 return std::make_pair(true, false);
11291 }
11293 InstructionCost ReusesCost = ::getShuffleCost(
11295 NumUniqueScalarValues > VL.size() / 2 ? ArrayRef<int>()
11296 : ArrayRef(ReuseShuffleIndices),
11297 CostKind, /*Index=*/0, UniquesVecTy);
11298 // For vectorizable (non-gather) nodes with low duplication, prefer keeping
11299 // the original values over packing uniques + reshuffling:
11300 // - A single duplicate (non-load) adds negligible overhead.
11301 // - When most values are already unique (>50%), or exactly half are unique
11302 // for some ops (GEPs, non-alt-shuffle casts), the reshuffle cost may
11303 // exceed the savings from a smaller packed vector - check against a
11304 // per-register-part threshold (stricter for wider vectors).
11305 if (S && !BuildGatherOnly) {
11306 bool HasOneDup = S.getOpcode() != Instruction::Load &&
11307 NumUniqueScalarValues + 1 == VL.size();
11308 bool MostlyUnique = NumUniqueScalarValues * 2 > VL.size();
11309 bool IsHalfUniqueValues =
11310 NumUniqueScalarValues * 2 == VL.size() &&
11311 (S.getOpcode() == Instruction::GetElementPtr ||
11312 (isa<CastInst>(S.getMainOp()) && !S.isAltShuffle()));
11314 NumParts * (VL.size() > SmallVecWidth ? 1 : 2);
11315 if (HasOneDup ||
11316 ((MostlyUnique || IsHalfUniqueValues) && ReusesCost > CostThreshold))
11317 return std::make_pair(true, true);
11318 }
11319 // For loads, check if either the deduplicated or the full (with
11320 // duplicates) set can be scatter/compress-vectorized. Prefer the unique
11321 // loads (pack + reshuffle) when possible, otherwise use the originals.
11322 if (S && S.getOpcode() == Instruction::Load) {
11323 bool UniquesVectorized =
11324 CheckLoads(UniqueValues, /*IncludeGather=*/false);
11325 if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=*/false))
11326 return std::make_pair(true, !UniquesVectorized);
11327 }
11328 bool CanSkipBVCost =
11329 (!BuildGatherOnly && !RequireScheduling) || R.hasSameNode(S, VL);
11330 InstructionCost InsertsCost =
11331 CanSkipBVCost
11333 : ::getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
11334 /*Insert=*/true, /*Extract=*/false,
11335 CostKind, AreAllValuesNonConst, VL);
11336 APInt UniquesDemandedElts = APInt::getAllOnes(NumUniqueScalarValues);
11337 for (const auto [Idx, V] : enumerate(UniqueValues))
11338 if (isConstant(V))
11339 UniquesDemandedElts.clearBit(Idx);
11340 InstructionCost UniquesCost =
11341 CanSkipBVCost
11343 : ::getScalarizationOverhead(TTI, ScalarTy, UniquesVecTy,
11344 UniquesDemandedElts, /*Insert=*/true,
11345 /*Extract=*/false, CostKind,
11346 AreAllValuesNonConst, UniqueValues);
11347 UniquesCost += ReusesCost;
11348 if (UniquesCost <= InsertsCost)
11349 return std::make_pair(true, false);
11350 InstructionCost CostDiff = UniquesCost - InsertsCost;
11351 if (CostDiff < TTI::TCC_Expensive ||
11352 (R.getTreeSize() == 0 && R.isReductionTree() &&
11353 CostDiff == TTI::TCC_Expensive))
11354 return std::make_pair(S && (!S.isAltShuffle() || !BuildGatherOnly),
11355 false);
11356 // Otherwise, use original values, if values do not require scheduling and
11357 // pass still try to vectorize them.
11358 bool KeepOriginal = !BuildGatherOnly && !RequireScheduling;
11359 return std::make_pair(KeepOriginal, KeepOriginal);
11360 };
11361
11362 const auto [PackProfitable, UseOriginal] = EstimatePackPlusShuffleVsInserts();
11363
11364 if (PackProfitable) {
11365 if (UseOriginal) {
11366 // Prefer original scalars - avoid shuffling.
11367 ReuseShuffleIndices.clear();
11368 } else {
11369 // Better to use uniques + reshuffle.
11370 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
11371 VL = std::move(UniqueValues);
11372 }
11373 return true;
11374 }
11375
11376 // Buildvector/gather of the original scalars.
11377 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11378 ReuseShuffleIndices.clear();
11379 return false;
11380}
11381
11383 const InstructionsState &LocalState,
11386 OrdersType &ReorderIndices) const {
11387 constexpr unsigned SmallNodeSize = 4;
11388 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11390 return false;
11391
11392 // Check if this is a duplicate of another split entry.
11393 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11394 << ".\n");
11395 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11396 if (E->isSame(VL)) {
11397 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11398 << *LocalState.getMainOp() << ".\n");
11399 return false;
11400 }
11401 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11402 if (all_of(VL, [&](Value *V) {
11403 return isa<PoisonValue>(V) || Values.contains(V);
11404 })) {
11405 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11406 return false;
11407 }
11408 }
11409
11410 ReorderIndices.assign(VL.size(), VL.size());
11411 SmallBitVector Op1Indices(VL.size());
11412 for (auto [Idx, V] : enumerate(VL)) {
11413 auto *I = dyn_cast<Instruction>(V);
11414 if (!I) {
11415 Op1.push_back(V);
11416 Op1Indices.set(Idx);
11417 continue;
11418 }
11419 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11420 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
11421 *TLI)) ||
11422 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11423 !isAlternateInstruction(I, LocalState.getMainOp(),
11424 LocalState.getAltOp(), *TLI))) {
11425 Op1.push_back(V);
11426 Op1Indices.set(Idx);
11427 continue;
11428 }
11429 Op2.push_back(V);
11430 }
11431 Type *ScalarTy = getValueType(VL.front());
11432 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
11433 unsigned Opcode0 = LocalState.getOpcode();
11434 unsigned Opcode1 = LocalState.getAltOpcode();
11435 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11436 // Enable split node, only if all nodes do not form legal alternate
11437 // instruction (like X86 addsub).
11440 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11441 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
11442 return false;
11443 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11444 for (unsigned Idx : seq<unsigned>(VL.size())) {
11445 if (Op1Indices.test(Idx)) {
11446 ReorderIndices[Op1Cnt] = Idx;
11447 ++Op1Cnt;
11448 } else {
11449 ReorderIndices[Op2Cnt] = Idx;
11450 ++Op2Cnt;
11451 }
11452 }
11453 if (isIdentityOrder(ReorderIndices))
11454 ReorderIndices.clear();
11455 // When VL fills a power-of-2 register but the split halves do not, the
11456 // reorder shuffle makes the split unprofitable - reject.
11457 else if (hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), VL.size()) &&
11458 (!hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(),
11459 Op1.size()) ||
11460 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(),
11461 Op2.size())))
11462 return false;
11463 SmallVector<int> Mask;
11464 if (!ReorderIndices.empty())
11465 inversePermutation(ReorderIndices, Mask);
11466 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11467 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
11468 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
11469 // Check non-profitable single register ops, which better to be represented
11470 // as alternate ops.
11471 if (NumParts >= VL.size())
11472 return false;
11474 InstructionCost InsertCost = ::getShuffleCost(
11475 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
11476 FixedVectorType *SubVecTy =
11477 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
11478 InstructionCost NewShuffleCost =
11479 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
11480 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11481 (Mask.empty() || InsertCost >= NewShuffleCost))
11482 return false;
11483 if ((LocalState.getMainOp()->isBinaryOp() &&
11484 LocalState.getAltOp()->isBinaryOp() &&
11485 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11486 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11487 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11488 (LocalState.getMainOp()->isUnaryOp() &&
11489 LocalState.getAltOp()->isUnaryOp())) {
11490 InstructionCost OriginalVecOpsCost =
11491 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11492 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11493 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11494 for (unsigned Idx : seq<unsigned>(VL.size())) {
11495 if (isa<PoisonValue>(VL[Idx]))
11496 continue;
11497 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11498 }
11499 InstructionCost OriginalCost =
11500 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
11501 VecTy, OriginalMask, Kind);
11502 InstructionCost NewVecOpsCost =
11503 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11504 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11505 InstructionCost NewCost =
11506 NewVecOpsCost + InsertCost +
11507 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11508 VectorizableTree.front()->getOpcode() == Instruction::Store
11509 ? NewShuffleCost
11510 : 0);
11511 // If not profitable to split - exit.
11512 if (NewCost >= OriginalCost)
11513 return false;
11514 }
11515 return true;
11516}
11517
11518namespace {
11519/// Class accepts incoming list of values, checks if it is able to model
11520/// "copyable" values as compatible operations, and generates the list of values
11521/// for scheduling and list of operands doe the new nodes.
11522class InstructionsCompatibilityAnalysis {
11523 DominatorTree &DT;
11524 const DataLayout &DL;
11525 const TargetTransformInfo &TTI;
11526 const TargetLibraryInfo &TLI;
11527 unsigned MainOpcode = 0;
11528 Instruction *MainOp = nullptr;
11529
11530 /// Checks if the opcode is supported as the main opcode for copyable
11531 /// elements.
11532 static bool isSupportedOpcode(const unsigned Opcode) {
11533 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11534 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11535 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11536 Opcode == Instruction::And || Opcode == Instruction::Or ||
11537 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11538 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11539 Opcode == Instruction::FDiv;
11540 }
11541
11542 /// Identifies the best candidate value, which represents main opcode
11543 /// operation.
11544 /// Currently the best candidate is the Add instruction with the parent
11545 /// block with the highest DFS incoming number (block, that dominates other).
11546 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11547 BasicBlock *Parent = nullptr;
11548 // Checks if the instruction has supported opcode.
11549 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11550 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
11551 return false;
11552 return I && isSupportedOpcode(I->getOpcode()) &&
11553 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
11554 };
11555 // Exclude operands instructions immediately to improve compile time, it
11556 // will be unable to schedule anyway.
11557 SmallDenseSet<Value *, 8> Operands;
11558 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11559 bool AnyUndef = false;
11560 for (Value *V : VL) {
11561 auto *I = dyn_cast<Instruction>(V);
11562 if (!I) {
11563 AnyUndef |= isa<UndefValue>(V);
11564 continue;
11565 }
11566 if (!DT.isReachableFromEntry(I->getParent()))
11567 continue;
11568 if (Candidates.empty()) {
11569 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11570 Parent = I->getParent();
11571 Operands.insert(I->op_begin(), I->op_end());
11572 continue;
11573 }
11574 if (Parent == I->getParent()) {
11575 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11576 Operands.insert(I->op_begin(), I->op_end());
11577 continue;
11578 }
11579 auto *NodeA = DT.getNode(Parent);
11580 auto *NodeB = DT.getNode(I->getParent());
11581 assert(NodeA && "Should only process reachable instructions");
11582 assert(NodeB && "Should only process reachable instructions");
11583 assert((NodeA == NodeB) ==
11584 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11585 "Different nodes should have different DFS numbers");
11586 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11587 Candidates.clear();
11588 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11589 Parent = I->getParent();
11590 Operands.clear();
11591 Operands.insert(I->op_begin(), I->op_end());
11592 }
11593 }
11594 unsigned BestOpcodeNum = 0;
11595 MainOp = nullptr;
11596 bool UsedOutside = false;
11597 for (const auto &P : Candidates) {
11598 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
11599 if (UsedOutside && !PUsedOutside)
11600 continue;
11601 if (!UsedOutside && PUsedOutside)
11602 BestOpcodeNum = 0;
11603 if (P.second.size() < BestOpcodeNum)
11604 continue;
11605 // If have inner dependencies - skip.
11606 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
11607 return Operands.contains(I);
11608 }))
11609 continue;
11610 UsedOutside = PUsedOutside;
11611 for (Instruction *I : P.second) {
11612 if (IsSupportedInstruction(I, AnyUndef)) {
11613 MainOp = I;
11614 BestOpcodeNum = P.second.size();
11615 break;
11616 }
11617 }
11618 }
11619 if (MainOp) {
11620 // Do not match, if any copyable is a terminator from the same block as
11621 // the main operation.
11622 if (any_of(VL, [&](Value *V) {
11623 auto *I = dyn_cast<Instruction>(V);
11624 return I && I->getParent() == MainOp->getParent() &&
11625 I->isTerminator();
11626 })) {
11627 MainOp = nullptr;
11628 return;
11629 }
11630 MainOpcode = MainOp->getOpcode();
11631 }
11632 }
11633
11634 /// Returns the idempotent value for the \p MainOp with the detected \p
11635 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11636 /// the operand itself, since V or V == V.
11637 Value *selectBestIdempotentValue() const {
11638 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11639 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
11640 !MainOp->isCommutative());
11641 }
11642
11643 /// Returns the value and operands for the \p V, considering if it is original
11644 /// instruction and its actual operands should be returned, or it is a
11645 /// copyable element and its should be represented as idempotent instruction.
11646 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11647 if (isa<PoisonValue>(V))
11648 return {V, V};
11649 if (!S.isCopyableElement(V))
11650 return convertTo(cast<Instruction>(V), S).second;
11651 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11652 return {V, selectBestIdempotentValue()};
11653 }
11654
11655 /// Builds operands for the original instructions.
11656 void
11657 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11658 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11659
11660 unsigned ShuffleOrOp =
11661 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11662 Instruction *VL0 = S.getMainOp();
11663
11664 switch (ShuffleOrOp) {
11665 case Instruction::PHI: {
11666 auto *PH = cast<PHINode>(VL0);
11667
11668 // Keeps the reordered operands to avoid code duplication.
11669 PHIHandler Handler(DT, PH, VL);
11670 Handler.buildOperands();
11671 Operands.assign(PH->getNumOperands(), {});
11672 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11673 Operands[I].assign(Handler.getOperands(I).begin(),
11674 Handler.getOperands(I).end());
11675 return;
11676 }
11677 case Instruction::ExtractValue:
11678 case Instruction::ExtractElement:
11679 // This is a special case, as it does not gather, but at the same time
11680 // we are not extending buildTree_rec() towards the operands.
11681 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11682 return;
11683 case Instruction::InsertElement:
11684 Operands.assign(2, {VL.size(), nullptr});
11685 for (auto [Idx, V] : enumerate(VL)) {
11686 auto *IE = cast<InsertElementInst>(V);
11687 for (auto [OpIdx, Ops] : enumerate(Operands))
11688 Ops[Idx] = IE->getOperand(OpIdx);
11689 }
11690 return;
11691 case Instruction::Load:
11692 Operands.assign(
11693 1, {VL.size(),
11694 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11695 for (auto [V, Op] : zip(VL, Operands.back())) {
11696 auto *LI = dyn_cast<LoadInst>(V);
11697 if (!LI)
11698 continue;
11699 Op = LI->getPointerOperand();
11700 }
11701 return;
11702 case Instruction::ZExt:
11703 case Instruction::SExt:
11704 case Instruction::FPToUI:
11705 case Instruction::FPToSI:
11706 case Instruction::FPExt:
11707 case Instruction::PtrToInt:
11708 case Instruction::IntToPtr:
11709 case Instruction::SIToFP:
11710 case Instruction::UIToFP:
11711 case Instruction::Trunc:
11712 case Instruction::FPTrunc:
11713 case Instruction::BitCast:
11714 case Instruction::ICmp:
11715 case Instruction::FCmp:
11716 case Instruction::FNeg:
11717 case Instruction::Add:
11718 case Instruction::FAdd:
11719 case Instruction::Sub:
11720 case Instruction::FSub:
11721 case Instruction::Mul:
11722 case Instruction::FMul:
11723 case Instruction::UDiv:
11724 case Instruction::SDiv:
11725 case Instruction::FDiv:
11726 case Instruction::URem:
11727 case Instruction::SRem:
11728 case Instruction::FRem:
11729 case Instruction::Shl:
11730 case Instruction::LShr:
11731 case Instruction::AShr:
11732 case Instruction::And:
11733 case Instruction::Or:
11734 case Instruction::Xor:
11735 case Instruction::Freeze:
11736 case Instruction::Store:
11737 case Instruction::ShuffleVector:
11738 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11739 for (auto [Idx, V] : enumerate(VL)) {
11740 auto *I = dyn_cast<Instruction>(V);
11741 if (!I) {
11742 for (auto [OpIdx, Ops] : enumerate(Operands))
11743 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11744 continue;
11745 }
11746 auto [Op, ConvertedOps] = convertTo(I, S);
11747 for (auto [OpIdx, Ops] : enumerate(Operands))
11748 Ops[Idx] = ConvertedOps[OpIdx];
11749 }
11750 return;
11751 case Instruction::Select:
11752 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11753 for (auto [Idx, V] : enumerate(VL)) {
11754 auto *I = dyn_cast<Instruction>(V);
11755 if (!I) {
11756 for (auto [OpIdx, Ops] : enumerate(Operands))
11757 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11758 continue;
11759 }
11760 if (isa<ZExtInst>(I)) {
11761 // Special case for select + zext i1 to avoid explosion of different
11762 // types. We want to keep the condition as i1 to be able to match
11763 // different selects together and reuse the vectorized condition
11764 // rather than trying to gather it.
11765 Operands[0][Idx] = I->getOperand(0);
11766 Operands[1][Idx] = ConstantInt::get(I->getType(), 1);
11767 Operands[2][Idx] = ConstantInt::getNullValue(I->getType());
11768 continue;
11769 }
11770 auto [Op, ConvertedOps] = convertTo(I, S);
11771 for (auto [OpIdx, Ops] : enumerate(Operands))
11772 Ops[Idx] = ConvertedOps[OpIdx];
11773 }
11774 return;
11775 case Instruction::GetElementPtr: {
11776 Operands.assign(2, {VL.size(), nullptr});
11777 // Need to cast all indices to the same type before vectorization to
11778 // avoid crash.
11779 // Required to be able to find correct matches between different gather
11780 // nodes and reuse the vectorized values rather than trying to gather them
11781 // again.
11782 const unsigned IndexIdx = 1;
11783 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
11784 Type *Ty =
11785 all_of(VL,
11786 [&](Value *V) {
11788 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
11789 })
11790 ? VL0Ty
11791 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
11792 ->getPointerOperandType()
11793 ->getScalarType());
11794 for (auto [Idx, V] : enumerate(VL)) {
11796 if (!GEP) {
11797 Operands[0][Idx] = V;
11798 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11799 continue;
11800 }
11801 Operands[0][Idx] = GEP->getPointerOperand();
11802 auto *Op = GEP->getOperand(IndexIdx);
11803 auto *CI = dyn_cast<ConstantInt>(Op);
11804 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11805 CI, Ty, CI->getValue().isSignBitSet(), DL)
11806 : Op;
11807 }
11808 return;
11809 }
11810 case Instruction::Call: {
11811 auto *CI = cast<CallInst>(VL0);
11813 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
11815 continue;
11816 auto &Ops = Operands.emplace_back();
11817 for (Value *V : VL) {
11818 auto *I = dyn_cast<Instruction>(V);
11819 Ops.push_back(I ? I->getOperand(Idx)
11820 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11821 }
11822 }
11823 return;
11824 }
11825 default:
11826 break;
11827 }
11828 llvm_unreachable("Unexpected vectorization of the instructions.");
11829 }
11830
11831 /// Check if the specified \p VL list of values is better to represent as
11832 /// uniform with copyables, as modeled via \p CopyableS, or as alternate (or
11833 /// uniform with compatible ops), modeled via \p S.
11834 /// Performs the analysis of the operands, choosing the preferred main
11835 /// instruction and checking the matching of the operands for the main
11836 /// instruction and copyable elements.
11837 bool isCopyablePreferable(ArrayRef<Value *> VL, const BoUpSLP &R,
11838 const InstructionsState &S,
11839 const InstructionsState &CopyableS) {
11840 // If all elements are vectorized already - keep as is.
11841 if (all_of(VL, [&](Value *V) {
11842 return isa<PoisonValue>(V) || R.isVectorized(V);
11843 }))
11844 return false;
11845 Instruction *SMain = S.getMainOp();
11846 Instruction *SAlt = S.isAltShuffle() ? S.getAltOp() : nullptr;
11847 const bool IsCommutative = ::isCommutative(SMain);
11848 const bool IsAltCommutative =
11849 S.isAltShuffle() ? ::isCommutative(SAlt) : false;
11850 const bool IsMainCommutative = ::isCommutative(MainOp);
11852 buildOriginalOperands(S, SMain, Ops);
11853 // Support only binary operations for now.
11854 if (Ops.size() != 2)
11855 return false;
11856 // Try to find better candidate for S main instruction, which operands have
11857 // better matching.
11858 auto CheckOperands = [](Value *Op, Value *SMainOp) {
11859 auto *OpI = dyn_cast<BinaryOperator>(Op);
11860 if (!OpI)
11861 return false;
11862 auto *SMainOpI = dyn_cast<BinaryOperator>(SMainOp);
11863 if (!SMainOpI)
11864 return true;
11865 return any_of(OpI->operands(), [&](Value *V) {
11866 auto *I = dyn_cast<Instruction>(V);
11867 return I && I->getOpcode() == SMainOpI->getOpcode();
11868 });
11869 };
11870 SmallPtrSet<Value *, 8> Operands;
11871 for (Value *V : VL) {
11872 auto *I = dyn_cast<Instruction>(V);
11873 if (!I || I == SMain)
11874 continue;
11875 Instruction *MatchingOp = S.getMatchingMainOpOrAltOp(I);
11876 if (MatchingOp != SMain)
11877 continue;
11879 buildOriginalOperands(S, I, VOps);
11880 Operands.insert(I->op_begin(), I->op_end());
11881 assert(VOps.size() == 2 && Ops.size() == 2 &&
11882 "Expected binary operations only.");
11883 if (CheckOperands(VOps[0][0], Ops[0][0]) ||
11884 CheckOperands(VOps[1][0], Ops[1][0]) ||
11885 (IsCommutative && (CheckOperands(VOps[0][0], Ops[1][0]) ||
11886 CheckOperands(VOps[1][0], Ops[0][0])))) {
11887 SMain = I;
11888 Ops.swap(VOps);
11889 break;
11890 }
11891 }
11893 buildOriginalOperands(S, MainOp, MainOps);
11894
11895 auto BuildFirstOperandCandidates =
11896 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11898 bool IsCommutative) {
11899 Candidates.emplace_back(Ops[0][0], Op0);
11900 if (IsCommutative)
11901 Candidates.emplace_back(Ops[0][0], Op1);
11902 };
11903
11904 auto BuildSecondOperandCandidates =
11905 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11906 ArrayRef<BoUpSLP::ValueList> Ops, int PrevBestIdx, Value *Op0,
11907 Value *Op1, bool IsCommutative) {
11908 if (PrevBestIdx != 1)
11909 Candidates.emplace_back(Ops[1][0], Op1);
11910 if (PrevBestIdx != 0 && IsCommutative)
11911 Candidates.emplace_back(Ops[1][0], Op0);
11912 };
11913
11914 auto FindBestCandidate =
11915 [&](ArrayRef<std::pair<Value *, Value *>> Candidates, bool &IsConst,
11916 int &Score) {
11917 auto Res = R.findBestRootPair(Candidates);
11918 Score = Res.second;
11919 IsConst =
11921 isConstant(Candidates[Res.first.value_or(0)].first) &&
11922 isConstant(Candidates[Res.first.value_or(0)].second);
11923 if (IsConst) {
11924 // Check if there are splat candidates and consider them better
11925 // option.
11926 for (const auto [Idx, P] : enumerate(Candidates)) {
11927 if (!isConstant(P.first) && !isConstant(P.second) &&
11928 P.second == P.first) {
11929 Res.first = Idx;
11930 IsConst = false;
11931 Score = isa<LoadInst>(Candidates[Res.first.value_or(0)].first)
11934 break;
11935 }
11936 }
11937 }
11938 return Res.first;
11939 };
11940
11941 for (Value *V : VL) {
11942 auto *I = dyn_cast<Instruction>(V);
11943 if (!I || (I == MainOp && (!S.isAltShuffle() || I == SMain)) ||
11944 (!S.isAltShuffle() && I == SMain))
11945 continue;
11947 buildOriginalOperands(S, I == SMain ? MainOp : I, VOps);
11948 SmallVector<Value *> CopyableOps =
11949 getOperands(CopyableS, I == MainOp ? SMain : I);
11950 if (CopyableOps.size() == VOps.size() &&
11951 all_of(zip(CopyableOps, VOps), [&](const auto &P) {
11952 return std::get<0>(P) == std::get<1>(P)[0];
11953 }))
11954 continue;
11956 BuildFirstOperandCandidates(Candidates, MainOps, CopyableOps[0],
11957 CopyableOps[1], IsMainCommutative);
11958 const unsigned OpSize = Candidates.size();
11959 Instruction *MatchingOp =
11960 S.getMatchingMainOpOrAltOp(I) == S.getMainOp() ? SMain : SAlt;
11961 const bool IsCommutativeInst =
11962 (MatchingOp == SMain ? IsCommutative : IsAltCommutative) ||
11963 ::isCommutative(I, MatchingOp);
11964 if (S.isAltShuffle() && MatchingOp == SAlt &&
11965 any_of(VOps, [&](const BoUpSLP::ValueList &Ops) {
11966 auto *I = dyn_cast<BinaryOperator>(Ops[0]);
11967 return I && Operands.contains(I);
11968 }))
11969 return false;
11970 if (S.isAltShuffle() && MatchingOp == SMain)
11971 Operands.insert(I->op_begin(), I->op_end());
11972 BuildFirstOperandCandidates(Candidates, Ops, VOps[0][0], VOps[1][0],
11973 IsCommutativeInst);
11974 bool IsBestConst;
11975 int Score;
11976 std::optional<int> BestOp =
11977 FindBestCandidate(Candidates, IsBestConst, Score);
11978 const bool IsOriginalBetter =
11979 static_cast<unsigned>(BestOp.value_or(OpSize)) >= OpSize;
11980 Candidates.clear();
11981 BuildSecondOperandCandidates(
11982 Candidates, MainOps, IsOriginalBetter ? -1 : *BestOp, CopyableOps[0],
11983 CopyableOps[1], IsMainCommutative);
11984 const unsigned SecondOpSize = Candidates.size();
11985 BuildSecondOperandCandidates(
11986 Candidates, Ops,
11987 IsOriginalBetter ? BestOp.value_or(OpSize - 1) - OpSize : -1,
11988 VOps[0][0], VOps[1][0], IsCommutativeInst);
11989 bool IsSecondBestConst;
11990 int SecondScore;
11991 std::optional<int> SecondBestOp =
11992 FindBestCandidate(Candidates, IsSecondBestConst, SecondScore);
11993 // No best candidates.
11994 if (!BestOp && !SecondBestOp)
11995 return false;
11996 // Original better in both ops combinations.
11997 const bool IsSecondOriginalBetter =
11998 static_cast<unsigned>(SecondBestOp.value_or(SecondOpSize)) >=
11999 SecondOpSize;
12000 if (IsOriginalBetter && IsSecondOriginalBetter)
12001 return false;
12002 // Original is better in second combination, but in the first combination
12003 // no best candidates.
12004 if (!BestOp && IsSecondOriginalBetter)
12005 return false;
12006 // Original is better in first combination, but in the second combination
12007 // no best candidates.
12008 if (!SecondBestOp && IsOriginalBetter)
12009 return false;
12010 // Copyable is best in the first combination, but it is constant, but
12011 // original is better in second non-constant combination.
12012 if (!IsOriginalBetter && IsBestConst && IsSecondOriginalBetter &&
12013 !IsSecondBestConst)
12014 return false;
12015 // Copyable is best in the second combination, but it is constant, but
12016 // original is better in the first non-constant combination.
12017 if (BestOp && IsOriginalBetter && !IsBestConst &&
12018 !IsSecondOriginalBetter && IsSecondBestConst)
12019 return false;
12020 // Original combination score is better.
12021 if (((Score > SecondScore ||
12023 Score == SecondScore)) &&
12024 IsOriginalBetter) ||
12025 (IsSecondOriginalBetter &&
12026 (SecondScore > Score ||
12028 Score == SecondScore))))
12029 return false;
12030 }
12031 return true;
12032 }
12033
12034public:
12035 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
12036 const TargetTransformInfo &TTI,
12037 const TargetLibraryInfo &TLI)
12038 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
12039
12040 InstructionsState buildInstructionsState(ArrayRef<Value *> VL,
12041 const BoUpSLP &R,
12042 bool WithProfitabilityCheck = false,
12043 bool SkipSameCodeCheck = false) {
12044 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
12045 ? InstructionsState::invalid()
12046 : getSameOpcode(VL, TLI);
12047 // Check if series of selects + zext i1 %x to in can be combined into
12048 // selects + select %x, i32 1, i32 0.
12049 Instruction *SelectOp = nullptr;
12050 if (!S && allSameBlock(VL) && all_of(VL, [&](Value *V) {
12051 if (match(V, m_Select(m_Value(), m_Value(), m_Value()))) {
12052 if (!SelectOp)
12053 SelectOp = cast<Instruction>(V);
12054 return true;
12055 }
12056 auto *ZExt = dyn_cast<ZExtInst>(V);
12057 return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) ||
12059 })) {
12060 if (SelectOp)
12061 return InstructionsState(SelectOp, SelectOp);
12062 }
12063 if (S && S.isAltShuffle()) {
12064 Type *ScalarTy = S.getMainOp()->getType();
12065 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
12066 unsigned Opcode0 = S.getOpcode();
12067 unsigned Opcode1 = S.getAltOpcode();
12068 SmallBitVector OpcodeMask(
12069 getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
12070 // If this pattern is supported by the target then we consider the order.
12071 if (TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
12072 return S;
12073 } else if (S && (!VectorizeCopyableElements ||
12074 !isa<BinaryOperator>(S.getMainOp()) ||
12075 all_of(VL, [&](Value *V) {
12076 auto *I = dyn_cast<Instruction>(V);
12077 return !I || I->getOpcode() == S.getOpcode() ||
12078 (S.getOpcode() == Instruction::Add &&
12079 I->getOpcode() == Instruction::Shl);
12080 }))) {
12081 return S;
12082 }
12084 return S;
12085 findAndSetMainInstruction(VL, R);
12086 if (!MainOp)
12087 return S;
12088 InstructionsState OrigS = S;
12089 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
12090 if (OrigS && !isCopyablePreferable(VL, R, OrigS, S))
12091 return OrigS;
12092 if (!WithProfitabilityCheck)
12093 return S;
12094 // Check if it is profitable to vectorize the instruction.
12095 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
12096 auto BuildCandidates =
12097 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
12098 Value *V2) {
12099 if (V1 != V2 && isa<PHINode>(V1))
12100 return;
12101 auto *I1 = dyn_cast<Instruction>(V1);
12102 auto *I2 = dyn_cast<Instruction>(V2);
12103 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
12104 I1->getParent() != I2->getParent())
12105 return;
12106 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
12107 };
12108 if (VL.size() == 2) {
12109 // Check if the operands allow better vectorization.
12110 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
12111 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
12112 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
12113 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
12114 R.findBestRootPair(Candidates1).first &&
12115 R.findBestRootPair(Candidates2).first;
12116 if (!Res && isCommutative(MainOp)) {
12117 Candidates1.clear();
12118 Candidates2.clear();
12119 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
12120 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
12121 Res = !Candidates1.empty() && !Candidates2.empty() &&
12122 R.findBestRootPair(Candidates1).first &&
12123 R.findBestRootPair(Candidates2).first;
12124 }
12125 if (!Res)
12126 return OrigS;
12128 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
12129 InstructionCost VectorCost;
12130 FixedVectorType *VecTy =
12131 getWidenedType(S.getMainOp()->getType(), VL.size());
12132 switch (MainOpcode) {
12133 case Instruction::Add:
12134 case Instruction::Sub:
12135 case Instruction::LShr:
12136 case Instruction::Shl:
12137 case Instruction::SDiv:
12138 case Instruction::UDiv:
12139 case Instruction::And:
12140 case Instruction::Or:
12141 case Instruction::Xor:
12142 case Instruction::FAdd:
12143 case Instruction::FMul:
12144 case Instruction::FSub:
12145 case Instruction::FDiv:
12146 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
12147 break;
12148 default:
12149 llvm_unreachable("Unexpected instruction.");
12150 }
12151 if (VectorCost > ScalarCost)
12152 return OrigS;
12153 return S;
12154 }
12155 assert(Operands.size() == 2 && "Unexpected number of operands!");
12156 unsigned CopyableNum =
12157 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
12158 if (CopyableNum < VL.size() / 2)
12159 return S;
12160 // Too many phi copyables - exit.
12161 const unsigned Limit = VL.size() / 24;
12162 if ((CopyableNum >= VL.size() - Limit ||
12163 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
12164 CopyableNum >= MaxPHINumOperands) &&
12165 all_of(VL, [&](Value *V) {
12166 return isa<PHINode>(V) || !S.isCopyableElement(V);
12167 }))
12168 return OrigS;
12169 // Check profitability if number of copyables > VL.size() / 2.
12170 // 1. Reorder operands for better matching.
12171 if (isCommutative(MainOp)) {
12172 Value *BestFrontOp = nullptr;
12173 for (auto [OpL, OpR] : zip(Operands.front(), Operands.back())) {
12174 // Make instructions the first operands.
12175 if (!isa<Instruction>(OpL) && isa<Instruction>(OpR)) {
12176 BestFrontOp = OpR;
12177 std::swap(OpL, OpR);
12178 continue;
12179 }
12180 // Make constants the second operands.
12181 if ((isa<Constant>(OpL) && !match(OpR, m_Zero())) ||
12182 match(OpL, m_Zero())) {
12183 if (isa<Instruction>(OpR))
12184 BestFrontOp = OpR;
12185 std::swap(OpL, OpR);
12186 continue;
12187 }
12188 if (isa<Instruction>(OpL))
12189 BestFrontOp = OpL;
12190 }
12191 // If some of the RHS operands better match most of LHS - swap such
12192 // operands to increase matching rate.
12193 if (auto *BestLHS = dyn_cast_if_present<Instruction>(BestFrontOp)) {
12194 const unsigned BestOpcode = BestLHS->getOpcode();
12195 for (auto [OpL, OpR] : zip(Operands.front(), Operands.back())) {
12196 auto *OpRI = dyn_cast<Instruction>(OpR);
12197 if (!OpRI)
12198 continue;
12199 if (OpRI->getOpcode() == BestOpcode)
12200 std::swap(OpL, OpR);
12201 }
12202 }
12203 }
12204 // 2. Check, if operands can be vectorized.
12205 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
12206 return OrigS;
12207 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
12208 if (allConstant(Ops) || isSplat(Ops))
12209 return true;
12210 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
12211 // one is different.
12212 constexpr unsigned Limit = 4;
12213 if (Operands.front().size() >= Limit) {
12214 SmallDenseMap<const Value *, unsigned> Counters;
12215 for (Value *V : Ops) {
12216 if (isa<UndefValue>(V))
12217 continue;
12218 ++Counters[V];
12219 }
12220 if (Counters.size() == 2 &&
12221 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
12222 return C.second == 1;
12223 }))
12224 return true;
12225 }
12226 // First operand not a constant or splat? Last attempt - check for
12227 // potential vectorization.
12228 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12229 InstructionsState OpS = Analysis.buildInstructionsState(Ops, R);
12230 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
12231 return false;
12232 unsigned CopyableNum =
12233 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
12234 return CopyableNum <= VL.size() / 2;
12235 };
12236 if (!CheckOperand(Operands.front()))
12237 return OrigS;
12238
12239 return S;
12240 }
12241
12242 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
12243 ArrayRef<Value *> VL) {
12244 assert(S && "Invalid state!");
12246 if (S.areInstructionsWithCopyableElements()) {
12247 MainOp = S.getMainOp();
12248 MainOpcode = S.getOpcode();
12249 const bool IsCommutative =
12250 isCommutative(MainOp) && MainOp->getNumOperands() == 2;
12251 Operands.assign(MainOp->getNumOperands(),
12252 BoUpSLP::ValueList(VL.size(), nullptr));
12253 // Populate operands for every lane.
12254 for (auto [Idx, V] : enumerate(VL)) {
12255 SmallVector<Value *> OperandsForValue = getOperands(S, V);
12256 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
12257 Operands[OperandIdx][Idx] = Operand;
12258 }
12259 // Operand-order normalization below swaps OpIdx 0 and OpIdx 1
12260 // of non-copyable lanes. That is only safe when the main op is
12261 // commutative (e.g. 0 - X is not X - 0, so `sub` must be
12262 // excluded).
12263 if (IsCommutative) {
12264 // Count (ID0, ID1) pair frequencies for operand normalization.
12265 // Pairs and their inverses are tracked under a canonical key
12266 // so that (Load, Add) and (Add, Load) contribute to the same
12267 // bucket.
12268 struct PairInfo {
12269 unsigned FwdCount = 0;
12270 unsigned RevCount = 0;
12271 };
12272 SmallMapVector<std::pair<unsigned, unsigned>, PairInfo, 8> PairCounts;
12273 unsigned MajID0 = 0, MajID1 = 0;
12274 for (auto [Idx, V] : enumerate(VL)) {
12275 if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12276 continue;
12277 unsigned ID0 = Operands[0][Idx]->getValueID();
12278 unsigned ID1 = Operands[1][Idx]->getValueID();
12279 if (ID0 == ID1)
12280 continue;
12281 unsigned MinID = std::min(ID0, ID1);
12282 unsigned MaxID = std::max(ID0, ID1);
12283 auto [It, Inserted] =
12284 PairCounts.try_emplace(std::make_pair(MinID, MaxID));
12285 PairInfo &Info = It->second;
12286 if (ID0 < ID1)
12287 ++Info.FwdCount;
12288 else
12289 ++Info.RevCount;
12290 }
12291 // Find the most frequent (ID0, ID1) pair across non-copyable
12292 // lanes. Select the orientation (original or inverse) that
12293 // has more votes as the majority pattern.
12294 unsigned BestCount = 0;
12295 for (const auto &P : PairCounts) {
12296 const PairInfo &Info = P.second;
12297 unsigned Total = Info.FwdCount + Info.RevCount;
12298 if (Total > BestCount) {
12299 BestCount = Total;
12300 if (Info.FwdCount >= Info.RevCount) {
12301 MajID0 = P.first.first;
12302 MajID1 = P.first.second;
12303 } else {
12304 MajID0 = P.first.second;
12305 MajID1 = P.first.first;
12306 }
12307 }
12308 }
12309 // Normalize non-copyable lanes in two steps:
12310 // 1) Swap lanes whose operand types are the exact inverse of
12311 // the majority pattern, making the non-copyable lanes
12312 // consistent.
12313 // 2) Independently, if a strict majority of non-copyable lanes
12314 // have loads at OpIdx 1, swap those lanes to put loads at
12315 // OpIdx 0 for better downstream vectorization.
12316 unsigned LAt0 = 0, LAt1 = 0, TotalNC = 0;
12317 for (auto [Idx, V] : enumerate(VL)) {
12318 if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12319 continue;
12320 // Step 1: swap exact-inverse lanes.
12321 if (BestCount > 0) {
12322 unsigned ID0 = Operands[0][Idx]->getValueID();
12323 unsigned ID1 = Operands[1][Idx]->getValueID();
12324 if (ID0 == MajID1 && ID1 == MajID0)
12325 std::swap(Operands[0][Idx], Operands[1][Idx]);
12326 }
12327 ++TotalNC;
12328 LAt0 += isa<LoadInst>(Operands[0][Idx]);
12329 LAt1 += isa<LoadInst>(Operands[1][Idx]);
12330 }
12331 // Step 2: if most non-copyable lanes have loads at OpIdx 1,
12332 // swap those lanes to put loads at OpIdx 0.
12333 if (TotalNC > 1 && LAt1 > LAt0 && LAt1 * 2 > TotalNC) {
12334 for (auto [Idx, V] : enumerate(VL)) {
12335 if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12336 continue;
12337 if (!isa<LoadInst>(Operands[0][Idx]) &&
12338 isa<LoadInst>(Operands[1][Idx]))
12339 std::swap(Operands[0][Idx], Operands[1][Idx]);
12340 }
12341 }
12342 }
12343 } else {
12344 buildOriginalOperands(S, VL, Operands);
12345 }
12346 return Operands;
12347 }
12348};
12349} // namespace
12350
12351BoUpSLP::ScalarsVectorizationLegality
12352BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
12353 const EdgeInfo &UserTreeIdx) const {
12354 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
12355
12356 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
12357 InstructionsState S = Analysis.buildInstructionsState(
12358 VL, *this, /*WithProfitabilityCheck=*/true);
12359
12360 bool AreScatterAllGEPSameBlock = false;
12361 if (!S) {
12362 SmallVector<unsigned> SortedIndices;
12363 BasicBlock *BB = nullptr;
12364 bool IsScatterVectorizeUserTE =
12365 UserTreeIdx.UserTE &&
12366 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12367 AreScatterAllGEPSameBlock =
12368 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
12369 VL.size() > 2 &&
12370 all_of(VL,
12371 [&BB](Value *V) {
12372 auto *I = dyn_cast<GetElementPtrInst>(V);
12373 if (!I)
12374 return doesNotNeedToBeScheduled(V);
12375 if (!BB)
12376 BB = I->getParent();
12377 return BB == I->getParent() && I->getNumOperands() == 2;
12378 }) &&
12379 BB &&
12380 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
12381 *SE, SortedIndices));
12382 if (!AreScatterAllGEPSameBlock) {
12383 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
12384 "C,S,B,O, small shuffle. \n";
12385 dbgs() << "[";
12386 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
12387 dbgs() << "]\n");
12388 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12389 /*TryToFindDuplicates=*/true,
12390 /*TrySplitVectorize=*/true);
12391 }
12392 // Reset S to make it GetElementPtr kind of node.
12393 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12394 assert(It != VL.end() && "Expected at least one GEP.");
12395 S = getSameOpcode(*It, *TLI);
12396 }
12397 assert(S && "Must be valid.");
12398
12399 // Don't handle vectors.
12400 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
12401 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
12402 // Do not try to pack to avoid extra instructions here.
12403 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12404 /*TryToFindDuplicates=*/false);
12405 }
12406
12407 // Check that all of the users of the scalars that we want to vectorize are
12408 // schedulable.
12409 BasicBlock *BB = S.getMainOp()->getParent();
12410
12412 !DT->isReachableFromEntry(BB)) {
12413 // Don't go into unreachable blocks. They may contain instructions with
12414 // dependency cycles which confuse the final scheduling.
12415 // Do not vectorize EH and non-returning blocks, not profitable in most
12416 // cases.
12417 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
12418 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12419 }
12420
12421 // Don't go into catchswitch blocks, which can happen with PHIs.
12422 // Such blocks can only have PHIs and the catchswitch. There is no
12423 // place to insert a shuffle if we need to, so just avoid that issue.
12425 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
12426 // Do not try to pack to avoid extra instructions here.
12427 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12428 /*TryToFindDuplicates=*/false);
12429 }
12430
12431 // Don't handle scalable vectors
12432 if (S.getOpcode() == Instruction::ExtractElement &&
12434 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
12435 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
12436 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12437 }
12438
12439 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
12440 // a load), in which case peek through to include it in the tree, without
12441 // ballooning over-budget.
12442 if (Depth >= RecursionMaxDepth &&
12443 (S.isAltShuffle() || VL.size() < 4 ||
12444 !(match(S.getMainOp(), m_Load(m_Value())) ||
12445 all_of(VL, [&S](const Value *I) {
12446 return match(I,
12448 cast<Instruction>(I)->getOpcode() == S.getOpcode();
12449 })))) {
12450 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
12451 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12452 }
12453
12454 // Check if this is a duplicate of another entry.
12455 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
12456 // Cache invariants to avoid recomputing for every V in VL (and every E).
12457 const bool IsPHIWithLoop =
12458 S.getOpcode() == Instruction::PHI &&
12459 LI->getLoopFor(S.getMainOp()->getParent()) != nullptr;
12460 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
12461 if (E->isSame(VL)) {
12462 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
12463 << ".\n");
12464 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12465 }
12466 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
12467 if (all_of(VL, [&](Value *V) {
12468 return isa<PoisonValue>(V) || Values.contains(V) ||
12469 (IsPHIWithLoop && isa<PHINode>(V) && isVectorized(V));
12470 })) {
12471 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
12472 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12473 }
12474 }
12475
12476 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
12477 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
12478 if (!AreAllSameInsts || isSplat(VL) ||
12480 S.getMainOp()) &&
12482 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O conditions. \n";
12483 dbgs() << "[";
12484 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
12485 dbgs() << "]\n");
12486 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12487 }
12488
12489 // Don't vectorize ephemeral values.
12490 if (!EphValues.empty()) {
12491 for (Value *V : VL) {
12492 if (EphValues.count(V)) {
12493 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
12494 << ") is ephemeral.\n");
12495 // Do not try to pack to avoid extra instructions here.
12496 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12497 /*TryToFindDuplicates=*/false);
12498 }
12499 }
12500 }
12501
12502 // We now know that this is a vector of instructions of the same type from
12503 // the same block.
12504
12505 // Check that none of the instructions in the bundle are already in the tree
12506 // and the node may be not profitable for the vectorization as the small
12507 // alternate node.
12508 if (S.isAltShuffle()) {
12509 auto GetNumVectorizedExtracted = [&]() {
12510 APInt Extracted = APInt::getZero(VL.size());
12511 APInt Vectorized = APInt::getAllOnes(VL.size());
12512 for (auto [Idx, V] : enumerate(VL)) {
12513 auto *I = dyn_cast<Instruction>(V);
12514 if (!I || doesNotNeedToBeScheduled(I) ||
12515 all_of(I->operands(), [&](const Use &U) {
12516 return isa<ExtractElementInst>(U.get());
12517 }))
12518 continue;
12519 if (isVectorized(I))
12520 Vectorized.clearBit(Idx);
12521 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
12522 Extracted.setBit(Idx);
12523 }
12524 return std::make_pair(Vectorized, Extracted);
12525 };
12526 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
12528 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
12529 if (!Vectorized.isAllOnes() && !PreferScalarize) {
12530 // Rough cost estimation, if the vector code (+ potential extracts) is
12531 // more profitable than the scalar + buildvector.
12532 Type *ScalarTy = VL.front()->getType();
12533 auto *VecTy = getWidenedType(ScalarTy, VL.size());
12534 InstructionCost VectorizeCostEstimate =
12535 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
12536 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
12537 /*Insert=*/false, /*Extract=*/true, Kind);
12538 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
12539 *TTI, ScalarTy, VecTy, Vectorized,
12540 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
12541 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
12542 }
12543 if (PreferScalarize) {
12544 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
12545 "node is not profitable.\n");
12546 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12547 }
12548 }
12549
12550 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
12551 if (UserIgnoreList && !UserIgnoreList->empty()) {
12552 for (Value *V : VL) {
12553 if (UserIgnoreList->contains(V)) {
12554 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
12555 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12556 }
12557 }
12558 }
12559
12560 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
12561}
12562
12563void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
12564 const EdgeInfo &UserTreeIdx,
12565 unsigned InterleaveFactor) {
12566 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
12567
12568 SmallVector<int> ReuseShuffleIndices;
12569 SmallVector<Value *> VL(VLRef);
12570
12571 // Tries to build split node.
12572 auto TrySplitNode = [&](const InstructionsState &LocalState) {
12573 SmallVector<Value *> Op1, Op2;
12574 OrdersType ReorderIndices;
12575 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
12576 return false;
12577
12578 auto Invalid = ScheduleBundle::invalid();
12579 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
12580 UserTreeIdx, {}, ReorderIndices);
12581 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
12582 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
12583 InstructionsState S = getSameOpcode(Op, *TLI);
12584 if (S && (isa<LoadInst>(S.getMainOp()) ||
12585 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
12586 // Build gather node for loads, they will be gathered later.
12587 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12588 Idx == 0 ? 0 : Op1.size());
12589 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
12590 } else {
12591 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12592 Idx == 0 ? 0 : Op1.size());
12593 buildTreeRec(Op, Depth, {TE, Idx});
12594 }
12595 };
12596 AddNode(Op1, 0);
12597 AddNode(Op2, 1);
12598 return true;
12599 };
12600
12601 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
12602 bool AreConsts = false;
12603 for (Value *V : VL) {
12604 if (isa<PoisonValue>(V))
12605 continue;
12606 if (isa<Constant>(V)) {
12607 AreConsts = true;
12608 continue;
12609 }
12610 if (!isa<PHINode>(V))
12611 return false;
12612 }
12613 return AreConsts;
12614 };
12615 if (AreOnlyConstsWithPHIs(VL)) {
12616 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
12617 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
12618 return;
12619 }
12620
12621 ScalarsVectorizationLegality Legality =
12622 getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
12623 InstructionsState S = Legality.getInstructionsState();
12624 if (!Legality.isLegal()) {
12625 if (Legality.trySplitVectorize()) {
12626 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
12627 // Last chance to try to vectorize alternate node.
12628 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
12629 return;
12630 }
12631 if (Legality.tryToFindDuplicates())
12632 (void)tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
12633 UserTreeIdx, *this);
12634
12635 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12636 return;
12637 }
12638
12639 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
12640 if (S.isAltShuffle() && TrySplitNode(S))
12641 return;
12642
12643 // Check that every instruction appears once in this bundle.
12644 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
12645 *this, /*BuildGatherOnly=*/false)) {
12646 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12647 return;
12648 }
12649
12650 // Perform specific checks for each particular instruction kind.
12651 bool IsScatterVectorizeUserTE =
12652 UserTreeIdx.UserTE &&
12653 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12654 OrdersType CurrentOrder;
12655 SmallVector<Value *> PointerOps;
12656 StridedPtrInfo SPtrInfo;
12657 TreeEntry::EntryState State = getScalarsVectorizationState(
12658 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
12659 if (State == TreeEntry::NeedToGather) {
12660 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12661 return;
12662 }
12663
12664 // Check the loop nest. We need to be sure we handle a single loop nest at a
12665 // time to avoid incorrect cost estimation because of the loop aware cost
12666 // model.
12667 if (VectorizableTree.empty()) {
12668 assert(CurrentLoopNest.empty() && "Expected empty loop nest");
12669 // Process the first node? Initial fill of the loop nest.
12670 BasicBlock *Parent = S.getMainOp()->getParent();
12671 if (const Loop *L = LI->getLoopFor(Parent)) {
12673 if (L)
12674 CurrentLoopNest.assign(getLoopNest(L));
12675 }
12676 } else if (!UserTreeIdx ||
12677 UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize ||
12678 UserTreeIdx.UserTE->isGather() ||
12679 UserTreeIdx.UserTE->getMainOp()->getParent() !=
12680 S.getMainOp()->getParent()) {
12681 BasicBlock *Parent = S.getMainOp()->getParent();
12682 if (const Loop *L = LI->getLoopFor(Parent)) {
12683 // Check that the new loop nest shares the same outer structure as the
12684 // tree's current loop nest. Completely disjoint nests (different
12685 // outermost loops) are forced to gather because their scales cannot be
12686 // meaningfully combined. Sibling inner loops (inside a common outer
12687 // loop or outside any loops at all) are allowed: the cost model scales
12688 // each entry by its own loop via getScaleToLoopIterations(), so a tree
12689 // that spans sibling inner loops (e.g. a PHI at their merge block) can
12690 // still be costed correctly. Contract CurrentLoopNest to the longest
12691 // common prefix with the new entry's nest so subsequent entries in yet
12692 // another sibling can also be admitted.
12694 if (L) {
12695 SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
12696 unsigned CommonLen = 0;
12697 for (const auto [L1, L2] : zip(CurrentLoopNest, NewLoopNest)) {
12698 if (L1 != L2)
12699 break;
12700 ++CommonLen;
12701 }
12702 auto ValidateMergedBTCs = [&](unsigned StartDepth) -> bool {
12703 unsigned EndDepth =
12704 std::min<unsigned>(NewLoopNest.size(), MergedLoopBTCs.size());
12705 for (unsigned D = StartDepth; D < EndDepth; ++D) {
12706 const SCEV *Constraint = MergedLoopBTCs[D];
12707 if (!Constraint)
12708 continue;
12709 const SCEV *NewBTC = SE->getBackedgeTakenCount(NewLoopNest[D]);
12710 if (isa<SCEVCouldNotCompute>(NewBTC) || NewBTC != Constraint)
12711 return false;
12712 }
12713 return true;
12714 };
12715 auto BailOutToGather = [&]() {
12717 << "SLP: Sibling loops have different trip counts.\n");
12718 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12719 };
12720 if (CurrentLoopNest.empty()) {
12721 if (!ValidateMergedBTCs(0)) {
12722 BailOutToGather();
12723 return;
12724 }
12725 CurrentLoopNest.assign(NewLoopNest);
12726 } else if (CommonLen < CurrentLoopNest.size() &&
12727 CommonLen < NewLoopNest.size()) {
12728 // Divergence below the common prefix: the tree now spans sibling
12729 // loops at depth CommonLen. Admitting them into one tree makes
12730 // the profitability decision JOINT across both siblings, so a
12731 // very hot sibling could otherwise let an unprofitable cold
12732 // sibling ride along "for free" (per-entry scaling of the cold
12733 // sibling's entries would be dwarfed by the hot one). Require
12734 // SCEV-proven equal backedge-taken counts for the diverging
12735 // siblings before joining; otherwise force gather.
12736 const Loop *SibA = CurrentLoopNest[CommonLen];
12737 const Loop *SibB = NewLoopNest[CommonLen];
12738 const SCEV *BecA = SE->getBackedgeTakenCount(SibA);
12739 const SCEV *BecB = SE->getBackedgeTakenCount(SibB);
12740 if (isa<SCEVCouldNotCompute>(BecA) || BecA != BecB) {
12741 BailOutToGather();
12742 return;
12743 }
12744 if (!ValidateMergedBTCs(CommonLen + 1)) {
12745 BailOutToGather();
12746 return;
12747 }
12748 if (MergedLoopBTCs.size() <= CommonLen)
12749 MergedLoopBTCs.resize(CommonLen + 1, nullptr);
12750 MergedLoopBTCs[CommonLen] = BecA;
12751 CurrentLoopNest.truncate(CommonLen);
12752 } else if (NewLoopNest.size() > CurrentLoopNest.size()) {
12753 if (!ValidateMergedBTCs(CurrentLoopNest.size())) {
12754 BailOutToGather();
12755 return;
12756 }
12757 CurrentLoopNest.append(
12758 std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
12759 NewLoopNest.end());
12760 }
12761 // Otherwise NewLoopNest is a prefix of CurrentLoopNest: keep as-is.
12762 }
12763 }
12764 }
12765
12766 Instruction *VL0 = S.getMainOp();
12767 BasicBlock *BB = VL0->getParent();
12768 auto &BSRef = BlocksSchedules[BB];
12769 if (!BSRef)
12770 BSRef = std::make_unique<BlockScheduling>(BB);
12771
12772 BlockScheduling &BS = *BSRef;
12773
12774 SetVector<Value *> UniqueValues(llvm::from_range, VL);
12775 std::optional<ScheduleBundle *> BundlePtr =
12776 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
12777#ifdef EXPENSIVE_CHECKS
12778 // Make sure we didn't break any internal invariants
12779 BS.verify();
12780#endif
12781 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12782 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
12783 // Last chance to try to vectorize alternate node.
12784 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
12785 return;
12786 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12787 NonScheduledFirst.insert(VL.front());
12788 if (S.getOpcode() == Instruction::Load &&
12789 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12791 return;
12792 }
12793 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
12794 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
12795 ScheduleBundle Empty;
12796 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
12797 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
12798
12799 unsigned ShuffleOrOp =
12800 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12801 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
12802 // Postpone PHI nodes creation
12803 SmallVector<unsigned> PHIOps;
12804 for (unsigned I : seq<unsigned>(Operands.size())) {
12805 ArrayRef<Value *> Op = Operands[I];
12806 if (Op.empty())
12807 continue;
12808 InstructionsState S = getSameOpcode(Op, *TLI);
12809 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12810 buildTreeRec(Op, Depth + 1, {TE, I});
12811 else
12812 PHIOps.push_back(I);
12813 }
12814 for (unsigned I : PHIOps)
12815 buildTreeRec(Operands[I], Depth + 1, {TE, I});
12816 };
12817 switch (ShuffleOrOp) {
12818 case Instruction::PHI: {
12819 TreeEntry *TE =
12820 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12821 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12822 TE->dump());
12823
12824 TE->setOperands(Operands);
12825 CreateOperandNodes(TE, Operands);
12826 return;
12827 }
12828 case Instruction::ExtractValue:
12829 case Instruction::ExtractElement: {
12830 if (CurrentOrder.empty()) {
12831 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12832 } else {
12833 LLVM_DEBUG({
12834 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12835 "with order";
12836 for (unsigned Idx : CurrentOrder)
12837 dbgs() << " " << Idx;
12838 dbgs() << "\n";
12839 });
12840 fixupOrderingIndices(CurrentOrder);
12841 }
12842 // Insert new order with initial value 0, if it does not exist,
12843 // otherwise return the iterator to the existing one.
12844 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12845 ReuseShuffleIndices, CurrentOrder);
12846 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12847 "(ExtractValueInst/ExtractElementInst).\n";
12848 TE->dump());
12849 // This is a special case, as it does not gather, but at the same time
12850 // we are not extending buildTreeRec() towards the operands.
12851 TE->setOperands(Operands);
12852 return;
12853 }
12854 case Instruction::InsertElement: {
12855 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12856
12857 auto OrdCompare = [](const std::pair<int, int> &P1,
12858 const std::pair<int, int> &P2) {
12859 return P1.first > P2.first;
12860 };
12861 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12862 decltype(OrdCompare)>
12863 Indices(OrdCompare);
12864 for (int I = 0, E = VL.size(); I < E; ++I) {
12865 unsigned Idx = *getElementIndex(VL[I]);
12866 Indices.emplace(Idx, I);
12867 }
12868 OrdersType CurrentOrder(VL.size(), VL.size());
12869 bool IsIdentity = true;
12870 for (int I = 0, E = VL.size(); I < E; ++I) {
12871 CurrentOrder[Indices.top().second] = I;
12872 IsIdentity &= Indices.top().second == I;
12873 Indices.pop();
12874 }
12875 if (IsIdentity)
12876 CurrentOrder.clear();
12877 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12878 {}, CurrentOrder);
12879 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12880 TE->dump());
12881
12882 TE->setOperands(Operands);
12883 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
12884 return;
12885 }
12886 case Instruction::Load: {
12887 // Check that a vectorized load would load the same memory as a scalar
12888 // load. For example, we don't want to vectorize loads that are smaller
12889 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12890 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12891 // from such a struct, we read/write packed bits disagreeing with the
12892 // unvectorized version.
12893 TreeEntry *TE = nullptr;
12894 fixupOrderingIndices(CurrentOrder);
12895 switch (State) {
12896 case TreeEntry::Vectorize:
12897 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12898 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12899 if (CurrentOrder.empty())
12900 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12901 TE->dump());
12902 else
12904 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12905 TE->dump());
12906 break;
12907 case TreeEntry::CompressVectorize:
12908 // Vectorizing non-consecutive loads with (masked)load + compress.
12909 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12910 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12911 LLVM_DEBUG(
12912 dbgs()
12913 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12914 TE->dump());
12915 break;
12916 case TreeEntry::StridedVectorize:
12917 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12918 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12919 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12920 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12921 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12922 TE->dump());
12923 break;
12924 case TreeEntry::ScatterVectorize:
12925 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12926 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12927 UserTreeIdx, ReuseShuffleIndices);
12928 LLVM_DEBUG(
12929 dbgs()
12930 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12931 TE->dump());
12932 break;
12933 case TreeEntry::CombinedVectorize:
12934 case TreeEntry::SplitVectorize:
12935 case TreeEntry::NeedToGather:
12936 llvm_unreachable("Unexpected loads state.");
12937 }
12938 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12939 assert(Operands.size() == 1 && "Expected a single operand only");
12940 SmallVector<int> Mask;
12941 inversePermutation(CurrentOrder, Mask);
12942 reorderScalars(Operands.front(), Mask);
12943 }
12944 TE->setOperands(Operands);
12945 if (State == TreeEntry::ScatterVectorize)
12946 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
12947 return;
12948 }
12949 case Instruction::ZExt:
12950 case Instruction::SExt:
12951 case Instruction::FPToUI:
12952 case Instruction::FPToSI:
12953 case Instruction::FPExt:
12954 case Instruction::PtrToInt:
12955 case Instruction::IntToPtr:
12956 case Instruction::SIToFP:
12957 case Instruction::UIToFP:
12958 case Instruction::Trunc:
12959 case Instruction::FPTrunc:
12960 case Instruction::BitCast: {
12961 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12962 std::make_pair(std::numeric_limits<unsigned>::min(),
12963 std::numeric_limits<unsigned>::max()));
12964 if (ShuffleOrOp == Instruction::ZExt ||
12965 ShuffleOrOp == Instruction::SExt) {
12966 CastMaxMinBWSizes = std::make_pair(
12967 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12968 PrevMaxBW),
12969 std::min<unsigned>(
12970 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12971 PrevMinBW));
12972 } else if (ShuffleOrOp == Instruction::Trunc) {
12973 CastMaxMinBWSizes = std::make_pair(
12974 std::max<unsigned>(
12975 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
12976 PrevMaxBW),
12977 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
12978 PrevMinBW));
12979 }
12980 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12981 ReuseShuffleIndices);
12982 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12983 TE->dump());
12984
12985 TE->setOperands(Operands);
12986 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
12987 buildTreeRec(TE->getOperand(I), Depth, {TE, I});
12988 if (ShuffleOrOp == Instruction::Trunc) {
12989 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12990 } else if (ShuffleOrOp == Instruction::SIToFP ||
12991 ShuffleOrOp == Instruction::UIToFP) {
12992 unsigned NumSignBits =
12993 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
12994 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
12995 APInt Mask = DB->getDemandedBits(OpI);
12996 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
12997 }
12998 if (NumSignBits * 2 >=
12999 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
13000 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13001 }
13002 return;
13003 }
13004 case Instruction::ICmp:
13005 case Instruction::FCmp: {
13006 // Check that all of the compares have the same predicate.
13007 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13008 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13009 ReuseShuffleIndices);
13010 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
13011 TE->dump());
13012
13013 VLOperands Ops(VL, Operands, S, *this);
13014 if (cast<CmpInst>(VL0)->isCommutative()) {
13015 // Commutative predicate - collect + sort operands of the instructions
13016 // so that each side is more likely to have the same opcode.
13018 "Commutative Predicate mismatch");
13019 Ops.reorder();
13020 Operands.front() = Ops.getVL(0);
13021 Operands.back() = Ops.getVL(1);
13022 } else {
13023 // Collect operands - commute if it uses the swapped predicate.
13024 for (auto [Idx, V] : enumerate(VL)) {
13025 if (isa<PoisonValue>(V))
13026 continue;
13027 auto *Cmp = cast<CmpInst>(V);
13028 if (Cmp->getPredicate() != P0)
13029 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
13030 }
13031 }
13032 TE->setOperands(Operands);
13033 buildTreeRec(Operands.front(), Depth, {TE, 0});
13034 buildTreeRec(Operands.back(), Depth, {TE, 1});
13035 if (ShuffleOrOp == Instruction::ICmp) {
13036 unsigned NumSignBits0 =
13037 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
13038 if (NumSignBits0 * 2 >=
13039 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
13040 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13041 unsigned NumSignBits1 =
13042 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
13043 if (NumSignBits1 * 2 >=
13044 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
13045 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
13046 }
13047 return;
13048 }
13049 case Instruction::Select:
13050 case Instruction::FNeg:
13051 case Instruction::Add:
13052 case Instruction::FAdd:
13053 case Instruction::Sub:
13054 case Instruction::FSub:
13055 case Instruction::Mul:
13056 case Instruction::FMul:
13057 case Instruction::UDiv:
13058 case Instruction::SDiv:
13059 case Instruction::FDiv:
13060 case Instruction::URem:
13061 case Instruction::SRem:
13062 case Instruction::FRem:
13063 case Instruction::Shl:
13064 case Instruction::LShr:
13065 case Instruction::AShr:
13066 case Instruction::And:
13067 case Instruction::Or:
13068 case Instruction::Xor:
13069 case Instruction::Freeze: {
13070 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13071 ReuseShuffleIndices);
13072 LLVM_DEBUG(
13073 dbgs() << "SLP: added a new TreeEntry "
13074 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
13075 TE->dump());
13076
13077 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
13078 VLOperands Ops(VL, Operands, S, *this);
13079 Ops.reorder();
13080 Operands[0] = Ops.getVL(0);
13081 Operands[1] = Ops.getVL(1);
13082 }
13083 TE->setOperands(Operands);
13084 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
13085 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
13086 return;
13087 }
13088 case Instruction::GetElementPtr: {
13089 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13090 ReuseShuffleIndices);
13091 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
13092 TE->dump());
13093 TE->setOperands(Operands);
13094
13095 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
13096 buildTreeRec(Operands[I], Depth + 1, {TE, I});
13097 return;
13098 }
13099 case Instruction::Store: {
13100 assert(CurrentOrder.empty() &&
13101 "Expected ordered store during tree building");
13102 if (State == TreeEntry::StridedVectorize) {
13103 TreeEntry *TE =
13104 newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
13105 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13106 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
13107 LLVM_DEBUG(
13108 dbgs() << "SLP: added a new TreeEntry (strided StoreInst).\n";
13109 TE->dump());
13110 TE->setOperands(Operands);
13111 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
13112 return;
13113 }
13114 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13115 ReuseShuffleIndices, CurrentOrder);
13116 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
13117 TE->dump());
13118 TE->setOperands(Operands);
13119 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
13120 return;
13121 }
13122 case Instruction::Call: {
13123 // Check if the calls are all to the same vectorizable intrinsic or
13124 // library function.
13125 CallInst *CI = cast<CallInst>(VL0);
13127
13128 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13129 ReuseShuffleIndices);
13130 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
13131 TE->dump());
13132 if (isCommutative(VL0)) {
13133 VLOperands Ops(VL, Operands, S, *this);
13134 Ops.reorder();
13135 Operands[0] = Ops.getVL(0);
13136 Operands[1] = Ops.getVL(1);
13137 }
13138 TE->setOperands(Operands);
13139 for (unsigned I : seq<unsigned>(CI->arg_size())) {
13140 // For scalar operands no need to create an entry since no need to
13141 // vectorize it.
13143 continue;
13144 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
13145 }
13146 return;
13147 }
13148 case Instruction::ShuffleVector: {
13149 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13150 ReuseShuffleIndices);
13151 if (S.isAltShuffle()) {
13152 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
13153 TE->dump());
13154 } else {
13155 assert(SLPReVec && "Only supported by REVEC.");
13156 LLVM_DEBUG(
13157 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
13158 TE->dump());
13159 }
13160
13161 // Reorder operands if reordering would enable vectorization.
13162 auto *CI = dyn_cast<CmpInst>(VL0);
13163 if (CI && any_of(VL, [](Value *V) {
13164 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
13165 })) {
13166 auto *MainCI = cast<CmpInst>(S.getMainOp());
13167 auto *AltCI = cast<CmpInst>(S.getAltOp());
13168 CmpInst::Predicate MainP = MainCI->getPredicate();
13169 CmpInst::Predicate AltP = AltCI->getPredicate();
13170 assert(MainP != AltP &&
13171 "Expected different main/alternate predicates.");
13172 // Collect operands - commute if it uses the swapped predicate or
13173 // alternate operation.
13174 for (auto [Idx, V] : enumerate(VL)) {
13175 if (isa<PoisonValue>(V))
13176 continue;
13177 auto *Cmp = cast<CmpInst>(V);
13178
13179 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
13180 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
13181 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
13182 } else {
13183 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
13184 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
13185 }
13186 }
13187 TE->setOperands(Operands);
13188 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
13189 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
13190 return;
13191 }
13192
13193 if (isa<BinaryOperator>(VL0) || CI) {
13194 VLOperands Ops(VL, Operands, S, *this);
13195 Ops.reorder();
13196 Operands[0] = Ops.getVL(0);
13197 Operands[1] = Ops.getVL(1);
13198 }
13199 TE->setOperands(Operands);
13200 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
13201 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
13202 return;
13203 }
13204 default:
13205 break;
13206 }
13207 llvm_unreachable("Unexpected vectorization of the instructions.");
13208}
13209
13210unsigned BoUpSLP::canMapToVector(Type *T) const {
13211 unsigned N = 1;
13212 Type *EltTy = T;
13213
13215 if (EltTy->isEmptyTy())
13216 return 0;
13217 if (auto *ST = dyn_cast<StructType>(EltTy)) {
13218 // Check that struct is homogeneous.
13219 for (const auto *Ty : ST->elements())
13220 if (Ty != *ST->element_begin())
13221 return 0;
13222 N *= ST->getNumElements();
13223 EltTy = *ST->element_begin();
13224 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
13225 N *= AT->getNumElements();
13226 EltTy = AT->getElementType();
13227 } else {
13228 auto *VT = cast<FixedVectorType>(EltTy);
13229 N *= VT->getNumElements();
13230 EltTy = VT->getElementType();
13231 }
13232 }
13233
13234 if (!isValidElementType(EltTy))
13235 return 0;
13236 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
13237 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
13238 VTSize != DL->getTypeStoreSizeInBits(T))
13239 return 0;
13240 return N;
13241}
13242
13243bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
13244 SmallVectorImpl<unsigned> &CurrentOrder,
13245 bool ResizeAllowed) const {
13247 assert(It != VL.end() && "Expected at least one extract instruction.");
13248 auto *E0 = cast<Instruction>(*It);
13249 assert(
13251 "Invalid opcode");
13252 // Check if all of the extracts come from the same vector and from the
13253 // correct offset.
13254 Value *Vec = E0->getOperand(0);
13255
13256 CurrentOrder.clear();
13257
13258 // We have to extract from a vector/aggregate with the same number of elements.
13259 unsigned NElts;
13260 if (E0->getOpcode() == Instruction::ExtractValue) {
13261 NElts = canMapToVector(Vec->getType());
13262 if (!NElts)
13263 return false;
13264 // Check if load can be rewritten as load of vector.
13265 LoadInst *LI = dyn_cast<LoadInst>(Vec);
13266 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
13267 return false;
13268 } else {
13269 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
13270 }
13271
13272 unsigned E = VL.size();
13273 if (!ResizeAllowed && NElts != E)
13274 return false;
13275 SmallVector<int> Indices(E, PoisonMaskElem);
13276 unsigned MinIdx = NElts, MaxIdx = 0;
13277 for (auto [I, V] : enumerate(VL)) {
13278 auto *Inst = dyn_cast<Instruction>(V);
13279 if (!Inst)
13280 continue;
13281 if (Inst->getOperand(0) != Vec)
13282 return false;
13283 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
13284 if (isa<UndefValue>(EE->getIndexOperand()))
13285 continue;
13286 std::optional<unsigned> Idx = getExtractIndex(Inst);
13287 if (!Idx)
13288 return false;
13289 const unsigned ExtIdx = *Idx;
13290 if (ExtIdx >= NElts)
13291 continue;
13292 Indices[I] = ExtIdx;
13293 if (MinIdx > ExtIdx)
13294 MinIdx = ExtIdx;
13295 if (MaxIdx < ExtIdx)
13296 MaxIdx = ExtIdx;
13297 }
13298 if (MaxIdx - MinIdx + 1 > E)
13299 return false;
13300 if (MaxIdx + 1 <= E)
13301 MinIdx = 0;
13302
13303 // Check that all of the indices extract from the correct offset.
13304 bool ShouldKeepOrder = true;
13305 // Assign to all items the initial value E + 1 so we can check if the extract
13306 // instruction index was used already.
13307 // Also, later we can check that all the indices are used and we have a
13308 // consecutive access in the extract instructions, by checking that no
13309 // element of CurrentOrder still has value E + 1.
13310 CurrentOrder.assign(E, E);
13311 for (unsigned I = 0; I < E; ++I) {
13312 if (Indices[I] == PoisonMaskElem)
13313 continue;
13314 const unsigned ExtIdx = Indices[I] - MinIdx;
13315 if (CurrentOrder[ExtIdx] != E) {
13316 CurrentOrder.clear();
13317 return false;
13318 }
13319 ShouldKeepOrder &= ExtIdx == I;
13320 CurrentOrder[ExtIdx] = I;
13321 }
13322 if (ShouldKeepOrder)
13323 CurrentOrder.clear();
13324
13325 return ShouldKeepOrder;
13326}
13327
13328bool BoUpSLP::areAllUsersVectorized(
13329 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
13330 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
13331 all_of(I->users(), [this](User *U) {
13332 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
13333 (isa<ExtractElementInst>(U) && MustGather.contains(U));
13334 });
13335}
13336
13338 const InstructionsState &S,
13339 DominatorTree &DT, const DataLayout &DL,
13340 TargetTransformInfo &TTI,
13341 const TargetLibraryInfo &TLI);
13342
13343unsigned BoUpSLP::getNumScalarInsts() const {
13344 unsigned Count = 0;
13345 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13346 const TreeEntry &TE = *Ptr;
13347 if (DeletedNodes.contains(&TE))
13348 continue;
13349 if (TE.isGather() || TransformedToGatherNodes.contains(&TE)) {
13350 // Count extractelement scalars in gathers - they exist in the scalar
13351 // code regardless of vectorization. ExtractElement instructions
13352 // become free when the vector input is used directly.
13353 for (Value *V : TE.Scalars)
13355 ++Count;
13356 continue;
13357 }
13358 // CombinedVectorize entries (e.g. the fmul child of an FMulAdd, or the
13359 // cmp child of a MinMax select) are absorbed into the parent on both
13360 // scalar and vector sides. The backend fuses fadd+fmul → fma and
13361 // select+cmp → smin/smax even for scalar code, so skip to avoid
13362 // double-counting.
13363 if (TE.State == TreeEntry::CombinedVectorize)
13364 continue;
13365 // Each vectorize entry represents a bundle of scalar instructions.
13366 // Count per-entry without cross-entry deduplication, since shared
13367 // scalars across entries still represent separate work in scalar code.
13368 for (Value *V : TE.Scalars) {
13369 if (!isa<Instruction>(V) ||
13370 (TE.hasCopyableElements() && TE.isCopyableElement(V)))
13371 continue;
13372 ++Count;
13373 // Calculate calls/divs/rems twice, they may cost higher, so better to
13374 // include their count twice to mimic slightly real cost here.
13375 auto *I = dyn_cast<Instruction>(V);
13376 if (I && (I->isIntDivRem() || I->isFPDivRem()))
13377 ++Count;
13378 if (auto *CI = dyn_cast<CallInst>(V)) {
13380 if (!isTriviallyVectorizable(BaseID))
13381 ++Count;
13382 }
13383 }
13384 // Even when the whole node is not combined, individual scalar
13385 // instructions may be fused by the backend. Each fused pair (e.g.
13386 // fadd+fmul → fma, select+cmp → smin/smax) becomes a single scalar
13387 // instruction, absorbing the operand instruction. Subtract 1 for each
13388 // such match to avoid over-counting the scalar side.
13389 if (TE.CombinedOp == TreeEntry::NotCombinedOp && TE.hasState()) {
13390 unsigned Opcode = TE.getOpcode();
13391 if (Opcode == Instruction::Select) {
13392 for (Value *V : TE.Scalars) {
13393 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
13394 continue;
13395 auto *SI = dyn_cast<SelectInst>(V);
13396 if (!SI)
13397 continue;
13398 auto [ID, _] = canConvertToMinOrMaxIntrinsic({V});
13400 assert(Count > 0 && "Underflow in scalar inst count (minmax)");
13401 --Count;
13402 }
13403 }
13404 } else if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
13405 for (Value *V : TE.Scalars) {
13406 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
13407 continue;
13408 auto *I = dyn_cast<Instruction>(V);
13409 if (!I || (TE.isAltShuffle() && I->getOpcode() != Instruction::FAdd &&
13410 I->getOpcode() != Instruction::FSub))
13411 continue;
13412 if (canConvertToFMA(I, InstructionsState(I, I), *DT, *DL, *TTI, *TLI)
13413 .isValid()) {
13414 assert(Count > 0 && "Underflow in scalar inst count (fma)");
13415 --Count;
13416 }
13417 }
13418 }
13419 }
13420 }
13421 return Count;
13422}
13423
13424unsigned BoUpSLP::getNumVectorInsts() const {
13425 unsigned Count = 0;
13426 SmallPtrSet<Value *, 4> GatherExtractSourceVecs;
13427 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13428 const TreeEntry &TE = *Ptr;
13429 if (DeletedNodes.contains(&TE))
13430 continue;
13431 if (TE.State == TreeEntry::CombinedVectorize)
13432 continue;
13433 bool IsGatherOrTransformed =
13434 TE.isGather() || TransformedToGatherNodes.contains(&TE);
13435 if (IsGatherOrTransformed) {
13436 if (TE.hasState()) {
13437 if (const TreeEntry *E =
13438 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
13439 E && E != &TE && E->getVectorFactor() == TE.getVectorFactor())
13440 continue;
13441 SmallVector<Value *> RevScalars(TE.Scalars.rbegin(), TE.Scalars.rend());
13442 if (const TreeEntry *E =
13443 getSameValuesTreeEntry(TE.getMainOp(), RevScalars);
13444 E && E->getVectorFactor() == TE.getVectorFactor()) {
13445 ++Count;
13446 continue;
13447 }
13448 }
13449 // ExtractElement gathers from the same source vector become a single
13450 // shufflevector. Collect source vectors globally across all gather
13451 // entries and count once at the end.
13452 if (all_of(TE.Scalars,
13454 for (Value *V : TE.Scalars)
13455 if (auto *EE = dyn_cast<ExtractElementInst>(V))
13456 GatherExtractSourceVecs.insert(EE->getVectorOperand());
13457 } else {
13458 for (Value *V : TE.Scalars) {
13459 if (!isConstant(V))
13460 ++Count;
13461 }
13462 }
13463 continue;
13464 }
13465 // InsertElement/ExtractElement vectorize entries don't produce real
13466 // vector instructions - InsertElement at root IS the result, and
13467 // ExtractElement entries reference the input vector directly.
13468 if (TE.getOpcode() == Instruction::InsertElement ||
13469 TE.getOpcode() == Instruction::ExtractElement)
13470 continue;
13471 if (TE.State == TreeEntry::SplitVectorize)
13472 Count += 2;
13473 else
13474 ++Count;
13475 if (!TE.ReorderIndices.empty() || !TE.ReuseShuffleIndices.empty())
13476 ++Count;
13477 }
13478 Count += GatherExtractSourceVecs.size();
13479 // Count extract instructions from ExternalUses, skipping insertelements
13480 // (those get folded into shuffles, not real extracts).
13481 SmallPtrSet<Value *, 8> CountedExtracts;
13482 for (const ExternalUser &EU : ExternalUses) {
13484 continue;
13485 if (EU.User && EphValues.count(EU.User))
13486 continue;
13487 if (ExternalUsesAsOriginalScalar.contains(EU.Scalar))
13488 continue;
13489 if (!CountedExtracts.insert(EU.Scalar).second)
13490 continue;
13491 ++Count;
13492 }
13493 return Count;
13494}
13495
13496void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
13497 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
13498 SmallVectorImpl<Value *> *OpScalars,
13499 SmallVectorImpl<Value *> *AltScalars) const {
13500 unsigned Sz = Scalars.size();
13501 Mask.assign(Sz, PoisonMaskElem);
13502 SmallVector<int> OrderMask;
13503 if (!ReorderIndices.empty())
13504 inversePermutation(ReorderIndices, OrderMask);
13505 for (unsigned I = 0; I < Sz; ++I) {
13506 unsigned Idx = I;
13507 if (!ReorderIndices.empty())
13508 Idx = OrderMask[I];
13509 if (isa<PoisonValue>(Scalars[Idx]))
13510 continue;
13511 auto *OpInst = cast<Instruction>(Scalars[Idx]);
13512 if (IsAltOp(OpInst)) {
13513 Mask[I] = Sz + Idx;
13514 if (AltScalars)
13515 AltScalars->push_back(OpInst);
13516 } else {
13517 Mask[I] = Idx;
13518 if (OpScalars)
13519 OpScalars->push_back(OpInst);
13520 }
13521 }
13522 if (!ReuseShuffleIndices.empty()) {
13523 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
13524 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
13525 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
13526 });
13527 Mask.swap(NewMask);
13528 }
13529}
13530
13532 Instruction *AltOp,
13533 const TargetLibraryInfo &TLI) {
13534 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
13535}
13536
13538 Instruction *AltOp,
13539 const TargetLibraryInfo &TLI) {
13540 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
13541 auto *AltCI = cast<CmpInst>(AltOp);
13542 CmpInst::Predicate MainP = MainCI->getPredicate();
13543 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
13544 assert(MainP != AltP && "Expected different main/alternate predicates.");
13545 auto *CI = cast<CmpInst>(I);
13546 if (isCmpSameOrSwapped(MainCI, CI, TLI))
13547 return false;
13548 if (isCmpSameOrSwapped(AltCI, CI, TLI))
13549 return true;
13550 CmpInst::Predicate P = CI->getPredicate();
13552
13553 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
13554 "CmpInst expected to match either main or alternate predicate or "
13555 "their swap.");
13556 return MainP != P && MainP != SwappedP;
13557 }
13558 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
13559}
13560
13561TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) const {
13562 assert(!Ops.empty());
13563 const auto *Op0 = Ops.front();
13564
13565 const bool IsConstant = all_of(Ops, [](Value *V) {
13566 // TODO: We should allow undef elements here
13567 return isConstant(V) && !isa<UndefValue>(V);
13568 });
13569 const bool IsUniform = all_of(Ops, [=](Value *V) {
13570 // TODO: We should allow undef elements here
13571 return V == Op0;
13572 });
13573 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
13574 // TODO: We should allow undef elements here
13575 if (auto *CI = dyn_cast<ConstantInt>(V))
13576 return CI->getValue().isPowerOf2();
13577 return false;
13578 });
13579 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
13580 // TODO: We should allow undef elements here
13581 if (auto *CI = dyn_cast<ConstantInt>(V))
13582 return CI->getValue().isNegatedPowerOf2();
13583 return false;
13584 });
13585
13587 if (IsConstant && IsUniform)
13589 else if (IsConstant)
13591 else if (IsUniform)
13593
13595 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
13596 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
13597
13598 return {VK, VP};
13599}
13600
13601namespace {
13602/// The base class for shuffle instruction emission and shuffle cost estimation.
13603class BaseShuffleAnalysis {
13604protected:
13605 Type *ScalarTy = nullptr;
13606
13607 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
13608
13609 /// V is expected to be a vectorized value.
13610 /// When REVEC is disabled, there is no difference between VF and
13611 /// VNumElements.
13612 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
13613 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
13614 /// of 8.
13615 unsigned getVF(Value *V) const {
13616 assert(V && "V cannot be nullptr");
13617 assert(isa<FixedVectorType>(V->getType()) &&
13618 "V does not have FixedVectorType");
13619 assert(ScalarTy && "ScalarTy cannot be nullptr");
13620 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13621 unsigned VNumElements =
13622 cast<FixedVectorType>(V->getType())->getNumElements();
13623 assert(VNumElements > ScalarTyNumElements &&
13624 "the number of elements of V is not large enough");
13625 assert(VNumElements % ScalarTyNumElements == 0 &&
13626 "the number of elements of V is not a vectorized value");
13627 return VNumElements / ScalarTyNumElements;
13628 }
13629
13630 /// Checks if the mask is an identity mask.
13631 /// \param IsStrict if is true the function returns false if mask size does
13632 /// not match vector size.
13633 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
13634 bool IsStrict) {
13635 int Limit = Mask.size();
13636 int VF = VecTy->getNumElements();
13637 int Index = -1;
13638 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
13639 return true;
13640 if (!IsStrict) {
13641 // Consider extract subvector starting from index 0.
13642 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
13643 Index == 0)
13644 return true;
13645 // All VF-size submasks are identity (e.g.
13646 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
13647 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
13648 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
13649 return all_of(Slice, equal_to(PoisonMaskElem)) ||
13651 }))
13652 return true;
13653 }
13654 return false;
13655 }
13656
13657 /// Tries to combine 2 different masks into single one.
13658 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
13659 /// change the size of the vector, \p LocalVF is the original size of the
13660 /// shuffled vector.
13661 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
13662 ArrayRef<int> ExtMask) {
13663 unsigned VF = Mask.size();
13664 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
13665 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
13666 if (ExtMask[I] == PoisonMaskElem)
13667 continue;
13668 int MaskedIdx = Mask[ExtMask[I] % VF];
13669 NewMask[I] =
13670 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
13671 }
13672 Mask.swap(NewMask);
13673 }
13674
13675 /// Looks through shuffles trying to reduce final number of shuffles in the
13676 /// code. The function looks through the previously emitted shuffle
13677 /// instructions and properly mark indices in mask as undef.
13678 /// For example, given the code
13679 /// \code
13680 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
13681 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
13682 /// \endcode
13683 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
13684 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
13685 /// <0, 1, 2, 3> for the shuffle.
13686 /// If 2 operands are of different size, the smallest one will be resized and
13687 /// the mask recalculated properly.
13688 /// For example, given the code
13689 /// \code
13690 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
13691 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
13692 /// \endcode
13693 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
13694 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
13695 /// <0, 1, 2, 3> for the shuffle.
13696 /// So, it tries to transform permutations to simple vector merge, if
13697 /// possible.
13698 /// \param V The input vector which must be shuffled using the given \p Mask.
13699 /// If the better candidate is found, \p V is set to this best candidate
13700 /// vector.
13701 /// \param Mask The input mask for the shuffle. If the best candidate is found
13702 /// during looking-through-shuffles attempt, it is updated accordingly.
13703 /// \param SinglePermute true if the shuffle operation is originally a
13704 /// single-value-permutation. In this case the look-through-shuffles procedure
13705 /// may look for resizing shuffles as the best candidates.
13706 /// \return true if the shuffle results in the non-resizing identity shuffle
13707 /// (and thus can be ignored), false - otherwise.
13708 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
13709 bool SinglePermute) {
13710 Value *Op = V;
13711 ShuffleVectorInst *IdentityOp = nullptr;
13712 SmallVector<int> IdentityMask;
13713 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
13714 // Exit if not a fixed vector type or changing size shuffle.
13715 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
13716 if (!SVTy)
13717 break;
13718 // Remember the identity or broadcast mask, if it is not a resizing
13719 // shuffle. If no better candidates are found, this Op and Mask will be
13720 // used in the final shuffle.
13721 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
13722 if (!IdentityOp || !SinglePermute ||
13723 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
13725 IdentityMask.size()))) {
13726 IdentityOp = SV;
13727 // Store current mask in the IdentityMask so later we did not lost
13728 // this info if IdentityOp is selected as the best candidate for the
13729 // permutation.
13730 IdentityMask.assign(Mask);
13731 }
13732 }
13733 // Remember the broadcast mask. If no better candidates are found, this Op
13734 // and Mask will be used in the final shuffle.
13735 // Zero splat can be used as identity too, since it might be used with
13736 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
13737 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
13738 // expensive, the analysis founds out, that the source vector is just a
13739 // broadcast, this original mask can be transformed to identity mask <0,
13740 // 1, 2, 3>.
13741 // \code
13742 // %0 = shuffle %v, poison, zeroinitalizer
13743 // %res = shuffle %0, poison, <3, 1, 2, 0>
13744 // \endcode
13745 // may be transformed to
13746 // \code
13747 // %0 = shuffle %v, poison, zeroinitalizer
13748 // %res = shuffle %0, poison, <0, 1, 2, 3>
13749 // \endcode
13750 if (SV->isZeroEltSplat()) {
13751 IdentityOp = SV;
13752 IdentityMask.assign(Mask);
13753 }
13754 int LocalVF = Mask.size();
13755 if (auto *SVOpTy =
13756 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
13757 LocalVF = SVOpTy->getNumElements();
13758 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
13759 for (auto [Idx, I] : enumerate(Mask)) {
13760 if (I == PoisonMaskElem ||
13761 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
13762 continue;
13763 ExtMask[Idx] = SV->getMaskValue(I);
13764 }
13765 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
13766 SV->getOperand(0),
13767 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
13768 .all();
13769 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
13770 SV->getOperand(1),
13771 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
13772 .all();
13773 if (!IsOp1Undef && !IsOp2Undef) {
13774 // Update mask and mark undef elems.
13775 for (int &I : Mask) {
13776 if (I == PoisonMaskElem)
13777 continue;
13778 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
13780 I = PoisonMaskElem;
13781 }
13782 break;
13783 }
13784 SmallVector<int> ShuffleMask(SV->getShuffleMask());
13785 combineMasks(LocalVF, ShuffleMask, Mask);
13786 Mask.swap(ShuffleMask);
13787 if (IsOp2Undef)
13788 Op = SV->getOperand(0);
13789 else
13790 Op = SV->getOperand(1);
13791 }
13792 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
13793 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
13795 if (IdentityOp) {
13796 V = IdentityOp;
13797 assert(Mask.size() == IdentityMask.size() &&
13798 "Expected masks of same sizes.");
13799 // Clear known poison elements.
13800 for (auto [I, Idx] : enumerate(Mask))
13801 if (Idx == PoisonMaskElem)
13802 IdentityMask[I] = PoisonMaskElem;
13803 Mask.swap(IdentityMask);
13804 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
13805 return SinglePermute &&
13806 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
13807 /*IsStrict=*/true) ||
13808 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
13809 Shuffle->isZeroEltSplat() &&
13811 all_of(enumerate(Mask), [&](const auto &P) {
13812 return P.value() == PoisonMaskElem ||
13813 Shuffle->getShuffleMask()[P.index()] == 0;
13814 })));
13815 }
13816 V = Op;
13817 return false;
13818 }
13819 V = Op;
13820 return true;
13821 }
13822
13823 /// Smart shuffle instruction emission, walks through shuffles trees and
13824 /// tries to find the best matching vector for the actual shuffle
13825 /// instruction.
13826 template <typename T, typename ShuffleBuilderTy, typename... Args>
13827 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
13828 ShuffleBuilderTy &Builder, Type *ScalarTy,
13829 Args... Arguments) {
13830 assert(V1 && "Expected at least one vector value.");
13831 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13832 SmallVector<int> NewMask(Mask);
13833 if (ScalarTyNumElements != 1) {
13834 assert(SLPReVec && "FixedVectorType is not expected.");
13835 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
13836 Mask = NewMask;
13837 }
13838 if (V2)
13839 Builder.resizeToMatch(V1, V2);
13840 int VF = Mask.size();
13841 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
13842 VF = FTy->getNumElements();
13844 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
13845 .all()) {
13846 // Peek through shuffles.
13847 Value *Op1 = V1;
13848 Value *Op2 = V2;
13849 int VF =
13850 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13851 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13852 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13853 for (int I = 0, E = Mask.size(); I < E; ++I) {
13854 if (Mask[I] < VF)
13855 CombinedMask1[I] = Mask[I];
13856 else
13857 CombinedMask2[I] = Mask[I] - VF;
13858 }
13859 Value *PrevOp1;
13860 Value *PrevOp2;
13861 do {
13862 PrevOp1 = Op1;
13863 PrevOp2 = Op2;
13864 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
13865 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
13866 // Check if we have 2 resizing shuffles - need to peek through operands
13867 // again.
13868 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
13869 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
13870 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
13871 for (auto [Idx, I] : enumerate(CombinedMask1)) {
13872 if (I == PoisonMaskElem)
13873 continue;
13874 ExtMask1[Idx] = SV1->getMaskValue(I);
13875 }
13876 SmallBitVector UseMask1 = buildUseMask(
13877 cast<FixedVectorType>(SV1->getOperand(1)->getType())
13878 ->getNumElements(),
13879 ExtMask1, UseMask::SecondArg);
13880 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
13881 for (auto [Idx, I] : enumerate(CombinedMask2)) {
13882 if (I == PoisonMaskElem)
13883 continue;
13884 ExtMask2[Idx] = SV2->getMaskValue(I);
13885 }
13886 SmallBitVector UseMask2 = buildUseMask(
13887 cast<FixedVectorType>(SV2->getOperand(1)->getType())
13888 ->getNumElements(),
13889 ExtMask2, UseMask::SecondArg);
13890 if (SV1->getOperand(0)->getType() ==
13891 SV2->getOperand(0)->getType() &&
13892 SV1->getOperand(0)->getType() != SV1->getType() &&
13893 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
13894 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
13895 Op1 = SV1->getOperand(0);
13896 Op2 = SV2->getOperand(0);
13897 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
13898 int LocalVF = ShuffleMask1.size();
13899 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
13900 LocalVF = FTy->getNumElements();
13901 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
13902 CombinedMask1.swap(ShuffleMask1);
13903 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
13904 LocalVF = ShuffleMask2.size();
13905 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
13906 LocalVF = FTy->getNumElements();
13907 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
13908 CombinedMask2.swap(ShuffleMask2);
13909 }
13910 }
13911 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
13912 Builder.resizeToMatch(Op1, Op2);
13913 VF = std::max(cast<VectorType>(Op1->getType())
13914 ->getElementCount()
13915 .getKnownMinValue(),
13917 ->getElementCount()
13918 .getKnownMinValue());
13919 for (int I = 0, E = Mask.size(); I < E; ++I) {
13920 if (CombinedMask2[I] != PoisonMaskElem) {
13921 assert(CombinedMask1[I] == PoisonMaskElem &&
13922 "Expected undefined mask element");
13923 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
13924 }
13925 }
13926 if (Op1 == Op2 &&
13927 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
13928 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
13930 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
13931 ArrayRef(CombinedMask1))))
13932 return Builder.createIdentity(Op1);
13933 return Builder.createShuffleVector(
13934 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
13935 CombinedMask1);
13936 }
13937 if (isa<PoisonValue>(V1))
13938 return Builder.createPoison(
13939 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
13940 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
13941 assert(V1 && "Expected non-null value after looking through shuffles.");
13942
13943 if (!IsIdentity)
13944 return Builder.createShuffleVector(V1, NewMask, Arguments...);
13945 return Builder.createIdentity(V1);
13946 }
13947
13948 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
13949 /// shuffle emission.
13950 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
13951 ArrayRef<int> Mask) {
13952 for (unsigned I : seq<unsigned>(CommonMask.size()))
13953 if (Mask[I] != PoisonMaskElem)
13954 CommonMask[I] = I;
13955 }
13956};
13957} // namespace
13958
13959/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
13960static std::pair<InstructionCost, InstructionCost>
13962 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
13963 Type *ScalarTy, VectorType *VecTy) {
13964 InstructionCost ScalarCost = 0;
13965 InstructionCost VecCost = 0;
13966 // Here we differentiate two cases: (1) when Ptrs represent a regular
13967 // vectorization tree node (as they are pointer arguments of scattered
13968 // loads) or (2) when Ptrs are the arguments of loads or stores being
13969 // vectorized as plane wide unit-stride load/store since all the
13970 // loads/stores are known to be from/to adjacent locations.
13971 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13972 // Case 2: estimate costs for pointer related costs when vectorizing to
13973 // a wide load/store.
13974 // Scalar cost is estimated as a set of pointers with known relationship
13975 // between them.
13976 // For vector code we will use BasePtr as argument for the wide load/store
13977 // but we also need to account all the instructions which are going to
13978 // stay in vectorized code due to uses outside of these scalar
13979 // loads/stores.
13980 ScalarCost = TTI.getPointersChainCost(
13981 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13982 CostKind);
13983
13984 SmallVector<const Value *> PtrsRetainedInVecCode;
13985 for (Value *V : Ptrs) {
13986 if (V == BasePtr) {
13987 PtrsRetainedInVecCode.push_back(V);
13988 continue;
13989 }
13990 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
13991 // For simplicity assume Ptr to stay in vectorized code if it's not a
13992 // GEP instruction. We don't care since it's cost considered free.
13993 // TODO: We should check for any uses outside of vectorizable tree
13994 // rather than just single use.
13995 if (!Ptr || !Ptr->hasOneUse())
13996 PtrsRetainedInVecCode.push_back(V);
13997 }
13998
13999 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
14000 // If all pointers stay in vectorized code then we don't have
14001 // any savings on that.
14002 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
14003 }
14004 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
14005 TTI::PointersChainInfo::getKnownStride(),
14006 VecTy, CostKind);
14007 } else {
14008 // Case 1: Ptrs are the arguments of loads that we are going to transform
14009 // into masked gather load intrinsic.
14010 // All the scalar GEPs will be removed as a result of vectorization.
14011 // For any external uses of some lanes extract element instructions will
14012 // be generated (which cost is estimated separately).
14013 TTI::PointersChainInfo PtrsInfo =
14014 all_of(Ptrs,
14015 [](const Value *V) {
14016 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
14017 return Ptr && !Ptr->hasAllConstantIndices();
14018 })
14019 ? TTI::PointersChainInfo::getUnknownStride()
14020 : TTI::PointersChainInfo::getKnownStride();
14021
14022 ScalarCost =
14023 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
14024 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
14025 if (!BaseGEP) {
14026 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
14027 if (It != Ptrs.end())
14028 BaseGEP = cast<GEPOperator>(*It);
14029 }
14030 if (BaseGEP) {
14031 SmallVector<const Value *> Indices(BaseGEP->indices());
14032 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
14033 BaseGEP->getPointerOperand(), Indices, VecTy,
14034 CostKind);
14035 }
14036 }
14037
14038 return std::make_pair(ScalarCost, VecCost);
14039}
14040
14041void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
14042 assert(TE.isGather() && TE.ReorderIndices.empty() &&
14043 "Expected gather node without reordering.");
14044 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
14045 SmallSet<size_t, 2> LoadKeyUsed;
14046
14047 // Do not reorder nodes if it small (just 2 elements), all-constant or all
14048 // instructions have same opcode already.
14049 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
14050 all_of(TE.Scalars, isConstant))
14051 return;
14052
14053 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
14054 return VectorizableTree[Idx]->isSame(TE.Scalars);
14055 }))
14056 return;
14057
14058 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
14059 Key = hash_combine(hash_value(LI->getParent()->getNumber()), Key);
14060 Value *Ptr =
14061 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
14062 if (LoadKeyUsed.contains(Key)) {
14063 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
14064 if (LIt != LoadsMap.end()) {
14065 for (LoadInst *RLI : LIt->second) {
14066 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
14067 LI->getType(), LI->getPointerOperand(), *DL, *SE,
14068 /*StrictCheck=*/true))
14069 return hash_value(RLI->getPointerOperand());
14070 }
14071 for (LoadInst *RLI : LIt->second) {
14073 LI->getPointerOperand(), *TLI)) {
14074 hash_code SubKey = hash_value(RLI->getPointerOperand());
14075 return SubKey;
14076 }
14077 }
14078 if (LIt->second.size() > 2) {
14079 hash_code SubKey =
14080 hash_value(LIt->second.back()->getPointerOperand());
14081 return SubKey;
14082 }
14083 }
14084 }
14085 LoadKeyUsed.insert(Key);
14086 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
14087 return hash_value(LI->getPointerOperand());
14088 };
14089 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
14090 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
14091 bool IsOrdered = true;
14092 unsigned NumInstructions = 0;
14093 // Try to "cluster" scalar instructions, to be able to build extra vectorized
14094 // nodes.
14095 for (auto [I, V] : enumerate(TE.Scalars)) {
14096 size_t Key = 1, Idx = 1;
14097 if (auto *Inst = dyn_cast<Instruction>(V);
14099 !isDeleted(Inst) && !isVectorized(V)) {
14100 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
14101 /*AllowAlternate=*/false);
14102 ++NumInstructions;
14103 }
14104 auto &Container = SortedValues[Key];
14105 if (IsOrdered && !KeyToIndex.contains(V) &&
14108 ((Container.contains(Idx) &&
14109 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
14110 (!Container.empty() && !Container.contains(Idx) &&
14111 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
14112 IsOrdered = false;
14113 auto &KTI = KeyToIndex[V];
14114 if (KTI.empty())
14115 Container[Idx].push_back(V);
14116 KTI.push_back(I);
14117 }
14119 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
14120 if (!IsOrdered && NumInstructions > 1) {
14121 unsigned Cnt = 0;
14122 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
14123 for (const auto &D : SortedValues) {
14124 for (const auto &P : D.second) {
14125 unsigned Sz = 0;
14126 for (Value *V : P.second) {
14127 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
14128 for (auto [K, Idx] : enumerate(Indices)) {
14129 TE.ReorderIndices[Cnt + K] = Idx;
14130 TE.Scalars[Cnt + K] = V;
14131 }
14132 Sz += Indices.size();
14133 Cnt += Indices.size();
14134 }
14135 if (Sz > 1 && isa<Instruction>(P.second.front())) {
14136 const unsigned SubVF = getFloorFullVectorNumberOfElements(
14137 *TTI, TE.Scalars.front()->getType(), Sz);
14138 SubVectors.emplace_back(Cnt - Sz, SubVF);
14139 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
14140 DemandedElts.clearBit(I);
14141 } else if (!P.second.empty() && isConstant(P.second.front())) {
14142 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
14143 DemandedElts.clearBit(I);
14144 }
14145 }
14146 }
14147 }
14148 // Reuses always require shuffles, so consider it as profitable.
14149 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
14150 return;
14151 // Do simple cost estimation.
14154 auto *ScalarTy = TE.Scalars.front()->getType();
14155 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
14156 for (auto [Idx, Sz] : SubVectors) {
14158 Idx, getWidenedType(ScalarTy, Sz));
14159 }
14160 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
14161 /*Insert=*/true,
14162 /*Extract=*/false, CostKind);
14163 int Sz = TE.Scalars.size();
14164 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
14165 TE.ReorderIndices.end());
14166 for (unsigned I : seq<unsigned>(Sz)) {
14167 Value *V = TE.getOrdered(I);
14168 if (isa<PoisonValue>(V)) {
14169 ReorderMask[I] = PoisonMaskElem;
14170 } else if (isConstant(V) || DemandedElts[I]) {
14171 ReorderMask[I] = I + TE.ReorderIndices.size();
14172 }
14173 }
14174 Cost += ::getShuffleCost(*TTI,
14175 any_of(ReorderMask, [&](int I) { return I >= Sz; })
14178 VecTy, ReorderMask);
14179 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
14180 ReorderMask.assign(Sz, PoisonMaskElem);
14181 for (unsigned I : seq<unsigned>(Sz)) {
14182 Value *V = TE.getOrdered(I);
14183 if (isConstant(V)) {
14184 DemandedElts.clearBit(I);
14185 if (!isa<PoisonValue>(V))
14186 ReorderMask[I] = I;
14187 } else {
14188 ReorderMask[I] = I + Sz;
14189 }
14190 }
14191 InstructionCost BVCost =
14192 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
14193 /*Insert=*/true, /*Extract=*/false, CostKind);
14194 if (!DemandedElts.isAllOnes())
14195 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
14196 if (Cost >= BVCost) {
14197 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
14198 reorderScalars(TE.Scalars, Mask);
14199 TE.ReorderIndices.clear();
14200 }
14201}
14202
14203/// Check if we can convert fadd/fsub sequence to FMAD.
14204/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
14206 const InstructionsState &S,
14207 DominatorTree &DT, const DataLayout &DL,
14209 const TargetLibraryInfo &TLI) {
14210 assert(all_of(VL,
14211 [](Value *V) {
14212 return V->getType()->getScalarType()->isFloatingPointTy();
14213 }) &&
14214 "Can only convert to FMA for floating point types");
14215 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
14216
14217 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
14218 FastMathFlags FMF;
14219 FMF.set();
14220 for (Value *V : VL) {
14221 auto *I = dyn_cast<Instruction>(V);
14222 if (!I)
14223 continue;
14224 if (S.isCopyableElement(I))
14225 continue;
14226 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
14227 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
14228 continue;
14229 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
14230 FMF &= FPCI->getFastMathFlags();
14231 }
14232 return FMF.allowContract();
14233 };
14234 if (!CheckForContractable(VL))
14236 // fmul also should be contractable
14237 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
14238 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
14239
14240 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
14241 if (!OpS.valid())
14243
14244 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
14246 if (!CheckForContractable(Operands.front()))
14248 // Compare the costs.
14249 InstructionCost FMulPlusFAddCost = 0;
14250 InstructionCost FMACost = 0;
14252 FastMathFlags FMF;
14253 FMF.set();
14254 for (Value *V : VL) {
14255 auto *I = dyn_cast<Instruction>(V);
14256 if (!I)
14257 continue;
14258 if (!S.isCopyableElement(I))
14259 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
14260 FMF &= FPCI->getFastMathFlags();
14261 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
14262 }
14263 unsigned NumOps = 0;
14264 for (auto [V, Op] : zip(VL, Operands.front())) {
14265 if (S.isCopyableElement(V))
14266 continue;
14267 auto *I = dyn_cast<Instruction>(Op);
14268 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
14269 if (auto *OpI = dyn_cast<Instruction>(V))
14270 FMACost += TTI.getInstructionCost(OpI, CostKind);
14271 if (I)
14272 FMACost += TTI.getInstructionCost(I, CostKind);
14273 continue;
14274 }
14275 ++NumOps;
14276 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
14277 FMF &= FPCI->getFastMathFlags();
14278 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
14279 }
14280 Type *Ty = VL.front()->getType();
14281 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
14282 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
14283 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
14284}
14285
14286bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
14287 bool &IsBSwap, bool &ForLoads) const {
14288 assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
14289 "Expected Shl node.");
14290 IsBSwap = false;
14291 ForLoads = false;
14292 if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
14293 !TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
14294 any_of(TE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
14295 return false;
14296 Type *ScalarTy = TE.getMainOp()->getType();
14297 // TODO: Check if same can be done for the vector types.
14298 if (!ScalarTy->isIntegerTy())
14299 return false;
14300 if (ScalarTy->isVectorTy())
14301 return false;
14302 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
14303 const TreeEntry *LhsTE = getOperandEntry(&TE, /*Idx=*/0);
14304 const TreeEntry *RhsTE = getOperandEntry(&TE, /*Idx=*/1);
14305 // Lhs should be zext i<stride> to I<sz>.
14306 if (!(LhsTE->State == TreeEntry::Vectorize &&
14307 LhsTE->getOpcode() == Instruction::ZExt &&
14308 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
14309 !MinBWs.contains(LhsTE) &&
14310 all_of(LhsTE->Scalars, [](Value *V) { return V->hasOneUse(); })))
14311 return false;
14312 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
14313 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
14314 if (!isPowerOf2_64(Stride) || Stride >= Sz || Sz % Stride != 0 ||
14315 !isPowerOf2_64(LhsTE->getVectorFactor()))
14316 return false;
14317 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
14318 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
14319 return false;
14320 Order.clear();
14321 unsigned CurrentValue = 0;
14322 // Rhs should be (0, Stride, 2 * Stride, ..., N-Stride), where N <= Sz.
14323 if (all_of(RhsTE->Scalars,
14324 [&](Value *V) {
14325 CurrentValue += Stride;
14326 if (isa<UndefValue>(V))
14327 return true;
14328 auto *C = dyn_cast<Constant>(V);
14329 if (!C)
14330 return false;
14331 return C->getUniqueInteger() == CurrentValue - Stride;
14332 }) &&
14333 CurrentValue <= Sz) {
14334 Order.clear();
14335 } else {
14336 const unsigned VF = RhsTE->getVectorFactor();
14337 Order.assign(VF, VF);
14338 // Track which logical positions we've seen; reject duplicate shift amounts.
14339 SmallBitVector SeenPositions(VF);
14340 // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
14341 // ..., N-Stride), where N <= Sz.
14342 if (VF * Stride > Sz)
14343 return false;
14344 for (const auto [Idx, V] : enumerate(RhsTE->Scalars)) {
14345 if (isa<UndefValue>(V))
14346 continue;
14347 auto *C = dyn_cast<Constant>(V);
14348 if (!C)
14349 return false;
14350 const APInt &Val = C->getUniqueInteger();
14351 if (Val.isNegative() || Val.uge(Sz) || Val.getZExtValue() % Stride != 0)
14352 return false;
14353 unsigned Pos = Val.getZExtValue() / Stride;
14354 // TODO: Support Pos >= VF, in this case need to shift the final value.
14355 if (Order[Idx] != VF || Pos >= VF)
14356 return false;
14357 if (SeenPositions.test(Pos))
14358 return false;
14359 SeenPositions.set(Pos);
14360 Order[Idx] = Pos;
14361 }
14362 // One of the indices not set - exit.
14363 if (is_contained(Order, VF))
14364 return false;
14365 }
14367 auto *SrcType = IntegerType::getIntNTy(ScalarTy->getContext(),
14368 Stride * LhsTE->getVectorFactor());
14369 FastMathFlags FMF;
14370 SmallPtrSet<Value *, 4> CheckedExtracts;
14371 auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
14372 auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
14373 TTI::CastContextHint CastCtx =
14374 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
14375 InstructionCost VecCost =
14376 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF, CostKind) +
14377 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy, CostKind,
14378 getOperandInfo(LhsTE->Scalars)) +
14379 TTI->getCastInstrCost(
14380 Instruction::ZExt, VecTy,
14381 getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()), CastCtx,
14382 CostKind);
14383 InstructionCost BitcastCost = TTI->getCastInstrCost(
14384 Instruction::BitCast, SrcType, SrcVecTy, CastCtx, CostKind);
14385 if (!Order.empty()) {
14386 fixupOrderingIndices(Order);
14387 SmallVector<int> Mask;
14388 inversePermutation(Order, Mask);
14389 BitcastCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, SrcVecTy,
14390 Mask, CostKind);
14391 }
14392 // Check if the combination can be modeled as a bitcast+byteswap operation.
14393 constexpr unsigned ByteSize = 8;
14394 if (!Order.empty() && isReverseOrder(Order) &&
14395 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14396 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14397 InstructionCost BSwapCost =
14398 TTI->getCastInstrCost(Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
14399 CostKind) +
14400 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14401 if (BSwapCost <= BitcastCost) {
14402 BitcastCost = BSwapCost;
14403 IsBSwap = true;
14404 Order.clear();
14405 // Check for loads in the ZExt node.
14406 const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
14407 if (SrcTE->State == TreeEntry::Vectorize &&
14408 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
14409 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14410 all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
14411 auto *LI = cast<LoadInst>(SrcTE->getMainOp());
14412 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14413 InstructionCost BSwapCost =
14414 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14415 LI->getPointerAddressSpace(), CostKind) +
14416 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14417 if (BSwapCost <= BitcastCost) {
14418 VecCost +=
14419 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14420 LI->getPointerAddressSpace(), CostKind);
14421 BitcastCost = BSwapCost;
14422 ForLoads = true;
14423 }
14424 }
14425 }
14426 } else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14427 // Check for loads in the ZExt node.
14428 const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
14429 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
14430 SrcTE->ReuseShuffleIndices.empty() &&
14431 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14432 all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
14433 auto *LI = cast<LoadInst>(SrcTE->getMainOp());
14434 BitcastCost =
14435 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14436 LI->getPointerAddressSpace(), CostKind);
14437 VecCost +=
14438 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14439 LI->getPointerAddressSpace(), CostKind);
14440 ForLoads = true;
14441 }
14442 }
14443 if (SrcType != ScalarTy) {
14444 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
14446 }
14447 return BitcastCost < VecCost;
14448}
14449
14450bool BoUpSLP::matchesInversedZExtSelect(
14451 const TreeEntry &SelectTE,
14452 SmallVectorImpl<unsigned> &InversedCmpsIndices) const {
14453 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14454 "Expected select node.");
14456 for (auto [Idx, V] : enumerate(SelectTE.Scalars)) {
14457 auto *Inst = dyn_cast<Instruction>(V);
14458 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
14459 continue;
14460 ZExts.emplace_back(Inst, Idx);
14461 }
14462 if (ZExts.empty())
14463 return false;
14464 const auto *CmpTE = getOperandEntry(&SelectTE, 0);
14465 const auto *Op1TE = getOperandEntry(&SelectTE, 1);
14466 const auto *Op2TE = getOperandEntry(&SelectTE, 2);
14467 // Compares must be alternate vectorized, and other operands must be gathers
14468 // or copyables.
14469 // TODO: investigate opportunity for reordered/reused nodes.
14470 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
14471 (CmpTE->getOpcode() != Instruction::ICmp &&
14472 CmpTE->getOpcode() != Instruction::FCmp) ||
14473 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
14474 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14475 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
14476 return false;
14477 // The operands must be buildvectors/copyables.
14478 if (!Op1TE->isGather() || !Op2TE->isGather())
14479 return false;
14480 // TODO: investigate opportunity for the vector nodes with copyables.
14481 auto *Cmp = CmpTE->getMainOp();
14482 CmpPredicate Pred;
14483 auto MatchCmp = m_Cmp(Pred, m_Value(), m_Value());
14484 if (!match(Cmp, MatchCmp))
14485 return false;
14486 CmpPredicate MainPred = Pred;
14487 CmpPredicate InversedPred(CmpInst::getInversePredicate(Pred),
14488 Pred.hasSameSign());
14489 for (const auto [Idx, V] : enumerate(CmpTE->Scalars)) {
14490 if (!match(V, MatchCmp))
14491 continue;
14492 if (CmpPredicate::getMatching(MainPred, Pred))
14493 continue;
14494 if (!CmpPredicate::getMatching(InversedPred, Pred))
14495 return false;
14496 if (!V->hasOneUse())
14497 return false;
14498 InversedCmpsIndices.push_back(Idx);
14499 }
14500
14501 if (InversedCmpsIndices.empty())
14502 return false;
14503 VectorType *VecTy =
14504 getWidenedType(Cmp->getOperand(0)->getType(), CmpTE->getVectorFactor());
14505 Type *CmpTy = CmpInst::makeCmpResultType(VecTy);
14506
14508 InstructionCost VecCost =
14509 TTI->getCmpSelInstrCost(CmpTE->getOpcode(), VecTy, CmpTy, MainPred,
14510 CostKind, getOperandInfo(CmpTE->getOperand(0)),
14511 getOperandInfo(CmpTE->getOperand(1)));
14512 InstructionCost BVCost =
14513 ::getScalarizationOverhead(*TTI, Cmp->getType(), cast<VectorType>(CmpTy),
14514 APInt::getAllOnes(CmpTE->getVectorFactor()),
14515 /*Insert=*/true, /*Extract=*/false, CostKind);
14516 for (Value *V : CmpTE->Scalars) {
14517 auto *I = dyn_cast<Instruction>(V);
14518 if (!I)
14519 continue;
14520 BVCost += TTI->getInstructionCost(I, CostKind);
14521 }
14522 return VecCost < BVCost;
14523}
14524
14525bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const {
14526 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14527 "Expected select node.");
14528 if (DL->isBigEndian())
14529 return false;
14530 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
14531 return false;
14532 if (!UserIgnoreList || SelectTE.Idx != 0)
14533 return false;
14534 if (any_of(SelectTE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
14535 return false;
14536 // Check that all reduction operands are or instructions.
14537 if (any_of(*UserIgnoreList,
14538 [](Value *V) { return !match(V, m_Or(m_Value(), m_Value())); }))
14539 return false;
14540 const TreeEntry *Op1TE = getOperandEntry(&SelectTE, 1);
14541 const TreeEntry *Op2TE = getOperandEntry(&SelectTE, 2);
14542 if (!Op1TE->isGather() || !Op2TE->isGather())
14543 return false;
14544 // No need to check for zeroes reordering.
14545 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14546 !Op2TE->ReuseShuffleIndices.empty())
14547 return false;
14548 Type *ScalarTy = Op1TE->Scalars.front()->getType();
14549 if (!ScalarTy->isIntegerTy())
14550 return false;
14551 // Check that second operand is all zeroes.
14552 if (any_of(Op2TE->Scalars, [](Value *V) { return !match(V, m_ZeroInt()); }))
14553 return false;
14554 // Check that first operand is 1,2,4,...
14555 if (any_of(enumerate(Op1TE->Scalars), [](const auto &P) {
14556 uint64_t V;
14557 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(V) &&
14558 Log2_64(V) == P.index());
14559 }))
14560 return false;
14561 // Check if bitcast is cheaper than select.
14562 auto *DstTy = IntegerType::getIntNTy(ScalarTy->getContext(),
14563 SelectTE.getVectorFactor());
14564 VectorType *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor());
14565 Type *CmpTy = CmpInst::makeCmpResultType(OpTy);
14566 VectorType *VecTy = getWidenedType(ScalarTy, SelectTE.getVectorFactor());
14567 auto It = MinBWs.find(&SelectTE);
14568 if (It != MinBWs.end()) {
14569 auto *EffectiveScalarTy =
14570 IntegerType::get(F->getContext(), It->second.first);
14571 VecTy = getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor());
14572 }
14574 InstructionCost BitcastCost = TTI->getCastInstrCost(
14575 Instruction::BitCast, DstTy, CmpTy, TTI::CastContextHint::None, CostKind);
14576 if (DstTy != ScalarTy) {
14577 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
14579 }
14580 FastMathFlags FMF;
14581 InstructionCost SelectCost =
14582 TTI->getCmpSelInstrCost(Instruction::Select, VecTy, CmpTy,
14584 getOperandInfo(Op1TE->Scalars),
14585 getOperandInfo(Op2TE->Scalars)) +
14586 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF, CostKind);
14587 return BitcastCost <= SelectCost;
14588}
14589
14592 BaseGraphSize = VectorizableTree.size();
14593 // Turn graph transforming mode on and off, when done.
14594 class GraphTransformModeRAAI {
14595 bool &SavedIsGraphTransformMode;
14596
14597 public:
14598 GraphTransformModeRAAI(bool &IsGraphTransformMode)
14599 : SavedIsGraphTransformMode(IsGraphTransformMode) {
14600 IsGraphTransformMode = true;
14601 }
14602 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
14603 } TransformContext(IsGraphTransformMode);
14604 // Operands are profitable if they are:
14605 // 1. At least one constant
14606 // or
14607 // 2. Splats
14608 // or
14609 // 3. Results in good vectorization opportunity, i.e. may generate vector
14610 // nodes and reduce cost of the graph.
14611 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
14612 const InstructionsState &S) {
14614 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
14615 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
14616 I2->getOperand(Op));
14617 return all_of(Candidates, [this](
14618 ArrayRef<std::pair<Value *, Value *>> Cand) {
14619 return all_of(Cand,
14620 [](const std::pair<Value *, Value *> &P) {
14621 return isa<Constant>(P.first) ||
14622 isa<Constant>(P.second) || P.first == P.second;
14623 }) ||
14625 });
14626 };
14627
14628 // Try to reorder gather nodes for better vectorization opportunities.
14629 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
14630 TreeEntry &E = *VectorizableTree[Idx];
14631 if (E.isGather())
14632 reorderGatherNode(E);
14633 }
14634
14635 // Better to use full gathered loads analysis, if there are only 2 loads
14636 // gathered nodes each having less than 16 elements.
14637 constexpr unsigned VFLimit = 16;
14638 bool ForceLoadGather =
14639 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14640 return TE->isGather() && TE->hasState() &&
14641 TE->getOpcode() == Instruction::Load &&
14642 TE->getVectorFactor() < VFLimit;
14643 }) == 2;
14644
14645 // Checks if the scalars are used in other node.
14646 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
14647 function_ref<bool(Value *)> CheckContainer) {
14648 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
14649 if (isa<PoisonValue>(V))
14650 return true;
14651 auto *I = dyn_cast<Instruction>(V);
14652 if (!I)
14653 return false;
14654 return is_contained(TE->Scalars, I) || CheckContainer(I);
14655 });
14656 };
14657 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
14658 if (E.hasState()) {
14659 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
14660 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
14661 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
14662 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
14663 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14664 return is_contained(TEs, TE);
14665 });
14666 });
14667 }))
14668 return true;
14669 ;
14670 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
14671 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
14672 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
14673 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14674 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14675 return is_contained(TEs, TE);
14676 });
14677 });
14678 }))
14679 return true;
14680 } else {
14681 // Check if the gather node full copy of split node.
14682 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
14683 if (It != E.Scalars.end()) {
14684 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
14685 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
14686 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
14687 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14688 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14689 return is_contained(TEs, TE);
14690 });
14691 });
14692 }))
14693 return true;
14694 }
14695 }
14696 return false;
14697 };
14698 // The tree may grow here, so iterate over nodes, built before.
14699 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
14700 TreeEntry &E = *VectorizableTree[Idx];
14701 if (E.isGather()) {
14702 ArrayRef<Value *> VL = E.Scalars;
14703 const unsigned Sz = getVectorElementSize(VL.front());
14704 unsigned MinVF = getMinVF(2 * Sz);
14705 // Do not try partial vectorization for small nodes (<= 2), nodes with the
14706 // same opcode and same parent block or all constants.
14707 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
14708 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
14709 // We use allSameOpcode instead of isAltShuffle because we don't
14710 // want to use interchangeable instruction here.
14711 !allSameOpcode(VL) || !allSameBlock(VL)) ||
14712 allConstant(VL) || isSplat(VL))
14713 continue;
14714 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
14715 continue;
14716 // Check if the node is a copy of other vector nodes.
14717 if (CheckForSameVectorNodes(E))
14718 continue;
14719 // Try to find vectorizable sequences and transform them into a series of
14720 // insertvector instructions.
14721 unsigned StartIdx = 0;
14722 unsigned End = VL.size();
14723 SmallBitVector Processed(End);
14724 for (unsigned VF = getFloorFullVectorNumberOfElements(
14725 *TTI, VL.front()->getType(), VL.size() - 1);
14726 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
14727 *TTI, VL.front()->getType(), VF - 1)) {
14728 if (StartIdx + VF > End)
14729 continue;
14731 bool AllStrided = true;
14732 // Walk the range in steps of VF, but allow the trailing slice to be
14733 // shorter (SliceVF < VF) so non-power-of-2 tails can be vectorized.
14734 // Processed only records slice starts (Cnt), and downstream consumers
14735 // (test(Cnt), AddCombinedNode's range set/StartIdx update) operate on
14736 // start positions, so partial coverage is consistent.
14737 for (unsigned Cnt = StartIdx; Cnt < End; Cnt += VF) {
14738 const unsigned SliceVF = std::min(VF, End - Cnt);
14739 if (SliceVF <= 1)
14740 continue;
14741 ArrayRef<Value *> Slice = VL.slice(Cnt, SliceVF);
14742 // If any instruction is vectorized already - do not try again.
14743 // Reuse the existing node, if it fully matches the slice.
14744 if ((Processed.test(Cnt) || isVectorized(Slice.front())) &&
14745 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
14746 continue;
14747 // Constant already handled effectively - skip.
14748 if (allConstant(Slice))
14749 continue;
14750 // Do not try to vectorize small splats (less than vector register and
14751 // only with the single non-undef element).
14752 bool IsSplat = isSplat(Slice);
14753 bool IsTwoRegisterSplat = true;
14754 if (IsSplat && VF == 2) {
14755 unsigned NumRegs2VF = ::getNumberOfParts(
14756 *TTI, getWidenedType(getValueType(Slice.front()), 2 * VF),
14757 getValueType(Slice.front()));
14758 IsTwoRegisterSplat = NumRegs2VF == 2;
14759 }
14760 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
14761 count(Slice, Slice.front()) ==
14762 static_cast<long>(isa<UndefValue>(Slice.front()) ? SliceVF - 1
14763 : 1)) {
14764 if (IsSplat)
14765 continue;
14766 InstructionsState S = getSameOpcode(Slice, *TLI);
14767 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
14768 (S.getOpcode() == Instruction::Load &&
14770 continue;
14771 if (VF == 2) {
14772 // Cache the cost check lazily - both branches below may need it.
14773 std::optional<bool> MainOpIsCheap;
14774 auto IsMainOpCheap = [&] {
14775 if (!MainOpIsCheap)
14776 MainOpIsCheap =
14777 TTI->getInstructionCost(S.getMainOp(), CostKind) <
14779 return *MainOpIsCheap;
14780 };
14781 // Try to vectorize reduced values or if all users are vectorized.
14782 // For expensive instructions extra extracts might be profitable.
14783 if ((!UserIgnoreList || E.Idx != 0) && IsMainOpCheap() &&
14784 !all_of(Slice, [&](Value *V) {
14785 if (isa<PoisonValue>(V))
14786 return true;
14787 return areAllUsersVectorized(cast<Instruction>(V),
14788 UserIgnoreList);
14789 }))
14790 continue;
14791 if (S.getOpcode() == Instruction::Load) {
14792 OrdersType Order;
14793 SmallVector<Value *> PointerOps;
14794 StridedPtrInfo SPtrInfo;
14795 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
14796 PointerOps, SPtrInfo);
14797 AllStrided &= Res == LoadsState::StridedVectorize ||
14799 Res == LoadsState::Gather;
14800 // Do not vectorize gathers.
14801 if (Res == LoadsState::ScatterVectorize ||
14802 Res == LoadsState::Gather) {
14803 if (Res == LoadsState::Gather) {
14805 // If reductions and the scalars from the root node are
14806 // analyzed - mark as non-vectorizable reduction.
14807 if (UserIgnoreList && E.Idx == 0)
14808 analyzedReductionVals(Slice);
14809 }
14810 continue;
14811 }
14812 } else if (S.getOpcode() == Instruction::ExtractElement ||
14813 (IsMainOpCheap() &&
14814 !CheckOperandsProfitability(
14815 S.getMainOp(),
14818 S))) {
14819 // Do not vectorize extractelements (handled effectively
14820 // alread). Do not vectorize non-profitable instructions (with
14821 // low cost and non-vectorizable operands.)
14822 continue;
14823 }
14824 }
14825 }
14826 Slices.emplace_back(Cnt, Slice.size());
14827 }
14828 // Do not try to vectorize if all slides are strided or gathered with
14829 // vector factor 2 and there are more than 2 slices. Better to handle
14830 // them in gathered loads analysis, may result in better vectorization.
14831 if (VF == 2 && AllStrided && Slices.size() > 2)
14832 continue;
14833 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
14834 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
14835 Processed.set(Cnt, Cnt + Sz);
14836 if (StartIdx == Cnt)
14837 StartIdx = Cnt + Sz;
14838 if (End == Cnt + Sz)
14839 End = Cnt;
14840 };
14841 for (auto [Cnt, Sz] : Slices) {
14842 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
14843 const TreeEntry *SameTE = nullptr;
14844 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
14845 It != Slice.end()) {
14846 // If any instruction is vectorized already - do not try again.
14847 SameTE = getSameValuesTreeEntry(*It, Slice);
14848 }
14849 unsigned PrevSize = VectorizableTree.size();
14850 [[maybe_unused]] unsigned PrevEntriesSize =
14851 LoadEntriesToVectorize.size();
14852 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
14853 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
14854 VectorizableTree[PrevSize]->isGather() &&
14855 VectorizableTree[PrevSize]->hasState() &&
14856 VectorizableTree[PrevSize]->getOpcode() !=
14857 Instruction::ExtractElement &&
14858 !isSplat(Slice)) {
14859 if (UserIgnoreList && E.Idx == 0 && VF == 2)
14860 analyzedReductionVals(Slice);
14861 VectorizableTree.pop_back();
14862 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
14863 "LoadEntriesToVectorize expected to remain the same");
14864 continue;
14865 }
14866 AddCombinedNode(PrevSize, Cnt, Sz);
14867 }
14868 }
14869 // Restore ordering, if no extra vectorization happened.
14870 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
14871 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
14872 reorderScalars(E.Scalars, Mask);
14873 E.ReorderIndices.clear();
14874 }
14875 }
14876 if (!E.hasState())
14877 continue;
14878 switch (E.getOpcode()) {
14879 case Instruction::Load: {
14880 // No need to reorder masked gather loads, just reorder the scalar
14881 // operands.
14882 if (E.State != TreeEntry::Vectorize)
14883 break;
14884 Type *ScalarTy = E.getMainOp()->getType();
14885 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
14886 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
14887 // Check if profitable to represent consecutive load + reverse as strided
14888 // load with stride -1.
14889 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
14890 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14891 SmallVector<int> Mask;
14892 inversePermutation(E.ReorderIndices, Mask);
14893 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
14894 InstructionCost OriginalVecCost =
14895 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
14896 BaseLI->getPointerAddressSpace(), CostKind,
14898 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
14899 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
14900 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
14901 VecTy, BaseLI->getPointerOperand(),
14902 /*VariableMask=*/false, CommonAlignment,
14903 BaseLI),
14904 CostKind);
14905 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
14906 // Strided load is more profitable than consecutive load + reverse -
14907 // transform the node to strided load.
14908 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
14909 ->getPointerOperand()
14910 ->getType());
14911 StridedPtrInfo SPtrInfo;
14912 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
14913 SPtrInfo.Ty = VecTy;
14914 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
14915 E.State = TreeEntry::StridedVectorize;
14916 }
14917 }
14918 break;
14919 }
14920 case Instruction::Store: {
14921 Type *ScalarTy =
14922 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
14923 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
14924 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
14925 // Check if profitable to represent consecutive load + reverse as strided
14926 // load with stride -1.
14927 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
14928 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14929 SmallVector<int> Mask;
14930 inversePermutation(E.ReorderIndices, Mask);
14931 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
14932 InstructionCost OriginalVecCost =
14933 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
14934 BaseSI->getPointerAddressSpace(), CostKind,
14936 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
14937 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
14938 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
14939 VecTy, BaseSI->getPointerOperand(),
14940 /*VariableMask=*/false, CommonAlignment,
14941 BaseSI),
14942 CostKind);
14943 if (StridedCost < OriginalVecCost) {
14944 // Strided store is more profitable than reverse + consecutive store -
14945 // transform the node to strided store.
14946 E.State = TreeEntry::StridedVectorize;
14947 Type *StrideTy = DL->getIndexType(cast<StoreInst>(E.Scalars.front())
14948 ->getPointerOperand()
14949 ->getType());
14950 StridedPtrInfo SPtrInfo;
14951 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, -1);
14952 SPtrInfo.Ty = VecTy;
14953 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
14954 }
14955 } else if (!E.ReorderIndices.empty()) {
14956 // Check for interleaved stores.
14957 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
14958 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
14959 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
14960 if (Mask.size() < 4)
14961 return 0u;
14962 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
14964 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
14965 TTI.isLegalInterleavedAccessType(
14966 VecTy, Factor, BaseSI->getAlign(),
14967 BaseSI->getPointerAddressSpace()))
14968 return Factor;
14969 }
14970
14971 return 0u;
14972 };
14973 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
14974 unsigned InterleaveFactor = IsInterleaveMask(Mask);
14975 if (InterleaveFactor != 0)
14976 E.setInterleave(InterleaveFactor);
14977 }
14978 break;
14979 }
14980 case Instruction::Select: {
14981 if (E.State != TreeEntry::Vectorize)
14982 break;
14983 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
14984 if (MinMaxID != Intrinsic::not_intrinsic) {
14985 // This node is a minmax node.
14986 E.CombinedOp = TreeEntry::MinMax;
14987 TreeEntry *CondEntry = getOperandEntry(&E, 0);
14988 if (SelectOnly && CondEntry->UserTreeIndex &&
14989 CondEntry->State == TreeEntry::Vectorize) {
14990 // The condition node is part of the combined minmax node.
14991 CondEntry->State = TreeEntry::CombinedVectorize;
14992 }
14993 break;
14994 }
14995 // Check for zext + selects, which can be reordered.
14996 SmallVector<unsigned> InversedCmpsIndices;
14997 if (matchesInversedZExtSelect(E, InversedCmpsIndices)) {
14998 auto *CmpTE = getOperandEntry(&E, 0);
14999 auto *Op1TE = getOperandEntry(&E, 1);
15000 auto *Op2TE = getOperandEntry(&E, 2);
15001 // State now is uniform, not alternate opcode.
15002 CmpTE->setOperations(
15003 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
15004 // Update mapping between the swapped values and their internal matching
15005 // nodes.
15006 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
15007 Value *V) {
15008 if (isConstant(V))
15009 return;
15010 auto It = ValueToGatherNodes.find(V);
15011 assert(It != ValueToGatherNodes.end() &&
15012 "Expected to find the value in the map.");
15013 auto &C = It->getSecond();
15014 if (!is_contained(OldTE->Scalars, V))
15015 C.remove(OldTE);
15016 C.insert(NewTE);
15017 };
15018 ValueList &Op1 = E.getOperand(1);
15019 ValueList &Op2 = E.getOperand(2);
15020 for (const unsigned Idx : InversedCmpsIndices) {
15021 Value *V1 = Op1TE->Scalars[Idx];
15022 Value *V2 = Op2TE->Scalars[Idx];
15023 std::swap(Op1TE->Scalars[Idx], Op2TE->Scalars[Idx]);
15024 std::swap(Op1[Idx], Op2[Idx]);
15025 UpdateGatherEntry(Op1TE, Op2TE, V1);
15026 UpdateGatherEntry(Op2TE, Op1TE, V2);
15027 }
15028 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&E, 1), Op1TE);
15029 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&E, 2), Op2TE);
15030 // NB: Fallback to check if select can be converted to cmp bitcast.
15031 }
15032 if (matchesSelectOfBits(E)) {
15033 // This node is a (reduced or) cmp bitcast node.
15034 const TreeEntry::CombinedOpcode Code = TreeEntry::ReducedCmpBitcast;
15035 E.CombinedOp = Code;
15036 auto *Op1TE = getOperandEntry(&E, 1);
15037 auto *Op2TE = getOperandEntry(&E, 2);
15038 Op1TE->State = TreeEntry::CombinedVectorize;
15039 Op1TE->CombinedOp = Code;
15040 Op2TE->State = TreeEntry::CombinedVectorize;
15041 Op2TE->CombinedOp = Code;
15042 break;
15043 }
15044 break;
15045 }
15046 case Instruction::FSub:
15047 case Instruction::FAdd: {
15048 // Check if possible to convert (a*b)+c to fma.
15049 if (E.State != TreeEntry::Vectorize ||
15050 !E.getOperations().isAddSubLikeOp() ||
15051 E.getOperations().isAltShuffle())
15052 break;
15053 const TreeEntry *LHS = getOperandEntry(&E, 0);
15054 const TreeEntry *RHS = getOperandEntry(&E, 1);
15055 auto IsOneUseVectorFMulOperand = [](const TreeEntry *TE) {
15056 return TE->State == TreeEntry::Vectorize &&
15057 TE->ReorderIndices.empty() && TE->ReuseShuffleIndices.empty() &&
15058 TE->getOpcode() == Instruction::FMul && !TE->isAltShuffle() &&
15059 all_of(TE->Scalars, [&](Value *V) {
15060 return (TE->hasCopyableElements() &&
15061 TE->isCopyableElement(V)) ||
15062 V->hasOneUse();
15063 });
15064 };
15065 if (!IsOneUseVectorFMulOperand(LHS) &&
15066 (E.getOpcode() == Instruction::FSub ||
15067 !IsOneUseVectorFMulOperand(RHS)))
15068 break;
15069 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
15070 .isValid())
15071 break;
15072 // This node is a fmuladd node.
15073 E.CombinedOp = TreeEntry::FMulAdd;
15074 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
15075 if (FMulEntry->UserTreeIndex &&
15076 FMulEntry->State == TreeEntry::Vectorize) {
15077 // The FMul node is part of the combined fmuladd node.
15078 FMulEntry->State = TreeEntry::CombinedVectorize;
15079 }
15080 break;
15081 }
15082 case Instruction::Shl: {
15083 if (E.Idx != 0 || DL->isBigEndian())
15084 break;
15085 if (!UserIgnoreList)
15086 break;
15087 // Check that all reduction operands are disjoint or instructions.
15088 if (any_of(*UserIgnoreList, [](Value *V) {
15089 return !match(V, m_DisjointOr(m_Value(), m_Value()));
15090 }))
15091 break;
15092 OrdersType Order;
15093 bool IsBSwap;
15094 bool ForLoads;
15095 if (!matchesShlZExt(E, Order, IsBSwap, ForLoads))
15096 break;
15097 // This node is a (reduced disjoint or) bitcast node.
15098 TreeEntry::CombinedOpcode Code =
15099 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
15100 : TreeEntry::ReducedBitcastBSwap)
15101 : (ForLoads ? TreeEntry::ReducedBitcastLoads
15102 : TreeEntry::ReducedBitcast);
15103 E.CombinedOp = Code;
15104 E.ReorderIndices = std::move(Order);
15105 TreeEntry *ZExtEntry = getOperandEntry(&E, 0);
15106 assert(ZExtEntry->UserTreeIndex &&
15107 ZExtEntry->State == TreeEntry::Vectorize &&
15108 ZExtEntry->getOpcode() == Instruction::ZExt &&
15109 "Expected ZExt node.");
15110 // The ZExt node is part of the combined node.
15111 ZExtEntry->State = TreeEntry::CombinedVectorize;
15112 ZExtEntry->CombinedOp = Code;
15113 if (ForLoads) {
15114 TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0);
15115 assert(LoadsEntry->UserTreeIndex &&
15116 LoadsEntry->State == TreeEntry::Vectorize &&
15117 LoadsEntry->getOpcode() == Instruction::Load &&
15118 "Expected Load node.");
15119 // The Load node is part of the combined node.
15120 LoadsEntry->State = TreeEntry::CombinedVectorize;
15121 LoadsEntry->CombinedOp = Code;
15122 }
15123 TreeEntry *ConstEntry = getOperandEntry(&E, 1);
15124 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
15125 "Expected ZExt node.");
15126 // The ConstNode node is part of the combined node.
15127 ConstEntry->State = TreeEntry::CombinedVectorize;
15128 ConstEntry->CombinedOp = Code;
15129 break;
15130 }
15131 default:
15132 break;
15133 }
15134 }
15135
15136 if (LoadEntriesToVectorize.empty()) {
15137 // Single load node - exit.
15138 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
15139 VectorizableTree.front()->getOpcode() == Instruction::Load)
15140 return;
15141 // Small graph with small VF - exit.
15142 constexpr unsigned SmallTree = 3;
15143 constexpr unsigned SmallVF = 2;
15144 if ((VectorizableTree.size() <= SmallTree &&
15145 VectorizableTree.front()->Scalars.size() == SmallVF) ||
15146 (VectorizableTree.size() <= 2 && UserIgnoreList))
15147 return;
15148
15149 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15150 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
15151 getCanonicalGraphSize() <= SmallTree &&
15152 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15153 [](const std::unique_ptr<TreeEntry> &TE) {
15154 return TE->isGather() && TE->hasState() &&
15155 TE->getOpcode() == Instruction::Load &&
15156 !allSameBlock(TE->Scalars);
15157 }) == 1)
15158 return;
15159 }
15160
15161 // A list of loads to be gathered during the vectorization process. We can
15162 // try to vectorize them at the end, if profitable.
15163 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
15165 GatheredLoads;
15166
15167 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15168 TreeEntry &E = *TE;
15169 if (E.isGather() &&
15170 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
15171 (!E.hasState() && any_of(E.Scalars,
15172 [&](Value *V) {
15173 return isa<LoadInst>(V) &&
15174 !isVectorized(V) &&
15175 !isDeleted(cast<Instruction>(V));
15176 }))) &&
15177 !isSplat(E.Scalars)) {
15178 for (Value *V : E.Scalars) {
15179 auto *LI = dyn_cast<LoadInst>(V);
15180 if (!LI)
15181 continue;
15182 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
15183 continue;
15185 *this, V, *DL, *SE, *TTI,
15186 GatheredLoads[std::make_tuple(
15187 LI->getParent(),
15188 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
15189 LI->getType())]);
15190 }
15191 }
15192 }
15193 // Try to vectorize gathered loads if this is not just a gather of loads.
15194 if (!GatheredLoads.empty())
15195 tryToVectorizeGatheredLoads(GatheredLoads);
15196}
15197
15198/// Merges shuffle masks and emits final shuffle instruction, if required. It
15199/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
15200/// when the actual shuffle instruction is generated only if this is actually
15201/// required. Otherwise, the shuffle instruction emission is delayed till the
15202/// end of the process, to reduce the number of emitted instructions and further
15203/// analysis/transformations.
15204class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
15205 bool IsFinalized = false;
15206 SmallVector<int> CommonMask;
15208 /// Captures the original scalar VL of a single, "clean" gather() call so
15209 /// the values can be forwarded as the Args operand to getShuffleCost() for
15210 /// the final permutation in finalize(). This lets the target cost model
15211 /// recognize patterns such as broadcast-of-load (e.g. on X86,
15212 /// vbroadcast{ss,sd} folds the broadcast and the load into one instruction
15213 /// under AVX/AVX2 and is reported as TCC_Free by getShuffleCost). The
15214 /// state machine is:
15215 /// * engaged + empty: tracking active, no qualifying gather seen yet.
15216 /// * engaged + non-empty: exactly one qualifying gather observed and its
15217 /// VL still corresponds to InVectors.front().
15218 /// * disengaged: the cached VL is no longer trustworthy (multiple
15219 /// gather() calls, or a state-mutating add() happened).
15220 std::optional<SmallVector<Value *>> BVValues = SmallVector<Value *>();
15221 const TargetTransformInfo &TTI;
15222 InstructionCost Cost = 0;
15223 SmallDenseSet<Value *> VectorizedVals;
15224 BoUpSLP &R;
15225 SmallPtrSetImpl<Value *> &CheckedExtracts;
15226 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15227 /// While set, still trying to estimate the cost for the same nodes and we
15228 /// can delay actual cost estimation (virtual shuffle instruction emission).
15229 /// May help better estimate the cost if same nodes must be permuted + allows
15230 /// to move most of the long shuffles cost estimation to TTI.
15231 bool SameNodesEstimated = true;
15232
15233 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
15234 if (Ty->getScalarType()->isPointerTy()) {
15237 IntegerType::get(Ty->getContext(),
15238 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
15239 Ty->getScalarType());
15240 if (auto *VTy = dyn_cast<VectorType>(Ty))
15241 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
15242 return Res;
15243 }
15244 return Constant::getAllOnesValue(Ty);
15245 }
15246
15247 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
15248 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
15249 return TTI::TCC_Free;
15250 auto *VecTy = getWidenedType(ScalarTy, VL.size());
15251 InstructionCost GatherCost = 0;
15252 SmallVector<Value *> Gathers(VL);
15253 if (!Root && isSplat(VL)) {
15254 // Found the broadcasting of the single scalar, calculate the cost as
15255 // the broadcast.
15256 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
15257 assert(It != VL.end() && "Expected at least one non-undef value.");
15258 // Add broadcast for non-identity shuffle only.
15259 bool NeedShuffle =
15260 count(VL, *It) > 1 &&
15261 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
15262 if (!NeedShuffle) {
15263 if (isa<FixedVectorType>(ScalarTy)) {
15264 assert(SLPReVec && "FixedVectorType is not expected.");
15265 return TTI.getShuffleCost(
15266 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
15267 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
15268 cast<FixedVectorType>(ScalarTy));
15269 }
15270 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
15271 CostKind, std::distance(VL.begin(), It),
15272 PoisonValue::get(VecTy), *It);
15273 }
15274
15275 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
15276 transform(VL, ShuffleMask.begin(), [](Value *V) {
15277 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
15278 });
15279 InstructionCost InsertCost =
15280 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
15281 PoisonValue::get(VecTy), *It);
15282 return InsertCost + ::getShuffleCost(TTI,
15284 VecTy, ShuffleMask, CostKind,
15285 /*Index=*/0, /*SubTp=*/nullptr,
15286 /*Args=*/*It);
15287 }
15288 return GatherCost +
15289 (all_of(Gathers, IsaPred<UndefValue>)
15291 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
15292 ScalarTy));
15293 };
15294
15295 /// Compute the cost of creating a vector containing the extracted values from
15296 /// \p VL.
15298 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
15299 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15300 unsigned NumParts) {
15301 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
15302 unsigned NumElts = accumulate(VL, 0, [](unsigned Sz, Value *V) {
15303 auto *EE = dyn_cast<ExtractElementInst>(V);
15304 if (!EE)
15305 return Sz;
15306 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
15307 if (!VecTy)
15308 return Sz;
15309 return std::max(Sz, VecTy->getNumElements());
15310 });
15311 // FIXME: this must be moved to TTI for better estimation.
15312 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
15313 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
15315 SmallVectorImpl<unsigned> &SubVecSizes)
15316 -> std::optional<TTI::ShuffleKind> {
15317 if (NumElts <= EltsPerVector)
15318 return std::nullopt;
15319 int OffsetReg0 = alignDown(accumulate(Mask, INT_MAX,
15320 [](int S, int I) {
15321 if (I == PoisonMaskElem)
15322 return S;
15323 return std::min(S, I);
15324 }),
15325 EltsPerVector);
15326 int OffsetReg1 = OffsetReg0;
15327 DenseSet<int> RegIndices;
15328 // Check that if trying to permute same single/2 input vectors.
15330 int FirstRegId = -1;
15331 Indices.assign(1, OffsetReg0);
15332 for (auto [Pos, I] : enumerate(Mask)) {
15333 if (I == PoisonMaskElem)
15334 continue;
15335 int Idx = I - OffsetReg0;
15336 int RegId =
15337 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
15338 if (FirstRegId < 0)
15339 FirstRegId = RegId;
15340 RegIndices.insert(RegId);
15341 if (RegIndices.size() > 2)
15342 return std::nullopt;
15343 if (RegIndices.size() == 2) {
15344 ShuffleKind = TTI::SK_PermuteTwoSrc;
15345 if (Indices.size() == 1) {
15346 OffsetReg1 = alignDown(
15347 std::accumulate(
15348 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
15349 [&](int S, int I) {
15350 if (I == PoisonMaskElem)
15351 return S;
15352 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
15353 ((I - OffsetReg0) % NumElts) / EltsPerVector;
15354 if (RegId == FirstRegId)
15355 return S;
15356 return std::min(S, I);
15357 }),
15358 EltsPerVector);
15359 unsigned Index = OffsetReg1 % NumElts;
15360 Indices.push_back(Index);
15361 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
15362 }
15363 Idx = I - OffsetReg1;
15364 }
15365 I = (Idx % NumElts) % EltsPerVector +
15366 (RegId == FirstRegId ? 0 : EltsPerVector);
15367 }
15368 return ShuffleKind;
15369 };
15370 InstructionCost Cost = 0;
15371
15372 // Process extracts in blocks of EltsPerVector to check if the source vector
15373 // operand can be re-used directly. If not, add the cost of creating a
15374 // shuffle to extract the values into a vector register.
15375 for (unsigned Part : seq<unsigned>(NumParts)) {
15376 if (!ShuffleKinds[Part])
15377 continue;
15378 ArrayRef<int> MaskSlice = Mask.slice(
15379 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
15380 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
15381 copy(MaskSlice, SubMask.begin());
15383 SmallVector<unsigned, 2> SubVecSizes;
15384 std::optional<TTI::ShuffleKind> RegShuffleKind =
15385 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
15386 if (!RegShuffleKind) {
15387 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
15389 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
15390 Cost +=
15391 ::getShuffleCost(TTI, *ShuffleKinds[Part],
15392 getWidenedType(ScalarTy, NumElts), MaskSlice);
15393 continue;
15394 }
15395 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
15396 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
15397 Cost +=
15398 ::getShuffleCost(TTI, *RegShuffleKind,
15399 getWidenedType(ScalarTy, EltsPerVector), SubMask);
15400 }
15401 const unsigned BaseVF = getFullVectorNumberOfElements(
15402 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
15403 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
15404 assert((Idx + SubVecSize) <= BaseVF &&
15405 "SK_ExtractSubvector index out of range");
15407 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
15408 Idx, getWidenedType(ScalarTy, SubVecSize));
15409 }
15410 // Second attempt to check, if just a permute is better estimated than
15411 // subvector extract.
15412 SubMask.assign(NumElts, PoisonMaskElem);
15413 copy(MaskSlice, SubMask.begin());
15414 InstructionCost OriginalCost = ::getShuffleCost(
15415 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
15416 if (OriginalCost < Cost)
15417 Cost = OriginalCost;
15418 }
15419 return Cost;
15420 }
15421 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
15422 /// mask \p Mask, register number \p Part, that includes \p SliceSize
15423 /// elements.
15424 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
15425 ArrayRef<int> Mask, unsigned Part,
15426 unsigned SliceSize) {
15427 if (SameNodesEstimated) {
15428 // Delay the cost estimation if the same nodes are reshuffling.
15429 // If we already requested the cost of reshuffling of E1 and E2 before, no
15430 // need to estimate another cost with the sub-Mask, instead include this
15431 // sub-Mask into the CommonMask to estimate it later and avoid double cost
15432 // estimation.
15433 if ((InVectors.size() == 2 &&
15434 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
15435 cast<const TreeEntry *>(InVectors.back()) == E2) ||
15436 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
15437 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
15438 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
15439 [](int Idx) { return Idx == PoisonMaskElem; }) &&
15440 "Expected all poisoned elements.");
15441 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
15442 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
15443 return;
15444 }
15445 // Found non-matching nodes - need to estimate the cost for the matched
15446 // and transform mask.
15447 Cost += createShuffle(InVectors.front(),
15448 InVectors.size() == 1 ? nullptr : InVectors.back(),
15449 CommonMask);
15450 transformMaskAfterShuffle(CommonMask, CommonMask);
15451 } else if (InVectors.size() == 2) {
15452 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15453 transformMaskAfterShuffle(CommonMask, CommonMask);
15454 }
15455 SameNodesEstimated = false;
15456 if (!E2 && InVectors.size() == 1) {
15457 unsigned VF = E1.getVectorFactor();
15458 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
15459 VF = std::max(VF, getVF(V1));
15460 } else {
15461 const auto *E = cast<const TreeEntry *>(InVectors.front());
15462 VF = std::max(VF, E->getVectorFactor());
15463 }
15464 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15465 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
15466 CommonMask[Idx] = Mask[Idx] + VF;
15467 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
15468 transformMaskAfterShuffle(CommonMask, CommonMask);
15469 } else {
15470 auto P = InVectors.front();
15471 Cost += createShuffle(&E1, E2, Mask);
15472 unsigned VF = Mask.size();
15473 if (Value *V1 = dyn_cast<Value *>(P)) {
15474 VF = std::max(VF,
15475 getNumElements(V1->getType()));
15476 } else {
15477 const auto *E = cast<const TreeEntry *>(P);
15478 VF = std::max(VF, E->getVectorFactor());
15479 }
15480 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15481 if (Mask[Idx] != PoisonMaskElem)
15482 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
15483 Cost += createShuffle(P, InVectors.front(), CommonMask);
15484 transformMaskAfterShuffle(CommonMask, CommonMask);
15485 }
15486 }
15487
15488 class ShuffleCostBuilder {
15489 const TargetTransformInfo &TTI;
15490
15491 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
15492 int Index = -1;
15493 return Mask.empty() ||
15494 (VF == Mask.size() &&
15497 Index == 0);
15498 }
15499
15500 public:
15501 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
15502 ~ShuffleCostBuilder() = default;
15503 InstructionCost createShuffleVector(Value *V1, Value *,
15504 ArrayRef<int> Mask) const {
15505 // Empty mask or identity mask are free.
15506 unsigned VF =
15507 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
15508 if (isEmptyOrIdentity(Mask, VF))
15509 return TTI::TCC_Free;
15510 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
15511 cast<VectorType>(V1->getType()), Mask);
15512 }
15513 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask,
15514 ArrayRef<Value *> VL) const {
15515 // Empty mask or identity mask are free.
15516 unsigned VF =
15517 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
15518 if (isEmptyOrIdentity(Mask, VF))
15519 return TTI::TCC_Free;
15520 return ::getShuffleCost(
15522 TTI::TCK_RecipThroughput, /*Index=*/0, /*SubTp=*/nullptr, VL);
15523 }
15524 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
15525 InstructionCost createPoison(Type *Ty, unsigned VF) const {
15526 return TTI::TCC_Free;
15527 }
15528 void resizeToMatch(Value *&, Value *&) const {}
15529 };
15530
15531 /// Smart shuffle instruction emission, walks through shuffles trees and
15532 /// tries to find the best matching vector for the actual shuffle
15533 /// instruction.
15535 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
15537 ArrayRef<int> Mask, ArrayRef<Value *> VL = {}) {
15538 ShuffleCostBuilder Builder(TTI);
15539 SmallVector<int> CommonMask(Mask);
15540 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
15541 unsigned CommonVF = Mask.size();
15542 InstructionCost ExtraCost = 0;
15543 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
15544 unsigned VF) -> InstructionCost {
15545 if (E.isGather() && allConstant(E.Scalars))
15546 return TTI::TCC_Free;
15547 Type *EScalarTy = E.Scalars.front()->getType();
15548 bool IsSigned = true;
15549 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
15550 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
15551 IsSigned = It->second.second;
15552 }
15553 if (EScalarTy != ScalarTy) {
15554 unsigned CastOpcode = Instruction::Trunc;
15555 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15556 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15557 if (DstSz > SrcSz)
15558 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15559 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
15560 getWidenedType(EScalarTy, VF),
15561 TTI::CastContextHint::None, CostKind);
15562 }
15563 return TTI::TCC_Free;
15564 };
15565 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
15566 if (isa<Constant>(V))
15567 return TTI::TCC_Free;
15568 auto *VecTy = cast<VectorType>(V->getType());
15569 Type *EScalarTy = VecTy->getElementType();
15570 if (EScalarTy != ScalarTy) {
15571 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
15572 unsigned CastOpcode = Instruction::Trunc;
15573 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15574 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15575 if (DstSz > SrcSz)
15576 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15577 return TTI.getCastInstrCost(
15578 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
15579 VecTy, TTI::CastContextHint::None, CostKind);
15580 }
15581 return TTI::TCC_Free;
15582 };
15583 if (!V1 && !V2 && !P2.isNull()) {
15584 // Shuffle 2 entry nodes.
15585 const TreeEntry *E = cast<const TreeEntry *>(P1);
15586 unsigned VF = E->getVectorFactor();
15587 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
15588 CommonVF = std::max(VF, E2->getVectorFactor());
15589 assert(all_of(Mask,
15590 [=](int Idx) {
15591 return Idx < 2 * static_cast<int>(CommonVF);
15592 }) &&
15593 "All elements in mask must be less than 2 * CommonVF.");
15594 if (E->Scalars.size() == E2->Scalars.size()) {
15595 SmallVector<int> EMask = E->getCommonMask();
15596 SmallVector<int> E2Mask = E2->getCommonMask();
15597 if (!EMask.empty() || !E2Mask.empty()) {
15598 for (int &Idx : CommonMask) {
15599 if (Idx == PoisonMaskElem)
15600 continue;
15601 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
15602 Idx = EMask[Idx];
15603 else if (Idx >= static_cast<int>(CommonVF))
15604 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
15605 E->Scalars.size();
15606 }
15607 }
15608 CommonVF = E->Scalars.size();
15609 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
15610 GetNodeMinBWAffectedCost(*E2, CommonVF);
15611 } else {
15612 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
15613 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
15614 }
15615 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15616 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15617 } else if (!V1 && P2.isNull()) {
15618 // Shuffle single entry node.
15619 const TreeEntry *E = cast<const TreeEntry *>(P1);
15620 unsigned VF = E->getVectorFactor();
15621 CommonVF = VF;
15622 assert(
15623 all_of(Mask,
15624 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
15625 "All elements in mask must be less than CommonVF.");
15626 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
15627 SmallVector<int> EMask = E->getCommonMask();
15628 assert(!EMask.empty() && "Expected non-empty common mask.");
15629 for (int &Idx : CommonMask) {
15630 if (Idx != PoisonMaskElem)
15631 Idx = EMask[Idx];
15632 }
15633 CommonVF = E->Scalars.size();
15634 } else if (unsigned Factor = E->getInterleaveFactor();
15635 Factor > 0 && E->Scalars.size() != Mask.size() &&
15637 Factor)) {
15638 // Deinterleaved nodes are free.
15639 std::iota(CommonMask.begin(), CommonMask.end(), 0);
15640 }
15641 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
15642 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15643 // Not identity/broadcast? Try to see if the original vector is better.
15644 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
15645 CommonVF == CommonMask.size() &&
15646 any_of(enumerate(CommonMask),
15647 [](const auto &&P) {
15648 return P.value() != PoisonMaskElem &&
15649 static_cast<unsigned>(P.value()) != P.index();
15650 }) &&
15651 any_of(CommonMask,
15652 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
15653 SmallVector<int> ReorderMask;
15654 inversePermutation(E->ReorderIndices, ReorderMask);
15655 ::addMask(CommonMask, ReorderMask);
15656 }
15657 } else if (V1 && P2.isNull()) {
15658 // Shuffle single vector.
15659 ExtraCost += GetValueMinBWAffectedCost(V1);
15660 CommonVF = getVF(V1);
15661 assert(
15662 all_of(Mask,
15663 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
15664 "All elements in mask must be less than CommonVF.");
15665 } else if (V1 && !V2) {
15666 // Shuffle vector and tree node.
15667 unsigned VF = getVF(V1);
15668 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
15669 CommonVF = std::max(VF, E2->getVectorFactor());
15670 assert(all_of(Mask,
15671 [=](int Idx) {
15672 return Idx < 2 * static_cast<int>(CommonVF);
15673 }) &&
15674 "All elements in mask must be less than 2 * CommonVF.");
15675 if (E2->Scalars.size() == VF && VF != CommonVF) {
15676 SmallVector<int> E2Mask = E2->getCommonMask();
15677 assert(!E2Mask.empty() && "Expected non-empty common mask.");
15678 for (int &Idx : CommonMask) {
15679 if (Idx == PoisonMaskElem)
15680 continue;
15681 if (Idx >= static_cast<int>(CommonVF))
15682 Idx = E2Mask[Idx - CommonVF] + VF;
15683 }
15684 CommonVF = VF;
15685 }
15686 ExtraCost += GetValueMinBWAffectedCost(V1);
15687 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15688 ExtraCost += GetNodeMinBWAffectedCost(
15689 *E2, std::min(CommonVF, E2->getVectorFactor()));
15690 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15691 } else if (!V1 && V2) {
15692 // Shuffle vector and tree node.
15693 unsigned VF = getVF(V2);
15694 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
15695 CommonVF = std::max(VF, E1->getVectorFactor());
15696 assert(all_of(Mask,
15697 [=](int Idx) {
15698 return Idx < 2 * static_cast<int>(CommonVF);
15699 }) &&
15700 "All elements in mask must be less than 2 * CommonVF.");
15701 if (E1->Scalars.size() == VF && VF != CommonVF) {
15702 SmallVector<int> E1Mask = E1->getCommonMask();
15703 assert(!E1Mask.empty() && "Expected non-empty common mask.");
15704 for (int &Idx : CommonMask) {
15705 if (Idx == PoisonMaskElem)
15706 continue;
15707 if (Idx >= static_cast<int>(CommonVF))
15708 Idx = E1Mask[Idx - CommonVF] + VF;
15709 else
15710 Idx = E1Mask[Idx];
15711 }
15712 CommonVF = VF;
15713 }
15714 ExtraCost += GetNodeMinBWAffectedCost(
15715 *E1, std::min(CommonVF, E1->getVectorFactor()));
15716 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15717 ExtraCost += GetValueMinBWAffectedCost(V2);
15718 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15719 } else {
15720 assert(V1 && V2 && "Expected both vectors.");
15721 unsigned VF = getVF(V1);
15722 CommonVF = std::max(VF, getVF(V2));
15723 assert(all_of(Mask,
15724 [=](int Idx) {
15725 return Idx < 2 * static_cast<int>(CommonVF);
15726 }) &&
15727 "All elements in mask must be less than 2 * CommonVF.");
15728 ExtraCost +=
15729 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
15730 if (V1->getType() != V2->getType()) {
15731 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15732 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15733 } else {
15734 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
15735 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15736 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
15737 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15738 }
15739 }
15740 InVectors.front() =
15741 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
15742 if (InVectors.size() == 2)
15743 InVectors.pop_back();
15744 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
15745 V1, V2, CommonMask, Builder, ScalarTy, VL);
15746 }
15747
15748public:
15750 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
15751 SmallPtrSetImpl<Value *> &CheckedExtracts)
15752 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
15753 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
15754 CheckedExtracts(CheckedExtracts) {}
15755 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
15756 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15757 unsigned NumParts, bool &UseVecBaseAsInput) {
15758 UseVecBaseAsInput = false;
15759 if (Mask.empty())
15760 return nullptr;
15761 Value *VecBase = nullptr;
15762 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
15763 if (!E->ReorderIndices.empty()) {
15764 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
15765 E->ReorderIndices.end());
15766 reorderScalars(VL, ReorderMask);
15767 }
15768 // Check if it can be considered reused if same extractelements were
15769 // vectorized already.
15770 bool PrevNodeFound = any_of(
15771 ArrayRef(R.VectorizableTree).take_front(E->Idx),
15772 [&](const std::unique_ptr<TreeEntry> &TE) {
15773 return ((TE->hasState() && !TE->isAltShuffle() &&
15774 TE->getOpcode() == Instruction::ExtractElement) ||
15775 TE->isGather()) &&
15776 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
15777 return VL.size() > Data.index() &&
15778 (Mask[Data.index()] == PoisonMaskElem ||
15779 isa<UndefValue>(VL[Data.index()]) ||
15780 Data.value() == VL[Data.index()]);
15781 });
15782 });
15783 SmallPtrSet<Value *, 4> UniqueBases;
15784 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
15785 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
15786 for (unsigned Part : seq<unsigned>(NumParts)) {
15787 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
15788 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
15789 for (auto [I, V] :
15790 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
15791 // Ignore non-extractelement scalars.
15792 if (isa<UndefValue>(V) ||
15793 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
15794 continue;
15795 // If all users of instruction are going to be vectorized and this
15796 // instruction itself is not going to be vectorized, consider this
15797 // instruction as dead and remove its cost from the final cost of the
15798 // vectorized tree.
15799 // Also, avoid adjusting the cost for extractelements with multiple uses
15800 // in different graph entries.
15801 auto *EE = cast<ExtractElementInst>(V);
15802 VecBase = EE->getVectorOperand();
15803 UniqueBases.insert(VecBase);
15804 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
15805 if (!CheckedExtracts.insert(V).second ||
15806 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
15807 any_of(VEs,
15808 [&](const TreeEntry *TE) {
15809 return R.DeletedNodes.contains(TE) ||
15810 R.TransformedToGatherNodes.contains(TE);
15811 }) ||
15812 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
15813 !R.isVectorized(EE) &&
15814 count_if(E->Scalars, [&](Value *V) { return V == EE; }) !=
15815 count_if(E->UserTreeIndex.UserTE->Scalars,
15816 [&](Value *V) { return V == EE; })) ||
15817 any_of(EE->users(),
15818 [&](User *U) {
15819 return isa<GetElementPtrInst>(U) &&
15820 !R.areAllUsersVectorized(cast<Instruction>(U),
15821 &VectorizedVals);
15822 }) ||
15823 (!VEs.empty() && !is_contained(VEs, E)))
15824 continue;
15825 std::optional<unsigned> EEIdx = getExtractIndex(EE);
15826 if (!EEIdx)
15827 continue;
15828 unsigned Idx = *EEIdx;
15829 // Take credit for instruction that will become dead.
15830 if (EE->hasOneUse() || !PrevNodeFound) {
15831 Instruction *Ext = EE->user_back();
15832 if (isa<SExtInst, ZExtInst>(Ext) &&
15834 // Use getExtractWithExtendCost() to calculate the cost of
15835 // extractelement/ext pair.
15836 Cost -= TTI.getExtractWithExtendCost(
15837 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
15838 Idx, CostKind);
15839 // Add back the cost of s|zext which is subtracted separately.
15840 Cost += TTI.getCastInstrCost(
15841 Ext->getOpcode(), Ext->getType(), EE->getType(),
15843 continue;
15844 }
15845 }
15846 APInt &DemandedElts =
15847 VectorOpsToExtracts
15848 .try_emplace(VecBase,
15849 APInt::getZero(getNumElements(VecBase->getType())))
15850 .first->getSecond();
15851 DemandedElts.setBit(Idx);
15852 }
15853 }
15854 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
15856 DemandedElts, /*Insert=*/false,
15857 /*Extract=*/true, CostKind);
15858 // Check that gather of extractelements can be represented as just a
15859 // shuffle of a single/two vectors the scalars are extracted from.
15860 // Found the bunch of extractelement instructions that must be gathered
15861 // into a vector and can be represented as a permutation elements in a
15862 // single input vector or of 2 input vectors.
15863 // Done for reused if same extractelements were vectorized already.
15864 if (!PrevNodeFound)
15865 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
15866 InVectors.assign(1, E);
15867 CommonMask.assign(Mask.begin(), Mask.end());
15868 transformMaskAfterShuffle(CommonMask, CommonMask);
15869 SameNodesEstimated = false;
15870 if (NumParts != 1 && UniqueBases.size() != 1) {
15871 UseVecBaseAsInput = true;
15872 VecBase =
15873 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
15874 }
15875 return VecBase;
15876 }
15877 /// Checks if the specified entry \p E needs to be delayed because of its
15878 /// dependency nodes.
15879 std::optional<InstructionCost>
15880 needToDelay(const TreeEntry *,
15882 // No need to delay the cost estimation during analysis.
15883 return std::nullopt;
15884 }
15885 /// Reset the builder to handle perfect diamond match.
15887 IsFinalized = false;
15888 CommonMask.clear();
15889 InVectors.clear();
15890 Cost = 0;
15891 VectorizedVals.clear();
15892 SameNodesEstimated = true;
15893 }
15894 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
15895 BVValues.reset();
15896 if (&E1 == &E2) {
15897 assert(all_of(Mask,
15898 [&](int Idx) {
15899 return Idx < static_cast<int>(E1.getVectorFactor());
15900 }) &&
15901 "Expected single vector shuffle mask.");
15902 add(E1, Mask);
15903 return;
15904 }
15905 if (InVectors.empty()) {
15906 CommonMask.assign(Mask.begin(), Mask.end());
15907 InVectors.assign({&E1, &E2});
15908 return;
15909 }
15910 assert(!CommonMask.empty() && "Expected non-empty common mask.");
15911 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15912 unsigned NumParts =
15913 ::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
15914 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
15915 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
15916 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15917 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
15918 }
15919 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
15920 BVValues.reset();
15921 if (InVectors.empty()) {
15922 CommonMask.assign(Mask.begin(), Mask.end());
15923 InVectors.assign(1, &E1);
15924 return;
15925 }
15926 assert(!CommonMask.empty() && "Expected non-empty common mask.");
15927 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15928 unsigned NumParts =
15929 ::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
15930 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
15931 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
15932 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15933 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
15934 if (!SameNodesEstimated && InVectors.size() == 1)
15935 InVectors.emplace_back(&E1);
15936 }
15937 /// Adds 2 input vectors and the mask for their shuffling.
15938 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
15939 // May come only for shuffling of 2 vectors with extractelements, already
15940 // handled in adjustExtracts.
15941 assert(InVectors.size() == 1 &&
15942 all_of(enumerate(CommonMask),
15943 [&](auto P) {
15944 if (P.value() == PoisonMaskElem)
15945 return Mask[P.index()] == PoisonMaskElem;
15946 auto *EI = cast<ExtractElementInst>(
15947 cast<const TreeEntry *>(InVectors.front())
15948 ->getOrdered(P.index()));
15949 return EI->getVectorOperand() == V1 ||
15950 EI->getVectorOperand() == V2;
15951 }) &&
15952 "Expected extractelement vectors.");
15953 }
15954 /// Adds another one input vector and the mask for the shuffling.
15955 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
15956 if (BVValues && !isa<Constant>(V1))
15957 BVValues.reset();
15958 if (InVectors.empty()) {
15959 assert(CommonMask.empty() && !ForExtracts &&
15960 "Expected empty input mask/vectors.");
15961 CommonMask.assign(Mask.begin(), Mask.end());
15962 InVectors.assign(1, V1);
15963 return;
15964 }
15965 if (ForExtracts) {
15966 // No need to add vectors here, already handled them in adjustExtracts.
15967 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
15968 !CommonMask.empty() &&
15969 all_of(enumerate(CommonMask),
15970 [&](auto P) {
15971 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
15972 ->getOrdered(P.index());
15973 if (P.value() == PoisonMaskElem)
15974 return P.value() == Mask[P.index()] ||
15975 isa<UndefValue>(Scalar);
15976 if (isa<Constant>(V1))
15977 return true;
15978 auto *EI = cast<ExtractElementInst>(Scalar);
15979 return EI->getVectorOperand() == V1;
15980 }) &&
15981 "Expected only tree entry for extractelement vectors.");
15982 return;
15983 }
15984 assert(!InVectors.empty() && !CommonMask.empty() &&
15985 "Expected only tree entries from extracts/reused buildvectors.");
15986 unsigned VF = getVF(V1);
15987 if (InVectors.size() == 2) {
15988 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15989 transformMaskAfterShuffle(CommonMask, CommonMask);
15990 VF = std::max<unsigned>(VF, CommonMask.size());
15991 } else if (const auto *InTE =
15992 InVectors.front().dyn_cast<const TreeEntry *>()) {
15993 VF = std::max(VF, InTE->getVectorFactor());
15994 } else {
15995 VF = std::max(
15996 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
15997 ->getNumElements());
15998 }
15999 InVectors.push_back(V1);
16000 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16001 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
16002 CommonMask[Idx] = Mask[Idx] + VF;
16003 }
16004 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
16005 Value *Root = nullptr) {
16006 Cost += getBuildVectorCost(VL, Root);
16007 if (BVValues) {
16008 if (BVValues->empty() && InVectors.empty())
16009 BVValues->assign(VL.begin(), VL.end());
16010 else
16011 BVValues.reset();
16012 }
16013 if (!Root) {
16014 // FIXME: Need to find a way to avoid use of getNullValue here.
16016 unsigned VF = VL.size();
16017 if (MaskVF != 0)
16018 VF = std::min(VF, MaskVF);
16019 Type *VLScalarTy = VL.front()->getType();
16020 for (Value *V : VL.take_front(VF)) {
16021 Type *ScalarTy = VLScalarTy->getScalarType();
16022 if (isa<PoisonValue>(V)) {
16023 Vals.push_back(PoisonValue::get(ScalarTy));
16024 continue;
16025 }
16026 if (isa<UndefValue>(V)) {
16027 Vals.push_back(UndefValue::get(ScalarTy));
16028 continue;
16029 }
16030 Vals.push_back(Constant::getNullValue(ScalarTy));
16031 }
16032 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
16033 assert(SLPReVec && "FixedVectorType is not expected.");
16034 // When REVEC is enabled, we need to expand vector types into scalar
16035 // types.
16036 Vals = replicateMask(Vals, VecTy->getNumElements());
16037 }
16038 return ConstantVector::get(Vals);
16039 }
16042 cast<FixedVectorType>(Root->getType())->getNumElements()),
16043 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
16044 }
16046 /// Finalize emission of the shuffles.
16048 ArrayRef<int> ExtMask,
16049 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
16050 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
16053 Action = {}) {
16054 IsFinalized = true;
16055 if (Action) {
16056 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
16057 if (InVectors.size() == 2)
16058 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
16059 else
16060 Cost += createShuffle(Vec, nullptr, CommonMask);
16061 transformMaskAfterShuffle(CommonMask, CommonMask);
16062 assert(VF > 0 &&
16063 "Expected vector length for the final value before action.");
16064 Value *V = cast<Value *>(Vec);
16065 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
16066 Cost += createShuffle(V1, V2, Mask);
16067 return V1;
16068 });
16069 InVectors.front() = V;
16070 }
16071 if (!SubVectors.empty()) {
16072 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
16073 if (InVectors.size() == 2)
16074 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
16075 else
16076 Cost += createShuffle(Vec, nullptr, CommonMask);
16077 transformMaskAfterShuffle(CommonMask, CommonMask);
16078 // Add subvectors permutation cost.
16079 if (!SubVectorsMask.empty()) {
16080 assert(SubVectorsMask.size() <= CommonMask.size() &&
16081 "Expected same size of masks for subvectors and common mask.");
16082 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
16083 copy(SubVectorsMask, SVMask.begin());
16084 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
16085 if (I2 != PoisonMaskElem) {
16086 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
16087 I1 = I2 + CommonMask.size();
16088 }
16089 }
16091 getWidenedType(ScalarTy, CommonMask.size()),
16092 SVMask, CostKind);
16093 }
16094 for (auto [E, Idx] : SubVectors) {
16095 Type *EScalarTy = E->Scalars.front()->getType();
16096 bool IsSigned = true;
16097 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
16098 EScalarTy =
16099 IntegerType::get(EScalarTy->getContext(), It->second.first);
16100 IsSigned = It->second.second;
16101 }
16102 if (ScalarTy != EScalarTy) {
16103 unsigned CastOpcode = Instruction::Trunc;
16104 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
16105 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
16106 if (DstSz > SrcSz)
16107 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
16108 Cost += TTI.getCastInstrCost(
16109 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
16110 getWidenedType(EScalarTy, E->getVectorFactor()),
16112 }
16115 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
16116 getWidenedType(ScalarTy, E->getVectorFactor()));
16117 if (!CommonMask.empty()) {
16118 std::iota(std::next(CommonMask.begin(), Idx),
16119 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
16120 Idx);
16121 }
16122 }
16123 }
16124
16125 if (!ExtMask.empty()) {
16126 if (CommonMask.empty()) {
16127 CommonMask.assign(ExtMask.begin(), ExtMask.end());
16128 } else {
16129 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
16130 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
16131 if (ExtMask[I] == PoisonMaskElem)
16132 continue;
16133 NewMask[I] = CommonMask[ExtMask[I]];
16134 }
16135 CommonMask.swap(NewMask);
16136 }
16137 }
16138 if (CommonMask.empty()) {
16139 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
16140 return Cost;
16141 }
16143 if (BVValues)
16144 VL = *BVValues;
16145 return Cost +
16146 createShuffle(InVectors.front(),
16147 InVectors.size() == 2 ? InVectors.back() : nullptr,
16148 CommonMask, VL);
16149 }
16150
16152 assert((IsFinalized || CommonMask.empty()) &&
16153 "Shuffle construction must be finalized.");
16154 }
16155};
16156
16157const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
16158 unsigned Idx) const {
16159 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
16160 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
16161 return Op;
16162}
16163
16164TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
16165 if (TE.State == TreeEntry::ScatterVectorize ||
16166 TE.State == TreeEntry::StridedVectorize)
16168 if (TE.State == TreeEntry::CompressVectorize)
16170 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
16171 !TE.isAltShuffle()) {
16172 if (TE.ReorderIndices.empty())
16174 SmallVector<int> Mask;
16175 inversePermutation(TE.ReorderIndices, Mask);
16176 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
16178 }
16180}
16181
16182/// Get the assumed loop trip count for the loop \p L.
16183static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE) {
16184 if (LoopAwareTripCount == 0)
16185 return 1;
16186 unsigned Scale = SE.getSmallConstantTripCount(L);
16187 if (Scale == 0)
16188 Scale = getLoopEstimatedTripCount(const_cast<Loop *>(L)).value_or(0);
16189 if (Scale != 0) {
16190 // Multiple exiting blocks - choose the minimum between trip count (scale)
16191 // and LoopAwareTripCount, since the multiple exit loops can be terminated
16192 // early.
16193 if (!L->getExitingBlock())
16194 return std::min<unsigned>(LoopAwareTripCount, Scale);
16195 return Scale;
16196 }
16197 return LoopAwareTripCount;
16198}
16199
16200uint64_t BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
16201 Instruction *U) {
16202 BasicBlock *Parent = nullptr;
16203 if (U) {
16204 Parent = U->getParent();
16205 } else if (TE.isGather() || TE.State == TreeEntry::SplitVectorize) {
16206 EdgeInfo EI = TE.UserTreeIndex;
16207 while (EI.UserTE) {
16208 if (EI.UserTE->isGather() ||
16209 EI.UserTE->State == TreeEntry::SplitVectorize) {
16210 EI = EI.UserTE->UserTreeIndex;
16211 continue;
16212 }
16213 if (EI.UserTE->State == TreeEntry::Vectorize &&
16214 EI.UserTE->getOpcode() == Instruction::PHI) {
16215 auto *PH = cast<PHINode>(EI.UserTE->getMainOp());
16216 Parent = PH->getIncomingBlock(EI.EdgeIdx);
16217 } else {
16218 Parent = EI.UserTE->getMainOp()->getParent();
16219 }
16220 break;
16221 }
16222 if (!Parent)
16223 return 1;
16224 } else {
16225 Parent = TE.getMainOp()->getParent();
16226 }
16227 const Loop *L = LI->getLoopFor(Parent);
16228 if (!L)
16229 return 1;
16230 // The entry's cost is paid once per execution of the innermost loop in
16231 // which some of its operands are variant. Operands that are invariant in
16232 // all enclosing loops are executed once (LICM will hoist them out).
16233 return getLoopNestScale(findInnermostNonInvariantLoop(
16234 L, Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars)));
16235}
16236
16237uint64_t BoUpSLP::getLoopNestScale(const Loop *L) {
16238 if (!L || LoopAwareTripCount == 0)
16239 return 1;
16240 if (auto It = LoopNestScaleCache.find(L); It != LoopNestScaleCache.end())
16241 return It->second;
16242 // Collect loops from L outward up to (but not including) the first cached
16243 // ancestor or the function top, then walk back inward multiplying trip
16244 // counts. Use uint64_t to avoid silent overflow on deep/large nests.
16245 SmallVector<const Loop *> Chain;
16246 for (const Loop *Cur = L; Cur; Cur = Cur->getParentLoop()) {
16247 if (LoopNestScaleCache.contains(Cur))
16248 break;
16249 Chain.push_back(Cur);
16250 }
16251 assert(!Chain.empty() && "Early-return above should have handled cache hit.");
16252 uint64_t Scale = 1;
16253 if (const Loop *Parent = Chain.back()->getParentLoop())
16254 Scale = LoopNestScaleCache.lookup(Parent);
16255 // Walk from the outermost uncached loop inward, accumulating trip counts.
16256 // Use SaturatingMultiply to clamp at uint64_t max on deep/large nests
16257 // rather than wrapping around.
16258 for (const Loop *Cur : reverse(Chain)) {
16259 uint64_t TC = std::max<uint64_t>(1, getLoopTripCount(Cur, *SE));
16260 Scale = SaturatingMultiply(Scale, TC);
16261 LoopNestScaleCache.try_emplace(Cur, std::max<uint64_t>(1, Scale));
16262 }
16263 return std::max<uint64_t>(1, Scale);
16264}
16265
16266uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) {
16267 // Only meaningful for gather/buildvector-like entries; the per-lane
16268 // insertelements that make up such an entry are LICM-hoistable by
16269 // optimizeGatherSequence() when their operand is loop-invariant.
16270 assert((TE.isGather() || TE.State == TreeEntry::SplitVectorize) &&
16271 "Expected gather/split tree entry.");
16272
16273 uint64_t BaseScale = getScaleToLoopIterations(TE);
16274 if (!PerLaneGatherScale || LoopAwareTripCount == 0 || BaseScale <= 1)
16275 return BaseScale;
16276
16277 // Average the per-lane execution scales: for each lane, reuse the same
16278 // scale helper the rest of the cost model uses, but ask it about that
16279 // one lane's value. Lanes that are loop-invariant in the current nest
16280 // collapse to their outer-loop scale (or 1 for fully invariant/constant
16281 // lanes), which matches the LICM hoisting performed by
16282 // optimizeGatherSequence(). Cap per-lane contributions by BaseScale so a
16283 // refinement can never raise the cost above the whole-entry scale.
16284 // Each lane contributes at most BaseScale, so Sum is bounded above by
16285 // N * BaseScale. If BaseScale is near uint64_t max (saturated by
16286 // getLoopNestScale on a deep nest) Sum can still overflow uint64_t,
16287 // which would silently wrap and produce a wrong average. Use
16288 // SaturatingAdd and bail out to BaseScale on overflow: the true average
16289 // is bounded above by BaseScale anyway, so this preserves the
16290 // refinement's invariant that it can never raise cost.
16291 uint64_t Sum = 0;
16292 unsigned N = 0;
16293 bool Overflow = false;
16294 for (Value *V : TE.Scalars) {
16295 if (isConstant(V))
16296 continue;
16297 ++N;
16298 uint64_t LaneScale = std::min(getScaleToLoopIterations(TE, V), BaseScale);
16299 Sum = SaturatingAdd(Sum, LaneScale, &Overflow);
16300 if (Overflow)
16301 return BaseScale;
16302 }
16303 if (N == 0)
16304 return BaseScale;
16305 // Ceil-divide so we never round the effective scale down below 1.
16306 uint64_t Numerator = SaturatingAdd(Sum, uint64_t(N - 1), &Overflow);
16307 if (Overflow)
16308 return BaseScale;
16309 uint64_t Avg = Numerator / N;
16310 return std::clamp<uint64_t>(Avg, 1, BaseScale);
16311}
16312
16314BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
16315 VectorType *VecTy, VectorType *FinalVecTy,
16317 InstructionCost SpillsReloads = 0;
16318
16319 // Estimate vector register pressure per target register class: operand
16320 // vectors plus the result. The same vector operand is counted once via
16321 // CountedOpEntries deduplication. PHIs take the max operand pressure across
16322 // incoming slots (only one predecessor is live at a time) plus the result.
16323 // All-constant operand bundles are skipped.
16324 if (!E->hasState() || E->getOpcode() == Instruction::Store ||
16325 E->getOpcode() == Instruction::ExtractElement ||
16326 E->getOpcode() == Instruction::ExtractValue ||
16327 E->getOpcode() == Instruction::Freeze ||
16328 (E->getOpcode() == Instruction::Load &&
16329 E->State != TreeEntry::ScatterVectorize))
16330 return SpillsReloads;
16331
16332 const bool IsPHI =
16333 E->State == TreeEntry::Vectorize && E->getOpcode() == Instruction::PHI;
16334 SmallPtrSet<const TreeEntry *, 8> CountedOpEntries;
16335 SmallDenseMap<unsigned, unsigned> PressureByClass;
16336 auto AddPartsToClass = [&](unsigned RegClass, unsigned Parts) {
16337 assert(Parts != 0 && "Expected non-zero number of parts (registers).");
16338 PressureByClass[RegClass] += Parts;
16339 };
16340
16341 auto GetEntryVecTy =
16342 [&](const TreeEntry *TE) -> std::pair<Type *, VectorType *> {
16343 Type *ScalarTy = getValueType(TE->Scalars.front());
16344 auto BWIt = MinBWs.find(TE);
16345 if (BWIt != MinBWs.end()) {
16346 auto *VTy = dyn_cast<FixedVectorType>(ScalarTy);
16347 ScalarTy = IntegerType::get(F->getContext(), BWIt->second.first);
16348 if (VTy)
16349 ScalarTy = getWidenedType(ScalarTy, VTy->getNumElements());
16350 }
16351 return std::make_pair(ScalarTy,
16352 getWidenedType(ScalarTy, TE->getVectorFactor()));
16353 };
16354
16355 if (E->State == TreeEntry::SplitVectorize) {
16356 for (const auto &[Idx, _] : E->CombinedEntriesWithIndices) {
16357 const TreeEntry *OpTE = VectorizableTree[Idx].get();
16358
16359 if (!CountedOpEntries.insert(OpTE).second)
16360 continue;
16361 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16362 const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy);
16363 if (Parts == 0)
16364 continue;
16365 const unsigned RC =
16366 TTI->getRegisterClassForType(/*Vector=*/true, OpVecTy);
16367 AddPartsToClass(RC, Parts);
16368 }
16369 } else if (IsPHI) {
16370 // Only one predecessor is live at a time - take the max operand pressure
16371 // across incoming slots.
16372 SmallDenseMap<unsigned, unsigned> MaxOpPressureByClass;
16373 for (unsigned Idx : seq<unsigned>(E->getNumOperands())) {
16374 const TreeEntry *OpTE = getOperandEntry(E, Idx);
16375 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16376 const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy);
16377 if (Parts == 0)
16378 continue;
16379 const unsigned RC =
16380 TTI->getRegisterClassForType(/*Vector=*/true, OpVecTy);
16381 MaxOpPressureByClass[RC] = std::max(MaxOpPressureByClass[RC], Parts);
16382 }
16383 for (auto [RC, Parts] : MaxOpPressureByClass)
16384 AddPartsToClass(RC, Parts);
16385 } else {
16386 for (unsigned Idx : seq<unsigned>(E->getNumOperands())) {
16387 // InsertElement operand 0 is the vector being inserted into, which is
16388 // built incrementally and does not occupy an extra register.
16389 if (E->getOpcode() == Instruction::InsertElement && Idx == 0)
16390 continue;
16391 ArrayRef<Value *> Ops = E->getOperand(Idx);
16392 if (Ops.empty() || allConstant(Ops) || isSplat(Ops))
16393 continue;
16394 Value *Op = Ops.front();
16395 if (!Op)
16396 continue;
16397 const TreeEntry *OpTE = getOperandEntry(E, Idx);
16398
16399 if (!CountedOpEntries.insert(OpTE).second)
16400 continue;
16401 auto *OpVecTy = getWidenedType(Op->getType(), Ops.size());
16402 const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, Op->getType());
16403 if (Parts == 0)
16404 continue;
16405 const unsigned RC =
16406 TTI->getRegisterClassForType(/*Vector=*/true, OpVecTy);
16407 AddPartsToClass(RC, Parts);
16408 }
16409 }
16410
16411 if (E->getOpcode() != Instruction::Load) {
16412 const unsigned ResParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy);
16413 if (ResParts != 0) {
16414 const unsigned RC = TTI->getRegisterClassForType(/*Vector=*/true, VecTy);
16415 AddPartsToClass(RC, ResParts);
16416 }
16417 if (VecTy != FinalVecTy) {
16418 const unsigned FinalResParts =
16419 ::getNumberOfParts(*TTI, FinalVecTy, ScalarTy);
16420 if (FinalResParts != 0) {
16421 const unsigned RC =
16422 TTI->getRegisterClassForType(/*Vector=*/true, FinalVecTy);
16423 AddPartsToClass(RC, FinalResParts);
16424 }
16425 }
16426 }
16427
16428 for (auto [RegClass, UsedRegs] : PressureByClass) {
16429 const unsigned NumAvailRegs = TTI->getNumberOfRegisters(RegClass);
16430 if (NumAvailRegs == 0 || UsedRegs <= NumAvailRegs)
16431 continue;
16432 const unsigned SpillCount = UsedRegs - NumAvailRegs;
16433 InstructionCost SingleRegSpillReload =
16434 TTI->getRegisterClassReloadCost(RegClass, CostKind);
16435 // No need to spill cost only for the root entry (Idx == 0), for reduction
16436 // and non-returning instructions, like void calls.
16437 if (E->Idx > 0 || !UserIgnoreList || !E->Scalars[0]->getType()->isVoidTy())
16438 SingleRegSpillReload +=
16439 TTI->getRegisterClassSpillCost(RegClass, CostKind);
16440 SpillsReloads += SingleRegSpillReload * SpillCount;
16441 }
16442 return SpillsReloads;
16443}
16444
16446BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
16447 SmallPtrSetImpl<Value *> &CheckedExtracts) {
16448 ArrayRef<Value *> VL = E->Scalars;
16449
16450 Type *ScalarTy = getValueType(VL[0]);
16451 if (SLPReVec && E->State == TreeEntry::Vectorize &&
16452 E->getOpcode() == Instruction::InsertElement &&
16453 !E->getOperand(1).back()->getType()->isVectorTy())
16454 ScalarTy = ScalarTy->getScalarType();
16455 if (!isValidElementType(ScalarTy))
16456 return InstructionCost::getInvalid();
16458
16459 // If we have computed a smaller type for the expression, update VecTy so
16460 // that the costs will be accurate.
16461 auto It = MinBWs.find(E);
16462 Type *OrigScalarTy = ScalarTy;
16463 if (It != MinBWs.end()) {
16464 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
16465 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
16466 if (VecTy)
16467 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
16468 } else if (E->Idx == 0 && isReducedBitcastRoot()) {
16469 const TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
16470 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
16471 }
16472 auto *VecTy = getWidenedType(ScalarTy, VL.size());
16473 unsigned EntryVF = E->getVectorFactor();
16474 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
16475
16476 const InstructionCost SpillsReloads =
16477 getVectorSpillReloadCost(E, ScalarTy, VecTy, FinalVecTy, CostKind);
16478 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
16479 if (allConstant(VL))
16480 return 0;
16481 if (isa<InsertElementInst>(VL[0]))
16482 return InstructionCost::getInvalid();
16483 return SpillsReloads +
16484 processBuildVector<ShuffleCostEstimator, InstructionCost>(
16485 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
16486 }
16487 if (E->State == TreeEntry::SplitVectorize) {
16488 assert(E->CombinedEntriesWithIndices.size() == 2 &&
16489 "Expected exactly 2 combined entries.");
16490 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
16491 InstructionCost VectorCost = 0;
16492 if (E->ReorderIndices.empty()) {
16493 VectorCost = ::getShuffleCost(
16494 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
16495 E->CombinedEntriesWithIndices.back().second,
16497 ScalarTy,
16498 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
16499 ->getVectorFactor()));
16500 } else {
16501 unsigned CommonVF =
16502 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
16503 ->getVectorFactor(),
16504 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
16505 ->getVectorFactor());
16506 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
16507 getWidenedType(ScalarTy, CommonVF),
16508 E->getSplitMask(), CostKind);
16509 }
16510 VectorCost += SpillsReloads;
16511 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
16512 return VectorCost;
16513 }
16514 InstructionCost CommonCost = 0;
16515 SmallVector<int> Mask;
16516 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
16517 (E->State != TreeEntry::StridedVectorize ||
16518 !isReverseOrder(E->ReorderIndices))) {
16519 SmallVector<int> NewMask;
16520 if (E->getOpcode() == Instruction::Store) {
16521 // For stores the order is actually a mask.
16522 NewMask.resize(E->ReorderIndices.size());
16523 copy(E->ReorderIndices, NewMask.begin());
16524 } else {
16525 inversePermutation(E->ReorderIndices, NewMask);
16526 }
16527 ::addMask(Mask, NewMask);
16528 }
16529 if (!E->ReuseShuffleIndices.empty())
16530 ::addMask(Mask, E->ReuseShuffleIndices);
16531 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16532 CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy,
16533 Mask, CostKind, /*Index=*/0, VecTy);
16534 assert((E->State == TreeEntry::Vectorize ||
16535 E->State == TreeEntry::ScatterVectorize ||
16536 E->State == TreeEntry::StridedVectorize ||
16537 E->State == TreeEntry::CompressVectorize) &&
16538 "Unhandled state");
16539 assert(E->getOpcode() &&
16540 ((allSameType(VL) && allSameBlock(VL)) ||
16541 (E->getOpcode() == Instruction::GetElementPtr &&
16542 E->getMainOp()->getType()->isPointerTy()) ||
16543 E->hasCopyableElements()) &&
16544 "Invalid VL");
16545 Instruction *VL0 = E->getMainOp();
16546 unsigned ShuffleOrOp =
16547 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
16548 if (E->CombinedOp != TreeEntry::NotCombinedOp)
16549 ShuffleOrOp = E->CombinedOp;
16550 SmallSetVector<Value *, 16> UniqueValues;
16551 SmallVector<unsigned, 16> UniqueIndexes;
16552 for (auto [Idx, V] : enumerate(VL))
16553 if (UniqueValues.insert(V))
16554 UniqueIndexes.push_back(Idx);
16555 const unsigned Sz = UniqueValues.size();
16556 SmallBitVector UsedScalars(Sz, false);
16557 for (unsigned I = 0; I < Sz; ++I) {
16558 if (isa<Instruction>(UniqueValues[I]) &&
16559 !E->isCopyableElement(UniqueValues[I]) &&
16560 getTreeEntries(UniqueValues[I]).front() == E)
16561 continue;
16562 UsedScalars.set(I);
16563 }
16564 auto GetCastContextHint = [&](Value *V) {
16565 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
16566 return getCastContextHint(*OpTEs.front());
16567 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
16568 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
16569 !SrcState.isAltShuffle())
16572 };
16573 auto GetCostDiff =
16574 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
16575 function_ref<InstructionCost(InstructionCost)> VectorCost) {
16576 // Calculate the cost of this instruction.
16577 InstructionCost ScalarCost = 0;
16578 if (isa<CastInst, CallInst>(VL0)) {
16579 // For some of the instructions no need to calculate cost for each
16580 // particular instruction, we can use the cost of the single
16581 // instruction x total number of scalar instructions.
16582 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
16583 } else {
16584 for (unsigned I = 0; I < Sz; ++I) {
16585 if (UsedScalars.test(I))
16586 continue;
16587 ScalarCost += ScalarEltCost(I);
16588 }
16589 }
16590
16591 InstructionCost VecCost = VectorCost(CommonCost);
16592 // Check if the current node must be resized, if the parent node is not
16593 // resized.
16594 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
16595 E->Idx != 0 &&
16596 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
16597 const EdgeInfo &EI = E->UserTreeIndex;
16598 if (!EI.UserTE->hasState() ||
16599 EI.UserTE->getOpcode() != Instruction::Select ||
16600 EI.EdgeIdx != 0) {
16601 auto UserBWIt = MinBWs.find(EI.UserTE);
16602 Type *UserScalarTy =
16603 (EI.UserTE->isGather() ||
16604 EI.UserTE->State == TreeEntry::SplitVectorize)
16605 ? EI.UserTE->Scalars.front()->getType()
16606 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
16607 if (UserBWIt != MinBWs.end())
16608 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
16609 UserBWIt->second.first);
16610 if (ScalarTy != UserScalarTy) {
16611 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16612 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
16613 unsigned VecOpcode;
16614 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
16615 if (BWSz > SrcBWSz)
16616 VecOpcode = Instruction::Trunc;
16617 else
16618 VecOpcode =
16619 It->second.second ? Instruction::SExt : Instruction::ZExt;
16620 TTI::CastContextHint CCH = GetCastContextHint(VL0);
16621 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
16622 CostKind);
16623 }
16624 }
16625 }
16626 VecCost += SpillsReloads;
16627 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
16628 ScalarCost, "Calculated costs for Tree"));
16629 return VecCost - ScalarCost;
16630 };
16631 // Calculate cost difference from vectorizing set of GEPs.
16632 // Negative value means vectorizing is profitable.
16633 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
16634 assert((E->State == TreeEntry::Vectorize ||
16635 E->State == TreeEntry::StridedVectorize ||
16636 E->State == TreeEntry::CompressVectorize) &&
16637 "Entry state expected to be Vectorize, StridedVectorize or "
16638 "MaskedLoadCompressVectorize here.");
16639 InstructionCost ScalarCost = 0;
16640 InstructionCost VecCost = 0;
16641 std::tie(ScalarCost, VecCost) = getGEPCosts(
16642 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
16643 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
16644 "Calculated GEPs cost for Tree"));
16645
16646 return VecCost - ScalarCost + SpillsReloads;
16647 };
16648
16649 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
16650 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
16651 if (MinMaxID == Intrinsic::not_intrinsic)
16652 return InstructionCost::getInvalid();
16653 Type *CanonicalType = Ty;
16654 if (CanonicalType->isPtrOrPtrVectorTy())
16655 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
16656 CanonicalType->getContext(),
16657 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
16658
16659 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
16660 {CanonicalType, CanonicalType});
16662 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
16663 // If the selects are the only uses of the compares, they will be
16664 // dead and we can adjust the cost by removing their cost.
16665 if (VI && SelectOnly) {
16666 assert((!Ty->isVectorTy() || SLPReVec) &&
16667 "Expected only for scalar type.");
16668 auto *CI = cast<CmpInst>(VI->getOperand(0));
16669 IntrinsicCost -= TTI->getCmpSelInstrCost(
16670 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
16671 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
16672 {TTI::OK_AnyValue, TTI::OP_None}, CI);
16673 }
16674 return IntrinsicCost;
16675 };
16676 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
16677 Instruction *VI) {
16678 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
16679 return Cost;
16680 };
16681 switch (ShuffleOrOp) {
16682 case Instruction::PHI: {
16683 // Count reused scalars.
16684 InstructionCost ScalarCost = 0;
16685 SmallPtrSet<const TreeEntry *, 4> CountedOps;
16686 for (Value *V : UniqueValues) {
16687 auto *PHI = dyn_cast<PHINode>(V);
16688 if (!PHI)
16689 continue;
16690
16691 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
16692 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
16693 Value *Op = PHI->getIncomingValue(I);
16694 Operands[I] = Op;
16695 }
16696 if (const TreeEntry *OpTE =
16697 getSameValuesTreeEntry(Operands.front(), Operands))
16698 if (CountedOps.insert(OpTE).second &&
16699 !OpTE->ReuseShuffleIndices.empty())
16700 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
16701 OpTE->Scalars.size());
16702 }
16703
16704 return CommonCost - ScalarCost + SpillsReloads;
16705 }
16706 case Instruction::ExtractValue:
16707 case Instruction::ExtractElement: {
16708 APInt DemandedElts;
16709 VectorType *SrcVecTy = nullptr;
16710 auto GetScalarCost = [&](unsigned Idx) {
16711 if (isa<PoisonValue>(UniqueValues[Idx]))
16713
16714 auto *I = cast<Instruction>(UniqueValues[Idx]);
16715 if (!SrcVecTy) {
16716 if (ShuffleOrOp == Instruction::ExtractElement) {
16717 auto *EE = cast<ExtractElementInst>(I);
16718 SrcVecTy = EE->getVectorOperandType();
16719 } else {
16720 auto *EV = cast<ExtractValueInst>(I);
16721 Type *AggregateTy = EV->getAggregateOperand()->getType();
16722 unsigned NumElts;
16723 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
16724 NumElts = ATy->getNumElements();
16725 else
16726 NumElts = AggregateTy->getStructNumElements();
16727 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
16728 }
16729 }
16730 if (I->hasOneUse()) {
16731 Instruction *Ext = I->user_back();
16732 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
16734 // Use getExtractWithExtendCost() to calculate the cost of
16735 // extractelement/ext pair.
16736 InstructionCost Cost = TTI->getExtractWithExtendCost(
16737 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
16738 CostKind);
16739 // Subtract the cost of s|zext which is subtracted separately.
16740 Cost -= TTI->getCastInstrCost(
16741 Ext->getOpcode(), Ext->getType(), I->getType(),
16743 return Cost;
16744 }
16745 }
16746 if (DemandedElts.isZero())
16747 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
16748 DemandedElts.setBit(*getExtractIndex(I));
16750 };
16751 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16752 return CommonCost - (DemandedElts.isZero()
16754 : TTI.getScalarizationOverhead(
16755 SrcVecTy, DemandedElts, /*Insert=*/false,
16756 /*Extract=*/true, CostKind));
16757 };
16758 return GetCostDiff(GetScalarCost, GetVectorCost);
16759 }
16760 case Instruction::InsertElement: {
16761 assert(E->ReuseShuffleIndices.empty() &&
16762 "Unique insertelements only are expected.");
16763 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
16764 unsigned const NumElts = SrcVecTy->getNumElements();
16765 unsigned const NumScalars = VL.size();
16766
16767 unsigned NumOfParts =
16768 ::getNumberOfParts(*TTI, SrcVecTy, VL0->getOperand(1)->getType());
16769
16770 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
16771 unsigned OffsetBeg = *getElementIndex(VL.front());
16772 unsigned OffsetEnd = OffsetBeg;
16773 InsertMask[OffsetBeg] = 0;
16774 for (auto [I, V] : enumerate(VL.drop_front())) {
16775 unsigned Idx = *getElementIndex(V);
16776 if (OffsetBeg > Idx)
16777 OffsetBeg = Idx;
16778 else if (OffsetEnd < Idx)
16779 OffsetEnd = Idx;
16780 InsertMask[Idx] = I + 1;
16781 }
16782 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
16783 if (NumOfParts > 0 && NumOfParts < NumElts)
16784 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
16785 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
16786 VecScalarsSz;
16787 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
16788 unsigned InsertVecSz = std::min<unsigned>(
16789 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
16790 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
16791 bool IsWholeSubvector =
16792 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
16793 // Check if we can safely insert a subvector. If it is not possible, just
16794 // generate a whole-sized vector and shuffle the source vector and the new
16795 // subvector.
16796 if (OffsetBeg + InsertVecSz > VecSz) {
16797 // Align OffsetBeg to generate correct mask.
16798 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
16799 InsertVecSz = VecSz;
16800 }
16801
16802 APInt DemandedElts = APInt::getZero(NumElts);
16803 // TODO: Add support for Instruction::InsertValue.
16804 SmallVector<int> Mask;
16805 if (!E->ReorderIndices.empty()) {
16806 inversePermutation(E->ReorderIndices, Mask);
16807 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
16808 } else {
16809 Mask.assign(VecSz, PoisonMaskElem);
16810 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
16811 }
16812 bool IsIdentity = true;
16813 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
16814 Mask.swap(PrevMask);
16815 for (unsigned I = 0; I < NumScalars; ++I) {
16816 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
16817 DemandedElts.setBit(InsertIdx);
16818 IsIdentity &= InsertIdx - OffsetBeg == I;
16819 Mask[InsertIdx - OffsetBeg] = I;
16820 }
16821 assert(Offset < NumElts && "Failed to find vector index offset");
16822
16824 Cost -=
16825 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
16826 /*Insert*/ true, /*Extract*/ false, CostKind);
16827
16828 // First cost - resize to actual vector size if not identity shuffle or
16829 // need to shift the vector.
16830 // Do not calculate the cost if the actual size is the register size and
16831 // we can merge this shuffle with the following SK_Select.
16832 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
16833 if (!IsIdentity)
16835 InsertVecTy, Mask);
16836 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
16837 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
16838 }));
16839 // Second cost - permutation with subvector, if some elements are from the
16840 // initial vector or inserting a subvector.
16841 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
16842 // subvector of ActualVecTy.
16843 SmallBitVector InMask =
16844 isUndefVector(FirstInsert->getOperand(0),
16845 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
16846 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
16847 if (InsertVecSz != VecSz) {
16848 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
16849 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
16850 CostKind, OffsetBeg - Offset, InsertVecTy);
16851 } else {
16852 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
16853 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
16854 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
16855 I <= End; ++I)
16856 if (Mask[I] != PoisonMaskElem)
16857 Mask[I] = I + VecSz;
16858 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
16859 Mask[I] =
16860 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
16861 Cost +=
16862 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
16863 }
16864 }
16865 return Cost + SpillsReloads;
16866 }
16867 case Instruction::ZExt:
16868 case Instruction::SExt:
16869 case Instruction::FPToUI:
16870 case Instruction::FPToSI:
16871 case Instruction::FPExt:
16872 case Instruction::PtrToInt:
16873 case Instruction::IntToPtr:
16874 case Instruction::SIToFP:
16875 case Instruction::UIToFP:
16876 case Instruction::Trunc:
16877 case Instruction::FPTrunc:
16878 case Instruction::BitCast: {
16879 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
16880 Type *SrcScalarTy = VL0->getOperand(0)->getType();
16881 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
16882 unsigned Opcode = ShuffleOrOp;
16883 unsigned VecOpcode = Opcode;
16884 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
16885 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
16886 // Check if the values are candidates to demote.
16887 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
16888 if (SrcIt != MinBWs.end()) {
16889 SrcBWSz = SrcIt->second.first;
16890 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
16891 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
16892 SrcVecTy =
16893 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
16894 }
16895 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
16896 if (BWSz == SrcBWSz) {
16897 VecOpcode = Instruction::BitCast;
16898 } else if (BWSz < SrcBWSz) {
16899 VecOpcode = Instruction::Trunc;
16900 } else if (It != MinBWs.end()) {
16901 assert(BWSz > SrcBWSz && "Invalid cast!");
16902 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16903 } else if (SrcIt != MinBWs.end()) {
16904 assert(BWSz > SrcBWSz && "Invalid cast!");
16905 VecOpcode =
16906 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
16907 }
16908 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
16909 !SrcIt->second.second) {
16910 VecOpcode = Instruction::UIToFP;
16911 }
16912 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
16913 assert(Idx == 0 && "Expected 0 index only");
16914 return TTI->getCastInstrCost(Opcode, VL0->getType(),
16915 VL0->getOperand(0)->getType(),
16917 };
16918 auto GetVectorCost = [=](InstructionCost CommonCost) {
16919 // Do not count cost here if minimum bitwidth is in effect and it is just
16920 // a bitcast (here it is just a noop).
16921 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
16922 return CommonCost;
16923 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
16924 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
16925
16926 bool IsArithmeticExtendedReduction =
16927 E->Idx == 0 && UserIgnoreList &&
16928 all_of(*UserIgnoreList, [](Value *V) {
16929 auto *I = cast<Instruction>(V);
16930 return is_contained({Instruction::Add, Instruction::FAdd,
16931 Instruction::Mul, Instruction::FMul,
16932 Instruction::And, Instruction::Or,
16933 Instruction::Xor},
16934 I->getOpcode());
16935 });
16936 if (IsArithmeticExtendedReduction &&
16937 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
16938 return CommonCost;
16939 return CommonCost +
16940 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
16941 VecOpcode == Opcode ? VI : nullptr);
16942 };
16943 return GetCostDiff(GetScalarCost, GetVectorCost);
16944 }
16945 case Instruction::FCmp:
16946 case Instruction::ICmp:
16947 // Override ScalarTy/VecTy with the compared operand type (not i1). The
16948 // cost of a compare instruction is determined by the operand width, and
16949 // getCmpSelInstrCost expects the compared type as its first type arg.
16950 OrigScalarTy = ScalarTy = getValueType(VL0, /*LookThroughCmp=*/true);
16951 VecTy = getWidenedType(ScalarTy, VL.size());
16952 [[fallthrough]];
16953 case Instruction::Select: {
16954 CmpPredicate VecPred, SwappedVecPred;
16955 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
16956 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
16957 match(VL0, MatchCmp))
16958 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
16959 else
16960 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
16963 auto GetScalarCost = [&](unsigned Idx) {
16964 if (isa<PoisonValue>(UniqueValues[Idx]))
16966
16967 if (!isa<SelectInst>(UniqueValues[Idx]))
16968 return TTI->getInstructionCost(cast<Instruction>(UniqueValues[Idx]),
16969 CostKind);
16970
16971 auto *VI = cast<Instruction>(UniqueValues[Idx]);
16972 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
16975 Value *LHS = nullptr, *RHS = nullptr;
16976 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
16977 bool IsSelect =
16978 ShuffleOrOp == Instruction::Select &&
16979 (match(VI, m_Select(MatchCmp, m_Value(LHS), m_Value(RHS))) ||
16981 if ((!IsSelect && !match(VI, MatchCmp)) ||
16982 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
16983 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
16984 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
16987
16988 // Check if operands are of i1 types, like a condition expression.
16989 // TODO: consider implementing this in TTI.
16990 InstructionCost ScalarCost = InstructionCost::getInvalid();
16991 if (IsSelect && LHS->getType() == VI->getOperand(0)->getType()) {
16992 assert(LHS->getType() == RHS->getType() &&
16993 "Expected same type for LHS/RHS");
16994 // select i1 v, i1 true, i1 b -> or i1 v, i1 b
16995 if (match(LHS, m_AllOnes())) {
16996 ScalarCost = TTI->getArithmeticInstrCost(
16997 Instruction::Or, LHS->getType(), CostKind,
16998 getOperandInfo(VI->getOperand(0)), getOperandInfo(RHS));
16999 } else if (match(RHS, m_Zero())) {
17000 // select i1 v, i1 b, i1 false -> and i1 v, i1 b
17001 ScalarCost = TTI->getArithmeticInstrCost(
17002 Instruction::And, LHS->getType(), CostKind,
17003 getOperandInfo(VI->getOperand(0)), getOperandInfo(LHS));
17004 }
17005 }
17006 if (!ScalarCost.isValid()) {
17007 // For selects, the "condition type" arg is the condition operand's
17008 // type; for standalone compares, it is the result type (i1).
17009 ScalarCost = TTI->getCmpSelInstrCost(
17010 E->getOpcode(), OrigScalarTy,
17011 ShuffleOrOp == Instruction::Select ? VL0->getOperand(0)->getType()
17012 : VL0->getType(),
17013 CurrentPred, CostKind,
17014 getOperandInfo(
17015 VI->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17016 getOperandInfo(
17017 VI->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17018 VI);
17019 }
17020 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
17021 if (IntrinsicCost.isValid())
17022 ScalarCost = IntrinsicCost;
17023
17024 return ScalarCost;
17025 };
17026 auto GetVectorCost = [&](InstructionCost CommonCost) {
17027 // For selects, the condition type may differ from the result type
17028 // (e.g. condition is <N x i1> while result is <N x i32>). For
17029 // compares, the result type IS the mask (i1/vNi1). Construct the
17030 // right type so getCmpSelInstrCost sees the actual mask/result width.
17031 auto *MaskTy = getWidenedType(ShuffleOrOp == Instruction::Select
17032 ? VL0->getOperand(0)->getType()
17033 : VL0->getType(),
17034 VL.size());
17035
17036 InstructionCost VecCost = InstructionCost::getInvalid();
17037 if (ShuffleOrOp == Instruction::Select) {
17038 ArrayRef<Value *> Cond = E->getOperand(0);
17039 ArrayRef<Value *> LHS = E->getOperand(1);
17040 ArrayRef<Value *> RHS = E->getOperand(2);
17041 // select <VF x i1>, <VF x i1>, <VF x i1>?
17042 // TODO: consider implementing this in TTI.
17043 if (Cond.front()->getType() == LHS.front()->getType()) {
17044 // select <VF x i1> v, <VF x i1> true, <VF x i1> b -> or <VF x i1> v,
17045 // <VF x i1> b
17046 if (all_of(LHS, [&](Value *V) { return match(V, m_AllOnes()); })) {
17047 VecCost = TTI->getArithmeticInstrCost(
17048 Instruction::Or, VecTy, CostKind, getOperandInfo(Cond),
17049 getOperandInfo(RHS));
17050 } else if (all_of(RHS,
17051 [&](Value *V) { return match(V, m_Zero()); })) {
17052 // select <VF x i1> v, <VF x i1> b, <VF x i1> false -> and <VF x i1>
17053 // v, <VF x i1> b
17054 VecCost = TTI->getArithmeticInstrCost(
17055 Instruction::And, VecTy, CostKind, getOperandInfo(Cond),
17056 getOperandInfo(LHS));
17057 }
17058 }
17059 }
17060 if (!VecCost.isValid()) {
17061 VecCost = TTI->getCmpSelInstrCost(
17062 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind,
17063 getOperandInfo(
17064 E->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17065 getOperandInfo(
17066 E->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17067 VL0);
17068 if (isa<SelectInst>(VL0)) {
17069 unsigned CondNumElements = getNumElements(MaskTy);
17070 unsigned VecTyNumElements = getNumElements(VecTy);
17071 assert(VecTyNumElements >= CondNumElements &&
17072 VecTyNumElements % CondNumElements == 0 &&
17073 "Cannot vectorize Instruction::Select");
17074 if (CondNumElements != VecTyNumElements) {
17075 // When the return type is i1 but the source is fixed vector type,
17076 // we need to duplicate the condition value.
17077 VecCost += ::getShuffleCost(
17078 *TTI, TTI::SK_PermuteSingleSrc, MaskTy,
17079 createReplicatedMask(VecTyNumElements / CondNumElements,
17080 CondNumElements));
17081 }
17082 }
17083 }
17084 return VecCost + CommonCost;
17085 };
17086 return GetCostDiff(GetScalarCost, GetVectorCost);
17087 }
17088 case TreeEntry::MinMax: {
17089 auto GetScalarCost = [&](unsigned Idx) {
17090 return GetMinMaxCost(OrigScalarTy);
17091 };
17092 auto GetVectorCost = [&](InstructionCost CommonCost) {
17093 InstructionCost VecCost = GetMinMaxCost(VecTy);
17094 return VecCost + CommonCost;
17095 };
17096 return GetCostDiff(GetScalarCost, GetVectorCost);
17097 }
17098 case TreeEntry::FMulAdd: {
17099 auto GetScalarCost = [&](unsigned Idx) {
17100 if (isa<PoisonValue>(UniqueValues[Idx]))
17102 return GetFMulAddCost(E->getOperations(),
17103 cast<Instruction>(UniqueValues[Idx]));
17104 };
17105 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17106 FastMathFlags FMF;
17107 FMF.set();
17108 for (Value *V : E->Scalars) {
17109 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
17110 FMF &= FPCI->getFastMathFlags();
17111 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
17112 FMF &= FPCIOp->getFastMathFlags();
17113 }
17114 }
17115 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
17116 {VecTy, VecTy, VecTy}, FMF);
17117 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
17118 return VecCost + CommonCost;
17119 };
17120 return GetCostDiff(GetScalarCost, GetVectorCost);
17121 }
17122 case TreeEntry::ReducedBitcast:
17123 case TreeEntry::ReducedBitcastBSwap: {
17124 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
17125 if (isa<PoisonValue>(UniqueValues[Idx]))
17127 auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
17128 if (!Shl)
17130 InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
17131 auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
17132 if (!ZExt)
17133 return ScalarCost;
17134 ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
17135 return ScalarCost;
17136 };
17137 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17138 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
17139 TTI::CastContextHint CastCtx =
17140 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
17141 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
17142 auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
17143 InstructionCost BitcastCost = TTI.getCastInstrCost(
17144 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx, CostKind);
17145 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
17146 auto *SrcType = IntegerType::getIntNTy(
17147 ScalarTy->getContext(),
17148 DL->getTypeSizeInBits(SrcScalarTy) * EntryVF);
17149 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17151 TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
17152 BitcastCost += IntrinsicCost;
17153 if (SrcType != ScalarTy) {
17154 BitcastCost +=
17155 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17157 }
17158 }
17159 return BitcastCost + CommonCost;
17160 };
17161 return GetCostDiff(GetScalarCost, GetVectorCost);
17162 }
17163 case TreeEntry::ReducedBitcastLoads:
17164 case TreeEntry::ReducedBitcastBSwapLoads: {
17165 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
17166 if (isa<PoisonValue>(UniqueValues[Idx]))
17168 auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
17169 if (!Shl)
17171 InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
17172 auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
17173 if (!ZExt)
17174 return ScalarCost;
17175 ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
17176 auto *Load = dyn_cast<Instruction>(ZExt->getOperand(0));
17177 if (!Load)
17178 return ScalarCost;
17179 ScalarCost += TTI.getInstructionCost(Load, CostKind);
17180 return ScalarCost;
17181 };
17182 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17183 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
17184 const TreeEntry *LoadTE = getOperandEntry(LhsTE, /*Idx=*/0);
17185 auto *LI0 = cast<LoadInst>(LoadTE->getMainOp());
17186 auto *SrcType = IntegerType::getIntNTy(
17187 ScalarTy->getContext(),
17188 DL->getTypeSizeInBits(LI0->getType()) * EntryVF);
17189 InstructionCost LoadCost =
17190 TTI.getMemoryOpCost(Instruction::Load, SrcType, LI0->getAlign(),
17191 LI0->getPointerAddressSpace(), CostKind);
17192 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
17193 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17195 TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
17196 LoadCost += IntrinsicCost;
17197 if (SrcType != ScalarTy) {
17198 LoadCost +=
17199 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17201 }
17202 }
17203 return LoadCost + CommonCost;
17204 };
17205 return GetCostDiff(GetScalarCost, GetVectorCost);
17206 }
17207 case TreeEntry::ReducedCmpBitcast: {
17208 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
17209 if (isa<PoisonValue>(UniqueValues[Idx]))
17211 auto *Sel = dyn_cast<Instruction>(UniqueValues[Idx]);
17212 if (!Sel)
17214 InstructionCost ScalarCost = TTI.getInstructionCost(Sel, CostKind);
17215 return ScalarCost;
17216 };
17217 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17218 Type *CmpTy = CmpInst::makeCmpResultType(VecTy);
17219 auto *DstTy =
17220 IntegerType::getIntNTy(ScalarTy->getContext(), E->getVectorFactor());
17221 InstructionCost BitcastCost =
17222 TTI.getCastInstrCost(Instruction::BitCast, DstTy, CmpTy,
17224 if (DstTy != ScalarTy) {
17225 BitcastCost +=
17226 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
17228 }
17229 return BitcastCost + CommonCost;
17230 };
17231 return GetCostDiff(GetScalarCost, GetVectorCost);
17232 }
17233 case Instruction::FNeg:
17234 case Instruction::Add:
17235 case Instruction::FAdd:
17236 case Instruction::Sub:
17237 case Instruction::FSub:
17238 case Instruction::Mul:
17239 case Instruction::FMul:
17240 case Instruction::UDiv:
17241 case Instruction::SDiv:
17242 case Instruction::FDiv:
17243 case Instruction::URem:
17244 case Instruction::SRem:
17245 case Instruction::FRem:
17246 case Instruction::Shl:
17247 case Instruction::LShr:
17248 case Instruction::AShr:
17249 case Instruction::And:
17250 case Instruction::Or:
17251 case Instruction::Xor: {
17252 auto GetScalarCost = [&](unsigned Idx) {
17253 if (isa<PoisonValue>(UniqueValues[Idx]))
17255
17256 // We cannot retrieve the operand from UniqueValues[Idx] because an
17257 // interchangeable instruction may be used. The order and the actual
17258 // operand might differ from what is retrieved from UniqueValues[Idx].
17259 unsigned Lane = UniqueIndexes[Idx];
17260 Value *Op1 = E->getOperand(0)[Lane];
17261 Value *Op2;
17262 SmallVector<const Value *, 2> Operands(1, Op1);
17263 if (isa<UnaryOperator>(UniqueValues[Idx])) {
17264 Op2 = Op1;
17265 } else {
17266 Op2 = E->getOperand(1)[Lane];
17267 Operands.push_back(Op2);
17268 }
17271 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
17272 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
17273 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
17274 I && (ShuffleOrOp == Instruction::FAdd ||
17275 ShuffleOrOp == Instruction::FSub)) {
17276 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
17277 if (IntrinsicCost.isValid())
17278 ScalarCost = IntrinsicCost;
17279 }
17280 return ScalarCost;
17281 };
17282 auto GetVectorCost = [=](InstructionCost CommonCost) {
17283 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
17284 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
17285 ArrayRef<Value *> Ops = E->getOperand(I);
17286 if (all_of(Ops, [&](Value *Op) {
17287 auto *CI = dyn_cast<ConstantInt>(Op);
17288 return CI && CI->getValue().countr_one() >= It->second.first;
17289 }))
17290 return CommonCost;
17291 }
17292 }
17293 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
17294 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
17295 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
17296 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
17297 Op2Info, {}, nullptr, TLI) +
17298 CommonCost;
17299 };
17300 return GetCostDiff(GetScalarCost, GetVectorCost);
17301 }
17302 case Instruction::GetElementPtr: {
17303 return CommonCost + GetGEPCostDiff(VL, VL0);
17304 }
17305 case Instruction::Load: {
17306 auto GetScalarCost = [&](unsigned Idx) {
17307 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
17308 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
17309 VI->getAlign(), VI->getPointerAddressSpace(),
17311 };
17312 auto *LI0 = cast<LoadInst>(VL0);
17313 auto GetVectorCost = [&](InstructionCost CommonCost) {
17314 InstructionCost VecLdCost;
17315 switch (E->State) {
17316 case TreeEntry::Vectorize:
17317 if (unsigned Factor = E->getInterleaveFactor()) {
17318 VecLdCost = TTI->getInterleavedMemoryOpCost(
17319 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
17320 LI0->getPointerAddressSpace(), CostKind);
17321
17322 } else {
17323 VecLdCost = TTI->getMemoryOpCost(
17324 Instruction::Load, VecTy, LI0->getAlign(),
17325 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
17326 }
17327 break;
17328 case TreeEntry::StridedVectorize: {
17329 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
17330 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
17331 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
17332 Align CommonAlignment =
17333 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
17334 VecLdCost = TTI->getMemIntrinsicInstrCost(
17335 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
17336 StridedLoadTy, LI0->getPointerOperand(),
17337 /*VariableMask=*/false, CommonAlignment),
17338 CostKind);
17339 if (StridedLoadTy != VecTy)
17340 VecLdCost +=
17341 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
17342 getCastContextHint(*E), CostKind);
17343
17344 break;
17345 }
17346 case TreeEntry::CompressVectorize: {
17347 bool IsMasked;
17348 unsigned InterleaveFactor;
17349 SmallVector<int> CompressMask;
17350 VectorType *LoadVecTy;
17351 SmallVector<Value *> Scalars(VL);
17352 if (!E->ReorderIndices.empty()) {
17353 SmallVector<int> Mask(E->ReorderIndices.begin(),
17354 E->ReorderIndices.end());
17355 reorderScalars(Scalars, Mask);
17356 }
17357 SmallVector<Value *> PointerOps(Scalars.size());
17358 for (auto [I, V] : enumerate(Scalars))
17359 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
17360 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
17361 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
17362 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
17363 CompressMask, LoadVecTy);
17364 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
17365 InterleaveFactor, IsMasked);
17366 Align CommonAlignment = LI0->getAlign();
17367 if (InterleaveFactor) {
17368 VecLdCost = TTI->getInterleavedMemoryOpCost(
17369 Instruction::Load, LoadVecTy, InterleaveFactor, {},
17370 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
17371 } else if (IsMasked) {
17372 VecLdCost = TTI->getMemIntrinsicInstrCost(
17373 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
17374 CommonAlignment,
17375 LI0->getPointerAddressSpace()),
17376 CostKind);
17377 // TODO: include this cost into CommonCost.
17378 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
17379 LoadVecTy, CompressMask, CostKind);
17380 } else {
17381 VecLdCost = TTI->getMemoryOpCost(
17382 Instruction::Load, LoadVecTy, CommonAlignment,
17383 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
17384 // TODO: include this cost into CommonCost.
17385 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
17386 LoadVecTy, CompressMask, CostKind);
17387 }
17388 break;
17389 }
17390 case TreeEntry::ScatterVectorize: {
17391 Align CommonAlignment =
17392 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
17393 VecLdCost = TTI->getMemIntrinsicInstrCost(
17394 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
17395 LI0->getPointerOperand(),
17396 /*VariableMask=*/false, CommonAlignment),
17397 CostKind);
17398 break;
17399 }
17400 case TreeEntry::CombinedVectorize:
17401 case TreeEntry::SplitVectorize:
17402 case TreeEntry::NeedToGather:
17403 llvm_unreachable("Unexpected vectorization state.");
17404 }
17405 return VecLdCost + CommonCost;
17406 };
17407
17408 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
17409 // If this node generates masked gather load then it is not a terminal node.
17410 // Hence address operand cost is estimated separately.
17411 if (E->State == TreeEntry::ScatterVectorize)
17412 return Cost;
17413
17414 // Estimate cost of GEPs since this tree node is a terminator.
17415 SmallVector<Value *> PointerOps(VL.size());
17416 for (auto [I, V] : enumerate(VL))
17417 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
17418 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
17419 }
17420 case Instruction::Store: {
17421 bool IsReorder = !E->ReorderIndices.empty();
17422 auto GetScalarCost = [=](unsigned Idx) {
17423 auto *VI = cast<StoreInst>(VL[Idx]);
17424 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
17425 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
17426 VI->getAlign(), VI->getPointerAddressSpace(),
17427 CostKind, OpInfo, VI);
17428 };
17429 auto *BaseSI =
17430 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
17431 auto GetVectorCost = [=](InstructionCost CommonCost) {
17432 // We know that we can merge the stores. Calculate the cost.
17433 InstructionCost VecStCost;
17434 if (E->State == TreeEntry::StridedVectorize) {
17435 Align CommonAlignment =
17436 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
17437 VecStCost = TTI->getMemIntrinsicInstrCost(
17438 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
17439 VecTy, BaseSI->getPointerOperand(),
17440 /*VariableMask=*/false, CommonAlignment),
17441 CostKind);
17442 } else {
17443 assert(E->State == TreeEntry::Vectorize &&
17444 "Expected either strided or consecutive stores.");
17445 if (unsigned Factor = E->getInterleaveFactor()) {
17446 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
17447 "No reused shuffles expected");
17448 CommonCost = 0;
17449 VecStCost = TTI->getInterleavedMemoryOpCost(
17450 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
17451 BaseSI->getPointerAddressSpace(), CostKind);
17452 } else {
17453 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
17454 VecStCost = TTI->getMemoryOpCost(
17455 Instruction::Store, VecTy, BaseSI->getAlign(),
17456 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
17457 }
17458 }
17459 return VecStCost + CommonCost;
17460 };
17461 SmallVector<Value *> PointerOps(VL.size());
17462 for (auto [I, V] : enumerate(VL)) {
17463 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
17464 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
17465 }
17466
17467 return GetCostDiff(GetScalarCost, GetVectorCost) +
17468 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
17469 }
17470 case Instruction::Call: {
17471 auto GetScalarCost = [&](unsigned Idx) {
17472 auto *CI = cast<CallInst>(UniqueValues[Idx]);
17475 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
17476 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
17477 }
17478 return TTI->getCallInstrCost(CI->getCalledFunction(),
17480 CI->getFunctionType()->params(), CostKind);
17481 };
17482 auto GetVectorCost = [=](InstructionCost CommonCost) {
17483 auto *CI = cast<CallInst>(VL0);
17486 CI, ID, VecTy->getNumElements(),
17487 It != MinBWs.end() ? It->second.first : 0, TTI);
17488 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
17489 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
17490 };
17491 return GetCostDiff(GetScalarCost, GetVectorCost);
17492 }
17493 case Instruction::ShuffleVector: {
17494 if (!SLPReVec || E->isAltShuffle())
17495 assert(E->isAltShuffle() &&
17496 ((Instruction::isBinaryOp(E->getOpcode()) &&
17497 Instruction::isBinaryOp(E->getAltOpcode())) ||
17498 (Instruction::isCast(E->getOpcode()) &&
17499 Instruction::isCast(E->getAltOpcode())) ||
17500 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
17501 "Invalid Shuffle Vector Operand");
17502 // Try to find the previous shuffle node with the same operands and same
17503 // main/alternate ops.
17504 auto TryFindNodeWithEqualOperands = [=]() {
17505 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17506 if (TE.get() == E)
17507 break;
17508 if (TE->hasState() && TE->isAltShuffle() &&
17509 ((TE->getOpcode() == E->getOpcode() &&
17510 TE->getAltOpcode() == E->getAltOpcode()) ||
17511 (TE->getOpcode() == E->getAltOpcode() &&
17512 TE->getAltOpcode() == E->getOpcode())) &&
17513 TE->hasEqualOperands(*E))
17514 return true;
17515 }
17516 return false;
17517 };
17518 auto GetScalarCost = [&](unsigned Idx) {
17519 if (isa<PoisonValue>(UniqueValues[Idx]))
17521
17522 auto *VI = cast<Instruction>(UniqueValues[Idx]);
17523 assert(E->getMatchingMainOpOrAltOp(VI) &&
17524 "Unexpected main/alternate opcode");
17525 (void)E;
17526 return TTI->getInstructionCost(VI, CostKind);
17527 };
17528 // Need to clear CommonCost since the final shuffle cost is included into
17529 // vector cost.
17530 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
17531 // VecCost is equal to sum of the cost of creating 2 vectors
17532 // and the cost of creating shuffle.
17533 InstructionCost VecCost = 0;
17534 if (TryFindNodeWithEqualOperands()) {
17535 LLVM_DEBUG({
17536 dbgs() << "SLP: diamond match for alternate node found.\n";
17537 E->dump();
17538 });
17539 // No need to add new vector costs here since we're going to reuse
17540 // same main/alternate vector ops, just do different shuffling.
17541 } else if (Instruction::isBinaryOp(E->getOpcode())) {
17542 VecCost =
17543 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
17544 VecCost +=
17545 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
17546 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
17547 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
17548 VecCost = TTIRef.getCmpSelInstrCost(
17549 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
17550 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17551 VL0);
17552 VecCost += TTIRef.getCmpSelInstrCost(
17553 E->getOpcode(), VecTy, MaskTy,
17554 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
17555 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17556 E->getAltOp());
17557 } else {
17558 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
17559 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
17560 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
17561 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
17562 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
17563 unsigned SrcBWSz =
17564 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
17565 if (SrcIt != MinBWs.end()) {
17566 SrcBWSz = SrcIt->second.first;
17567 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
17568 SrcTy = getWidenedType(SrcSclTy, VL.size());
17569 }
17570 if (BWSz <= SrcBWSz) {
17571 if (BWSz < SrcBWSz)
17572 VecCost =
17573 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
17575 LLVM_DEBUG({
17576 dbgs()
17577 << "SLP: alternate extension, which should be truncated.\n";
17578 E->dump();
17579 });
17580 return VecCost;
17581 }
17582 }
17583 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
17585 VecCost +=
17586 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
17588 }
17589 SmallVector<int> Mask;
17590 E->buildAltOpShuffleMask(
17591 [&](Instruction *I) {
17592 assert(E->getMatchingMainOpOrAltOp(I) &&
17593 "Unexpected main/alternate opcode");
17594 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
17595 *TLI);
17596 },
17597 Mask);
17599 FinalVecTy, Mask, CostKind);
17600 // Patterns like [fadd,fsub] can be combined into a single instruction
17601 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
17602 // need to take into account their order when looking for the most used
17603 // order.
17604 unsigned Opcode0 = E->getOpcode();
17605 unsigned Opcode1 = E->getAltOpcode();
17606 SmallBitVector OpcodeMask(
17607 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
17608 // If this pattern is supported by the target then we consider the
17609 // order.
17610 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
17611 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
17612 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
17613 return AltVecCost < VecCost ? AltVecCost : VecCost;
17614 }
17615 // TODO: Check the reverse order too.
17616 return VecCost;
17617 };
17618 if (SLPReVec && !E->isAltShuffle())
17619 return GetCostDiff(
17620 GetScalarCost, [&](InstructionCost) -> InstructionCost {
17621 // If a group uses mask in order, the shufflevector can be
17622 // eliminated by instcombine. Then the cost is 0.
17624 "Not supported shufflevector usage.");
17625 auto *SV = cast<ShuffleVectorInst>(VL.front());
17626 unsigned SVNumElements =
17627 cast<FixedVectorType>(SV->getOperand(0)->getType())
17628 ->getNumElements();
17629 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
17630 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
17631 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
17632 int NextIndex = 0;
17633 if (!all_of(Group, [&](Value *V) {
17635 "Not supported shufflevector usage.");
17636 auto *SV = cast<ShuffleVectorInst>(V);
17637 int Index;
17638 [[maybe_unused]] bool IsExtractSubvectorMask =
17639 SV->isExtractSubvectorMask(Index);
17640 assert(IsExtractSubvectorMask &&
17641 "Not supported shufflevector usage.");
17642 if (NextIndex != Index)
17643 return false;
17644 NextIndex += SV->getShuffleMask().size();
17645 return true;
17646 }))
17647 return ::getShuffleCost(
17649 calculateShufflevectorMask(E->Scalars));
17650 }
17651 return TTI::TCC_Free;
17652 });
17653 return GetCostDiff(GetScalarCost, GetVectorCost);
17654 }
17655 case Instruction::Freeze:
17656 return CommonCost;
17657 default:
17658 llvm_unreachable("Unknown instruction");
17659 }
17660}
17661
17662bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
17663 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
17664 << VectorizableTree.size() << " is fully vectorizable .\n");
17665
17666 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
17667 SmallVector<int> Mask;
17668 return TE->isGather() &&
17669 !any_of(TE->Scalars,
17670 [this](Value *V) { return EphValues.contains(V); }) &&
17671 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
17672 TE->Scalars.size() < Limit ||
17673 (((TE->hasState() &&
17674 TE->getOpcode() == Instruction::ExtractElement) ||
17676 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
17677 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
17678 !TE->isAltShuffle()) ||
17679 any_of(TE->Scalars, IsaPred<LoadInst>));
17680 };
17681
17682 // We only handle trees of heights 1 and 2.
17683 if (VectorizableTree.size() == 1 &&
17684 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
17685 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
17686 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
17687 (ForReduction &&
17688 AreVectorizableGathers(VectorizableTree[0].get(),
17689 VectorizableTree[0]->Scalars.size()) &&
17690 VectorizableTree[0]->getVectorFactor() > 2)))
17691 return true;
17692
17693 if (VectorizableTree.size() != 2)
17694 return false;
17695
17696 // Handle splat and all-constants stores. Also try to vectorize tiny trees
17697 // with the second gather nodes if they have less scalar operands rather than
17698 // the initial tree element (may be profitable to shuffle the second gather)
17699 // or they are extractelements, which form shuffle.
17700 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
17701 AreVectorizableGathers(VectorizableTree[1].get(),
17702 VectorizableTree[0]->Scalars.size()))
17703 return true;
17704
17705 // Gathering cost would be too much for tiny trees.
17706 if (VectorizableTree[0]->isGather() ||
17707 (VectorizableTree[1]->isGather() &&
17708 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
17709 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
17710 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
17711 return false;
17712
17713 return true;
17714}
17715
17716bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
17717 if (!DebugCounter::shouldExecute(VectorizedGraphs))
17718 return true;
17719
17720 // Graph is empty - do nothing.
17721 if (VectorizableTree.empty()) {
17722 assert(ExternalUses.empty() && "We shouldn't have any external users");
17723
17724 return true;
17725 }
17726
17727 // Cache values from the root node and the cost-threshold options to avoid
17728 // re-querying them inside hot predicates below.
17729 const unsigned TreeSize = VectorizableTree.size();
17730 const TreeEntry &Front = *VectorizableTree.front();
17731 const bool FrontIsGather = Front.isGather();
17732 const bool FrontHasState = Front.hasState();
17733 const unsigned FrontOpcode = FrontHasState ? Front.getOpcode() : 0u;
17734 const bool ThresholdSet = SLPCostThreshold.getNumOccurrences() > 0;
17735 const bool ThresholdNonNegative = SLPCostThreshold >= 0;
17736
17737 constexpr unsigned Limit = 4;
17738 constexpr unsigned LargeTree = 20;
17739 constexpr unsigned LimitTreeSize = 36;
17740
17741 // The remaining size-1/size-<=MinTreeSize early bail-outs only apply to
17742 // non-reduction trees; group them under a single guard to avoid 3 separate
17743 // !ForReduction short-circuits when reducing.
17744 if (!ForReduction) {
17745 // Single gather node: bail out for ExtractElement or any node containing a
17746 // real Instruction scalar.
17747 if (TreeSize == 1 && FrontIsGather) {
17748 if (FrontHasState && FrontOpcode == Instruction::ExtractElement)
17749 return true;
17750 if (any_of(Front.Scalars, IsaPred<Instruction>))
17751 return true;
17752 }
17753 if (TreeSize <= MinTreeSize &&
17754 all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
17755 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
17756 }))
17757 return true;
17758 if (TreeSize == 1 && SLPCostThreshold < 0 && FrontHasState &&
17759 FrontOpcode == Instruction::ExtractElement &&
17760 (Front.getVectorFactor() == 2 ||
17761 all_of(
17762 Front.Scalars,
17763 [&](Value *V) {
17764 auto *I = dyn_cast<Instruction>(V);
17765 return !I || !areAllUsersVectorized(I, UserIgnoreList);
17766 })))
17767 return true;
17768 }
17769 // No need to vectorize inserts of gathered values.
17770 if (TreeSize == 2 && isa<InsertElementInst>(Front.Scalars[0]) &&
17771 VectorizableTree[1]->isGather() &&
17772 (VectorizableTree[1]->getVectorFactor() <= 2 ||
17773 !(isSplat(VectorizableTree[1]->Scalars) ||
17774 allConstant(VectorizableTree[1]->Scalars))))
17775 return true;
17776
17777 // The tree with only 3 nodes, where 2 last are gathers/buildvectors, not
17778 // profitable for vectorization.
17779 if (TreeSize == 3 && SLPCostThreshold == 0 &&
17780 (!ForReduction || Front.getVectorFactor() <= 2) &&
17781 all_of(ArrayRef(VectorizableTree).drop_front(),
17782 [&](const std::unique_ptr<TreeEntry> &TE) {
17783 return TE->isGather() && TE->getVectorFactor() <= Limit &&
17784 !all_of(
17785 TE->Scalars,
17787 }))
17788 return true;
17789
17790 // All remaining bail-out heuristics require !ForReduction. Group them under
17791 // a single guard so reduction trees skip them with one branch instead of one
17792 // per check.
17793 if (!ForReduction) {
17794 // If the graph includes only PHI nodes and gathers, it is defnitely not
17795 // profitable for the vectorization, we can skip it, if the cost threshold
17796 // is default. The cost of vectorized PHI nodes is almost always 0 + the
17797 // cost of gathers/buildvectors.
17798 if (!ThresholdSet &&
17799 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17800 const bool IsGather = TE->isGather();
17801 const bool HasState = TE->hasState();
17802 const unsigned Op = HasState ? TE->getOpcode() : 0u;
17803 if (IsGather && (!HasState || Op != Instruction::ExtractElement) &&
17804 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit)
17805 return true;
17806 return HasState && Op == Instruction::PHI;
17807 }))
17808 return true;
17809
17810 // Do not vectorize small tree of phis only, if all vector phis are also
17811 // gathered.
17812 if (ThresholdSet && TreeSize <= Limit) {
17813 bool HasVectorPhi = false;
17814 auto Compatible = [&](const std::unique_ptr<TreeEntry> &TE) {
17815 const bool IsGather = TE->isGather();
17816 const bool HasState = TE->hasState();
17817 const unsigned Op = HasState ? TE->getOpcode() : 0u;
17818 if (IsGather && (!HasState || Op != Instruction::ExtractElement) &&
17819 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit)
17820 return true;
17821 if (!HasState)
17822 return false;
17823 if (Op == Instruction::InsertElement)
17824 return true;
17825 if (Op != Instruction::PHI)
17826 return false;
17827 if (TE->State == TreeEntry::Vectorize)
17828 HasVectorPhi = true;
17829 return all_of(TE->Scalars, [&](Value *V) {
17830 return isa<PoisonValue>(V) || MustGather.contains(V);
17831 });
17832 };
17833 if (all_of(VectorizableTree, Compatible) && HasVectorPhi)
17834 return true;
17835 }
17836
17837 // PHI nodes only and gathers cannot be vectorized, skip.
17838 if (ThresholdNonNegative) {
17839 const bool IsLargeTree = TreeSize >= LargeTree;
17840 bool HasSingleLoad = false;
17841 if (all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17842 const bool IsGather = TE->isGather();
17843 const bool HasState = TE->hasState();
17844 const unsigned Op = HasState ? TE->getOpcode() : 0u;
17845 // HasSingleLoad/PrevLoad are only consulted in the
17846 // IsLargeTree branch; skip the bookkeeping otherwise.
17847 if (IsLargeTree) {
17848 const bool PrevLoad = HasSingleLoad;
17849 HasSingleLoad |=
17850 HasState && !IsGather &&
17851 (Op == Instruction::Load || TE->hasCopyableElements()) &&
17852 (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty());
17853 if (HasState) {
17854 if (Op == Instruction::PHI)
17855 return true;
17856 if (TE->getVectorFactor() <= Limit &&
17857 (Op == Instruction::Store ||
17858 (Op == Instruction::Load && !PrevLoad)))
17859 return true;
17860 }
17861 } else if (HasState && Op == Instruction::PHI) {
17862 return true;
17863 }
17864 return IsGather && (!HasState || Op != Instruction::ExtractElement);
17865 }))
17866 return true;
17867
17868 // Single non-phi vector node - skip the tree.
17869 if (TreeSize >= 5 && Front.getVectorFactor() <= 2 &&
17870 Front.Scalars.front()->getType()->isIntegerTy()) {
17871 bool VectorNodeFound = false;
17872 bool AnyNonConst = false;
17873 if (all_of(VectorizableTree,
17874 [&](const std::unique_ptr<TreeEntry> &TE) {
17875 if (TE->State == TreeEntry::Vectorize && TE->hasState()) {
17876 const unsigned Op = TE->getOpcode();
17877 if (Op == Instruction::PHI ||
17878 !TE->ReorderIndices.empty())
17879 return true;
17880 if (VectorNodeFound)
17881 return false;
17882 VectorNodeFound = true;
17883 return true;
17884 }
17885 // Once AnyNonConst is true, skip the O(n) allConstant
17886 // walk for subsequent entries.
17887 if (!AnyNonConst)
17888 AnyNonConst = !allConstant(TE->Scalars);
17889 return TE->isGather() ||
17890 TE->State == TreeEntry::SplitVectorize;
17891 }) &&
17892 AnyNonConst)
17893 return true;
17894 }
17895 }
17896
17897 // Common predicate for "phis, buildvectors, split nodes and small nodes
17898 // with reuses" used by the two checks below. Cheap checks are evaluated
17899 // before expensive Scalars walks.
17900 auto IsBenignNode = [&](const TreeEntry &TE) {
17901 if (TE.State == TreeEntry::SplitVectorize)
17902 return true;
17903 const bool IsGather = TE.isGather();
17904 const bool HasState = TE.hasState();
17905 if (HasState) {
17906 const unsigned Op = TE.getOpcode();
17907 if (Op == Instruction::PHI)
17908 return true;
17909 const unsigned ScalarsSize = TE.Scalars.size();
17910 if (TE.Idx == 0 && ScalarsSize == 2 && Op == Instruction::ICmp &&
17911 TreeSize > LimitTreeSize)
17912 return true;
17913 if (ScalarsSize == 2 &&
17914 (!TE.ReuseShuffleIndices.empty() || !TE.ReorderIndices.empty() ||
17915 TE.isAltShuffle()))
17916 return true;
17917 if (TE.hasCopyableElements() &&
17918 static_cast<unsigned>(count_if(
17919 TE.Scalars, IsaPred<PHINode, Constant>)) >= ScalarsSize / 2)
17920 return true;
17921 }
17922 return IsGather && none_of(TE.Scalars, IsaPred<ExtractElementInst>);
17923 };
17924
17925 // If the tree contains only phis, buildvectors, split nodes and
17926 // small nodes with reuses, we can skip it.
17927 if (!ThresholdSet) {
17928 SmallVector<const TreeEntry *> StoreLoadNodes;
17929 unsigned NumGathers = 0;
17930 if (all_of(VectorizableTree,
17931 [&](const std::unique_ptr<TreeEntry> &TE) {
17932 const bool IsGather = TE->isGather();
17933 if (!IsGather && TE->hasState()) {
17934 const unsigned Op = TE->getOpcode();
17935 if (Op == Instruction::Load || Op == Instruction::Store) {
17936 StoreLoadNodes.push_back(TE.get());
17937 return true;
17938 }
17939 }
17940 if (IsGather)
17941 ++NumGathers;
17942 return IsBenignNode(*TE);
17943 }) &&
17944 (StoreLoadNodes.empty() ||
17945 (TreeSize > LimitTreeSize * StoreLoadNodes.size() &&
17946 (NumGathers > 0 ||
17947 none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
17948 return TE->getOpcode() == Instruction::Store ||
17949 all_of(TE->Scalars, [&](Value *V) {
17950 return !isa<LoadInst>(V) ||
17951 areAllUsersVectorized(cast<Instruction>(V));
17952 });
17953 })))))
17954 return true;
17955 }
17956
17957 // If the tree contains only phis, buildvectors, split nodes and
17958 // small nodes with reuses, we can skip it.
17959 if (ThresholdNonNegative && TreeSize > LimitTreeSize) {
17960 const TreeEntry *VectorNode = nullptr;
17961 if (all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17962 if (!TE->isGather() && TE->hasState() &&
17963 TE->State != TreeEntry::SplitVectorize &&
17964 TE->getOpcode() != Instruction::PHI) {
17965 if (VectorNode)
17966 return false;
17967 VectorNode = TE.get();
17968 return true;
17969 }
17970 return IsBenignNode(*TE);
17971 }))
17972 return true;
17973 }
17974
17975 // If the tree contains only buildvector, 2 non-buildvectors (with root
17976 // user tree node) and other buildvectors, we can skip it.
17977 if (ThresholdSet && TreeSize >= Limit &&
17978 Front.State == TreeEntry::SplitVectorize &&
17979 count_if(ArrayRef(VectorizableTree).drop_front(),
17980 [](const std::unique_ptr<TreeEntry> &TE) {
17981 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
17982 TE->UserTreeIndex.UserTE->Idx == 0;
17983 }) == 2)
17984 return true;
17985
17986 // If the tree contains only vectorization of the phi node from the
17987 // buildvector - skip it.
17988 if (ThresholdSet && TreeSize > 2 && Front.State == TreeEntry::Vectorize &&
17989 FrontOpcode == Instruction::InsertElement &&
17990 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17991 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
17992 all_of(ArrayRef(VectorizableTree).drop_front(2),
17993 [](const std::unique_ptr<TreeEntry> &TE) {
17994 return TE->isGather();
17995 }))
17996 return true;
17997 }
17998
17999 // We can vectorize the tree if its size is greater than or equal to the
18000 // minimum size specified by the MinTreeSize command line option.
18001 if (TreeSize >= MinTreeSize)
18002 return false;
18003
18004 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
18005 // can vectorize it if we can prove it fully vectorizable.
18006 if (isFullyVectorizableTinyTree(ForReduction))
18007 return false;
18008
18009 // Check if any of the gather node forms an insertelement buildvector
18010 // somewhere. TreeSize >= 1 is guaranteed, so the multi-node case reduces to
18011 // a simple TreeSize > 1 short-circuit.
18012 const bool IsAllowedSingleBVNode =
18013 TreeSize > 1 || (FrontHasState && !Front.isAltShuffle() &&
18014 FrontOpcode != Instruction::PHI &&
18015 FrontOpcode != Instruction::GetElementPtr &&
18016 allSameBlock(Front.Scalars));
18017 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18018 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
18019 return isa<ExtractElementInst, Constant>(V) ||
18020 (IsAllowedSingleBVNode &&
18021 !V->hasNUsesOrMore(UsesLimit) &&
18022 any_of(V->users(), IsaPred<InsertElementInst>));
18023 });
18024 }))
18025 return false;
18026
18027 const TreeEntry &Back = *VectorizableTree.back();
18028 if (Back.isGather() && Back.hasState() && Back.isAltShuffle()) {
18029 const unsigned BackVF = Back.getVectorFactor();
18030 if (BackVF > 2 && allSameBlock(Back.Scalars) &&
18031 !Back.Scalars.front()->getType()->isVectorTy() &&
18032 TTI->getScalarizationOverhead(
18033 getWidenedType(Back.Scalars.front()->getType(), BackVF),
18034 APInt::getAllOnes(BackVF),
18035 /*Insert=*/true, /*Extract=*/false,
18037 return false;
18038 }
18039
18040 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
18041 // vectorizable.
18042 return true;
18043}
18044
18047 constexpr unsigned SmallTree = 3;
18048 if (VectorizableTree.front()->isNonPowOf2Vec() &&
18049 getCanonicalGraphSize() <= SmallTree &&
18050 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
18051 [](const std::unique_ptr<TreeEntry> &TE) {
18052 return TE->isGather() && TE->hasState() &&
18053 TE->getOpcode() == Instruction::Load &&
18054 !allSameBlock(TE->Scalars);
18055 }) == 1)
18056 return true;
18057 return false;
18058 }
18059 bool Res = false;
18060 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
18061 TreeEntry &E = *VectorizableTree[Idx];
18062 if (E.State == TreeEntry::SplitVectorize)
18063 return false;
18064 if (!E.isGather())
18065 continue;
18066 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
18067 (!E.hasState() &&
18069 (isa<ExtractElementInst>(E.Scalars.front()) &&
18070 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
18071 return false;
18072 if (isSplat(E.Scalars) || allConstant(E.Scalars))
18073 continue;
18074 Res = true;
18075 }
18076 return Res;
18077}
18078
18080 // Walk the vectorizable tree from the root towards its leaves, tracking
18081 // which vectorized operand values would be live across each tree edge
18082 // (i.e. between the last instruction of an operand entry and the last
18083 // instruction of its user entry). When the live range crosses a call
18084 // instruction that is not part of the vectorized tree, query TTI for the
18085 // cost of keeping the value live across it (for example, if spills and
18086 // fills are required).
18087
18088 const TreeEntry *Root = VectorizableTree.front().get();
18089 if (Root->isGather())
18090 return 0;
18091
18092 InstructionCost Cost = 0;
18094 EntriesToOperands;
18095 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
18096 SmallPtrSet<const Instruction *, 8> LastInstructions;
18097 SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
18098 for (const auto &TEPtr : VectorizableTree) {
18099 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
18100 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
18101 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
18102 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
18103 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
18104 ScalarOrPseudoEntries.insert(TEPtr.get());
18105 continue;
18106 }
18107 if (!TEPtr->isGather()) {
18108 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
18109 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
18110 LastInstructions.insert(LastInst);
18111 }
18112 if (TEPtr->UserTreeIndex)
18113 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
18114 }
18115
18116 // Cache NoCallIntrinsicOrDoesNotReturn results - the same intrinsic call may
18117 // be queried many times during the spill cost scan, and each computation
18118 // involves two potentially expensive TTI virtual calls.
18119 SmallDenseMap<const IntrinsicInst *, bool> NoCallIntrinsicCache;
18120 auto NoCallIntrinsicOrDoesNotReturn = [this, &NoCallIntrinsicCache](
18121 const Instruction *I) {
18122 const auto *CB = dyn_cast<CallBase>(I);
18123 if (!CB)
18124 return false;
18125 if (CB->doesNotReturn())
18126 return true;
18127 const auto *II = dyn_cast<IntrinsicInst>(CB);
18128 if (!II)
18129 return false;
18130 if (II->isAssumeLikeIntrinsic())
18131 return true;
18132 auto [It, Inserted] = NoCallIntrinsicCache.try_emplace(II);
18133 if (!Inserted)
18134 return It->second;
18135 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
18136 InstructionCost IntrCost =
18137 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
18138 InstructionCost CallCost = TTI->getCallInstrCost(
18139 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
18140 bool Res = IntrCost < CallCost;
18141 It->second = Res;
18142 return Res;
18143 };
18144
18145 // Maps last instruction in the entry to the last instruction for the one of
18146 // operand entries and the flag. If the flag is true, there are no calls in
18147 // between these instructions.
18149 CheckedInstructions;
18150 unsigned Budget = 0;
18151 const unsigned BudgetLimit =
18152 ScheduleRegionSizeBudget / VectorizableTree.size();
18153 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
18154 const Instruction *Last) {
18155 assert(First->getParent() == Last->getParent() &&
18156 "Expected instructions in same block.");
18157 if (auto It = CheckedInstructions.find(Last);
18158 It != CheckedInstructions.end()) {
18159 const Instruction *Checked = It->second.getPointer();
18160 const bool NoCallsInCachedRange = It->second.getInt() != 0;
18161 if (Checked == First)
18162 return NoCallsInCachedRange;
18163 if (Checked->comesBefore(First))
18164 // In every cached state (full clean scan, call-found, or
18165 // budget-exhausted) the region strictly above `Checked` up to `Last`
18166 // was inspected and proved call-free. Since `First` is above
18167 // `Checked`, the queried range [First, Last] is contained in that
18168 // call-free region, regardless of whether bit is 0 or 1.
18169 return true;
18170 Last = Checked;
18171 } else if (Last == First || Last->comesBefore(First)) {
18172 // Empty range.
18173 return true;
18174 }
18176 ++First->getIterator().getReverse(),
18177 PrevInstIt =
18178 Last->getIterator().getReverse();
18179 SmallVector<const Instruction *> LastInstsInRange;
18180 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
18181 // Debug information does not impact spill cost.
18182 // Vectorized calls, represented as vector intrinsics, do not impact spill
18183 // cost.
18184 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
18185 CB && !NoCallIntrinsicOrDoesNotReturn(CB) && !isVectorized(CB)) {
18186 for (const Instruction *LastInst : LastInstsInRange)
18187 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
18188 return false;
18189 }
18190 if (LastInstructions.contains(&*PrevInstIt))
18191 LastInstsInRange.push_back(&*PrevInstIt);
18192
18193 ++PrevInstIt;
18194 ++Budget;
18195 }
18196 // If we reached the scan's lower bound (`PrevInstIt == InstIt`) then the
18197 // whole [First, Last] range was inspected and found call-free, even if
18198 // Budget just overflowed at the very last step; do not mislabel such a
18199 // completed scan as "has call".
18200 const bool Completed = PrevInstIt == InstIt;
18201 const bool NoCallsInRange = Completed || Budget <= BudgetLimit;
18202 for (const Instruction *LastInst : LastInstsInRange)
18203 CheckedInstructions.try_emplace(
18204 LastInst, Completed ? First : &*PrevInstIt, NoCallsInRange ? 1 : 0);
18205 return NoCallsInRange;
18206 };
18207 auto AddCosts = [&](const TreeEntry *Op) {
18208 if (ScalarOrPseudoEntries.contains(Op))
18209 return;
18210 Type *ScalarTy = Op->Scalars.front()->getType();
18211 auto It = MinBWs.find(Op);
18212 if (It != MinBWs.end())
18213 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
18214 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
18215 uint64_t Scale = getScaleToLoopIterations(*Op);
18216 InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
18217 KeepLiveCost *= Scale;
18218 Cost += KeepLiveCost;
18219 if (ScalarTy->isVectorTy()) {
18220 // Handle revec dead vector instructions.
18221 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
18222 Scale;
18223 }
18224 };
18225 // Memoize the relationship between blocks, i.e. if there is (at least one)
18226 // non-vectorized call between the blocks. This allows to skip the analysis of
18227 // the same block paths multiple times.
18229 ParentOpParentToPreds;
18230 // Memoize whether a basic block contains a non-terminator no-return call.
18231 // Such blocks are dead-end paths in normal control flow (execution does not
18232 // exit them past the no-return call), so the block is excluded from the
18233 // spill cost analysis. Terminator no-return calls (invoke/callbr) are not
18234 // block-killing because they still have live CFG successors (e.g. the
18235 // unwind destination of an invoke).
18236 SmallDenseMap<const BasicBlock *, bool> BlockHasNoReturnCallCache;
18237 auto BlockHasNoReturnCall = [&](const BasicBlock *BB) {
18238 auto [It, Inserted] = BlockHasNoReturnCallCache.try_emplace(BB, false);
18239 if (!Inserted)
18240 return It->second;
18241 for (const Instruction &I : *BB) {
18242 const auto *CB = dyn_cast<CallBase>(&I);
18243 if (CB && CB->doesNotReturn() && !CB->isTerminator()) {
18244 It->second = true;
18245 return true;
18246 }
18247 }
18248 return false;
18249 };
18250 // Memoize whether a loop's body (all blocks of the loop, including
18251 // sub-loops) contains any non-vec call.
18252 SmallDenseMap<const Loop *, bool> LoopBodyHasNonVecCall;
18253 auto LoopBodyHasCall = [&](const Loop *L) {
18254 if (auto It = LoopBodyHasNonVecCall.find(L);
18255 It != LoopBodyHasNonVecCall.end())
18256 return It->second;
18257 for (BasicBlock *BB : L->blocks()) {
18259 continue;
18260 // Blocks containing a no-return call are dead-end paths and never
18261 // actually flow back through the loop's back-edge, so their calls do
18262 // not keep loop-invariant vector values live across calls.
18263 if (BlockHasNoReturnCall(BB))
18264 continue;
18265 for (const Instruction &I : *BB) {
18266 const auto *CB = dyn_cast<CallBase>(&I);
18267 if (!CB || NoCallIntrinsicOrDoesNotReturn(CB) || isVectorized(CB))
18268 continue;
18269 LoopBodyHasNonVecCall.try_emplace(L, true);
18270 return true;
18271 }
18272 }
18273 LoopBodyHasNonVecCall.try_emplace(L, false);
18274 return false;
18275 };
18276 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
18277 BasicBlock *OpParent) {
18278 auto Key = std::make_pair(Root, OpParent);
18279 if (auto It = ParentOpParentToPreds.find(Key);
18280 It != ParentOpParentToPreds.end())
18281 return It->second;
18282 bool Res = false;
18283 scope_exit Cleanup([&]() { ParentOpParentToPreds.try_emplace(Key, Res); });
18284 // If Op is loop-invariant, a call anywhere in the loop body forces a spill,
18285 // even when a call-free forward path from Root back to OpParent exists on
18286 // the first iteration. Find the outermost such enclosing loop and reject if
18287 // its body contains a non-vec call.
18288 const Loop *L = LI->getLoopFor(Root);
18289 const Loop *Outermost = nullptr;
18290 while (L && !L->contains(OpParent)) {
18291 Outermost = L;
18292 L = L->getParentLoop();
18293 }
18294 if (Outermost && LoopBodyHasCall(Outermost))
18295 return Res;
18297 if (Pred)
18298 Worklist.push_back(Pred);
18299 else
18300 Worklist.append(pred_begin(Root), pred_end(Root));
18302 // With "at least one call-free path" semantics we can only reliably
18303 // memoize the exact (Root, OpParent) query. Pairs for intermediate
18304 // blocks that were visited during the BFS are not necessarily
18305 // call-free-reachable to OpParent themselves - we may have reached
18306 // OpParent through a *sibling* path that bypassed them.
18307 // We return `true` (no spill cost) if at least one backward path from
18308 // some predecessor of Root back to OpParent is call-free. Only when
18309 // *every* such path goes through a non-vec call do we charge the spill
18310 // cost: only then is it actually necessary to keep the vectorized value
18311 // live across a call and therefore spill/reload it.
18312 //
18313 // A BB is only explored further (its predecessors added to the worklist)
18314 // when it is itself call-free and not strictly dominated by Root (blocks
18315 // dominated by Root are only reachable via loop back-edges - they sit
18316 // *after* Root in forward execution and must not be counted).
18317 //
18318 // If we ever pop OpParent from the worklist, we have reached it through
18319 // a chain of call-free, non-dominated blocks: a call-free path exists
18320 // and we return true. If the worklist is exhausted without reaching
18321 // OpParent, every admissible path is blocked by a call and we return
18322 // false so the caller charges the spill cost.
18323 while (!Worklist.empty()) {
18324 BasicBlock *BB = Worklist.pop_back_val();
18325 if (BB == OpParent) {
18326 Res = true;
18327 return Res;
18328 }
18329 if (!Visited.insert(BB).second)
18330 continue;
18331 // Blocks strictly dominated by Root are reached only *after* Root in
18332 // forward execution (via loop back-edges); skip them and their
18333 // dominated predecessors.
18334 if (DT->properlyDominates(Root, BB))
18335 continue;
18336 // A block containing a no-return call cannot reach Root via the
18337 // forward edge being analyzed: execution does not continue past the
18338 // no-return call, so the BB -> ... -> Root path is dead. Drop the
18339 // block from the analysis without following its predecessors.
18340 if (BlockHasNoReturnCall(BB))
18341 continue;
18342 auto Pair = std::make_pair(BB, OpParent);
18343 if (auto It = ParentOpParentToPreds.find(Pair);
18344 It != ParentOpParentToPreds.end()) {
18345 if (It->second) {
18346 // BB is known to reach OpParent via a call-free path.
18347 Res = true;
18348 return Res;
18349 }
18350 // BB is known to be blocked from OpParent by calls; keep checking
18351 // other paths.
18352 continue;
18353 }
18354 unsigned BlockSize = BB->size();
18355 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
18356 continue;
18357 Budget += BlockSize;
18358 if (Budget > BudgetLimit)
18359 return Res;
18360 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
18361 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
18362 BB->getTerminator()))
18363 continue;
18364 Worklist.append(pred_begin(BB), pred_end(BB));
18365 }
18366 // Worklist drained without ever reaching OpParent: every path between
18367 // Root and OpParent is blocked by a non-vec call.
18368 return Res;
18369 };
18370 SmallVector<const TreeEntry *> LiveEntries(1, Root);
18371 auto FindNonScalarParentEntry = [&](const TreeEntry *E) -> const TreeEntry * {
18372 assert(ScalarOrPseudoEntries.contains(E) &&
18373 "Expected scalar or pseudo entry.");
18374 const TreeEntry *Entry = E;
18375 while (Entry->UserTreeIndex) {
18376 Entry = Entry->UserTreeIndex.UserTE;
18377 if (!ScalarOrPseudoEntries.contains(Entry))
18378 return Entry;
18379 }
18380 return nullptr;
18381 };
18382 while (!LiveEntries.empty()) {
18383 const TreeEntry *Entry = LiveEntries.pop_back_val();
18384 const auto OpIt = EntriesToOperands.find(Entry);
18385 if (OpIt == EntriesToOperands.end())
18386 continue;
18387 ArrayRef<const TreeEntry *> Operands = OpIt->second;
18388 if (Operands.empty())
18389 continue;
18390 if (ScalarOrPseudoEntries.contains(Entry)) {
18391 Entry = FindNonScalarParentEntry(Entry);
18392 if (!Entry) {
18393 for (const TreeEntry *Op : Operands) {
18394 if (!Op->isGather())
18395 LiveEntries.push_back(Op);
18396 }
18397 continue;
18398 }
18399 }
18400 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
18401 BasicBlock *Parent = LastInst->getParent();
18402 for (const TreeEntry *Op : Operands) {
18403 if (!Op->isGather())
18404 LiveEntries.push_back(Op);
18405 if (ScalarOrPseudoEntries.contains(Op))
18406 continue;
18407 if (Entry->State == TreeEntry::SplitVectorize ||
18408 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
18409 (Op->isGather() && allConstant(Op->Scalars)))
18410 continue;
18411 Budget = 0;
18412 BasicBlock *Pred = nullptr;
18413 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
18414 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
18415 BasicBlock *OpParent;
18416 Instruction *OpLastInst;
18417 if (Op->isGather()) {
18418 assert(Entry->getOpcode() == Instruction::PHI &&
18419 "Expected phi node only.");
18420 OpParent = cast<PHINode>(Entry->getMainOp())
18421 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
18422 OpLastInst = OpParent->getTerminator();
18423 for (Value *V : Op->Scalars) {
18424 auto *Inst = dyn_cast<Instruction>(V);
18425 if (!Inst)
18426 continue;
18427 if (isVectorized(V)) {
18428 OpParent = Inst->getParent();
18429 OpLastInst = Inst;
18430 break;
18431 }
18432 }
18433 } else {
18434 OpLastInst = EntriesToLastInstruction.at(Op);
18435 OpParent = OpLastInst->getParent();
18436 }
18437 // Check the call instructions within the same basic blocks.
18438 if (OpParent == Parent) {
18439 if (Entry->getOpcode() == Instruction::PHI) {
18440 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
18441 AddCosts(Op);
18442 continue;
18443 }
18444 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
18445 AddCosts(Op);
18446 continue;
18447 }
18448 // Check for call instruction in between blocks.
18449 // 1. Check entry's block to the head.
18450 if (Entry->getOpcode() != Instruction::PHI &&
18451 !CheckForNonVecCallsInSameBlock(
18452 &*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
18453 AddCosts(Op);
18454 continue;
18455 }
18456 // 2. Check op's block from the end.
18457 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
18458 OpParent->getTerminator())) {
18459 AddCosts(Op);
18460 continue;
18461 }
18462 // 3. Check the predecessors of entry's block till op's block.
18463 if (!CheckPredecessors(Parent, Pred, OpParent)) {
18464 AddCosts(Op);
18465 continue;
18466 }
18467 }
18468 }
18469
18470 return Cost;
18471}
18472
18473/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
18474/// buildvector sequence.
18476 const InsertElementInst *IE2) {
18477 if (IE1 == IE2)
18478 return false;
18479 const auto *I1 = IE1;
18480 const auto *I2 = IE2;
18481 const InsertElementInst *PrevI1;
18482 const InsertElementInst *PrevI2;
18483 unsigned Idx1 = *getElementIndex(IE1);
18484 unsigned Idx2 = *getElementIndex(IE2);
18485 do {
18486 if (I2 == IE1)
18487 return true;
18488 if (I1 == IE2)
18489 return false;
18490 PrevI1 = I1;
18491 PrevI2 = I2;
18492 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
18493 getElementIndex(I1).value_or(Idx2) != Idx2)
18494 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
18495 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
18496 getElementIndex(I2).value_or(Idx1) != Idx1)
18497 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
18498 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
18499 llvm_unreachable("Two different buildvectors not expected.");
18500}
18501
18502namespace {
18503/// Returns incoming Value *, if the requested type is Value * too, or a default
18504/// value, otherwise.
18505struct ValueSelect {
18506 template <typename U>
18507 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
18508 return V;
18509 }
18510 template <typename U>
18511 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
18512 return U();
18513 }
18514};
18515} // namespace
18516
18517/// Does the analysis of the provided shuffle masks and performs the requested
18518/// actions on the vectors with the given shuffle masks. It tries to do it in
18519/// several steps.
18520/// 1. If the Base vector is not undef vector, resizing the very first mask to
18521/// have common VF and perform action for 2 input vectors (including non-undef
18522/// Base). Other shuffle masks are combined with the resulting after the 1 stage
18523/// and processed as a shuffle of 2 elements.
18524/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
18525/// action only for 1 vector with the given mask, if it is not the identity
18526/// mask.
18527/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
18528/// vectors, combing the masks properly between the steps.
18529template <typename T>
18531 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
18532 function_ref<unsigned(T *)> GetVF,
18533 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
18535 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
18536 SmallVector<int> Mask(ShuffleMask.begin()->second);
18537 auto VMIt = std::next(ShuffleMask.begin());
18538 T *Prev = nullptr;
18539 SmallBitVector UseMask =
18540 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
18541 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
18542 if (!IsBaseUndef.all()) {
18543 // Base is not undef, need to combine it with the next subvectors.
18544 std::pair<T *, bool> Res =
18545 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
18546 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
18547 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
18548 if (Mask[Idx] == PoisonMaskElem)
18549 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
18550 else
18551 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
18552 }
18553 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
18554 assert((!V || GetVF(V) == Mask.size()) &&
18555 "Expected base vector of VF number of elements.");
18556 Prev = Action(Mask, {nullptr, Res.first});
18557 } else if (ShuffleMask.size() == 1) {
18558 // Base is undef and only 1 vector is shuffled - perform the action only for
18559 // single vector, if the mask is not the identity mask.
18560 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
18561 /*ForSingleMask=*/true);
18562 if (Res.second)
18563 // Identity mask is found.
18564 Prev = Res.first;
18565 else
18566 Prev = Action(Mask, {ShuffleMask.begin()->first});
18567 } else {
18568 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
18569 // shuffles step by step, combining shuffle between the steps.
18570 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
18571 unsigned Vec2VF = GetVF(VMIt->first);
18572 if (Vec1VF == Vec2VF) {
18573 // No need to resize the input vectors since they are of the same size, we
18574 // can shuffle them directly.
18575 ArrayRef<int> SecMask = VMIt->second;
18576 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
18577 if (SecMask[I] != PoisonMaskElem) {
18578 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
18579 Mask[I] = SecMask[I] + Vec1VF;
18580 }
18581 }
18582 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
18583 } else {
18584 // Vectors of different sizes - resize and reshuffle.
18585 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
18586 /*ForSingleMask=*/false);
18587 std::pair<T *, bool> Res2 =
18588 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
18589 ArrayRef<int> SecMask = VMIt->second;
18590 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
18591 if (Mask[I] != PoisonMaskElem) {
18592 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
18593 if (Res1.second)
18594 Mask[I] = I;
18595 } else if (SecMask[I] != PoisonMaskElem) {
18596 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
18597 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
18598 }
18599 }
18600 Prev = Action(Mask, {Res1.first, Res2.first});
18601 }
18602 VMIt = std::next(VMIt);
18603 }
18604 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
18605 // Perform requested actions for the remaining masks/vectors.
18606 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
18607 // Shuffle other input vectors, if any.
18608 std::pair<T *, bool> Res =
18609 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
18610 ArrayRef<int> SecMask = VMIt->second;
18611 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
18612 if (SecMask[I] != PoisonMaskElem) {
18613 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
18614 "Multiple uses of scalars.");
18615 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
18616 } else if (Mask[I] != PoisonMaskElem) {
18617 Mask[I] = I;
18618 }
18619 }
18620 Prev = Action(Mask, {Prev, Res.first});
18621 }
18622 return Prev;
18623}
18624
18626 ArrayRef<Value *> VectorizedVals) {
18628 SmallPtrSet<Value *, 4> CheckedExtracts;
18629 SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
18631 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
18632 << VectorizableTree.size() << ".\n");
18633 // The V-only-dependent part of the predicate. Same V is commonly seen in
18634 // multiple TEs (shared scalars), so cache the result across calls.
18635 // DeletedNodes is read-only during this cost loop, so caching is safe.
18636 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
18637 SmallDenseMap<Value *, bool> ExternalUseVCache;
18638 auto IsExternallyUsedV = [&](Value *V) {
18639 auto [It, Inserted] = ExternalUseVCache.try_emplace(V);
18640 if (!Inserted)
18641 return It->second;
18642 bool Res = false;
18643 if (V->hasOneUse() || V->getType()->isVoidTy()) {
18644 // Res stays false.
18645 } else if (V->hasNUsesOrMore(NumVectScalars)) {
18646 Res = true;
18647 } else if (auto *I = dyn_cast<Instruction>(V)) {
18648 Res = any_of(I->users(), [&](const User *U) {
18649 // store/insertelt v, [cast]U will likely be vectorized.
18650 if (match(U,
18651 m_InsertElt(m_Value(), m_OneUse(m_CastOrSelf(m_Specific(I))),
18652 m_ConstantInt())))
18653 return false;
18654 if (match(U, m_InsertElt(m_Value(), m_Specific(I), m_ConstantInt())))
18655 return false;
18656 if (match(U, m_Store(m_OneUse(m_CastOrSelf(m_Specific(I))), m_Value())))
18657 return false;
18658 if (match(U, m_Store(m_Specific(I), m_Value())))
18659 return false;
18660 ArrayRef<TreeEntry *> Entries = getTreeEntries(U);
18661 if (Entries.empty() && !MustGather.contains(U))
18662 return true;
18663 if (any_of(Entries,
18664 [&](TreeEntry *TE) { return DeletedNodes.contains(TE); }))
18665 return true;
18666 return any_of(ValueToGatherNodes.lookup(U), [&](const TreeEntry *TE) {
18667 return DeletedNodes.contains(TE);
18668 });
18669 });
18670 }
18671 It->second = Res;
18672 return Res;
18673 };
18674 auto IsExternallyUsed = [&](const TreeEntry &TE, Value *V) {
18675 assert(TE.hasState() && !TE.isGather() &&
18676 TE.State != TreeEntry::SplitVectorize && "Expected vector node.");
18677 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
18678 return false;
18679 return IsExternallyUsedV(V);
18680 };
18682 InstructionCost Cost = 0;
18684 uint64_t PrevScale = 0;
18685 BasicBlock *PrevVecParent = nullptr;
18686 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
18687 TreeEntry &TE = *Ptr;
18688 // No need to count the cost for combined entries, they are combined and
18689 // just skip their cost.
18690 if (TE.State == TreeEntry::CombinedVectorize) {
18691 LLVM_DEBUG(
18692 dbgs() << "SLP: Skipping cost for combined node that starts with "
18693 << *TE.Scalars[0] << ".\n";
18694 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
18695 NodesCosts.try_emplace(&TE);
18696 continue;
18697 }
18698 if (TE.hasState() &&
18699 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
18700 if (const TreeEntry *E =
18701 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
18702 E && E->getVectorFactor() == TE.getVectorFactor()) {
18703 // Some gather nodes might be absolutely the same as some vectorizable
18704 // nodes after reordering, need to handle it.
18705 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
18706 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
18707 << "SLP: Current total cost = " << Cost << "\n");
18708 NodesCosts.try_emplace(&TE);
18709 continue;
18710 }
18711 }
18712
18713 // Exclude cost of gather loads nodes which are not used. These nodes were
18714 // built as part of the final attempt to vectorize gathered loads.
18715 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
18716 "Expected gather nodes with users only.");
18717
18718 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
18719 uint64_t Scale = 0;
18720 bool CostIsFree = C == 0;
18721 // For gather/buildvector (and split-vectorize) entries, prefer the
18722 // per-lane refined scale that accounts for LICM-hoistable insertelements
18723 // when an operand is invariant in the current loop nest but defined in
18724 // an outer loop. This prevents over-costing cross-loop-nest buildvectors.
18725 const bool IsGatherLike =
18726 TE.isGather() || TE.State == TreeEntry::SplitVectorize;
18727 if (!CostIsFree && !TE.isGather() && TE.hasState()) {
18728 if (PrevVecParent == TE.getMainOp()->getParent()) {
18729 Scale = PrevScale;
18730 C *= Scale;
18731 EntryToScale.try_emplace(&TE, Scale);
18732 }
18733 }
18734 if (!CostIsFree && !Scale) {
18735 Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE)
18736 : getScaleToLoopIterations(TE);
18737 C *= Scale;
18738 EntryToScale.try_emplace(&TE, Scale);
18739 if (!TE.isGather() && TE.hasState()) {
18740 PrevVecParent = TE.getMainOp()->getParent();
18741 PrevScale = Scale;
18742 }
18743 }
18744 Cost += C;
18745 NodesCosts.try_emplace(&TE, C);
18746 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
18747 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
18748 << "SLP: Current total cost = " << Cost << "\n");
18749 // Add gathered loads nodes to the set for later processing.
18750 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
18751 TE.getOpcode() == Instruction::Load)
18752 GatheredLoadsNodes.insert(&TE);
18753 if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize &&
18754 !(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement ||
18755 TE.getOpcode() == Instruction::Store))) {
18756 // Calculate costs of external uses.
18757 APInt DemandedElts = APInt::getZero(TE.getVectorFactor());
18758 for (Value *V : TE.Scalars) {
18759 if (IsExternallyUsed(TE, V))
18760 DemandedElts.setBit(TE.findLaneForValue(V));
18761 }
18762 if (!DemandedElts.isZero()) {
18763 Type *ScalarTy = TE.Scalars.front()->getType();
18764 auto It = MinBWs.find(&TE);
18765 if (It != MinBWs.end())
18766 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
18767 auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
18769 *TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/false,
18770 /*Extract=*/true, CostKind);
18771 if (ExtCost.isValid() && ExtCost != 0) {
18772 if (!Scale)
18773 Scale = getScaleToLoopIterations(TE);
18774 ExtCost *= Scale;
18775 EntryToScale.try_emplace(&TE, Scale);
18776 }
18777 ExtractCosts.try_emplace(&TE, ExtCost);
18778 }
18779 }
18780 }
18781 // Bail out if the cost threshold is negative and cost already below it.
18782 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
18784 return Cost;
18785 // The narrow non-profitable tree in loop? Skip, may cause regressions.
18786 constexpr unsigned PartLimit = 2;
18787 const unsigned Sz =
18788 getVectorElementSize(VectorizableTree.front()->Scalars.front());
18789 const unsigned MinVF = getMinVF(Sz);
18790 if (Cost >= -SLPCostThreshold &&
18791 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
18792 (!VectorizableTree.front()->hasState() ||
18793 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
18794 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
18795 return Cost;
18796 // Store the cost + external uses estimation as the first element of the
18797 // tuple, just the cost as the second element of the tuple. Required to return
18798 // correct cost estimation for the tree, extracts are calculated separately.
18799 // Extracts, calculated here, are just quick estimations.
18801 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
18802 SubtreeCosts(VectorizableTree.size());
18803 auto UpdateParentNodes =
18804 [&](const TreeEntry *UserTE, const TreeEntry *TE,
18806 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
18807 &VisitedUser,
18808 bool AddToList = true) {
18809 while (UserTE &&
18810 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
18811 std::get<0>(SubtreeCosts[UserTE->Idx]) += TotalCost;
18812 std::get<1>(SubtreeCosts[UserTE->Idx]) += Cost;
18813 if (AddToList)
18814 std::get<2>(SubtreeCosts[UserTE->Idx]).push_back(TE->Idx);
18815 UserTE = UserTE->UserTreeIndex.UserTE;
18816 }
18817 };
18818 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
18819 TreeEntry &TE = *Ptr;
18820 InstructionCost C = NodesCosts.at(&TE);
18821 InstructionCost ExtractCost = ExtractCosts.lookup(&TE);
18822 std::get<0>(SubtreeCosts[TE.Idx]) += C + ExtractCost;
18823 std::get<1>(SubtreeCosts[TE.Idx]) += C;
18824 if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
18825 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
18826 VisitedUser;
18827 UpdateParentNodes(UserTE, &TE, C + ExtractCost, C, VisitedUser);
18828 }
18829 }
18830 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
18831 for (TreeEntry *TE : GatheredLoadsNodes) {
18832 InstructionCost TotalCost = std::get<0>(SubtreeCosts[TE->Idx]);
18833 InstructionCost Cost = std::get<1>(SubtreeCosts[TE->Idx]);
18834 for (Value *V : TE->Scalars) {
18835 for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
18836 UpdateParentNodes(BVTE, TE, TotalCost, Cost, Visited,
18837 /*AddToList=*/false);
18838 }
18839 }
18840 Visited.clear();
18841 using CostIndicesTy =
18842 std::pair<TreeEntry *, std::tuple<InstructionCost, InstructionCost,
18843 SmallVector<unsigned>>>;
18844 struct FirstGreater {
18845 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
18846 return std::get<0>(LHS.second) < std::get<0>(RHS.second) ||
18847 (std::get<0>(LHS.second) == std::get<0>(RHS.second) &&
18848 LHS.first->Idx < RHS.first->Idx);
18849 }
18850 };
18851 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
18852 Worklist;
18853 for (const auto [Idx, P] : enumerate(SubtreeCosts))
18854 Worklist.emplace(VectorizableTree[Idx].get(), P);
18855
18856 // Narrow store trees with non-profitable immediate values - exit.
18857 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
18858 VectorizableTree.front()->hasState() &&
18859 VectorizableTree.front()->getOpcode() == Instruction::Store &&
18860 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
18861 return Cost;
18862
18863 bool Changed = false;
18864 bool PreferTrimmedTree = false;
18865 while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) {
18866 TreeEntry *TE = Worklist.top().first;
18867 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
18868 // Exit early if the parent node is split node and any of scalars is
18869 // used in other split nodes.
18870 (TE->UserTreeIndex &&
18871 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
18872 any_of(TE->Scalars, [&](Value *V) {
18873 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
18874 return Entries.size() > 1;
18875 }))) {
18876 Worklist.pop();
18877 continue;
18878 }
18879 // Skip inversed compare nodes, they cannot be transformed to buildvectors.
18880 if (TE->State == TreeEntry::Vectorize && !TE->isAltShuffle() &&
18881 (TE->getOpcode() == Instruction::ICmp ||
18882 TE->getOpcode() == Instruction::FCmp) &&
18883 any_of(TE->Scalars, [&](Value *V) {
18884 auto *I = dyn_cast<CmpInst>(V);
18885 if (!I)
18886 return false;
18887 return I->getPredicate() !=
18888 cast<CmpInst>(TE->getMainOp())->getPredicate();
18889 })) {
18890 Worklist.pop();
18891 continue;
18892 }
18893
18894 // Calculate the gather cost of the root node.
18895 InstructionCost TotalSubtreeCost = std::get<0>(Worklist.top().second);
18896 InstructionCost SubtreeCost = std::get<1>(Worklist.top().second);
18897 if (TotalSubtreeCost < TE->Scalars.size()) {
18898 Worklist.pop();
18899 continue;
18900 }
18901 if (!TransformedToGatherNodes.empty()) {
18902 for (unsigned Idx : std::get<2>(Worklist.top().second)) {
18903 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
18904 if (It != TransformedToGatherNodes.end()) {
18905 TotalSubtreeCost -= std::get<0>(SubtreeCosts[Idx]);
18906 SubtreeCost -= std::get<1>(SubtreeCosts[Idx]);
18907 TotalSubtreeCost += It->second;
18908 SubtreeCost += It->second;
18909 }
18910 }
18911 }
18912 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
18913 Worklist.pop();
18914 continue;
18915 }
18916 const unsigned EntryVF = TE->getVectorFactor();
18917 APInt DemandedElts = APInt::getZero(EntryVF);
18918 for (auto [Idx, V] : enumerate(TE->Scalars)) {
18919 if (!isConstant(V))
18920 DemandedElts.setBit(Idx);
18921 }
18922
18923 Type *ScalarTy = getValueType(TE->Scalars.front());
18924 auto It = MinBWs.find(TE);
18925 if (It != MinBWs.end())
18926 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
18927 auto *VecTy = getWidenedType(ScalarTy, EntryVF);
18929 *TTI, ScalarTy, VecTy, DemandedElts,
18930 /*Insert=*/true, /*Extract=*/false, CostKind);
18931 SmallVector<int> Mask;
18932 if (!TE->ReorderIndices.empty() &&
18933 TE->State != TreeEntry::CompressVectorize &&
18934 (TE->State != TreeEntry::StridedVectorize ||
18935 !isReverseOrder(TE->ReorderIndices))) {
18936 SmallVector<int> NewMask;
18937 if (TE->getOpcode() == Instruction::Store) {
18938 // For stores the order is actually a mask.
18939 NewMask.resize(TE->ReorderIndices.size());
18940 copy(TE->ReorderIndices, NewMask.begin());
18941 } else {
18942 inversePermutation(TE->ReorderIndices, NewMask);
18943 }
18944 ::addMask(Mask, NewMask);
18945 }
18946 if (!TE->ReuseShuffleIndices.empty())
18947 ::addMask(Mask, TE->ReuseShuffleIndices);
18948 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
18949 GatherCost +=
18950 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask);
18951 // If all scalars are reused in gather node(s) or other vector nodes, there
18952 // might be extra cost for inserting them.
18953 if ((!TE->hasState() || !TE->isAltShuffle()) &&
18954 all_of(TE->Scalars, [&](Value *V) {
18955 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
18956 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
18957 }))
18958 GatherCost *= 2;
18959 // Erase subtree if it is non-profitable.
18960 ArrayRef<unsigned> Nodes = std::get<2>(Worklist.top().second);
18961 // Prefer trimming equal-cost alternate-shuffle subtrees rooted at binary
18962 // ops: alt-shuffles introduce runtime shuffle overhead that the cost model
18963 // may underestimate. Skip if the subtree contains ExtractElement nodes,
18964 // since those operate on already-materialized vectors where the cost model
18965 // is more accurate.
18966 auto IsEqualCostAltShuffleToTrim = [&]() {
18967 return TotalSubtreeCost == GatherCost && TE->hasState() &&
18968 TE->isAltShuffle() && Instruction::isBinaryOp(TE->getOpcode()) &&
18969 none_of(Nodes, [&](unsigned Idx) {
18970 return VectorizableTree[Idx]->hasState() &&
18971 VectorizableTree[Idx]->getOpcode() ==
18972 Instruction::ExtractElement;
18973 });
18974 };
18975 // Non-power-of-2 entries may have inflated costs - add a margin of 1
18976 // before trimming to avoid over-pruning.
18977 bool HasNonPowerOf2 = any_of(Nodes, [&](unsigned Idx) {
18978 return !has_single_bit(VectorizableTree[Idx]->Scalars.size());
18979 });
18980 InstructionCost TrimMargin = HasNonPowerOf2 ? 1 : 0;
18981 if (TotalSubtreeCost > GatherCost + TrimMargin ||
18982 IsEqualCostAltShuffleToTrim()) {
18983 PreferTrimmedTree |= TotalSubtreeCost == GatherCost;
18984 // If the remaining tree is just a buildvector - exit, it will cause
18985 // endless attempts to vectorize. When the tree is already profitable,
18986 // skip trimming this node and let the post-loop logic (including
18987 // gathered loads processing) decide.
18988 if (VectorizableTree.front()->hasState() &&
18989 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
18990 TE->Idx == 1) {
18991 if (Cost < -SLPCostThreshold) {
18992 LLVM_DEBUG(dbgs() << "SLP: Skipping trim of node " << TE->Idx
18993 << " - tree already profitable with cost " << Cost
18994 << ".\n");
18995 Worklist.pop();
18996 continue;
18997 }
18998 return InstructionCost::getInvalid();
18999 }
19000
19001 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
19002 << TE->Idx << " with cost "
19003 << std::get<0>(Worklist.top().second)
19004 << " and gather cost " << GatherCost << ".\n");
19005 if (TE->UserTreeIndex) {
19006 TransformedToGatherNodes.try_emplace(TE, GatherCost);
19007 NodesCosts.erase(TE);
19008 } else {
19009 DeletedNodes.insert(TE);
19010 TransformedToGatherNodes.erase(TE);
19011 NodesCosts.erase(TE);
19012 }
19013 for (unsigned Idx : Nodes) {
19014 TreeEntry &ChildTE = *VectorizableTree[Idx];
19015 DeletedNodes.insert(&ChildTE);
19016 TransformedToGatherNodes.erase(&ChildTE);
19017 NodesCosts.erase(&ChildTE);
19018 }
19019 Changed = true;
19020 }
19021 Worklist.pop();
19022 }
19023 if (!Changed)
19024 return std::get<1>(SubtreeCosts.front());
19025
19026 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
19027 InstructionCost LoadsExtractsCost = 0;
19028 // Check if all loads of gathered loads nodes are marked for deletion. In this
19029 // case the whole gathered loads subtree must be deleted.
19030 // Also, try to account for extracts, which might be required, if only part of
19031 // gathered load must be vectorized. Keep partially vectorized nodes, if
19032 // extracts are cheaper than gathers.
19033 for (TreeEntry *TE : GatheredLoadsNodes) {
19034 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
19035 continue;
19036 GatheredLoadsToDelete.insert(TE);
19037 APInt DemandedElts = APInt::getZero(TE->getVectorFactor());
19038 // All loads are removed from gathered? Need to delete the subtree.
19039 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
19040 for (Value *V : TE->Scalars) {
19041 unsigned Pos = TE->findLaneForValue(V);
19042 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
19043 if (DeletedNodes.contains(BVE))
19044 continue;
19045 DemandedElts.setBit(Pos);
19046 ValuesToInsert.try_emplace(BVE).first->second.push_back(V);
19047 }
19048 }
19049 if (!DemandedElts.isZero()) {
19050 Type *ScalarTy = TE->Scalars.front()->getType();
19051 auto It = MinBWs.find(TE);
19052 if (It != MinBWs.end())
19053 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
19054 auto *VecTy = getWidenedType(ScalarTy, TE->getVectorFactor());
19056 *TTI, ScalarTy, VecTy, DemandedElts,
19057 /*Insert=*/false, /*Extract=*/true, CostKind);
19058 InstructionCost BVCost = 0;
19059 for (const auto &[BVE, Values] : ValuesToInsert) {
19060 APInt BVDemandedElts = APInt::getZero(BVE->getVectorFactor());
19061 SmallVector<Value *> BVValues(BVE->getVectorFactor(),
19062 PoisonValue::get(ScalarTy));
19063 for (Value *V : Values) {
19064 unsigned Pos = BVE->findLaneForValue(V);
19065 BVValues[Pos] = V;
19066 BVDemandedElts.setBit(Pos);
19067 }
19068 auto *BVVecTy = getWidenedType(ScalarTy, BVE->getVectorFactor());
19070 *TTI, ScalarTy, BVVecTy, BVDemandedElts,
19071 /*Insert=*/true, /*Extract=*/false, CostKind,
19072 BVDemandedElts.isAllOnes(), BVValues);
19073 }
19074 if (ExtractsCost < BVCost) {
19075 LoadsExtractsCost += ExtractsCost;
19076 GatheredLoadsToDelete.erase(TE);
19077 continue;
19078 }
19079 LoadsExtractsCost += BVCost;
19080 }
19081 NodesCosts.erase(TE);
19082 }
19083
19084 // Deleted all subtrees rooted at gathered loads nodes.
19085 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19086 if (TE->UserTreeIndex &&
19087 GatheredLoadsToDelete.contains(TE->UserTreeIndex.UserTE)) {
19088 DeletedNodes.insert(TE.get());
19089 NodesCosts.erase(TE.get());
19090 GatheredLoadsToDelete.insert(TE.get());
19091 }
19092 if (GatheredLoadsToDelete.contains(TE.get()))
19093 DeletedNodes.insert(TE.get());
19094 }
19095
19096 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19097 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
19098 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
19099 continue;
19100 }
19101 if (DeletedNodes.contains(TE.get()))
19102 continue;
19103 if (!NodesCosts.contains(TE.get())) {
19105 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
19106 if (!C.isValid() || C == 0) {
19107 NodesCosts.try_emplace(TE.get(), C);
19108 continue;
19109 }
19110 uint64_t Scale = EntryToScale.lookup(TE.get());
19111 if (!Scale) {
19112 const bool IsGatherLike =
19113 TE->isGather() || TE->State == TreeEntry::SplitVectorize;
19114 Scale = IsGatherLike ? getGatherNodeEffectiveScale(*TE.get())
19115 : getScaleToLoopIterations(*TE.get());
19116 }
19117 C *= Scale;
19118 NodesCosts.try_emplace(TE.get(), C);
19119 }
19120 }
19121
19122 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
19123 InstructionCost NewCost = 0;
19124 for (const auto &P : NodesCosts) {
19125 NewCost += P.second;
19126 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
19127 << shortBundleName(P.first->Scalars, P.first->Idx)
19128 << ".\n"
19129 << "SLP: Current total cost = " << NewCost << "\n");
19130 }
19131 if (NewCost + LoadsExtractsCost > Cost ||
19132 (!PreferTrimmedTree && NewCost + LoadsExtractsCost == Cost)) {
19133 DeletedNodes.clear();
19134 TransformedToGatherNodes.clear();
19135 NewCost = Cost;
19136 } else {
19137 // If the remaining tree is just a buildvector - exit, it will cause
19138 // endless attempts to vectorize.
19139 if (VectorizableTree.size() >= 2 && VectorizableTree.front()->hasState() &&
19140 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19141 TransformedToGatherNodes.contains(VectorizableTree[1].get()))
19142 return InstructionCost::getInvalid();
19143 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
19144 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19145 VectorizableTree[1]->hasState() &&
19146 VectorizableTree[1]->State == TreeEntry::Vectorize &&
19147 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
19148 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
19149 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
19150 TransformedToGatherNodes.contains(VectorizableTree[2].get()))
19151 return InstructionCost::getInvalid();
19152 }
19153 return NewCost;
19154}
19155
19156namespace {
19157/// Data type for handling buildvector sequences with the reused scalars from
19158/// other tree entries.
19159template <typename T> struct ShuffledInsertData {
19160 /// List of insertelements to be replaced by shuffles.
19161 SmallVector<InsertElementInst *> InsertElements;
19162 /// The parent vectors and shuffle mask for the given list of inserts.
19163 MapVector<T, SmallVector<int>> ValueMasks;
19164};
19165} // namespace
19166
19168 ArrayRef<Value *> VectorizedVals,
19169 InstructionCost ReductionCost,
19170 Instruction *RdxRoot) {
19171 // Reject vectorization if the vector code would produce more instructions
19172 // than the scalar code. The cost model may underestimate overhead from
19173 // shuffles, inserts, and extracts.
19174 // FIXME: remove this as soon as correct fractional model is landed for all
19175 // targets.
19176 if (SLPInstCountCheck && VectorizableTree.front()->getVectorFactor() == 2 &&
19177 SLPCostThreshold == 0 &&
19178 (!SLPReVec ||
19180 VectorizableTree.front()->Scalars.front()->getType()))) {
19181 unsigned NumScalar = getNumScalarInsts();
19182 unsigned NumVector = getNumVectorInsts();
19183 LLVM_DEBUG(dbgs() << "SLP: Inst count check: vector=" << NumVector
19184 << " scalar=" << NumScalar << "\n");
19185 if (NumVector > NumScalar) {
19186 LLVM_DEBUG(dbgs() << "SLP: Rejecting tree: vector inst count "
19187 << NumVector << " > scalar inst count " << NumScalar
19188 << ".\n");
19190 }
19191 }
19192 InstructionCost Cost = TreeCost;
19193
19195 EntryToScale;
19196 auto ScaleCost = [&](InstructionCost C, const TreeEntry &TE,
19197 Value *Scalar = nullptr, Instruction *U = nullptr) {
19198 if (!C.isValid() || C == 0)
19199 return C;
19200 uint64_t &Scale =
19201 EntryToScale.try_emplace(std::make_tuple(&TE, Scalar, U), 0)
19202 .first->getSecond();
19203 if (!Scale)
19204 Scale = getScaleToLoopIterations(TE, Scalar, U);
19205 LLVM_DEBUG(dbgs() << "Scale " << Scale << " For entry " << TE.Idx << "\n");
19206 return C * Scale;
19207 };
19208 Instruction *ReductionRoot = RdxRoot;
19209 if (UserIgnoreList) {
19210 // Scale reduction cost to the factor of the loop nest trip count.
19211 ReductionCost = ScaleCost(ReductionCost, *VectorizableTree.front().get(),
19212 /*Scalar=*/nullptr, ReductionRoot);
19213 }
19214
19215 // Add the cost for reduction.
19216 Cost += ReductionCost;
19217
19218 // Skip trees, which are non-profitable even if there are insertelements with
19219 // external uses.
19220 constexpr unsigned CostLimit = 100;
19221 if (Cost >= -SLPCostThreshold + CostLimit &&
19222 (VectorizableTree.size() - DeletedNodes.size()) *
19223 VectorizableTree.front()->getVectorFactor() <
19224 CostLimit)
19225 return Cost;
19226
19227 if (Cost >= -SLPCostThreshold &&
19228 none_of(ExternalUses, [](const ExternalUser &EU) {
19229 return isa_and_nonnull<InsertElementInst>(EU.User);
19230 }))
19231 return Cost;
19232
19233 SmallPtrSet<Value *, 16> ExtractCostCalculated;
19234 InstructionCost ExtractCost = 0;
19236 SmallVector<APInt> DemandedElts;
19237 SmallDenseSet<Value *, 4> UsedInserts;
19239 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
19241 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
19242 // Keep track {Scalar, Index, User} tuple.
19243 // On AArch64, this helps in fusing a mov instruction, associated with
19244 // extractelement, with fmul in the backend so that extractelement is free.
19246 bool AllUsersGEPSWithStoresLoads = true;
19247 SmallBitVector UsedLanes(VectorizableTree.front()->getVectorFactor());
19249 Type *UserScalarTy = nullptr;
19250 for (ExternalUser &EU : ExternalUses) {
19251 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
19252 if (EU.E.Idx == 0) {
19253 UsedLanes.set(EU.Lane);
19255 if (User && User->hasOneUse() &&
19257 Type *LocalTy = getValueType(User->user_back());
19258 if (!UserScalarTy && !isa<ScalableVectorType>(LocalTy)) {
19259 UserScalarTy = LocalTy;
19260 } else if (UserScalarTy != LocalTy) {
19261 AllUsersGEPSWithStoresLoads = false;
19262 break;
19263 }
19264 Pointers.push_back(User);
19265 } else {
19266 AllUsersGEPSWithStoresLoads = false;
19267 break;
19268 }
19269 }
19270 }
19271 AllUsersGEPSWithStoresLoads &= UsedLanes.all();
19272 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
19273 for (ExternalUser &EU : ExternalUses) {
19274 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
19275 << EU.E.Idx << " in lane " << EU.Lane << "\n");
19276 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
19277 else dbgs() << " User: nullptr\n");
19278 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
19279
19280 // Uses by ephemeral values are free (because the ephemeral value will be
19281 // removed prior to code generation, and so the extraction will be
19282 // removed as well).
19283 if (EphValues.count(EU.User))
19284 continue;
19285
19286 // Check if the scalar for the given user or all users is accounted already.
19287 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
19288 (EU.User &&
19289 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
19290 continue;
19291
19292 // Used in unreachable blocks or in EH pads (rarely executed) or is
19293 // terminated with unreachable instruction.
19294 if (BasicBlock *UserParent =
19295 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
19296 UserParent &&
19297 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
19298 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
19299 continue;
19300
19301 // We only add extract cost once for the same scalar.
19302 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
19303 !ExtractCostCalculated.insert(EU.Scalar).second)
19304 continue;
19305
19306 // No extract cost for vector "scalar" if REVEC is disabled
19307 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
19308 continue;
19309
19310 // If found user is an insertelement, do not calculate extract cost but try
19311 // to detect it as a final shuffled/identity match.
19312 // TODO: what if a user is insertvalue when REVEC is enabled?
19313 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
19314 VU && VU->getOperand(1) == EU.Scalar) {
19315 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
19316 if (!UsedInserts.insert(VU).second)
19317 continue;
19318 std::optional<unsigned> InsertIdx = getElementIndex(VU);
19319 if (InsertIdx) {
19320 const TreeEntry *ScalarTE = &EU.E;
19321 auto *It = find_if(
19322 ShuffledInserts,
19323 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
19324 // Checks if 2 insertelements are from the same buildvector.
19325 InsertElementInst *VecInsert = Data.InsertElements.front();
19327 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
19328 Value *Op0 = II->getOperand(0);
19329 if (isVectorized(II) && !isVectorized(Op0))
19330 return nullptr;
19331 return Op0;
19332 });
19333 });
19334 int VecId = -1;
19335 if (It == ShuffledInserts.end()) {
19336 auto &Data = ShuffledInserts.emplace_back();
19337 Data.InsertElements.emplace_back(VU);
19338 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
19339 VecId = ShuffledInserts.size() - 1;
19340 auto It = MinBWs.find(ScalarTE);
19341 if (It != MinBWs.end() &&
19342 VectorCasts
19343 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
19344 .second) {
19345 unsigned BWSz = It->second.first;
19346 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
19347 unsigned VecOpcode;
19348 if (DstBWSz < BWSz)
19349 VecOpcode = Instruction::Trunc;
19350 else
19351 VecOpcode =
19352 It->second.second ? Instruction::SExt : Instruction::ZExt;
19354 InstructionCost C = TTI->getCastInstrCost(
19355 VecOpcode, FTy,
19356 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
19357 FTy->getNumElements()),
19359 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
19360 << " for extending externally used vector with "
19361 "non-equal minimum bitwidth.\n");
19362 Cost += C;
19363 }
19364 } else {
19365 if (isFirstInsertElement(VU, It->InsertElements.front()))
19366 It->InsertElements.front() = VU;
19367 VecId = std::distance(ShuffledInserts.begin(), It);
19368 }
19369 int InIdx = *InsertIdx;
19370 SmallVectorImpl<int> &Mask =
19371 ShuffledInserts[VecId].ValueMasks[ScalarTE];
19372 if (Mask.empty())
19373 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
19374 Mask[InIdx] = EU.Lane;
19375 DemandedElts[VecId].setBit(InIdx);
19376 continue;
19377 }
19378 }
19379 }
19380
19382 // If we plan to rewrite the tree in a smaller type, we will need to sign
19383 // extend the extracted value back to the original type. Here, we account
19384 // for the extract and the added cost of the sign extend if needed.
19385 InstructionCost ExtraCost = TTI::TCC_Free;
19386 auto *ScalarTy = EU.Scalar->getType();
19387 const unsigned BundleWidth = EU.E.getVectorFactor();
19388 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
19389 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
19390 const TreeEntry *Entry = &EU.E;
19391 auto It = MinBWs.find(Entry);
19392 if (It != MinBWs.end()) {
19393 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
19394 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
19395 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
19396 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
19397 ? Instruction::ZExt
19398 : Instruction::SExt;
19399 VecTy = getWidenedType(MinTy, BundleWidth);
19400 ExtraCost =
19401 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
19402 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
19403 << ExtraCost << "\n");
19404 } else {
19405 ExtraCost =
19406 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
19407 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
19408 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
19409 << *VecTy << ": " << ExtraCost << "\n");
19410 }
19411 // Leave the scalar instructions as is if they are cheaper than extracts.
19412 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
19413 Entry->getOpcode() == Instruction::Load) {
19414 // Checks if the user of the external scalar is phi in loop body.
19415 auto IsPhiInLoop = [&](const ExternalUser &U) {
19416 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
19417 auto *I = cast<Instruction>(U.Scalar);
19418 const Loop *L = LI->getLoopFor(Phi->getParent());
19419 return L && (Phi->getParent() == I->getParent() ||
19420 L == LI->getLoopFor(I->getParent()));
19421 }
19422 return false;
19423 };
19424 if (!ValueToExtUses) {
19425 ValueToExtUses.emplace();
19426 for (const auto &P : enumerate(ExternalUses)) {
19427 // Ignore phis in loops.
19428 if (IsPhiInLoop(P.value()))
19429 continue;
19430
19431 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
19432 }
19433 }
19434 // Can use original instruction, if no operands vectorized or they are
19435 // marked as externally used already.
19436 auto *Inst = cast<Instruction>(EU.Scalar);
19437 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
19438 auto OperandIsScalar = [&](Value *V) {
19439 if (!isVectorized(V)) {
19440 // Some extractelements might be not vectorized, but
19441 // transformed into shuffle and removed from the function,
19442 // consider it here.
19443 if (auto *EE = dyn_cast<ExtractElementInst>(V))
19444 return !EE->hasOneUse() || !MustGather.contains(EE);
19445 return true;
19446 }
19447 return ValueToExtUses->contains(V);
19448 };
19449 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
19450 bool CanBeUsedAsScalarCast = false;
19451 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
19452 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
19453 Op && all_of(Op->operands(), OperandIsScalar)) {
19454 InstructionCost OpCost =
19455 (isVectorized(Op) && !ValueToExtUses->contains(Op))
19456 ? TTI->getInstructionCost(Op, CostKind)
19457 : 0;
19458 if (ScalarCost + OpCost <= ExtraCost) {
19459 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
19460 ScalarCost += OpCost;
19461 }
19462 }
19463 }
19464 if (CanBeUsedAsScalar) {
19465 bool KeepScalar = ScalarCost <= ExtraCost;
19466 // Try to keep original scalar if the user is the phi node from the same
19467 // block as the root phis, currently vectorized. It allows to keep
19468 // better ordering info of PHIs, being vectorized currently.
19469 bool IsProfitablePHIUser =
19470 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
19471 VectorizableTree.front()->Scalars.size() > 2)) &&
19472 VectorizableTree.front()->hasState() &&
19473 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
19474 !Inst->hasNUsesOrMore(UsesLimit) &&
19475 none_of(Inst->users(),
19476 [&](User *U) {
19477 auto *PHIUser = dyn_cast<PHINode>(U);
19478 return (!PHIUser ||
19479 PHIUser->getParent() != VectorizableTree.front()
19480 ->getMainOp()
19481 ->getParent()) &&
19482 !isVectorized(U);
19483 }) &&
19484 count_if(Entry->Scalars, [&](Value *V) {
19485 return ValueToExtUses->contains(V);
19486 }) <= 2;
19487 if (IsProfitablePHIUser) {
19488 KeepScalar = true;
19489 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
19490 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
19491 (!GatheredLoadsEntriesFirst.has_value() ||
19492 Entry->Idx < *GatheredLoadsEntriesFirst)) {
19493 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
19494 return ValueToExtUses->contains(V);
19495 });
19496 auto It = ExtractsCount.find(Entry);
19497 if (It != ExtractsCount.end()) {
19498 assert(ScalarUsesCount >= It->getSecond().size() &&
19499 "Expected total number of external uses not less than "
19500 "number of scalar uses.");
19501 ScalarUsesCount -= It->getSecond().size();
19502 }
19503 // Keep original scalar if number of externally used instructions in
19504 // the same entry is not power of 2. It may help to do some extra
19505 // vectorization for now.
19506 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
19507 }
19508 if (KeepScalar) {
19509 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
19510 for (Value *V : Inst->operands()) {
19511 auto It = ValueToExtUses->find(V);
19512 if (It != ValueToExtUses->end()) {
19513 // Replace all uses to avoid compiler crash.
19514 ExternalUses[It->second].User = nullptr;
19515 }
19516 }
19517 ExtraCost = ScalarCost;
19518 if (!IsPhiInLoop(EU))
19519 ExtractsCount[Entry].insert(Inst);
19520 if (CanBeUsedAsScalarCast) {
19521 ScalarOpsFromCasts.insert(Inst->getOperand(0));
19522 // Update the users of the operands of the cast operand to avoid
19523 // compiler crash.
19524 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
19525 for (Value *V : IOp->operands()) {
19526 auto It = ValueToExtUses->find(V);
19527 if (It != ValueToExtUses->end()) {
19528 // Replace all uses to avoid compiler crash.
19529 ExternalUses[It->second].User = nullptr;
19530 }
19531 }
19532 }
19533 }
19534 }
19535 }
19536 }
19537
19538 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
19539 cast_or_null<Instruction>(EU.User));
19540
19541 ExtractCost += ExtraCost;
19542 }
19543 // Charge the pointer-chain cost difference once for the root entry when
19544 // every external use of its scalars is a GEP feeding a single load/store
19545 // (see the detection loop above). Vectorizing the root in this pattern
19546 // forces lane extracts (or a vector GEP with unknown stride) to drive the
19547 // address computation, which is typically more expensive than keeping the
19548 // indices scalar in a unit-stride address chain. Add the delta once rather
19549 // than per external use.
19550 if (AllUsersGEPSWithStoresLoads && !Pointers.empty()) {
19551 const TreeEntry &RootEntry = *VectorizableTree.front();
19552 const bool AnyRootKeptAsScalar = any_of(RootEntry.Scalars, [&](Value *V) {
19553 return ExternalUsesAsOriginalScalar.contains(V);
19554 });
19555 const Value *CommonBase = nullptr;
19556 bool HaveCommonBase = true;
19557 for (const Value *P : Pointers) {
19558 const Value *Op = getUnderlyingObject(P);
19559 if (!CommonBase)
19560 CommonBase = Op;
19561 else if (CommonBase != Op) {
19562 HaveCommonBase = false;
19563 break;
19564 }
19565 }
19566 if (!AnyRootKeptAsScalar && HaveCommonBase) {
19568 auto *VecTy = getWidenedType(UserScalarTy, RootEntry.Scalars.size());
19569 InstructionCost ScalarGEPCost = TTI->getPointersChainCost(
19570 Pointers, CommonBase, TTI::PointersChainInfo::getUnitStride(),
19571 UserScalarTy, CostKind);
19572 InstructionCost VectorGEPCost = TTI->getPointersChainCost(
19573 Pointers, CommonBase, TTI::PointersChainInfo::getUnknownStride(),
19574 VecTy, CostKind);
19575 ExtractCost += ScaleCost(VectorGEPCost - ScalarGEPCost, RootEntry);
19576 }
19577 }
19578 // Insert externals for extract of operands of casts to be emitted as scalars
19579 // instead of extractelement.
19580 for (Value *V : ScalarOpsFromCasts) {
19581 ExternalUsesAsOriginalScalar.insert(V);
19582 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
19583 const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
19584 return TransformedToGatherNodes.contains(TE) ||
19585 DeletedNodes.contains(TE);
19586 });
19587 if (It != TEs.end()) {
19588 const TreeEntry *UserTE = *It;
19589 ExternalUses.emplace_back(V, nullptr, *UserTE,
19590 UserTE->findLaneForValue(V));
19591 }
19592 }
19593 }
19594 // Add reduced value cost, if resized.
19595 if (!VectorizedVals.empty()) {
19596 const TreeEntry &Root = *VectorizableTree.front();
19597 auto BWIt = MinBWs.find(&Root);
19598 if (BWIt != MinBWs.end()) {
19599 Type *DstTy = Root.Scalars.front()->getType();
19600 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
19601 unsigned SrcSz =
19602 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
19603 if (OriginalSz != SrcSz) {
19604 unsigned Opcode = Instruction::Trunc;
19605 if (OriginalSz > SrcSz)
19606 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
19607 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
19608 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
19609 assert(SLPReVec && "Only supported by REVEC.");
19610 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
19611 }
19612 InstructionCost CastCost =
19613 TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
19616 CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
19617 Cost += CastCost;
19618 }
19619 }
19620 }
19621
19622 // Buildvector with externally used scalars, which should remain as scalars,
19623 // should not be vectorized, the compiler may hang.
19624 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
19625 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
19626 VectorizableTree[1]->hasState() &&
19627 VectorizableTree[1]->State == TreeEntry::Vectorize &&
19628 all_of(VectorizableTree[1]->Scalars, [&](Value *V) {
19629 return ExternalUsesAsOriginalScalar.contains(V);
19630 }))
19632
19633 Cost += ExtractCost;
19634 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
19635 bool ForSingleMask) {
19636 InstructionCost C = 0;
19637 unsigned VF = Mask.size();
19638 unsigned VecVF = TE->getVectorFactor();
19639 bool HasLargeIndex =
19640 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
19641 if ((VF != VecVF && HasLargeIndex) ||
19643
19644 if (HasLargeIndex) {
19645 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
19646 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
19647 OrigMask.begin());
19649 getWidenedType(TE->getMainOp()->getType(), VecVF),
19650 OrigMask);
19651 LLVM_DEBUG(
19652 dbgs() << "SLP: Adding cost " << C
19653 << " for final shuffle of insertelement external users.\n";
19654 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
19655 Cost += C;
19656 return std::make_pair(TE, true);
19657 }
19658
19659 if (!ForSingleMask) {
19660 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19661 for (unsigned I = 0; I < VF; ++I) {
19662 if (Mask[I] != PoisonMaskElem)
19663 ResizeMask[Mask[I]] = Mask[I];
19664 }
19665 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
19668 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
19669 LLVM_DEBUG(
19670 dbgs() << "SLP: Adding cost " << C
19671 << " for final shuffle of insertelement external users.\n";
19672 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
19673
19674 Cost += C;
19675 }
19676 }
19677 return std::make_pair(TE, false);
19678 };
19679 // Calculate the cost of the reshuffled vectors, if any.
19680 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
19681 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
19682 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
19683 unsigned VF = 0;
19684 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
19686 assert((TEs.size() == 1 || TEs.size() == 2) &&
19687 "Expected exactly 1 or 2 tree entries.");
19688 if (TEs.size() == 1) {
19689 if (VF == 0)
19690 VF = TEs.front()->getVectorFactor();
19691 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
19692 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
19693 !all_of(enumerate(Mask), [=](const auto &Data) {
19694 return Data.value() == PoisonMaskElem ||
19695 (Data.index() < VF &&
19696 static_cast<int>(Data.index()) == Data.value());
19697 })) {
19700 C = ScaleCost(C, *TEs.front());
19701 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
19702 << " for final shuffle of insertelement "
19703 "external users.\n";
19704 TEs.front()->dump();
19705 dbgs() << "SLP: Current total cost = " << Cost << "\n");
19706 Cost += C;
19707 }
19708 } else {
19709 if (VF == 0) {
19710 if (TEs.front() &&
19711 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
19712 VF = TEs.front()->getVectorFactor();
19713 else
19714 VF = Mask.size();
19715 }
19716 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
19718 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
19719 C = ScaleCost(C, *TEs.back());
19720 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
19721 << " for final shuffle of vector node and external "
19722 "insertelement users.\n";
19723 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
19724 dbgs() << "SLP: Current total cost = " << Cost << "\n");
19725 Cost += C;
19726 }
19727 VF = Mask.size();
19728 return TEs.back();
19729 };
19731 MutableArrayRef(Vector.data(), Vector.size()), Base,
19732 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
19733 EstimateShufflesCost);
19734 InstructionCost InsertCost = TTI->getScalarizationOverhead(
19736 ShuffledInserts[I].InsertElements.front()->getType()),
19737 DemandedElts[I],
19738 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
19739 Cost -= InsertCost;
19740 }
19741
19742 // Add the cost for reduced value resize (if required).
19743 if (ReductionBitWidth != 0) {
19744 assert(UserIgnoreList && "Expected reduction tree.");
19745 const TreeEntry &E = *VectorizableTree.front();
19746 auto It = MinBWs.find(&E);
19747 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
19748 unsigned SrcSize = It->second.first;
19749 unsigned DstSize = ReductionBitWidth;
19750 unsigned Opcode = Instruction::Trunc;
19751 if (SrcSize < DstSize) {
19752 bool IsArithmeticExtendedReduction =
19753 all_of(*UserIgnoreList, [](Value *V) {
19754 auto *I = cast<Instruction>(V);
19755 return is_contained({Instruction::Add, Instruction::FAdd,
19756 Instruction::Mul, Instruction::FMul,
19757 Instruction::And, Instruction::Or,
19758 Instruction::Xor},
19759 I->getOpcode());
19760 });
19761 if (IsArithmeticExtendedReduction)
19762 Opcode =
19763 Instruction::BitCast; // Handle it by getExtendedReductionCost
19764 else
19765 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19766 }
19767 if (Opcode != Instruction::BitCast) {
19768 auto *SrcVecTy =
19769 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
19770 auto *DstVecTy =
19771 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
19772 TTI::CastContextHint CCH = getCastContextHint(E);
19773 switch (E.getOpcode()) {
19774 case Instruction::SExt:
19775 case Instruction::ZExt:
19776 case Instruction::Trunc: {
19777 const TreeEntry *OpTE = getOperandEntry(&E, 0);
19778 CCH = getCastContextHint(*OpTE);
19779 break;
19780 }
19781 default:
19782 break;
19783 }
19784 InstructionCost CastCost =
19785 TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
19787 CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
19788 /*Scalar=*/nullptr, ReductionRoot);
19789 Cost += CastCost;
19790 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
19791 << " for final resize for reduction from " << SrcVecTy
19792 << " to " << DstVecTy << "\n";
19793 dbgs() << "SLP: Current total cost = " << Cost << "\n");
19794 }
19795 }
19796 }
19797
19798 std::optional<InstructionCost> SpillCost;
19799 if (Cost < -SLPCostThreshold) {
19800 SpillCost = getSpillCost();
19801 Cost += *SpillCost;
19802 }
19803#ifndef NDEBUG
19804 SmallString<256> Str;
19805 {
19806 raw_svector_ostream OS(Str);
19807 OS << "SLP: Spill Cost = ";
19808 if (SpillCost)
19809 OS << *SpillCost;
19810 else
19811 OS << "<skipped>";
19812 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n";
19813 if (ReductionRoot)
19814 OS << "SLP: Reduction Cost = " << ReductionCost << ".\n";
19815 OS << "SLP: Total Cost = " << Cost << ".\n";
19816 }
19817 LLVM_DEBUG(dbgs() << Str);
19818 if (ViewSLPTree)
19819 ViewGraph(this, "SLP" + F->getName(), false, Str);
19820#endif
19821
19822 return Cost;
19823}
19824
19825/// Tries to find extractelement instructions with constant indices from fixed
19826/// vector type and gather such instructions into a bunch, which highly likely
19827/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
19828/// successful, the matched scalars are replaced by poison values in \p VL for
19829/// future analysis.
19830std::optional<TTI::ShuffleKind>
19831BoUpSLP::tryToGatherSingleRegisterExtractElements(
19833 // Scan list of gathered scalars for extractelements that can be represented
19834 // as shuffles.
19836 SmallVector<int> UndefVectorExtracts;
19837 for (int I = 0, E = VL.size(); I < E; ++I) {
19838 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
19839 if (!EI) {
19840 if (isa<UndefValue>(VL[I]))
19841 UndefVectorExtracts.push_back(I);
19842 continue;
19843 }
19844 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
19845 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
19846 continue;
19847 std::optional<unsigned> Idx = getExtractIndex(EI);
19848 // Undefined index.
19849 if (!Idx) {
19850 UndefVectorExtracts.push_back(I);
19851 continue;
19852 }
19853 if (Idx >= VecTy->getNumElements()) {
19854 UndefVectorExtracts.push_back(I);
19855 continue;
19856 }
19857 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
19858 ExtractMask.reset(*Idx);
19859 if (isUndefVector</*IsPoisonOnly=*/true>(EI->getVectorOperand(),
19860 ExtractMask)
19861 .all()) {
19862 UndefVectorExtracts.push_back(I);
19863 continue;
19864 }
19865 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
19866 }
19867 // Sort the vector operands by the maximum number of uses in extractelements.
19869 VectorOpToIdx.takeVector();
19870 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
19871 return P1.second.size() > P2.second.size();
19872 });
19873 // Find the best pair of the vectors or a single vector.
19874 const int UndefSz = UndefVectorExtracts.size();
19875 unsigned SingleMax = 0;
19876 unsigned PairMax = 0;
19877 if (!Vectors.empty()) {
19878 SingleMax = Vectors.front().second.size() + UndefSz;
19879 if (Vectors.size() > 1) {
19880 auto *ItNext = std::next(Vectors.begin());
19881 PairMax = SingleMax + ItNext->second.size();
19882 }
19883 }
19884 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
19885 return std::nullopt;
19886 // Check if better to perform a shuffle of 2 vectors or just of a single
19887 // vector.
19888 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
19889 SmallVector<Value *> GatheredExtracts(
19890 VL.size(), PoisonValue::get(VL.front()->getType()));
19891 if (SingleMax >= PairMax && SingleMax) {
19892 for (int Idx : Vectors.front().second)
19893 std::swap(GatheredExtracts[Idx], VL[Idx]);
19894 } else if (!Vectors.empty()) {
19895 for (unsigned Idx : {0, 1})
19896 for (int Idx : Vectors[Idx].second)
19897 std::swap(GatheredExtracts[Idx], VL[Idx]);
19898 }
19899 // Add extracts from undefs too.
19900 for (int Idx : UndefVectorExtracts)
19901 std::swap(GatheredExtracts[Idx], VL[Idx]);
19902 // Check that gather of extractelements can be represented as just a
19903 // shuffle of a single/two vectors the scalars are extracted from.
19904 std::optional<TTI::ShuffleKind> Res =
19905 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
19906 if (!Res || all_of(Mask, equal_to(PoisonMaskElem))) {
19907 // TODO: try to check other subsets if possible.
19908 // Restore the original VL if attempt was not successful.
19909 copy(SavedVL, VL.begin());
19910 return std::nullopt;
19911 }
19912 // Restore unused scalars from mask, if some of the extractelements were not
19913 // selected for shuffle.
19914 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
19915 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
19916 isa<UndefValue>(GatheredExtracts[I])) {
19917 std::swap(VL[I], GatheredExtracts[I]);
19918 continue;
19919 }
19920 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
19921 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
19922 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
19923 is_contained(UndefVectorExtracts, I))
19924 continue;
19925 }
19926 return Res;
19927}
19928
19929/// Tries to find extractelement instructions with constant indices from fixed
19930/// vector type and gather such instructions into a bunch, which highly likely
19931/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
19932/// successful, the matched scalars are replaced by poison values in \p VL for
19933/// future analysis.
19935BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
19936 SmallVectorImpl<int> &Mask,
19937 unsigned NumParts) const {
19938 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
19939 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
19940 Mask.assign(VL.size(), PoisonMaskElem);
19941 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
19942 for (unsigned Part : seq<unsigned>(NumParts)) {
19943 // Scan list of gathered scalars for extractelements that can be represented
19944 // as shuffles.
19945 const unsigned PartOffset = Part * SliceSize;
19946 const unsigned PartSize = getNumElems(VL.size(), SliceSize, Part);
19947 // It may happen in case of revec, need to check no access out of bounds.
19948 if (PartOffset + PartSize > VL.size())
19949 break;
19951 MutableArrayRef(VL).slice(PartOffset, PartSize);
19952 SmallVector<int> SubMask;
19953 std::optional<TTI::ShuffleKind> Res =
19954 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
19955 ShufflesRes[Part] = Res;
19956 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
19957 if (SubVL.size() != SliceSize)
19958 break;
19959 }
19960 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
19961 return Res.has_value();
19962 }))
19963 ShufflesRes.clear();
19964 return ShufflesRes;
19965}
19966
19967std::optional<TargetTransformInfo::ShuffleKind>
19968BoUpSLP::isGatherShuffledSingleRegisterEntry(
19969 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
19970 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
19971 unsigned SliceSize) {
19972 Entries.clear();
19973 if (TE->Idx == 0)
19974 return std::nullopt;
19975 const unsigned MaskBase = Part * SliceSize;
19976 // TODO: currently checking only for Scalars in the tree entry, need to count
19977 // reused elements too for better cost estimation.
19978 auto GetUserEntry = [&](const TreeEntry *TE) {
19979 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
19980 TE = TE->UserTreeIndex.UserTE;
19981 if (TE == VectorizableTree.front().get())
19982 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
19983 return TE->UserTreeIndex;
19984 };
19985 auto HasGatherUser = [&](const TreeEntry *TE) {
19986 while (TE->Idx != 0 && TE->UserTreeIndex) {
19987 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
19988 return true;
19989 TE = TE->UserTreeIndex.UserTE;
19990 }
19991 return false;
19992 };
19993 const EdgeInfo TEUseEI = GetUserEntry(TE);
19994 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
19995 !TEUseEI.UserTE->hasState()))
19996 return std::nullopt;
19997 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
19998 const BasicBlock *TEInsertBlock = nullptr;
19999 // Main node of PHI entries keeps the correct order of operands/incoming
20000 // blocks.
20001 if (auto *PHI = dyn_cast_or_null<PHINode>(
20002 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
20003 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
20004 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
20005 TEInsertPt = TEInsertBlock->getTerminator();
20006 } else {
20007 TEInsertBlock = TEInsertPt->getParent();
20008 }
20009 if (!DT->isReachableFromEntry(TEInsertBlock))
20010 return std::nullopt;
20011 auto *NodeUI = DT->getNode(TEInsertBlock);
20012 assert(NodeUI && "Should only process reachable instructions");
20013 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
20014 auto CheckOrdering = [&](const Instruction *InsertPt) {
20015 // Argument InsertPt is an instruction where vector code for some other
20016 // tree entry (one that shares one or more scalars with TE) is going to be
20017 // generated. This lambda returns true if insertion point of vector code
20018 // for the TE dominates that point (otherwise dependency is the other way
20019 // around). The other node is not limited to be of a gather kind. Gather
20020 // nodes are not scheduled and their vector code is inserted before their
20021 // first user. If user is PHI, that is supposed to be at the end of a
20022 // predecessor block. Otherwise it is the last instruction among scalars of
20023 // the user node. So, instead of checking dependency between instructions
20024 // themselves, we check dependency between their insertion points for vector
20025 // code (since each scalar instruction ends up as a lane of a vector
20026 // instruction).
20027 const BasicBlock *InsertBlock = InsertPt->getParent();
20028 auto *NodeEUI = DT->getNode(InsertBlock);
20029 if (!NodeEUI)
20030 return false;
20031 assert((NodeUI == NodeEUI) ==
20032 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
20033 "Different nodes should have different DFS numbers");
20034 // Check the order of the gather nodes users.
20035 if (TEInsertPt->getParent() != InsertBlock &&
20036 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
20037 return false;
20038 if (TEInsertPt->getParent() == InsertBlock &&
20039 TEInsertPt->comesBefore(InsertPt))
20040 return false;
20041 return true;
20042 };
20043 // Find all tree entries used by the gathered values. If no common entries
20044 // found - not a shuffle.
20045 // Here we build a set of tree nodes for each gathered value and trying to
20046 // find the intersection between these sets. If we have at least one common
20047 // tree node for each gathered value - we have just a permutation of the
20048 // single vector. If we have 2 different sets, we're in situation where we
20049 // have a permutation of 2 input vectors.
20051 SmallDenseMap<Value *, int> UsedValuesEntry;
20052 SmallPtrSet<const Value *, 16> VisitedValue;
20053 bool IsReusedNodeFound = false;
20054 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
20055 // The node is reused - exit.
20056 if (IsReusedNodeFound)
20057 return false;
20058 if ((TEPtr->getVectorFactor() != VL.size() &&
20059 TEPtr->Scalars.size() != VL.size()) ||
20060 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
20061 return false;
20062 IsReusedNodeFound =
20063 equal(TE->Scalars, TEPtr->Scalars) &&
20064 equal(TE->ReorderIndices, TEPtr->ReorderIndices) &&
20065 equal(TE->ReuseShuffleIndices, TEPtr->ReuseShuffleIndices);
20066 UsedTEs.clear();
20067 UsedTEs.emplace_back().insert(TEPtr);
20068 for (Value *V : VL) {
20069 if (isConstant(V))
20070 continue;
20071 UsedValuesEntry.try_emplace(V, 0);
20072 }
20073 return true;
20074 };
20075 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
20076 unsigned EdgeIdx) {
20077 const TreeEntry *Ptr1 = User1;
20078 const TreeEntry *Ptr2 = User2;
20079 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
20080 while (Ptr2) {
20081 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
20082 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
20083 Ptr2 = Ptr2->UserTreeIndex.UserTE;
20084 }
20085 while (Ptr1) {
20086 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
20087 Ptr1 = Ptr1->UserTreeIndex.UserTE;
20088 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
20089 return Idx < It->second;
20090 }
20091 return false;
20092 };
20093 // Cache `isUsedOutsideBlock(TEInsertPt)` - TEInsertPt is loop-invariant and
20094 // the function walks the instruction's user list.
20095 std::optional<bool> TEInsertPtUsedOutsideBlock;
20096 auto IsTEInsertPtUsedOutsideBlock = [&] {
20097 if (!TEInsertPtUsedOutsideBlock)
20098 TEInsertPtUsedOutsideBlock =
20099 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt));
20100 return *TEInsertPtUsedOutsideBlock;
20101 };
20102 // Cache the TEUseEI/TEInsertPt-only prefix of the per-call lambda predicate
20103 // below - all of these depend only on outer-scope state, not the lambda's
20104 // arguments.
20105 const bool TEUseEIInsertPtUsedOutside =
20106 TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
20107 !TEUseEI.UserTE->isCopyableElement(
20108 const_cast<Instruction *>(TEInsertPt)) &&
20109 IsTEInsertPtUsedOutsideBlock();
20110 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
20111 Instruction *InsertPt) {
20112 return TEUseEIInsertPtUsedOutside &&
20113 InsertPt->getNextNode() == TEInsertPt &&
20114 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
20115 !isUsedOutsideBlock(InsertPt));
20116 };
20117 // Cache the TEUseEI.UserTE-dependent predicate - it is invariant across the
20118 // double loop below. all_of with isUsedOutsideBlock walks each scalar's
20119 // users and is the expensive component.
20120 const bool TEUserNeedsEmitFirst =
20121 TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20122 TEUseEI.UserTE->hasState() &&
20123 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
20124 TEUseEI.UserTE->isAltShuffle()) &&
20125 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock);
20126 // Cache `all_of(UserTE->Scalars, isUsedOutsideBlock)` per UserTE - the
20127 // same UserTE may be encountered for many TEPtr values inside the loop.
20128 SmallDenseMap<const TreeEntry *, bool> ScalarsUsedOutsideBlockCache;
20129 auto AllScalarsUsedOutsideBlock = [&](const TreeEntry *UserTE) {
20130 auto [It, Inserted] = ScalarsUsedOutsideBlockCache.try_emplace(UserTE);
20131 if (!Inserted)
20132 return It->second;
20133 bool Res = all_of(UserTE->Scalars, isUsedOutsideBlock);
20134 It->second = Res;
20135 return Res;
20136 };
20137 for (Value *V : VL) {
20138 if (isConstant(V) || !VisitedValue.insert(V).second)
20139 continue;
20140 // Build a list of tree entries where V is used.
20141 SmallPtrSet<const TreeEntry *, 4> VToTEs;
20143 ValueToGatherNodes.lookup(V).takeVector());
20144 if (TransformedToGatherNodes.contains(TE)) {
20145 for (TreeEntry *E : getSplitTreeEntries(V)) {
20146 if (TE == E || !TransformedToGatherNodes.contains(E) ||
20147 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
20148 continue;
20149 GatherNodes.push_back(E);
20150 }
20151 for (TreeEntry *E : getTreeEntries(V)) {
20152 if (TE == E || !TransformedToGatherNodes.contains(E) ||
20153 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
20154 continue;
20155 GatherNodes.push_back(E);
20156 }
20157 }
20158 for (const TreeEntry *TEPtr : GatherNodes) {
20159 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
20160 continue;
20161 assert(any_of(TEPtr->Scalars,
20162 [&](Value *V) { return GatheredScalars.contains(V); }) &&
20163 "Must contain at least single gathered value.");
20164 assert(TEPtr->UserTreeIndex &&
20165 "Expected only single user of a gather node.");
20166 if (any_of(TEPtr->CombinedEntriesWithIndices,
20167 [&](const auto &P) { return P.first == TE->Idx; }))
20168 continue;
20169 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
20170
20171 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
20172 UseEI.UserTE->hasState())
20173 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
20174 : nullptr;
20175 Instruction *InsertPt =
20176 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
20177 : &getLastInstructionInBundle(UseEI.UserTE);
20178 if (TEInsertPt == InsertPt) {
20179 // Check nodes, which might be emitted first.
20180 if (TEUserNeedsEmitFirst) {
20181 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
20182 (UseEI.UserTE->hasState() &&
20183 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20184 !UseEI.UserTE->isAltShuffle()) ||
20185 !AllScalarsUsedOutsideBlock(UseEI.UserTE))
20186 continue;
20187 }
20188
20189 // If the schedulable insertion point is used in multiple entries - just
20190 // exit, no known ordering at this point, available only after real
20191 // scheduling.
20192 if (!doesNotNeedToBeScheduled(InsertPt) &&
20193 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
20194 continue;
20195 // If the users are the PHI nodes with the same incoming blocks - skip.
20196 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20197 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
20198 UseEI.UserTE->State == TreeEntry::Vectorize &&
20199 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20200 TEUseEI.UserTE != UseEI.UserTE)
20201 continue;
20202 // If 2 gathers are operands of the same entry (regardless of whether
20203 // user is PHI or else), compare operands indices, use the earlier one
20204 // as the base.
20205 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
20206 continue;
20207 // If the user instruction is used for some reason in different
20208 // vectorized nodes - make it depend on index.
20209 if (TEUseEI.UserTE != UseEI.UserTE &&
20210 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
20211 HasGatherUser(TEUseEI.UserTE)))
20212 continue;
20213 // If the user node is the operand of the other user node - skip.
20214 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
20215 continue;
20216 }
20217
20218 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
20219 TEUseEI.UserTE->doesNotNeedToSchedule() !=
20220 UseEI.UserTE->doesNotNeedToSchedule() &&
20221 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
20222 continue;
20223 // Check if the user node of the TE comes after user node of TEPtr,
20224 // otherwise TEPtr depends on TE.
20225 if ((TEInsertBlock != InsertPt->getParent() ||
20226 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
20227 (!CheckOrdering(InsertPt) ||
20228 (UseEI.UserTE->hasCopyableElements() &&
20229 IsTEInsertPtUsedOutsideBlock() &&
20230 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
20231 continue;
20232 // The node is reused - exit.
20233 if (CheckAndUseSameNode(TEPtr))
20234 break;
20235 // The parent node is copyable with last inst used outside? And the last
20236 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
20237 // preserve def-use chain.
20238 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
20239 continue;
20240 VToTEs.insert(TEPtr);
20241 }
20242 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
20243 const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
20244 return MTE != TE && MTE != TEUseEI.UserTE &&
20245 !DeletedNodes.contains(MTE) &&
20246 !TransformedToGatherNodes.contains(MTE);
20247 });
20248 if (It != VTEs.end()) {
20249 const TreeEntry *VTE = *It;
20250 if (none_of(TE->CombinedEntriesWithIndices,
20251 [&](const auto &P) { return P.first == VTE->Idx; })) {
20252 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20253 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
20254 continue;
20255 }
20256 // The node is reused - exit.
20257 if (CheckAndUseSameNode(VTE))
20258 break;
20259 VToTEs.insert(VTE);
20260 }
20261 }
20262 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
20263 const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
20264 return TE != MainTE && !DeletedNodes.contains(TE) &&
20265 !TransformedToGatherNodes.contains(TE);
20266 });
20267 if (It != VTEs.end()) {
20268 const TreeEntry *VTE = *It;
20269 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
20270 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
20271 VTEs = VTEs.drop_front();
20272 // Iterate through all vectorized nodes.
20273 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
20274 return MTE->State == TreeEntry::Vectorize;
20275 });
20276 if (MIt == VTEs.end())
20277 continue;
20278 VTE = *MIt;
20279 }
20280 if (none_of(TE->CombinedEntriesWithIndices,
20281 [&](const auto &P) { return P.first == VTE->Idx; })) {
20282 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20283 if (&LastBundleInst == TEInsertPt ||
20284 !CheckOrdering(&LastBundleInst) ||
20285 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
20286 continue;
20287 }
20288 // The node is reused - exit.
20289 if (CheckAndUseSameNode(VTE))
20290 break;
20291 VToTEs.insert(VTE);
20292 }
20293 }
20294 if (IsReusedNodeFound)
20295 break;
20296 if (VToTEs.empty())
20297 continue;
20298 if (UsedTEs.empty()) {
20299 // The first iteration, just insert the list of nodes to vector.
20300 UsedTEs.push_back(VToTEs);
20301 UsedValuesEntry.try_emplace(V, 0);
20302 } else {
20303 // Need to check if there are any previously used tree nodes which use V.
20304 // If there are no such nodes, consider that we have another one input
20305 // vector.
20306 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
20307 unsigned Idx = 0;
20308 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
20309 // Do we have a non-empty intersection of previously listed tree entries
20310 // and tree entries using current V?
20311 set_intersect(VToTEs, Set);
20312 if (!VToTEs.empty()) {
20313 // Yes, write the new subset and continue analysis for the next
20314 // scalar.
20315 Set.swap(VToTEs);
20316 break;
20317 }
20318 VToTEs = SavedVToTEs;
20319 ++Idx;
20320 }
20321 // No non-empty intersection found - need to add a second set of possible
20322 // source vectors.
20323 if (Idx == UsedTEs.size()) {
20324 // If the number of input vectors is greater than 2 - not a permutation,
20325 // fallback to the regular gather.
20326 // TODO: support multiple reshuffled nodes.
20327 if (UsedTEs.size() == 2)
20328 continue;
20329 UsedTEs.push_back(SavedVToTEs);
20330 Idx = UsedTEs.size() - 1;
20331 }
20332 UsedValuesEntry.try_emplace(V, Idx);
20333 }
20334 }
20335
20336 if (UsedTEs.empty()) {
20337 Entries.clear();
20338 return std::nullopt;
20339 }
20340
20341 unsigned VF = 0;
20342 if (UsedTEs.size() == 1) {
20343 // Keep the order to avoid non-determinism.
20344 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
20345 UsedTEs.front().end());
20346 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
20347 return TE1->Idx < TE2->Idx;
20348 });
20349 // Try to find the perfect match in another gather node at first.
20350 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
20351 return (EntryPtr->getVectorFactor() == TE->Scalars.size() &&
20352 EntryPtr->isSame(TE->Scalars)) ||
20353 EntryPtr->isSame(VL);
20354 });
20355 if (It != FirstEntries.end() &&
20356 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
20357 ((*It)->getVectorFactor() == TE->Scalars.size() &&
20358 TE->ReuseShuffleIndices.size() == VL.size() &&
20359 (*It)->isSame(TE->Scalars)))) {
20360 Entries.push_back(*It);
20361 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
20362 std::iota(std::next(Mask.begin(), MaskBase),
20363 std::next(Mask.begin(), MaskBase + VL.size()), 0);
20364 } else {
20365 SmallVector<int> CommonMask = TE->getCommonMask();
20366 copy(CommonMask, Mask.begin());
20367 }
20368 // Clear undef scalars.
20369 for (unsigned I : seq<unsigned>(VL.size()))
20370 if (isa<PoisonValue>(VL[I]))
20371 Mask[MaskBase + I] = PoisonMaskElem;
20373 }
20374 // No perfect match, just shuffle, so choose the first tree node from the
20375 // tree.
20376 Entries.push_back(FirstEntries.front());
20377 // Update mapping between values and corresponding tree entries.
20378 for (auto &P : UsedValuesEntry)
20379 P.second = 0;
20380 VF = FirstEntries.front()->getVectorFactor();
20381 } else {
20382 // Try to find nodes with the same vector factor.
20383 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
20384 // Keep the order of tree nodes to avoid non-determinism.
20385 DenseMap<int, const TreeEntry *> VFToTE;
20386 for (const TreeEntry *TE : UsedTEs.front()) {
20387 unsigned VF = TE->getVectorFactor();
20388 auto It = VFToTE.find(VF);
20389 if (It != VFToTE.end()) {
20390 if (It->second->Idx > TE->Idx)
20391 It->getSecond() = TE;
20392 continue;
20393 }
20394 VFToTE.try_emplace(VF, TE);
20395 }
20396 // Same, keep the order to avoid non-determinism.
20397 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
20398 UsedTEs.back().end());
20399 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
20400 return TE1->Idx < TE2->Idx;
20401 });
20402 for (const TreeEntry *TE : SecondEntries) {
20403 auto It = VFToTE.find(TE->getVectorFactor());
20404 if (It != VFToTE.end()) {
20405 VF = It->first;
20406 Entries.push_back(It->second);
20407 Entries.push_back(TE);
20408 break;
20409 }
20410 }
20411 // No 2 source vectors with the same vector factor - just choose 2 with max
20412 // index.
20413 if (Entries.empty()) {
20415 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
20416 return TE1->Idx < TE2->Idx;
20417 }));
20418 Entries.push_back(SecondEntries.front());
20419 VF = std::max(Entries.front()->getVectorFactor(),
20420 Entries.back()->getVectorFactor());
20421 } else {
20422 VF = Entries.front()->getVectorFactor();
20423 }
20424 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
20425 for (const TreeEntry *E : Entries)
20426 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
20427 E->Scalars.end());
20428 // Update mapping between values and corresponding tree entries.
20429 for (auto &P : UsedValuesEntry) {
20430 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
20431 if (ValuesToEntries[Idx].contains(P.first)) {
20432 P.second = Idx;
20433 break;
20434 }
20435 }
20436 }
20437
20438 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
20439 // Checks if the 2 PHIs are compatible in terms of high possibility to be
20440 // vectorized.
20441 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
20442 auto *PHI = cast<PHINode>(V);
20443 auto *PHI1 = cast<PHINode>(V1);
20444 // Check that all incoming values are compatible/from same parent (if they
20445 // are instructions).
20446 // The incoming values are compatible if they all are constants, or
20447 // instruction with the same/alternate opcodes from the same basic block.
20448 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
20449 Value *In = PHI->getIncomingValue(I);
20450 Value *In1 = PHI1->getIncomingValue(I);
20451 if (isConstant(In) && isConstant(In1))
20452 continue;
20453 if (!getSameOpcode({In, In1}, *TLI))
20454 return false;
20455 if (cast<Instruction>(In)->getParent() !=
20457 return false;
20458 }
20459 return true;
20460 };
20461 // Check if the value can be ignored during analysis for shuffled gathers.
20462 // We suppose it is better to ignore instruction, which do not form splats,
20463 // are not vectorized/not extractelements (these instructions will be handled
20464 // by extractelements processing) or may form vector node in future.
20465 // Cache results - each V in VL is queried up to 3 times (direct +
20466 // NeighborMightBeIgnored from both neighbors), and areAllUsersVectorized
20467 // walks each instruction's user list.
20468 SmallDenseMap<Value *, bool> MightBeIgnoredCache;
20469 auto MightBeIgnored = [=, &MightBeIgnoredCache](Value *V) {
20470 auto [It, Inserted] = MightBeIgnoredCache.try_emplace(V);
20471 if (!Inserted)
20472 return It->second;
20473 auto *I = dyn_cast<Instruction>(V);
20474 bool Res = I && !IsSplatOrUndefs && !isVectorized(I) &&
20476 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
20477 It->second = Res;
20478 return Res;
20479 };
20480 // Check that the neighbor instruction may form a full vector node with the
20481 // current instruction V. It is possible, if they have same/alternate opcode
20482 // and same parent basic block.
20483 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
20484 Value *V1 = VL[Idx];
20485 bool UsedInSameVTE = false;
20486 auto It = UsedValuesEntry.find(V1);
20487 if (It != UsedValuesEntry.end())
20488 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
20489 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
20490 getSameOpcode({V, V1}, *TLI) &&
20491 cast<Instruction>(V)->getParent() ==
20492 cast<Instruction>(V1)->getParent() &&
20493 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
20494 };
20495 // Build a shuffle mask for better cost estimation and vector emission.
20496 SmallBitVector UsedIdxs(Entries.size());
20498 for (int I = 0, E = VL.size(); I < E; ++I) {
20499 Value *V = VL[I];
20500 auto It = UsedValuesEntry.find(V);
20501 if (It == UsedValuesEntry.end())
20502 continue;
20503 // Do not try to shuffle scalars, if they are constants, or instructions
20504 // that can be vectorized as a result of the following vector build
20505 // vectorization.
20506 if (isConstant(V) || (MightBeIgnored(V) &&
20507 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
20508 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
20509 continue;
20510 unsigned Idx = It->second;
20511 EntryLanes.emplace_back(Idx, I);
20512 UsedIdxs.set(Idx);
20513 }
20514 // Iterate through all shuffled scalars and select entries, which can be used
20515 // for final shuffle.
20517 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
20518 if (!UsedIdxs.test(I))
20519 continue;
20520 // Fix the entry number for the given scalar. If it is the first entry, set
20521 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
20522 // These indices are used when calculating final shuffle mask as the vector
20523 // offset.
20524 for (std::pair<unsigned, int> &Pair : EntryLanes)
20525 if (Pair.first == I)
20526 Pair.first = TempEntries.size();
20527 TempEntries.push_back(Entries[I]);
20528 }
20529 Entries.swap(TempEntries);
20530 if (EntryLanes.size() == Entries.size() &&
20531 !VL.equals(ArrayRef(TE->Scalars)
20532 .slice(MaskBase, getNumElems(TE->Scalars.size(), SliceSize,
20533 Part)))) {
20534 // We may have here 1 or 2 entries only. If the number of scalars is equal
20535 // to the number of entries, no need to do the analysis, it is not very
20536 // profitable. Since VL is not the same as TE->Scalars, it means we already
20537 // have some shuffles before. Cut off not profitable case.
20538 Entries.clear();
20539 return std::nullopt;
20540 }
20541 // Build the final mask, check for the identity shuffle, if possible.
20542 bool IsIdentity = Entries.size() == 1;
20543 // Pair.first is the offset to the vector, while Pair.second is the index of
20544 // scalar in the list.
20545 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
20546 unsigned Idx = MaskBase + Pair.second;
20547 Mask[Idx] =
20548 Pair.first * VF +
20549 (ForOrder ? std::distance(
20550 Entries[Pair.first]->Scalars.begin(),
20551 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
20552 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
20553 IsIdentity &= Mask[Idx] == Pair.second;
20554 }
20555 if (ForOrder || IsIdentity || Entries.empty()) {
20556 switch (Entries.size()) {
20557 case 1:
20558 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
20560 break;
20561 case 2:
20562 if (EntryLanes.size() > 2 || VL.size() <= 2)
20564 break;
20565 default:
20566 break;
20567 }
20568 } else if (!isa<VectorType>(VL.front()->getType()) &&
20569 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
20570 // Do the cost estimation if shuffle beneficial than buildvector.
20571 SmallVector<int> SubMask(std::next(Mask.begin(), MaskBase),
20572 std::next(Mask.begin(), MaskBase + VL.size()));
20573 int MinElement = SubMask.front(), MaxElement = SubMask.front();
20574 for (int Idx : SubMask) {
20575 if (Idx == PoisonMaskElem)
20576 continue;
20577 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
20578 MinElement = Idx;
20579 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
20580 MaxElement = Idx;
20581 }
20582 assert(MaxElement >= 0 && MinElement >= 0 &&
20583 MaxElement % VF >= MinElement % VF &&
20584 "Expected at least single element.");
20585 // If the leading [0, MinIdx) range sits in its own register part(s),
20586 // skip those whole parts when sizing the destination - everything below
20587 // the register-aligned floor is unused and never indexed.
20588 unsigned Offset = 0;
20589 unsigned MinIdx = MinElement % VF;
20590 if (MinIdx > 1) {
20591 unsigned RegFloor = getFloorFullVectorNumberOfElements(
20592 *TTI, VL.front()->getType(), MinIdx);
20593 auto *RegFloorTy = getWidenedType(VL.front()->getType(), RegFloor);
20594 unsigned RegFloorParts =
20595 ::getNumberOfParts(*TTI, RegFloorTy, VL.front()->getType(), RegFloor);
20596 if (RegFloorParts > 1)
20597 Offset = RegFloor;
20598 }
20599 unsigned NewVF =
20600 std::max<unsigned>(VL.size(), (MaxElement % VF) - Offset + 1);
20601 if (NewVF < VF) {
20602 for (int &Idx : SubMask) {
20603 if (Idx == PoisonMaskElem)
20604 continue;
20605 Idx = (Idx % VF) - Offset + (Idx >= static_cast<int>(VF) ? NewVF : 0);
20606 }
20607 } else {
20608 NewVF = VF;
20609 }
20610
20612 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
20613 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
20614 auto GetShuffleCost = [&,
20615 &TTI = *TTI](ArrayRef<int> Mask,
20617 VectorType *VecTy) -> InstructionCost {
20618 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
20620 Mask, Entries.front()->getInterleaveFactor()))
20621 return TTI::TCC_Free;
20622 return ::getShuffleCost(TTI,
20623 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
20625 VecTy, Mask, CostKind);
20626 };
20627 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
20628 InstructionCost FirstShuffleCost = 0;
20629 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
20630 if (Entries.size() == 1 || !Entries[0]->isGather()) {
20631 FirstShuffleCost = ShuffleCost;
20632 } else {
20633 // Transform mask to include only first entry.
20634 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
20635 bool IsIdentity = true;
20636 for (auto [I, Idx] : enumerate(FirstMask)) {
20637 if (Idx >= static_cast<int>(NewVF)) {
20638 Idx = PoisonMaskElem;
20639 } else {
20640 DemandedElts.clearBit(I);
20641 if (Idx != PoisonMaskElem)
20642 IsIdentity &= static_cast<int>(I) == Idx;
20643 }
20644 }
20645 if (!IsIdentity)
20646 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
20647 FirstShuffleCost += getScalarizationOverhead(
20648 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
20649 /*Extract=*/false, CostKind);
20650 }
20651 InstructionCost SecondShuffleCost = 0;
20652 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
20653 if (Entries.size() == 1 || !Entries[1]->isGather()) {
20654 SecondShuffleCost = ShuffleCost;
20655 } else {
20656 // Transform mask to include only first entry.
20657 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
20658 bool IsIdentity = true;
20659 for (auto [I, Idx] : enumerate(SecondMask)) {
20660 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
20661 Idx = PoisonMaskElem;
20662 } else {
20663 DemandedElts.clearBit(I);
20664 if (Idx != PoisonMaskElem) {
20665 Idx -= NewVF;
20666 IsIdentity &= static_cast<int>(I) == Idx;
20667 }
20668 }
20669 }
20670 if (!IsIdentity)
20671 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
20672 SecondShuffleCost += getScalarizationOverhead(
20673 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
20674 /*Extract=*/false, CostKind);
20675 }
20676 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
20677 for (auto [I, Idx] : enumerate(SubMask))
20678 if (Idx == PoisonMaskElem)
20679 DemandedElts.clearBit(I);
20680 InstructionCost BuildVectorCost = getScalarizationOverhead(
20681 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
20682 /*Extract=*/false, CostKind);
20683 const TreeEntry *BestEntry = nullptr;
20684 auto MaskSlice = MutableArrayRef(Mask).slice(MaskBase, VL.size());
20685 if (FirstShuffleCost < ShuffleCost) {
20686 for (int &Idx : MaskSlice)
20687 if (Idx >= static_cast<int>(VF))
20688 Idx = PoisonMaskElem;
20689 BestEntry = Entries.front();
20690 ShuffleCost = FirstShuffleCost;
20691 }
20692 if (SecondShuffleCost < ShuffleCost) {
20693 for (int &Idx : MaskSlice) {
20694 if (Idx < static_cast<int>(VF))
20695 Idx = PoisonMaskElem;
20696 else
20697 Idx -= VF;
20698 }
20699 BestEntry = Entries[1];
20700 ShuffleCost = SecondShuffleCost;
20701 }
20702 if (BuildVectorCost >= ShuffleCost) {
20703 if (BestEntry) {
20704 Entries.clear();
20705 Entries.push_back(BestEntry);
20706 }
20707 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
20709 }
20710 }
20711 Entries.clear();
20712 // Clear the corresponding mask elements.
20713 std::fill(std::next(Mask.begin(), MaskBase),
20714 std::next(Mask.begin(), MaskBase + VL.size()), PoisonMaskElem);
20715 return std::nullopt;
20716}
20717
20719BoUpSLP::isGatherShuffledEntry(
20720 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
20721 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
20722 bool ForOrder) {
20723 assert(NumParts > 0 && NumParts < VL.size() &&
20724 "Expected positive number of registers.");
20725 Entries.clear();
20726 // No need to check for the topmost gather node.
20727 if (TE == VectorizableTree.front().get() &&
20728 (!GatheredLoadsEntriesFirst.has_value() ||
20729 none_of(ArrayRef(VectorizableTree).drop_front(),
20730 [](const std::unique_ptr<TreeEntry> &TE) {
20731 return !TE->isGather();
20732 })))
20733 return {};
20734 Mask.assign(VL.size(), PoisonMaskElem);
20735 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
20736 "Expected only single user of the gather node.");
20737 unsigned PWSz =
20738 getFullVectorNumberOfElements(*TTI, VL.front()->getType(), VL.size());
20739 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
20740 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
20741 (TE->Idx == 0 ||
20742 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
20743 isSplat(TE->Scalars) ||
20744 (TE->hasState() &&
20745 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
20746 return {};
20747 unsigned SliceSize = getPartNumElems(PWSz, NumParts);
20749 for (unsigned Part : seq<unsigned>(NumParts)) {
20750 if (Part * SliceSize >= VL.size())
20751 break;
20752 ArrayRef<Value *> SubVL =
20753 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
20754 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
20755 std::optional<TTI::ShuffleKind> SubRes =
20756 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
20757 ForOrder, SliceSize);
20758 if (!SubRes)
20759 SubEntries.clear();
20760 Res.push_back(SubRes);
20761 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
20762 SubEntries.front()->getVectorFactor() == VL.size() &&
20763 (SubEntries.front()->isSame(TE->Scalars) ||
20764 SubEntries.front()->isSame(VL))) {
20765 SmallVector<const TreeEntry *> LocalSubEntries;
20766 LocalSubEntries.swap(SubEntries);
20767 Entries.clear();
20768 Res.clear();
20769 std::iota(Mask.begin(), Mask.end(), 0);
20770 // Clear undef scalars.
20771 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
20772 if (isa<PoisonValue>(VL[I]))
20774 Entries.emplace_back(1, LocalSubEntries.front());
20776 return Res;
20777 }
20778 }
20779 if (all_of(Res,
20780 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
20781 Entries.clear();
20782 return {};
20783 }
20784 return Res;
20785}
20786
20787InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
20788 Type *ScalarTy) const {
20789 const unsigned VF = VL.size();
20790 auto *VecTy = getWidenedType(ScalarTy, VF);
20791 // Find the cost of inserting/extracting values from the vector.
20792 // Check if the same elements are inserted several times and count them as
20793 // shuffle candidates.
20794 APInt DemandedElements = APInt::getZero(VF);
20797 auto EstimateInsertCost = [&](unsigned I, Value *V) {
20798 DemandedElements.setBit(I);
20799 if (V->getType() != ScalarTy)
20800 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
20802 };
20803 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
20804 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
20805 for (auto [I, V] : enumerate(VL)) {
20806 // No need to shuffle duplicates for constants.
20807 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
20808 continue;
20809
20810 if (isConstant(V)) {
20811 ConstantShuffleMask[I] = I + VF;
20812 continue;
20813 }
20814 EstimateInsertCost(I, V);
20815 }
20816 // FIXME: add a cost for constant vector materialization.
20817 bool IsAnyNonUndefConst =
20818 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
20819 // 1. Shuffle input source vector and constant vector.
20820 if (!ForPoisonSrc && IsAnyNonUndefConst) {
20822 ConstantShuffleMask);
20823 }
20824
20825 // 2. Insert unique non-constants.
20826 if (!DemandedElements.isZero())
20827 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
20828 /*Insert=*/true,
20829 /*Extract=*/false, CostKind,
20830 ForPoisonSrc && !IsAnyNonUndefConst, VL);
20831 return Cost;
20832}
20833
20834Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
20835 auto It = EntryToLastInstruction.find(E);
20836 if (It != EntryToLastInstruction.end())
20837 return *cast<Instruction>(It->second);
20838 Instruction *Res = nullptr;
20839 // Get the basic block this bundle is in. All instructions in the bundle
20840 // should be in this block (except for extractelement-like instructions with
20841 // constant indices or gathered loads or copyables).
20842 Instruction *Front;
20843 unsigned Opcode;
20844 if (E->hasState()) {
20845 Front = E->getMainOp();
20846 Opcode = E->getOpcode();
20847 } else {
20848 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
20849 Opcode = Front->getOpcode();
20850 }
20851 auto *BB = Front->getParent();
20852 assert(
20853 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
20854 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
20855 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
20856 all_of(E->Scalars,
20857 [=](Value *V) -> bool {
20858 if (Opcode == Instruction::GetElementPtr &&
20859 !isa<GetElementPtrInst>(V))
20860 return true;
20861 auto *I = dyn_cast<Instruction>(V);
20862 return !I || !E->getMatchingMainOpOrAltOp(I) ||
20863 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
20864 })) &&
20865 "Expected gathered loads or GEPs or instructions from same basic "
20866 "block.");
20867
20868 auto FindLastInst = [&]() {
20869 Instruction *LastInst = Front;
20870 for (Value *V : E->Scalars) {
20871 auto *I = dyn_cast<Instruction>(V);
20872 if (!I)
20873 continue;
20874 if (E->isCopyableElement(I))
20875 continue;
20876 if (LastInst->getParent() == I->getParent()) {
20877 if (LastInst->comesBefore(I))
20878 LastInst = I;
20879 continue;
20880 }
20881 assert(((Opcode == Instruction::GetElementPtr &&
20883 E->State == TreeEntry::SplitVectorize ||
20884 (isVectorLikeInstWithConstOps(LastInst) &&
20886 (GatheredLoadsEntriesFirst.has_value() &&
20887 Opcode == Instruction::Load && E->isGather() &&
20888 E->Idx < *GatheredLoadsEntriesFirst)) &&
20889 "Expected vector-like or non-GEP in GEP node insts only.");
20890 if (!DT->isReachableFromEntry(LastInst->getParent())) {
20891 LastInst = I;
20892 continue;
20893 }
20894 if (!DT->isReachableFromEntry(I->getParent()))
20895 continue;
20896 auto *NodeA = DT->getNode(LastInst->getParent());
20897 auto *NodeB = DT->getNode(I->getParent());
20898 assert(NodeA && "Should only process reachable instructions");
20899 assert(NodeB && "Should only process reachable instructions");
20900 assert((NodeA == NodeB) ==
20901 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
20902 "Different nodes should have different DFS numbers");
20903 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
20904 LastInst = I;
20905 }
20906 BB = LastInst->getParent();
20907 return LastInst;
20908 };
20909
20910 auto FindFirstInst = [&]() {
20911 Instruction *FirstInst = Front;
20912 for (Value *V : E->Scalars) {
20913 auto *I = dyn_cast<Instruction>(V);
20914 if (!I)
20915 continue;
20916 if (E->isCopyableElement(I))
20917 continue;
20918 if (FirstInst->getParent() == I->getParent()) {
20919 if (I->comesBefore(FirstInst))
20920 FirstInst = I;
20921 continue;
20922 }
20923 assert(((Opcode == Instruction::GetElementPtr &&
20925 (isVectorLikeInstWithConstOps(FirstInst) &&
20927 "Expected vector-like or non-GEP in GEP node insts only.");
20928 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
20929 FirstInst = I;
20930 continue;
20931 }
20932 if (!DT->isReachableFromEntry(I->getParent()))
20933 continue;
20934 auto *NodeA = DT->getNode(FirstInst->getParent());
20935 auto *NodeB = DT->getNode(I->getParent());
20936 assert(NodeA && "Should only process reachable instructions");
20937 assert(NodeB && "Should only process reachable instructions");
20938 assert((NodeA == NodeB) ==
20939 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
20940 "Different nodes should have different DFS numbers");
20941 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
20942 FirstInst = I;
20943 }
20944 return FirstInst;
20945 };
20946
20947 if (E->State == TreeEntry::SplitVectorize) {
20948 Res = FindLastInst();
20949 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
20950 for (auto *E : Entries) {
20951 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
20952 if (!I)
20953 I = &getLastInstructionInBundle(E);
20954 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
20955 Res = I;
20956 }
20957 }
20958 EntryToLastInstruction.try_emplace(E, Res);
20959 return *Res;
20960 }
20961
20962 // Set insertpoint for gathered loads to the very first load.
20963 if (GatheredLoadsEntriesFirst.has_value() &&
20964 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
20965 Opcode == Instruction::Load) {
20966 Res = FindFirstInst();
20967 EntryToLastInstruction.try_emplace(E, Res);
20968 return *Res;
20969 }
20970
20971 // Set the insert point to the beginning of the basic block if the entry
20972 // should not be scheduled.
20973 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
20974 if (E->isGather())
20975 return nullptr;
20976 // Found previously that the instruction do not need to be scheduled.
20977 const auto *It = BlocksSchedules.find(BB);
20978 if (It == BlocksSchedules.end())
20979 return nullptr;
20980 for (Value *V : E->Scalars) {
20981 auto *I = dyn_cast<Instruction>(V);
20982 if (!I || isa<PHINode>(I) ||
20983 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
20984 continue;
20985 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
20986 if (Bundles.empty())
20987 continue;
20988 const auto *It = find_if(
20989 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
20990 if (It != Bundles.end())
20991 return *It;
20992 }
20993 return nullptr;
20994 };
20995 const ScheduleBundle *Bundle = FindScheduleBundle(E);
20996 if (!E->isGather() && !Bundle) {
20997 if ((Opcode == Instruction::GetElementPtr &&
20998 any_of(E->Scalars,
20999 [](Value *V) {
21000 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
21001 })) ||
21002 (all_of(E->Scalars,
21003 [&](Value *V) {
21004 return isa<PoisonValue>(V) ||
21005 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
21006 E->isCopyableElement(V) ||
21007 (!isVectorLikeInstWithConstOps(V) &&
21008 isUsedOutsideBlock(V));
21009 }) &&
21010 (!E->doesNotNeedToSchedule() ||
21011 any_of(E->Scalars,
21012 [&](Value *V) {
21013 if (!isa<Instruction>(V) ||
21014 (E->hasCopyableElements() && E->isCopyableElement(V)))
21015 return false;
21016 return !areAllOperandsNonInsts(V);
21017 }) ||
21018 none_of(E->Scalars, [&](Value *V) {
21019 if (!isa<Instruction>(V) ||
21020 (E->hasCopyableElements() && E->isCopyableElement(V)))
21021 return false;
21022 return MustGather.contains(V);
21023 }))))
21024 Res = FindLastInst();
21025 else
21026 Res = FindFirstInst();
21027 EntryToLastInstruction.try_emplace(E, Res);
21028 return *Res;
21029 }
21030
21031 // Find the last instruction. The common case should be that BB has been
21032 // scheduled, and the last instruction is VL.back(). So we start with
21033 // VL.back() and iterate over schedule data until we reach the end of the
21034 // bundle. The end of the bundle is marked by null ScheduleData.
21035 if (Bundle) {
21036 assert(!E->isGather() && "Gathered instructions should not be scheduled");
21037 Res = Bundle->getBundle().back()->getInst();
21038 EntryToLastInstruction.try_emplace(E, Res);
21039 return *Res;
21040 }
21041
21042 // LastInst can still be null at this point if there's either not an entry
21043 // for BB in BlocksSchedules or there's no ScheduleData available for
21044 // VL.back(). This can be the case if buildTreeRec aborts for various
21045 // reasons (e.g., the maximum recursion depth is reached, the maximum region
21046 // size is reached, etc.). ScheduleData is initialized in the scheduling
21047 // "dry-run".
21048 //
21049 // If this happens, we can still find the last instruction by brute force. We
21050 // iterate forwards from Front (inclusive) until we either see all
21051 // instructions in the bundle or reach the end of the block. If Front is the
21052 // last instruction in program order, LastInst will be set to Front, and we
21053 // will visit all the remaining instructions in the block.
21054 //
21055 // One of the reasons we exit early from buildTreeRec is to place an upper
21056 // bound on compile-time. Thus, taking an additional compile-time hit here is
21057 // not ideal. However, this should be exceedingly rare since it requires that
21058 // we both exit early from buildTreeRec and that the bundle be out-of-order
21059 // (causing us to iterate all the way to the end of the block).
21060 if (!Res)
21061 Res = FindLastInst();
21062 assert(Res && "Failed to find last instruction in bundle");
21063 EntryToLastInstruction.try_emplace(E, Res);
21064 return *Res;
21065}
21066
21067void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
21068 auto *Front = E->getMainOp();
21069 Instruction *LastInst = &getLastInstructionInBundle(E);
21070 assert(LastInst && "Failed to find last instruction in bundle");
21071 BasicBlock::iterator LastInstIt = LastInst->getIterator();
21072 // If the instruction is PHI, set the insert point after all the PHIs.
21073 bool IsPHI = isa<PHINode>(LastInst);
21074 if (IsPHI) {
21075 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
21076 if (LastInstIt != LastInst->getParent()->end() &&
21077 LastInstIt->getParent()->isLandingPad())
21078 LastInstIt = std::next(LastInstIt);
21079 }
21080 if (IsPHI ||
21081 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
21082 (E->doesNotNeedToSchedule() ||
21083 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
21084 isUsedOutsideBlock(LastInst)))) ||
21085 (GatheredLoadsEntriesFirst.has_value() &&
21086 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
21087 E->getOpcode() == Instruction::Load)) {
21088 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
21089 } else {
21090 // Set the insertion point after the last instruction in the bundle. Set the
21091 // debug location to Front.
21092 Builder.SetInsertPoint(
21093 LastInst->getParent(),
21094 LastInst->getNextNode()->getIterator());
21095 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
21096 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
21097 } else {
21098 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
21099 PoisonValue::get(Builder.getPtrTy()),
21100 MaybeAlign());
21101 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
21102 eraseInstruction(Res);
21103 if (E->State != TreeEntry::SplitVectorize)
21104 LastInstructionToPos.try_emplace(LastInst, Res);
21105 }
21106 }
21107 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
21108}
21109
21110Value *BoUpSLP::gather(
21111 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
21112 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
21113 // List of instructions/lanes from current block and/or the blocks which are
21114 // part of the current loop. These instructions will be inserted at the end to
21115 // make it possible to optimize loops and hoist invariant instructions out of
21116 // the loops body with better chances for success.
21118 SmallSet<int, 4> PostponedIndices;
21119 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
21120 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
21121 SmallPtrSet<BasicBlock *, 4> Visited;
21122 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
21123 InsertBB = InsertBB->getSinglePredecessor();
21124 return InsertBB && InsertBB == InstBB;
21125 };
21126 for (int I = 0, E = VL.size(); I < E; ++I) {
21127 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
21128 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
21129 isVectorized(Inst) ||
21130 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
21131 PostponedIndices.insert(I).second)
21132 PostponedInsts.emplace_back(Inst, I);
21133 }
21134
21135 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
21136 Type *Ty) {
21137 Value *Scalar = V;
21138 // Drop NUW from trunc to avoid incorrect codegen.
21139 Value *Trunced;
21140 if (match(Scalar, m_NUWTrunc(m_Value(Trunced))))
21141 cast<TruncInst>(Scalar)->setHasNoUnsignedWrap(/*B=*/false);
21142 if (Scalar->getType() != Ty) {
21143 assert(Scalar->getType()->isIntOrIntVectorTy() &&
21144 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
21145 Value *V = Scalar;
21146 if (auto *CI = dyn_cast<CastInst>(Scalar);
21148 Value *Op = CI->getOperand(0);
21149 if (auto *IOp = dyn_cast<Instruction>(Op);
21150 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
21151 V = Op;
21152 }
21153 Scalar = Builder.CreateIntCast(
21154 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
21155 }
21156
21157 Instruction *InsElt;
21158 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
21159 assert(SLPReVec && "FixedVectorType is not expected.");
21160 Vec =
21161 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
21162 auto *II = dyn_cast<Instruction>(Vec);
21163 if (!II)
21164 return Vec;
21165 InsElt = II;
21166 } else {
21167 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
21168 InsElt = dyn_cast<InsertElementInst>(Vec);
21169 if (!InsElt)
21170 return Vec;
21171 }
21172 GatherShuffleExtractSeq.insert(InsElt);
21173 CSEBlocks.insert(InsElt->getParent());
21174 // Add to our 'need-to-extract' list.
21175 if (isa<Instruction>(V)) {
21176 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
21177 const auto *It = find_if(Entries, [&](const TreeEntry *E) {
21178 return !TransformedToGatherNodes.contains(E) &&
21179 !DeletedNodes.contains(E);
21180 });
21181 if (It != Entries.end()) {
21182 // Find which lane we need to extract.
21183 User *UserOp = nullptr;
21184 if (Scalar != V) {
21185 if (auto *SI = dyn_cast<Instruction>(Scalar))
21186 UserOp = SI;
21187 } else {
21188 if (V->getType()->isVectorTy()) {
21189 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
21190 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
21191 // Find shufflevector, caused by resize.
21192 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
21193 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
21194 if (SV->getOperand(0) == V)
21195 return SV;
21196 if (SV->getOperand(1) == V)
21197 return SV;
21198 }
21199 return nullptr;
21200 };
21201 InsElt = nullptr;
21202 if (Instruction *User = FindOperand(SV->getOperand(0), V))
21203 InsElt = User;
21204 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
21205 InsElt = User;
21206 assert(InsElt &&
21207 "Failed to find shufflevector, caused by resize.");
21208 } else if (SLPReVec && isa<ShuffleVectorInst>(InsElt)) {
21209 // ReVec gather used V directly as a shufflevector operand.
21210 // Register a nullptr-User external use so all remaining
21211 // in-IR uses of V get rewritten via replaceAllUsesWith,
21212 // and track V in ExternalUsesWithNonUsers to match the
21213 // bookkeeping done by buildExternalUses.
21214 unsigned FoundLane = (*It)->findLaneForValue(V);
21215 ExternalUses.emplace_back(V, nullptr, **It, FoundLane);
21216 ExternalUsesWithNonUsers.insert(V);
21217 }
21218 }
21219 UserOp = InsElt;
21220 }
21221 if (UserOp) {
21222 unsigned FoundLane = (*It)->findLaneForValue(V);
21223 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
21224 }
21225 }
21226 }
21227 return Vec;
21228 };
21229 auto *VecTy = getWidenedType(ScalarTy, VL.size());
21230 Value *Vec = PoisonValue::get(VecTy);
21231 SmallVector<int> NonConsts;
21232 SmallVector<int> Mask(VL.size());
21233 std::iota(Mask.begin(), Mask.end(), 0);
21234 Value *OriginalRoot = Root;
21235 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
21236 SV && isa<PoisonValue>(SV->getOperand(1)) &&
21237 SV->getOperand(0)->getType() == VecTy) {
21238 Root = SV->getOperand(0);
21239 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
21240 }
21241 // Insert constant values at first.
21242 for (int I = 0, E = VL.size(); I < E; ++I) {
21243 if (PostponedIndices.contains(I))
21244 continue;
21245 if (!isConstant(VL[I])) {
21246 NonConsts.push_back(I);
21247 continue;
21248 }
21249 if (isa<PoisonValue>(VL[I]))
21250 continue;
21251 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
21252 Mask[I] = I + E;
21253 }
21254 if (Root) {
21255 if (isa<PoisonValue>(Vec)) {
21256 Vec = OriginalRoot;
21257 } else {
21258 Vec = CreateShuffle(Root, Vec, Mask);
21259 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
21260 OI && OI->use_empty() &&
21261 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
21262 return TE->VectorizedValue == OI;
21263 }))
21264 eraseInstruction(OI);
21265 }
21266 }
21267 // Insert non-constant values.
21268 for (int I : NonConsts)
21269 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
21270 // Append instructions, which are/may be part of the loop, in the end to make
21271 // it possible to hoist non-loop-based instructions.
21272 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
21273 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
21274
21275 return Vec;
21276}
21277
21278/// Merges shuffle masks and emits final shuffle instruction, if required. It
21279/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
21280/// when the actual shuffle instruction is generated only if this is actually
21281/// required. Otherwise, the shuffle instruction emission is delayed till the
21282/// end of the process, to reduce the number of emitted instructions and further
21283/// analysis/transformations.
21284/// The class also will look through the previously emitted shuffle instructions
21285/// and properly mark indices in mask as undef.
21286/// For example, given the code
21287/// \code
21288/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
21289/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
21290/// \endcode
21291/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
21292/// look through %s1 and %s2 and emit
21293/// \code
21294/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
21295/// \endcode
21296/// instead.
21297/// If 2 operands are of different size, the smallest one will be resized and
21298/// the mask recalculated properly.
21299/// For example, given the code
21300/// \code
21301/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
21302/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
21303/// \endcode
21304/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
21305/// look through %s1 and %s2 and emit
21306/// \code
21307/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
21308/// \endcode
21309/// instead.
21310class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
21311 bool IsFinalized = false;
21312 /// Combined mask for all applied operands and masks. It is built during
21313 /// analysis and actual emission of shuffle vector instructions.
21314 SmallVector<int> CommonMask;
21315 /// List of operands for the shuffle vector instruction. It hold at max 2
21316 /// operands, if the 3rd is going to be added, the first 2 are combined into
21317 /// shuffle with \p CommonMask mask, the first operand sets to be the
21318 /// resulting shuffle and the second operand sets to be the newly added
21319 /// operand. The \p CommonMask is transformed in the proper way after that.
21320 SmallVector<Value *, 2> InVectors;
21321 IRBuilderBase &Builder;
21322 BoUpSLP &R;
21323
21324 class ShuffleIRBuilder {
21325 IRBuilderBase &Builder;
21326 /// Holds all of the instructions that we gathered.
21327 SetVector<Instruction *> &GatherShuffleExtractSeq;
21328 /// A list of blocks that we are going to CSE.
21329 DenseSet<BasicBlock *> &CSEBlocks;
21330 /// Data layout.
21331 const DataLayout &DL;
21332
21333 public:
21334 ShuffleIRBuilder(IRBuilderBase &Builder,
21335 SetVector<Instruction *> &GatherShuffleExtractSeq,
21336 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
21337 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
21338 CSEBlocks(CSEBlocks), DL(DL) {}
21339 ~ShuffleIRBuilder() = default;
21340 /// Creates shufflevector for the 2 operands with the given mask.
21341 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
21342 if (V1->getType() != V2->getType()) {
21344 V1->getType()->isIntOrIntVectorTy() &&
21345 "Expected integer vector types only.");
21346 if (V1->getType() != V2->getType()) {
21347 if (cast<VectorType>(V2->getType())
21348 ->getElementType()
21349 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
21350 ->getElementType()
21351 ->getIntegerBitWidth())
21352 V2 = Builder.CreateIntCast(
21353 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
21354 else
21355 V1 = Builder.CreateIntCast(
21356 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
21357 }
21358 }
21359 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
21360 if (auto *I = dyn_cast<Instruction>(Vec)) {
21361 GatherShuffleExtractSeq.insert(I);
21362 CSEBlocks.insert(I->getParent());
21363 }
21364 return Vec;
21365 }
21366 /// Creates permutation of the single vector operand with the given mask, if
21367 /// it is not identity mask.
21368 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
21369 if (Mask.empty())
21370 return V1;
21371 unsigned VF = Mask.size();
21372 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
21373 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
21374 return V1;
21375 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
21376 if (auto *I = dyn_cast<Instruction>(Vec)) {
21377 GatherShuffleExtractSeq.insert(I);
21378 CSEBlocks.insert(I->getParent());
21379 }
21380 return Vec;
21381 }
21382 Value *createIdentity(Value *V) { return V; }
21383 Value *createPoison(Type *Ty, unsigned VF) {
21384 return PoisonValue::get(getWidenedType(Ty, VF));
21385 }
21386 /// Resizes 2 input vector to match the sizes, if the they are not equal
21387 /// yet. The smallest vector is resized to the size of the larger vector.
21388 void resizeToMatch(Value *&V1, Value *&V2) {
21389 if (V1->getType() == V2->getType())
21390 return;
21391 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
21392 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
21393 int VF = std::max(V1VF, V2VF);
21394 int MinVF = std::min(V1VF, V2VF);
21395 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
21396 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
21397 0);
21398 Value *&Op = MinVF == V1VF ? V1 : V2;
21399 Op = Builder.CreateShuffleVector(Op, IdentityMask);
21400 if (auto *I = dyn_cast<Instruction>(Op)) {
21401 GatherShuffleExtractSeq.insert(I);
21402 CSEBlocks.insert(I->getParent());
21403 }
21404 if (MinVF == V1VF)
21405 V1 = Op;
21406 else
21407 V2 = Op;
21408 }
21409 };
21410
21411 /// Smart shuffle instruction emission, walks through shuffles trees and
21412 /// tries to find the best matching vector for the actual shuffle
21413 /// instruction.
21414 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
21415 assert(V1 && "Expected at least one vector value.");
21416 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
21417 R.CSEBlocks, *R.DL);
21418 return BaseShuffleAnalysis::createShuffle<Value *>(
21419 V1, V2, Mask, ShuffleBuilder, ScalarTy);
21420 }
21421
21422 /// Cast value \p V to the vector type with the same number of elements, but
21423 /// the base type \p ScalarTy.
21424 Value *castToScalarTyElem(Value *V,
21425 std::optional<bool> IsSigned = std::nullopt) {
21426 auto *VecTy = cast<VectorType>(V->getType());
21427 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
21428 if (VecTy->getElementType() == ScalarTy->getScalarType())
21429 return V;
21430 return Builder.CreateIntCast(
21431 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
21432 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
21433 }
21434
21435 Value *getVectorizedValue(const TreeEntry &E) {
21436 Value *Vec = E.VectorizedValue;
21437 if (!Vec->getType()->isIntOrIntVectorTy())
21438 return Vec;
21439 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
21440 return !isa<PoisonValue>(V) &&
21441 !isKnownNonNegative(
21442 V, SimplifyQuery(*R.DL));
21443 }));
21444 }
21445
21446public:
21448 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
21449
21450 /// Adjusts extractelements after reusing them.
21451 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
21452 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
21453 unsigned NumParts, bool &UseVecBaseAsInput) {
21454 UseVecBaseAsInput = false;
21455 SmallPtrSet<Value *, 4> UniqueBases;
21456 Value *VecBase = nullptr;
21457 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
21458 if (!E->ReorderIndices.empty()) {
21459 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
21460 E->ReorderIndices.end());
21461 reorderScalars(VL, ReorderMask);
21462 }
21463 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
21464 int Idx = Mask[I];
21465 if (Idx == PoisonMaskElem)
21466 continue;
21467 auto *EI = cast<ExtractElementInst>(VL[I]);
21468 VecBase = EI->getVectorOperand();
21469 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
21470 VecBase = TEs.front()->VectorizedValue;
21471 assert(VecBase && "Expected vectorized value.");
21472 UniqueBases.insert(VecBase);
21473 // If the only one use is vectorized - can delete the extractelement
21474 // itself.
21475 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
21476 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
21477 !R.isVectorized(EI) &&
21478 count_if(E->Scalars, [&](Value *V) { return V == EI; }) !=
21479 count_if(E->UserTreeIndex.UserTE->Scalars,
21480 [&](Value *V) { return V == EI; })) ||
21481 (NumParts != 1 && count(VL, EI) > 1) ||
21482 any_of(EI->users(), [&](User *U) {
21483 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
21484 return UTEs.empty() || UTEs.size() > 1 ||
21485 any_of(UTEs,
21486 [&](const TreeEntry *TE) {
21487 return R.DeletedNodes.contains(TE) ||
21488 R.TransformedToGatherNodes.contains(TE);
21489 }) ||
21491 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
21492 (!UTEs.empty() &&
21493 count_if(R.VectorizableTree,
21494 [&](const std::unique_ptr<TreeEntry> &TE) {
21495 return TE->UserTreeIndex.UserTE ==
21496 UTEs.front() &&
21497 is_contained(VL, EI);
21498 }) != 1);
21499 }))
21500 continue;
21501 R.eraseInstruction(EI);
21502 }
21503 if (NumParts == 1 || UniqueBases.size() == 1) {
21504 assert(VecBase && "Expected vectorized value.");
21505 return castToScalarTyElem(VecBase);
21506 }
21507 UseVecBaseAsInput = true;
21508 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
21509 for (auto [I, Idx] : enumerate(Mask))
21510 if (Idx != PoisonMaskElem)
21511 Idx = I;
21512 };
21513 // Perform multi-register vector shuffle, joining them into a single virtual
21514 // long vector.
21515 // Need to shuffle each part independently and then insert all this parts
21516 // into a long virtual vector register, forming the original vector.
21517 Value *Vec = nullptr;
21518 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
21519 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
21520 for (unsigned Part : seq<unsigned>(NumParts)) {
21521 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
21522 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
21523 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
21524 constexpr int MaxBases = 2;
21525 SmallVector<Value *, MaxBases> Bases(MaxBases);
21526 auto VLMask = zip(SubVL, SubMask);
21527 const unsigned VF =
21528 accumulate(VLMask, 0U, [&](unsigned S, const auto &D) {
21529 if (std::get<1>(D) == PoisonMaskElem)
21530 return S;
21531 Value *VecOp =
21532 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
21533 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
21534 !TEs.empty())
21535 VecOp = TEs.front()->VectorizedValue;
21536 assert(VecOp && "Expected vectorized value.");
21537 const unsigned Size =
21538 cast<FixedVectorType>(VecOp->getType())->getNumElements();
21539 return std::max(S, Size);
21540 });
21541 for (const auto [V, I] : VLMask) {
21542 if (I == PoisonMaskElem)
21543 continue;
21544 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
21545 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
21546 VecOp = TEs.front()->VectorizedValue;
21547 assert(VecOp && "Expected vectorized value.");
21548 VecOp = castToScalarTyElem(VecOp);
21549 Bases[I / VF] = VecOp;
21550 }
21551 if (!Bases.front())
21552 continue;
21553 Value *SubVec;
21554 if (Bases.back()) {
21555 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
21556 TransformToIdentity(SubMask);
21557 } else {
21558 SubVec = Bases.front();
21559 }
21560 if (!Vec) {
21561 Vec = SubVec;
21562 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
21563 [&](unsigned P) {
21564 ArrayRef<int> SubMask =
21565 Mask.slice(P * SliceSize,
21566 getNumElems(Mask.size(),
21567 SliceSize, P));
21568 return all_of(SubMask, [](int Idx) {
21569 return Idx == PoisonMaskElem;
21570 });
21571 })) &&
21572 "Expected first part or all previous parts masked.");
21573 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
21574 } else {
21575 unsigned NewVF =
21576 cast<FixedVectorType>(Vec->getType())->getNumElements();
21577 if (Vec->getType() != SubVec->getType()) {
21578 unsigned SubVecVF =
21579 cast<FixedVectorType>(SubVec->getType())->getNumElements();
21580 NewVF = std::max(NewVF, SubVecVF);
21581 }
21582 // Adjust SubMask.
21583 for (int &Idx : SubMask)
21584 if (Idx != PoisonMaskElem)
21585 Idx += NewVF;
21586 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
21587 Vec = createShuffle(Vec, SubVec, VecMask);
21588 TransformToIdentity(VecMask);
21589 }
21590 }
21591 copy(VecMask, Mask.begin());
21592 return Vec;
21593 }
21594 /// Checks if the specified entry \p E needs to be delayed because of its
21595 /// dependency nodes.
21596 std::optional<Value *>
21597 needToDelay(const TreeEntry *E,
21599 // No need to delay emission if all deps are ready.
21600 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
21601 return all_of(
21602 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
21603 }))
21604 return std::nullopt;
21605 // Postpone gather emission, will be emitted after the end of the
21606 // process to keep correct order.
21607 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
21608 return Builder.CreateAlignedLoad(
21609 ResVecTy,
21610 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
21611 MaybeAlign());
21612 }
21613 /// Reset the builder to handle perfect diamond match.
21615 IsFinalized = false;
21616 CommonMask.clear();
21617 InVectors.clear();
21618 }
21619 /// Adds 2 input vectors (in form of tree entries) and the mask for their
21620 /// shuffling.
21621 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
21622 Value *V1 = getVectorizedValue(E1);
21623 Value *V2 = getVectorizedValue(E2);
21624 add(V1, V2, Mask);
21625 }
21626 /// Adds single input vector (in form of tree entry) and the mask for its
21627 /// shuffling.
21628 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
21629 Value *V1 = getVectorizedValue(E1);
21630 add(V1, Mask);
21631 }
21632 /// Adds 2 input vectors and the mask for their shuffling.
21633 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
21634 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
21637 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
21638 V1 = castToScalarTyElem(V1);
21639 V2 = castToScalarTyElem(V2);
21640 if (InVectors.empty()) {
21641 InVectors.push_back(V1);
21642 InVectors.push_back(V2);
21643 CommonMask.assign(Mask.begin(), Mask.end());
21644 return;
21645 }
21646 Value *Vec = InVectors.front();
21647 if (InVectors.size() == 2) {
21648 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
21649 transformMaskAfterShuffle(CommonMask, CommonMask);
21650 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
21651 Mask.size()) {
21652 Vec = createShuffle(Vec, nullptr, CommonMask);
21653 transformMaskAfterShuffle(CommonMask, CommonMask);
21654 }
21655 V1 = createShuffle(V1, V2, Mask);
21656 unsigned VF = std::max(getVF(V1), getVF(Vec));
21657 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21658 if (Mask[Idx] != PoisonMaskElem)
21659 CommonMask[Idx] = Idx + VF;
21660 InVectors.front() = Vec;
21661 if (InVectors.size() == 2)
21662 InVectors.back() = V1;
21663 else
21664 InVectors.push_back(V1);
21665 }
21666 /// Adds another one input vector and the mask for the shuffling.
21667 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
21669 "castToScalarTyElem expects V1 to be FixedVectorType");
21670 V1 = castToScalarTyElem(V1);
21671 if (InVectors.empty()) {
21672 InVectors.push_back(V1);
21673 CommonMask.assign(Mask.begin(), Mask.end());
21674 return;
21675 }
21676 const auto *It = find(InVectors, V1);
21677 if (It == InVectors.end()) {
21678 if (InVectors.size() == 2 ||
21679 InVectors.front()->getType() != V1->getType()) {
21680 Value *V = InVectors.front();
21681 if (InVectors.size() == 2) {
21682 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
21683 transformMaskAfterShuffle(CommonMask, CommonMask);
21684 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
21685 CommonMask.size()) {
21686 V = createShuffle(InVectors.front(), nullptr, CommonMask);
21687 transformMaskAfterShuffle(CommonMask, CommonMask);
21688 }
21689 unsigned VF = std::max(CommonMask.size(), Mask.size());
21690 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21691 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
21692 CommonMask[Idx] = V->getType() != V1->getType()
21693 ? Idx + VF
21694 : Mask[Idx] + getVF(V1);
21695 if (V->getType() != V1->getType())
21696 V1 = createShuffle(V1, nullptr, Mask);
21697 InVectors.front() = V;
21698 if (InVectors.size() == 2)
21699 InVectors.back() = V1;
21700 else
21701 InVectors.push_back(V1);
21702 return;
21703 }
21704 // Check if second vector is required if the used elements are already
21705 // used from the first one.
21706 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21707 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
21708 InVectors.push_back(V1);
21709 break;
21710 }
21711 }
21712 unsigned VF = 0;
21713 for (Value *V : InVectors)
21714 VF = std::max(VF, getVF(V));
21715 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21716 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
21717 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
21718 }
21719 /// Adds another one input vector and the mask for the shuffling.
21721 SmallVector<int> NewMask;
21722 inversePermutation(Order, NewMask);
21723 add(V1, NewMask);
21724 }
21725 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
21726 Value *Root = nullptr) {
21727 return R.gather(VL, Root, ScalarTy,
21728 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
21729 return createShuffle(V1, V2, Mask);
21730 });
21731 }
21732 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
21733 /// Finalize emission of the shuffles.
21734 /// \param Action the action (if any) to be performed before final applying of
21735 /// the \p ExtMask mask.
21737 ArrayRef<int> ExtMask,
21738 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
21739 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
21742 Action = {}) {
21743 IsFinalized = true;
21744 if (Action) {
21745 Value *Vec = InVectors.front();
21746 if (InVectors.size() == 2) {
21747 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
21748 InVectors.pop_back();
21749 } else {
21750 Vec = createShuffle(Vec, nullptr, CommonMask);
21751 }
21752 transformMaskAfterShuffle(CommonMask, CommonMask);
21753 assert(VF > 0 &&
21754 "Expected vector length for the final value before action.");
21755 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
21756 if (VecVF < VF) {
21757 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
21758 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
21759 Vec = createShuffle(Vec, nullptr, ResizeMask);
21760 }
21761 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
21762 return createShuffle(V1, V2, Mask);
21763 });
21764 InVectors.front() = Vec;
21765 }
21766 if (!SubVectors.empty()) {
21767 Value *Vec = InVectors.front();
21768 if (InVectors.size() == 2) {
21769 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
21770 InVectors.pop_back();
21771 } else {
21772 Vec = createShuffle(Vec, nullptr, CommonMask);
21773 }
21774 transformMaskAfterShuffle(CommonMask, CommonMask);
21775 auto CreateSubVectors = [&](Value *Vec,
21776 SmallVectorImpl<int> &CommonMask) {
21777 for (auto [E, Idx] : SubVectors) {
21778 Value *V = getVectorizedValue(*E);
21779 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
21780 // Use scalar version of the SCalarType to correctly handle shuffles
21781 // for revectorization. The revectorization mode operates by the
21782 // vectors, but here we need to operate on the scalars, because the
21783 // masks were already transformed for the vector elements and we don't
21784 // need doing this transformation again.
21785 Type *OrigScalarTy = ScalarTy;
21786 ScalarTy = ScalarTy->getScalarType();
21787 Vec = createInsertVector(
21788 Builder, Vec, V, InsertionIndex,
21789 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
21790 _3));
21791 ScalarTy = OrigScalarTy;
21792 if (!CommonMask.empty()) {
21793 std::iota(std::next(CommonMask.begin(), Idx),
21794 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
21795 Idx);
21796 }
21797 }
21798 return Vec;
21799 };
21800 if (SubVectorsMask.empty()) {
21801 Vec = CreateSubVectors(Vec, CommonMask);
21802 } else {
21803 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
21804 copy(SubVectorsMask, SVMask.begin());
21805 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
21806 if (I2 != PoisonMaskElem) {
21807 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
21808 I1 = I2 + CommonMask.size();
21809 }
21810 }
21811 Value *InsertVec =
21812 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
21813 Vec = createShuffle(InsertVec, Vec, SVMask);
21814 transformMaskAfterShuffle(CommonMask, SVMask);
21815 }
21816 InVectors.front() = Vec;
21817 }
21818
21819 if (!ExtMask.empty()) {
21820 if (CommonMask.empty()) {
21821 CommonMask.assign(ExtMask.begin(), ExtMask.end());
21822 } else {
21823 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
21824 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
21825 if (ExtMask[I] == PoisonMaskElem)
21826 continue;
21827 NewMask[I] = CommonMask[ExtMask[I]];
21828 }
21829 CommonMask.swap(NewMask);
21830 }
21831 }
21832 if (CommonMask.empty()) {
21833 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
21834 return InVectors.front();
21835 }
21836 if (InVectors.size() == 2)
21837 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
21838 return createShuffle(InVectors.front(), nullptr, CommonMask);
21839 }
21840
21842 assert((IsFinalized || CommonMask.empty()) &&
21843 "Shuffle construction must be finalized.");
21844 }
21845};
21846
21847Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
21848 return vectorizeTree(getOperandEntry(E, NodeIdx));
21849}
21850
21851template <typename BVTy, typename ResTy, typename... Args>
21852ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
21853 Args &...Params) {
21854 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
21855 "Expected gather node.");
21856 unsigned VF = E->getVectorFactor();
21857
21858 bool NeedFreeze = false;
21859 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
21860 // Do not process split vectorize node, marked to be gathers/buildvectors.
21862 E->CombinedEntriesWithIndices.size());
21863 if (E->State == TreeEntry::SplitVectorize &&
21864 TransformedToGatherNodes.contains(E)) {
21865 SubVectors.clear();
21866 } else {
21867 // Clear values, to be replaced by insertvector instructions.
21868 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
21869 for_each(MutableArrayRef(GatheredScalars)
21870 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
21871 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
21872 transform(
21873 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
21874 return std::make_pair(VectorizableTree[P.first].get(), P.second);
21875 });
21876 }
21877 // Build a mask out of the reorder indices and reorder scalars per this
21878 // mask.
21879 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
21880 E->ReorderIndices.end());
21881 if (!ReorderMask.empty())
21882 reorderScalars(GatheredScalars, ReorderMask);
21883 SmallVector<int> SubVectorsMask;
21884 inversePermutation(E->ReorderIndices, SubVectorsMask);
21885 // Transform non-clustered elements in the mask to poison (-1).
21886 // "Clustered" operations will be reordered using this mask later.
21887 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
21888 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
21889 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
21890 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
21891 } else {
21892 SubVectorsMask.clear();
21893 }
21894 SmallVector<Value *> StoredGS(GatheredScalars);
21895 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
21896 unsigned I, unsigned SliceSize,
21897 bool IsNotPoisonous) {
21898 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
21899 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
21900 }))
21901 return false;
21902 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
21903 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
21904 if (UserTE->getNumOperands() != 2)
21905 return false;
21906 if (!IsNotPoisonous) {
21907 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
21908 [=](const std::unique_ptr<TreeEntry> &TE) {
21909 return TE->UserTreeIndex.UserTE == UserTE &&
21910 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
21911 });
21912 if (It == VectorizableTree.end())
21913 return false;
21914 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
21915 if (!(*It)->ReorderIndices.empty()) {
21916 inversePermutation((*It)->ReorderIndices, ReorderMask);
21917 reorderScalars(GS, ReorderMask);
21918 }
21919 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
21920 Value *V0 = std::get<0>(P);
21921 Value *V1 = std::get<1>(P);
21922 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
21923 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
21924 is_contained(E->Scalars, V1));
21925 }))
21926 return false;
21927 }
21928 int Idx;
21929 if ((Mask.size() < InputVF &&
21930 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
21931 Idx == 0) ||
21932 (Mask.size() == InputVF &&
21933 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
21934 std::iota(
21935 std::next(Mask.begin(), I * SliceSize),
21936 std::next(Mask.begin(),
21937 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
21938 0);
21939 } else {
21940 unsigned IVal =
21941 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
21942 std::fill(
21943 std::next(Mask.begin(), I * SliceSize),
21944 std::next(Mask.begin(),
21945 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
21946 IVal);
21947 }
21948 return true;
21949 };
21950 BVTy ShuffleBuilder(ScalarTy, Params...);
21951 ResTy Res = ResTy();
21952 SmallVector<int> Mask;
21953 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
21955 Value *ExtractVecBase = nullptr;
21956 bool UseVecBaseAsInput = false;
21959 Type *OrigScalarTy = GatheredScalars.front()->getType();
21960 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
21961 unsigned NumParts =
21962 ::getNumberOfParts(*TTI, VecTy, ScalarTy, GatheredScalars.size());
21963 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
21964 // Check for gathered extracts.
21965 bool Resized = false;
21966 ExtractShuffles =
21967 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
21968 if (!ExtractShuffles.empty()) {
21969 SmallVector<const TreeEntry *> ExtractEntries;
21970 for (auto [Idx, I] : enumerate(ExtractMask)) {
21971 if (I == PoisonMaskElem)
21972 continue;
21973 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
21974 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
21975 !TEs.empty())
21976 ExtractEntries.append(TEs.begin(), TEs.end());
21977 }
21978 if (std::optional<ResTy> Delayed =
21979 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
21980 // Delay emission of gathers which are not ready yet.
21981 PostponedGathers.insert(E);
21982 // Postpone gather emission, will be emitted after the end of the
21983 // process to keep correct order.
21984 return *Delayed;
21985 }
21986 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
21987 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
21988 ExtractVecBase = VecBase;
21989 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
21990 if (VF == VecBaseTy->getNumElements() &&
21991 GatheredScalars.size() != VF) {
21992 Resized = true;
21993 GatheredScalars.append(VF - GatheredScalars.size(),
21994 PoisonValue::get(OrigScalarTy));
21995 NumParts = ::getNumberOfParts(
21996 *TTI, getWidenedType(OrigScalarTy, VF), OrigScalarTy, VF);
21997 }
21998 }
21999 }
22000 // Gather extracts after we check for full matched gathers only.
22001 if (!ExtractShuffles.empty() || !E->hasState() ||
22002 E->getOpcode() != Instruction::Load ||
22003 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
22004 any_of(E->Scalars, IsaPred<LoadInst>)) &&
22005 any_of(E->Scalars,
22006 [this](Value *V) {
22007 return isa<LoadInst>(V) && isVectorized(V);
22008 })) ||
22009 (E->hasState() && E->isAltShuffle()) ||
22010 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
22011 isSplat(E->Scalars) ||
22012 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
22013 GatherShuffles =
22014 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
22015 }
22016 if (!GatherShuffles.empty()) {
22017 if (std::optional<ResTy> Delayed =
22018 ShuffleBuilder.needToDelay(E, Entries)) {
22019 // Delay emission of gathers which are not ready yet.
22020 PostponedGathers.insert(E);
22021 // Postpone gather emission, will be emitted after the end of the
22022 // process to keep correct order.
22023 return *Delayed;
22024 }
22025 if (GatherShuffles.size() == 1 &&
22026 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
22027 (Entries.front().front()->isSame(E->Scalars) ||
22028 E->isSame(Entries.front().front()->Scalars))) {
22029 // Perfect match in the graph, will reuse the previously vectorized
22030 // node. Cost is 0.
22031 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
22032 << shortBundleName(E->Scalars, E->Idx) << ".\n");
22033 // Restore the mask for previous partially matched values.
22034 Mask.resize(E->Scalars.size());
22035 const TreeEntry *FrontTE = Entries.front().front();
22036 if (FrontTE->ReorderIndices.empty() && E->ReorderIndices.empty() &&
22037 ((FrontTE->ReuseShuffleIndices.empty() &&
22038 E->Scalars.size() == FrontTE->Scalars.size()) ||
22039 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
22040 std::iota(Mask.begin(), Mask.end(), 0);
22041 } else {
22042 for (auto [I, V] : enumerate(E->Scalars)) {
22043 if (isa<PoisonValue>(V)) {
22044 Mask[I] = PoisonMaskElem;
22045 continue;
22046 }
22047 Mask[I] = FrontTE->findLaneForValue(V);
22048 }
22049 }
22050 // Reset the builder(s) to correctly handle perfect diamond matched
22051 // nodes.
22052 ShuffleBuilder.resetForSameNode();
22053 // Full matched entry found, no need to insert subvectors.
22054 if ((E->isSame(FrontTE->Scalars) &&
22055 FrontTE->ReuseShuffleIndices.empty() &&
22056 FrontTE->ReorderIndices.empty() &&
22057 E->getVectorFactor() == FrontTE->getVectorFactor()) ||
22058 (equal(E->Scalars, FrontTE->Scalars) &&
22059 equal(E->ReorderIndices, FrontTE->ReorderIndices) &&
22060 equal(E->ReuseShuffleIndices, FrontTE->ReuseShuffleIndices))) {
22061 Mask.resize(FrontTE->getVectorFactor());
22062 std::iota(Mask.begin(), Mask.end(), 0);
22063 ShuffleBuilder.add(*FrontTE, Mask);
22064 Res = ShuffleBuilder.finalize({}, {}, {});
22065 } else {
22066 ShuffleBuilder.add(*FrontTE, Mask);
22067 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
22068 }
22069 return Res;
22070 }
22071 if (!Resized) {
22072 if (GatheredScalars.size() != VF &&
22073 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
22074 return any_of(TEs, [&](const TreeEntry *TE) {
22075 return TE->getVectorFactor() == VF;
22076 });
22077 }))
22078 GatheredScalars.append(VF - GatheredScalars.size(),
22079 PoisonValue::get(OrigScalarTy));
22080 }
22081 // Remove shuffled elements from list of gathers.
22082 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
22083 if (Mask[I] != PoisonMaskElem)
22084 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
22085 }
22086 }
22087 }
22088 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
22089 SmallVectorImpl<int> &ReuseMask,
22090 bool IsRootPoison) {
22091 // For splats with can emit broadcasts instead of gathers, so try to find
22092 // such sequences.
22093 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
22094 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
22095 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
22096 SmallVector<int> UndefPos;
22097 DenseMap<Value *, unsigned> UniquePositions;
22098 // Gather unique non-const values and all constant values.
22099 // For repeated values, just shuffle them.
22100 int NumNonConsts = 0;
22101 int SinglePos = 0;
22102 for (auto [I, V] : enumerate(Scalars)) {
22103 if (isa<UndefValue>(V)) {
22104 if (!isa<PoisonValue>(V)) {
22105 ReuseMask[I] = I;
22106 UndefPos.push_back(I);
22107 }
22108 continue;
22109 }
22110 if (isConstant(V)) {
22111 ReuseMask[I] = I;
22112 continue;
22113 }
22114 ++NumNonConsts;
22115 SinglePos = I;
22116 Value *OrigV = V;
22117 Scalars[I] = PoisonValue::get(OrigScalarTy);
22118 if (IsSplat) {
22119 Scalars.front() = OrigV;
22120 ReuseMask[I] = 0;
22121 } else {
22122 const auto Res = UniquePositions.try_emplace(OrigV, I);
22123 Scalars[Res.first->second] = OrigV;
22124 ReuseMask[I] = Res.first->second;
22125 }
22126 }
22127 if (NumNonConsts == 1) {
22128 // Restore single insert element.
22129 if (IsSplat) {
22130 ReuseMask.assign(VF, PoisonMaskElem);
22131 std::swap(Scalars.front(), Scalars[SinglePos]);
22132 if (!UndefPos.empty() && UndefPos.front() == 0)
22133 Scalars.front() = UndefValue::get(OrigScalarTy);
22134 }
22135 ReuseMask[SinglePos] = SinglePos;
22136 } else if (!UndefPos.empty() && IsSplat) {
22137 // For undef values, try to replace them with the simple broadcast.
22138 // We can do it if the broadcasted value is guaranteed to be
22139 // non-poisonous, or by freezing the incoming scalar value first.
22140 auto *It = find_if(Scalars, [this, E](Value *V) {
22141 return !isa<UndefValue>(V) &&
22143 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
22144 // Check if the value already used in the same operation in
22145 // one of the nodes already.
22146 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
22147 is_contained(E->UserTreeIndex.UserTE->Scalars,
22148 U.getUser());
22149 })));
22150 });
22151 if (It != Scalars.end()) {
22152 // Replace undefs by the non-poisoned scalars and emit broadcast.
22153 int Pos = std::distance(Scalars.begin(), It);
22154 for (int I : UndefPos) {
22155 // Set the undef position to the non-poisoned scalar.
22156 ReuseMask[I] = Pos;
22157 // Replace the undef by the poison, in the mask it is replaced by
22158 // non-poisoned scalar already.
22159 if (I != Pos)
22160 Scalars[I] = PoisonValue::get(OrigScalarTy);
22161 }
22162 } else {
22163 // Replace undefs by the poisons, emit broadcast and then emit
22164 // freeze.
22165 for (int I : UndefPos) {
22166 ReuseMask[I] = PoisonMaskElem;
22167 if (isa<UndefValue>(Scalars[I]))
22168 Scalars[I] = PoisonValue::get(OrigScalarTy);
22169 }
22170 NeedFreeze = true;
22171 }
22172 }
22173 };
22174 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
22175 bool IsNonPoisoned = true;
22176 bool IsUsedInExpr = true;
22177 Value *Vec1 = nullptr;
22178 if (!ExtractShuffles.empty()) {
22179 // Gather of extractelements can be represented as just a shuffle of
22180 // a single/two vectors the scalars are extracted from.
22181 // Find input vectors.
22182 Value *Vec2 = nullptr;
22183 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
22184 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
22185 ExtractMask[I] = PoisonMaskElem;
22186 }
22187 if (UseVecBaseAsInput) {
22188 Vec1 = ExtractVecBase;
22189 } else {
22190 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
22191 if (ExtractMask[I] == PoisonMaskElem)
22192 continue;
22193 if (isa<UndefValue>(StoredGS[I]))
22194 continue;
22195 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
22196 Value *VecOp = EI->getVectorOperand();
22197 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
22198 !TEs.empty() && TEs.front()->VectorizedValue)
22199 VecOp = TEs.front()->VectorizedValue;
22200 if (!Vec1) {
22201 Vec1 = VecOp;
22202 } else if (Vec1 != VecOp) {
22203 assert((!Vec2 || Vec2 == VecOp) &&
22204 "Expected only 1 or 2 vectors shuffle.");
22205 Vec2 = VecOp;
22206 }
22207 }
22208 }
22209 if (Vec2) {
22210 IsUsedInExpr = false;
22211 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
22212 isGuaranteedNotToBePoison(Vec2, AC);
22213 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
22214 } else if (Vec1) {
22215 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
22216 IsUsedInExpr &= FindReusedSplat(
22217 ExtractMask,
22218 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
22219 ExtractMask.size(), IsNotPoisonedVec);
22220 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
22221 IsNonPoisoned &= IsNotPoisonedVec;
22222 } else {
22223 IsUsedInExpr = false;
22224 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
22225 /*ForExtracts=*/true);
22226 }
22227 }
22228 if (!GatherShuffles.empty()) {
22229 unsigned SliceSize = getPartNumElems(VF, NumParts);
22230 if (Mask.size() == E->Scalars.size())
22231 SliceSize = getPartNumElems(
22232 E->Scalars.size(),
22233 ::getNumberOfParts(*TTI, VecTy, ScalarTy, E->Scalars.size()));
22234 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
22235 for (const auto [I, TEs] : enumerate(Entries)) {
22236 if (TEs.empty()) {
22237 assert(!GatherShuffles[I] &&
22238 "No shuffles with empty entries list expected.");
22239 continue;
22240 }
22241 assert((TEs.size() == 1 || TEs.size() == 2) &&
22242 "Expected shuffle of 1 or 2 entries.");
22243 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
22244 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
22245 VecMask.assign(VecMask.size(), PoisonMaskElem);
22246 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
22247 if (TEs.size() == 1) {
22248 bool IsNotPoisonedVec =
22249 TEs.front()->VectorizedValue
22250 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
22251 : true;
22252 IsUsedInExpr &=
22253 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
22254 SliceSize, IsNotPoisonedVec);
22255 ShuffleBuilder.add(*TEs.front(), VecMask);
22256 IsNonPoisoned &= IsNotPoisonedVec;
22257 } else {
22258 IsUsedInExpr = false;
22259 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
22260 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
22261 IsNonPoisoned &=
22262 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
22263 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
22264 }
22265 }
22266 }
22267 // Try to figure out best way to combine values: build a shuffle and insert
22268 // elements or just build several shuffles.
22269 // Insert non-constant scalars.
22270 SmallVector<Value *> NonConstants(GatheredScalars);
22271 int EMSz = ExtractMask.size();
22272 int MSz = Mask.size();
22273 // Try to build constant vector and shuffle with it only if currently we
22274 // have a single permutation and more than 1 scalar constants.
22275 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
22276 bool IsIdentityShuffle =
22277 ((UseVecBaseAsInput ||
22278 all_of(ExtractShuffles,
22279 [](const std::optional<TTI::ShuffleKind> &SK) {
22280 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
22282 })) &&
22283 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
22284 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
22285 (!GatherShuffles.empty() &&
22286 all_of(GatherShuffles,
22287 [](const std::optional<TTI::ShuffleKind> &SK) {
22288 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
22290 }) &&
22291 none_of(Mask, [&](int I) { return I >= MSz; }) &&
22293 bool EnoughConstsForShuffle =
22294 IsSingleShuffle &&
22295 (none_of(GatheredScalars,
22296 [](Value *V) {
22297 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
22298 }) ||
22299 any_of(GatheredScalars,
22300 [](Value *V) {
22301 return isa<Constant>(V) && !isa<UndefValue>(V);
22302 })) &&
22303 (!IsIdentityShuffle ||
22304 (GatheredScalars.size() == 2 &&
22305 any_of(GatheredScalars,
22306 [](Value *V) { return !isa<UndefValue>(V); })) ||
22307 count_if(GatheredScalars, [](Value *V) {
22308 return isa<Constant>(V) && !isa<PoisonValue>(V);
22309 }) > 1);
22310 // NonConstants array contains just non-constant values, GatheredScalars
22311 // contains only constant to build final vector and then shuffle.
22312 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
22313 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
22314 NonConstants[I] = PoisonValue::get(OrigScalarTy);
22315 else
22316 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
22317 }
22318 // Generate constants for final shuffle and build a mask for them.
22319 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
22320 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
22321 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
22322 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
22323 ShuffleBuilder.add(BV, BVMask);
22324 }
22325 if (all_of(NonConstants, [=](Value *V) {
22326 return isa<PoisonValue>(V) ||
22327 (IsSingleShuffle && ((IsIdentityShuffle &&
22328 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
22329 }))
22330 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22331 SubVectorsMask);
22332 else
22333 Res = ShuffleBuilder.finalize(
22334 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
22335 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
22336 bool IsSplat = isSplat(NonConstants);
22337 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
22338 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
22339 auto CheckIfSplatIsProfitable = [&]() {
22340 // Estimate the cost of splatting + shuffle and compare with
22341 // insert + shuffle.
22342 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
22343 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
22344 if (isa<ExtractElementInst>(V) || isVectorized(V))
22345 return false;
22346 InstructionCost SplatCost = TTI->getVectorInstrCost(
22347 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
22348 PoisonValue::get(VecTy), V);
22349 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22350 for (auto [Idx, I] : enumerate(BVMask))
22351 if (I != PoisonMaskElem)
22352 NewMask[Idx] = Mask.size();
22353 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
22354 NewMask, CostKind);
22355 InstructionCost BVCost = TTI->getVectorInstrCost(
22356 Instruction::InsertElement, VecTy, CostKind,
22357 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
22358 // Shuffle required?
22359 if (count(BVMask, PoisonMaskElem) <
22360 static_cast<int>(BVMask.size() - 1)) {
22361 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22362 for (auto [Idx, I] : enumerate(BVMask))
22363 if (I != PoisonMaskElem)
22364 NewMask[Idx] = I;
22365 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
22366 VecTy, NewMask, CostKind);
22367 }
22368 return SplatCost <= BVCost;
22369 };
22370 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
22371 for (auto [Idx, I] : enumerate(BVMask))
22372 if (I != PoisonMaskElem)
22373 Mask[Idx] = I;
22374 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
22375 } else {
22376 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
22377 SmallVector<Value *> Values(NonConstants.size(),
22378 PoisonValue::get(ScalarTy));
22379 Values[0] = V;
22380 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
22381 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
22382 transform(BVMask, SplatMask.begin(), [](int I) {
22383 return I == PoisonMaskElem ? PoisonMaskElem : 0;
22384 });
22385 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
22386 BV = CreateShuffle(BV, nullptr, SplatMask);
22387 for (auto [Idx, I] : enumerate(BVMask))
22388 if (I != PoisonMaskElem)
22389 Mask[Idx] = BVMask.size() + Idx;
22390 Vec = CreateShuffle(Vec, BV, Mask);
22391 for (auto [Idx, I] : enumerate(Mask))
22392 if (I != PoisonMaskElem)
22393 Mask[Idx] = Idx;
22394 }
22395 });
22396 } else if (!allConstant(GatheredScalars)) {
22397 // Gather unique scalars and all constants.
22398 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
22399 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
22400 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
22401 ShuffleBuilder.add(BV, ReuseMask);
22402 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22403 SubVectorsMask);
22404 } else {
22405 // Gather all constants.
22406 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
22407 for (auto [I, V] : enumerate(GatheredScalars)) {
22408 if (!isa<PoisonValue>(V))
22409 Mask[I] = I;
22410 }
22411 Value *BV = ShuffleBuilder.gather(GatheredScalars);
22412 ShuffleBuilder.add(BV, Mask);
22413 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22414 SubVectorsMask);
22415 }
22416
22417 if (NeedFreeze)
22418 Res = ShuffleBuilder.createFreeze(Res);
22419 return Res;
22420}
22421
22422Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
22423 // Do not do this for split vectorize node, marked to be gathers/buildvectors.
22424 if (E->State != TreeEntry::SplitVectorize ||
22425 !TransformedToGatherNodes.contains(E)) {
22426 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
22427 (void)vectorizeTree(VectorizableTree[EIdx].get());
22428 }
22429 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
22430 Builder, *this);
22431}
22432
22433/// \returns \p I after propagating metadata from \p VL only for instructions in
22434/// \p VL.
22437 for (Value *V : VL)
22438 if (isa<Instruction>(V))
22439 Insts.push_back(V);
22440 return llvm::propagateMetadata(Inst, Insts);
22441}
22442
22444 if (DebugLoc DL = PN.getDebugLoc())
22445 return DL;
22446 return DebugLoc::getUnknown();
22447}
22448
22449Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
22450 IRBuilderBase::InsertPointGuard Guard(Builder);
22451
22452 Value *V = E->Scalars.front();
22453 Type *ScalarTy = getValueType(V);
22454 auto It = MinBWs.find(E);
22455 if (It != MinBWs.end()) {
22456 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
22457 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
22458 if (VecTy)
22459 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
22460 }
22461 if (E->VectorizedValue)
22462 return E->VectorizedValue;
22463 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
22464 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
22465 // Set insert point for non-reduction initial nodes.
22466 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
22467 setInsertPointAfterBundle(E);
22468 Value *Vec = createBuildVector(E, ScalarTy);
22469 E->VectorizedValue = Vec;
22470 return Vec;
22471 }
22472 if (E->State == TreeEntry::SplitVectorize) {
22473 assert(E->CombinedEntriesWithIndices.size() == 2 &&
22474 "Expected exactly 2 combined entries.");
22475 setInsertPointAfterBundle(E);
22476 TreeEntry &OpTE1 =
22477 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
22478 assert(OpTE1.isSame(
22479 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
22480 "Expected same first part of scalars.");
22481 Value *Op1 = vectorizeTree(&OpTE1);
22482 TreeEntry &OpTE2 =
22483 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
22484 assert(
22485 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
22486 "Expected same second part of scalars.");
22487 Value *Op2 = vectorizeTree(&OpTE2);
22488 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
22489 bool IsSigned = false;
22490 auto It = MinBWs.find(OpE);
22491 if (It != MinBWs.end())
22492 IsSigned = It->second.second;
22493 else
22494 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
22495 if (isa<PoisonValue>(V))
22496 return false;
22497 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22498 });
22499 return IsSigned;
22500 };
22501 if (cast<VectorType>(Op1->getType())->getElementType() !=
22502 ScalarTy->getScalarType()) {
22503 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
22504 Op1 = Builder.CreateIntCast(
22505 Op1,
22507 ScalarTy,
22508 cast<FixedVectorType>(Op1->getType())->getNumElements()),
22509 GetOperandSignedness(&OpTE1));
22510 }
22511 if (cast<VectorType>(Op2->getType())->getElementType() !=
22512 ScalarTy->getScalarType()) {
22513 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
22514 Op2 = Builder.CreateIntCast(
22515 Op2,
22517 ScalarTy,
22518 cast<FixedVectorType>(Op2->getType())->getNumElements()),
22519 GetOperandSignedness(&OpTE2));
22520 }
22521 if (E->ReorderIndices.empty()) {
22522 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
22523 std::iota(
22524 Mask.begin(),
22525 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
22526 0);
22527 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22528 if (ScalarTyNumElements != 1) {
22529 assert(SLPReVec && "Only supported by REVEC.");
22530 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
22531 }
22532 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
22533 Vec = createInsertVector(Builder, Vec, Op2,
22534 E->CombinedEntriesWithIndices.back().second *
22535 ScalarTyNumElements);
22536 E->VectorizedValue = Vec;
22537 return Vec;
22538 }
22539 unsigned CommonVF =
22540 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
22541 const unsigned Scale = getNumElements(ScalarTy);
22542 CommonVF *= Scale;
22543 if (getNumElements(Op1->getType()) != CommonVF) {
22544 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
22545 copy(createReplicatedMask(Scale, OpTE1.getVectorFactor() * Scale),
22546 Mask.begin());
22547 Op1 = Builder.CreateShuffleVector(Op1, Mask);
22548 }
22549 if (getNumElements(Op2->getType()) != CommonVF) {
22550 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
22551 copy(createReplicatedMask(Scale, OpTE2.getVectorFactor() * Scale),
22552 Mask.begin());
22553 Op2 = Builder.CreateShuffleVector(Op2, Mask);
22554 }
22555 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
22556 E->VectorizedValue = Vec;
22557 return Vec;
22558 }
22559
22560 bool IsReverseOrder =
22561 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
22562 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
22563 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
22564 if (E->getOpcode() == Instruction::Store &&
22565 E->State == TreeEntry::Vectorize) {
22566 ArrayRef<int> Mask =
22567 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
22568 E->ReorderIndices.size());
22569 ShuffleBuilder.add(V, Mask);
22570 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
22571 E->State == TreeEntry::CompressVectorize) {
22572 ShuffleBuilder.addOrdered(V, {});
22573 } else {
22574 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
22575 }
22577 E->CombinedEntriesWithIndices.size());
22578 transform(
22579 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
22580 return std::make_pair(VectorizableTree[P.first].get(), P.second);
22581 });
22582 assert(
22583 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
22584 "Expected either combined subnodes or reordering");
22585 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
22586 };
22587
22588 assert(!E->isGather() && "Unhandled state");
22589 unsigned ShuffleOrOp =
22590 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
22591 if (!E->isAltShuffle()) {
22592 switch (E->CombinedOp) {
22593 case TreeEntry::ReducedBitcast:
22594 case TreeEntry::ReducedBitcastBSwap:
22595 case TreeEntry::ReducedBitcastLoads:
22596 case TreeEntry::ReducedBitcastBSwapLoads:
22597 case TreeEntry::ReducedCmpBitcast:
22598 ShuffleOrOp = E->CombinedOp;
22599 break;
22600 default:
22601 break;
22602 }
22603 }
22604 Instruction *VL0 = E->getMainOp();
22605 auto GetOperandSignedness = [&](unsigned Idx) {
22606 const TreeEntry *OpE = getOperandEntry(E, Idx);
22607 bool IsSigned = false;
22608 auto It = MinBWs.find(OpE);
22609 if (It != MinBWs.end())
22610 IsSigned = It->second.second;
22611 else
22612 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
22613 if (isa<PoisonValue>(V))
22614 return false;
22615 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22616 });
22617 return IsSigned;
22618 };
22619 switch (ShuffleOrOp) {
22620 case Instruction::PHI: {
22621 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
22622 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
22623 "PHI reordering is free.");
22624 auto *PH = cast<PHINode>(VL0);
22625 Builder.SetInsertPoint(PH->getParent(),
22626 PH->getParent()->getFirstNonPHIIt());
22627 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
22628 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
22629 Value *V = NewPhi;
22630
22631 // Adjust insertion point once all PHI's have been generated.
22632 Builder.SetInsertPoint(PH->getParent(),
22633 PH->getParent()->getFirstInsertionPt());
22634 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
22635
22636 V = FinalShuffle(V, E);
22637
22638 E->VectorizedValue = V;
22639 // If phi node is fully emitted - exit.
22640 if (NewPhi->getNumIncomingValues() != 0)
22641 return NewPhi;
22642
22643 // PHINodes may have multiple entries from the same block. We want to
22644 // visit every block once.
22645 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
22646 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
22647 BasicBlock *IBB = PH->getIncomingBlock(I);
22648
22649 // Stop emission if all incoming values are generated.
22650 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
22651 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
22652 return NewPhi;
22653 }
22654
22655 auto Res = VisitedBBs.try_emplace(IBB, I);
22656 if (!Res.second) {
22657 TreeEntry *OpTE = getOperandEntry(E, I);
22658 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
22659 TransformedToGatherNodes.contains(OpTE)) {
22660 Value *VecOp = NewPhi->getIncomingValue(Res.first->getSecond());
22661 NewPhi->addIncoming(VecOp, IBB);
22662 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
22663 OpTE->VectorizedValue = VecOp;
22664 continue;
22665 }
22666 }
22667
22668 Builder.SetInsertPoint(IBB->getTerminator());
22669 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
22670 Value *Vec = vectorizeOperand(E, I);
22671 if (VecTy != Vec->getType()) {
22672 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
22673 MinBWs.contains(getOperandEntry(E, I))) &&
22674 "Expected item in MinBWs.");
22675 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
22676 }
22677 NewPhi->addIncoming(Vec, IBB);
22678 }
22679
22680 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
22681 "Invalid number of incoming values");
22682 assert(E->VectorizedValue && "Expected vectorized value.");
22683 return E->VectorizedValue;
22684 }
22685
22686 case Instruction::ExtractElement: {
22687 Value *V = E->getSingleOperand(0);
22688 setInsertPointAfterBundle(E);
22689 V = FinalShuffle(V, E);
22690 E->VectorizedValue = V;
22691 return V;
22692 }
22693 case Instruction::ExtractValue: {
22694 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
22695 Builder.SetInsertPoint(LI);
22696 Value *Ptr = LI->getPointerOperand();
22697 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
22698 Value *NewV = ::propagateMetadata(V, E->Scalars);
22699 NewV = FinalShuffle(NewV, E);
22700 E->VectorizedValue = NewV;
22701 return NewV;
22702 }
22703 case Instruction::InsertElement: {
22704 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
22705 if (const TreeEntry *OpE = getOperandEntry(E, 1);
22706 OpE && !OpE->isGather() && OpE->hasState() &&
22707 !OpE->hasCopyableElements())
22708 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
22709 else
22710 setInsertPointAfterBundle(E);
22711 Value *V = vectorizeOperand(E, 1);
22712 ArrayRef<Value *> Op = E->getOperand(1);
22713 Type *ScalarTy = Op.front()->getType();
22714 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
22715 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
22716 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
22717 assert(Res.first > 0 && "Expected item in MinBWs.");
22718 V = Builder.CreateIntCast(
22719 V,
22721 ScalarTy,
22722 cast<FixedVectorType>(V->getType())->getNumElements()),
22723 Res.second);
22724 }
22725
22726 // Create InsertVector shuffle if necessary
22727 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
22728 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
22729 }));
22730 const unsigned NumElts =
22731 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
22732 const unsigned NumScalars = E->Scalars.size();
22733
22734 unsigned Offset = *getElementIndex(VL0);
22735 assert(Offset < NumElts && "Failed to find vector index offset");
22736
22737 // Create shuffle to resize vector
22738 SmallVector<int> Mask;
22739 if (!E->ReorderIndices.empty()) {
22740 inversePermutation(E->ReorderIndices, Mask);
22741 Mask.append(NumElts - NumScalars, PoisonMaskElem);
22742 } else {
22743 Mask.assign(NumElts, PoisonMaskElem);
22744 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
22745 }
22746 // Create InsertVector shuffle if necessary
22747 bool IsIdentity = true;
22748 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
22749 Mask.swap(PrevMask);
22750 for (unsigned I = 0; I < NumScalars; ++I) {
22751 Value *Scalar = E->Scalars[PrevMask[I]];
22752 unsigned InsertIdx = *getElementIndex(Scalar);
22753 IsIdentity &= InsertIdx - Offset == I;
22754 Mask[InsertIdx - Offset] = I;
22755 }
22756 if (!IsIdentity || NumElts != NumScalars) {
22757 Value *V2 = nullptr;
22758 bool IsVNonPoisonous =
22760 SmallVector<int> InsertMask(Mask);
22761 if (NumElts != NumScalars && Offset == 0) {
22762 // Follow all insert element instructions from the current buildvector
22763 // sequence.
22764 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
22765 do {
22766 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
22767 if (!InsertIdx)
22768 break;
22769 if (InsertMask[*InsertIdx] == PoisonMaskElem)
22770 InsertMask[*InsertIdx] = *InsertIdx;
22771 if (!Ins->hasOneUse())
22772 break;
22775 } while (Ins);
22776 SmallBitVector UseMask =
22777 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
22778 SmallBitVector IsFirstPoison =
22779 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
22780 SmallBitVector IsFirstUndef =
22781 isUndefVector(FirstInsert->getOperand(0), UseMask);
22782 if (!IsFirstPoison.all()) {
22783 unsigned Idx = 0;
22784 for (unsigned I = 0; I < NumElts; I++) {
22785 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
22786 IsFirstUndef.test(I)) {
22787 if (IsVNonPoisonous) {
22788 InsertMask[I] = I < NumScalars ? I : 0;
22789 continue;
22790 }
22791 if (!V2)
22792 V2 = UndefValue::get(V->getType());
22793 if (Idx >= NumScalars)
22794 Idx = NumScalars - 1;
22795 InsertMask[I] = NumScalars + Idx;
22796 ++Idx;
22797 } else if (InsertMask[I] != PoisonMaskElem &&
22798 Mask[I] == PoisonMaskElem) {
22799 InsertMask[I] = PoisonMaskElem;
22800 }
22801 }
22802 } else {
22803 InsertMask = Mask;
22804 }
22805 }
22806 if (!V2)
22807 V2 = PoisonValue::get(V->getType());
22808 V = Builder.CreateShuffleVector(V, V2, InsertMask);
22809 if (auto *I = dyn_cast<Instruction>(V)) {
22810 GatherShuffleExtractSeq.insert(I);
22811 CSEBlocks.insert(I->getParent());
22812 }
22813 }
22814
22815 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
22816 for (unsigned I = 0; I < NumElts; I++) {
22817 if (Mask[I] != PoisonMaskElem)
22818 InsertMask[Offset + I] = I;
22819 }
22820 SmallBitVector UseMask =
22821 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
22822 SmallBitVector IsFirstUndef =
22823 isUndefVector(FirstInsert->getOperand(0), UseMask);
22824 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
22825 NumElts != NumScalars) {
22826 if (IsFirstUndef.all()) {
22827 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
22828 SmallBitVector IsFirstPoison =
22829 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
22830 if (!IsFirstPoison.all()) {
22831 for (unsigned I = 0; I < NumElts; I++) {
22832 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
22833 InsertMask[I] = I + NumElts;
22834 }
22835 }
22836 V = Builder.CreateShuffleVector(
22837 V,
22838 IsFirstPoison.all() ? PoisonValue::get(V->getType())
22839 : FirstInsert->getOperand(0),
22840 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
22841 if (auto *I = dyn_cast<Instruction>(V)) {
22842 GatherShuffleExtractSeq.insert(I);
22843 CSEBlocks.insert(I->getParent());
22844 }
22845 }
22846 } else {
22847 SmallBitVector IsFirstPoison =
22848 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
22849 for (unsigned I = 0; I < NumElts; I++) {
22850 if (InsertMask[I] == PoisonMaskElem)
22851 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
22852 else
22853 InsertMask[I] += NumElts;
22854 }
22855 V = Builder.CreateShuffleVector(
22856 FirstInsert->getOperand(0), V, InsertMask,
22857 cast<Instruction>(E->Scalars.back())->getName());
22858 if (auto *I = dyn_cast<Instruction>(V)) {
22859 GatherShuffleExtractSeq.insert(I);
22860 CSEBlocks.insert(I->getParent());
22861 }
22862 }
22863 }
22864
22865 ++NumVectorInstructions;
22866 E->VectorizedValue = V;
22867 return V;
22868 }
22869 case Instruction::ZExt:
22870 case Instruction::SExt:
22871 case Instruction::FPToUI:
22872 case Instruction::FPToSI:
22873 case Instruction::FPExt:
22874 case Instruction::PtrToInt:
22875 case Instruction::IntToPtr:
22876 case Instruction::SIToFP:
22877 case Instruction::UIToFP:
22878 case Instruction::Trunc:
22879 case Instruction::FPTrunc:
22880 case Instruction::BitCast: {
22881 setInsertPointAfterBundle(E);
22882
22883 Value *InVec = vectorizeOperand(E, 0);
22884
22885 auto *CI = cast<CastInst>(VL0);
22886 Instruction::CastOps VecOpcode = CI->getOpcode();
22887 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
22888 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
22889 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
22890 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
22891 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
22892 // Check if the values are candidates to demote.
22893 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
22894 if (SrcIt != MinBWs.end())
22895 SrcBWSz = SrcIt->second.first;
22896 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
22897 if (BWSz == SrcBWSz) {
22898 VecOpcode = Instruction::BitCast;
22899 } else if (BWSz < SrcBWSz) {
22900 VecOpcode = Instruction::Trunc;
22901 } else if (It != MinBWs.end()) {
22902 assert(BWSz > SrcBWSz && "Invalid cast!");
22903 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
22904 } else if (SrcIt != MinBWs.end()) {
22905 assert(BWSz > SrcBWSz && "Invalid cast!");
22906 VecOpcode =
22907 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
22908 }
22909 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
22910 !SrcIt->second.second) {
22911 VecOpcode = Instruction::UIToFP;
22912 } else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
22913 ScalarTy->isFPOrFPVectorTy()) {
22914 Type *OrigSrcScalarTy = CI->getSrcTy();
22915 auto *OrigSrcVectorTy =
22916 getWidenedType(OrigSrcScalarTy, E->Scalars.size());
22917 InVec =
22918 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
22919 }
22920 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
22921 ? InVec
22922 : Builder.CreateCast(VecOpcode, InVec, VecTy);
22923 V = FinalShuffle(V, E);
22924
22925 E->VectorizedValue = V;
22926 ++NumVectorInstructions;
22927 return V;
22928 }
22929 case Instruction::FCmp:
22930 case Instruction::ICmp: {
22931 setInsertPointAfterBundle(E);
22932
22933 Value *L = vectorizeOperand(E, 0);
22934 Value *R = vectorizeOperand(E, 1);
22935 if (L->getType() != R->getType()) {
22936 assert((getOperandEntry(E, 0)->isGather() ||
22937 getOperandEntry(E, 1)->isGather() ||
22938 MinBWs.contains(getOperandEntry(E, 0)) ||
22939 MinBWs.contains(getOperandEntry(E, 1))) &&
22940 "Expected item in MinBWs.");
22941 const unsigned LBW = cast<VectorType>(L->getType())
22942 ->getElementType()
22943 ->getIntegerBitWidth();
22944 const unsigned RBW = cast<VectorType>(R->getType())
22945 ->getElementType()
22946 ->getIntegerBitWidth();
22947 if ((LBW < RBW && (!allConstant(E->getOperand(1)) ||
22948 any_of(
22949 E->getOperand(1),
22950 [&](Value *V) {
22951 auto *CI = dyn_cast<ConstantInt>(V);
22952 return !CI ||
22953 CI->getValue().getActiveBits() > LBW;
22954 }))) ||
22955 (LBW > RBW && allConstant(E->getOperand(0)) &&
22956 all_of(E->getOperand(1), [&](Value *V) {
22957 auto *CI = dyn_cast<ConstantInt>(V);
22958 return CI && CI->getValue().getActiveBits() <= RBW;
22959 }))) {
22960 Type *CastTy = R->getType();
22961 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
22962 } else {
22963 Type *CastTy = L->getType();
22964 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
22965 }
22966 }
22967
22968 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
22969 Value *V = Builder.CreateCmp(P0, L, R);
22970 propagateIRFlags(V, E->Scalars, VL0);
22971 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
22972 ICmp->setSameSign(/*B=*/false);
22973 // Do not cast for cmps.
22974 VecTy = cast<FixedVectorType>(V->getType());
22975 V = FinalShuffle(V, E);
22976
22977 E->VectorizedValue = V;
22978 ++NumVectorInstructions;
22979 return V;
22980 }
22981 case Instruction::Select: {
22982 setInsertPointAfterBundle(E);
22983
22984 Value *Cond = vectorizeOperand(E, 0);
22985 Value *True = vectorizeOperand(E, 1);
22986 Value *False = vectorizeOperand(E, 2);
22987 if (True->getType() != VecTy || False->getType() != VecTy) {
22988 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
22989 getOperandEntry(E, 2)->isGather() ||
22990 MinBWs.contains(getOperandEntry(E, 1)) ||
22991 MinBWs.contains(getOperandEntry(E, 2))) &&
22992 "Expected item in MinBWs.");
22993 if (True->getType() != VecTy)
22994 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
22995 if (False->getType() != VecTy)
22996 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
22997 }
22998
22999 unsigned CondNumElements = getNumElements(Cond->getType());
23000 unsigned TrueNumElements = getNumElements(True->getType());
23001 assert(TrueNumElements >= CondNumElements &&
23002 TrueNumElements % CondNumElements == 0 &&
23003 "Cannot vectorize Instruction::Select");
23004 assert(TrueNumElements == getNumElements(False->getType()) &&
23005 "Cannot vectorize Instruction::Select");
23006 if (CondNumElements != TrueNumElements) {
23007 // When the return type is i1 but the source is fixed vector type, we
23008 // need to duplicate the condition value.
23009 Cond = Builder.CreateShuffleVector(
23010 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
23011 CondNumElements));
23012 }
23013 assert(getNumElements(Cond->getType()) == TrueNumElements &&
23014 "Cannot vectorize Instruction::Select");
23015 Value *V =
23016 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
23017 V = FinalShuffle(V, E);
23018
23019 E->VectorizedValue = V;
23020 ++NumVectorInstructions;
23021 return V;
23022 }
23023 case Instruction::FNeg: {
23024 setInsertPointAfterBundle(E);
23025
23026 Value *Op = vectorizeOperand(E, 0);
23027
23028 Value *V = Builder.CreateUnOp(
23029 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
23030 propagateIRFlags(V, E->Scalars, VL0);
23031 if (auto *I = dyn_cast<Instruction>(V))
23032 V = ::propagateMetadata(I, E->Scalars);
23033
23034 V = FinalShuffle(V, E);
23035
23036 E->VectorizedValue = V;
23037 ++NumVectorInstructions;
23038
23039 return V;
23040 }
23041 case Instruction::Freeze: {
23042 setInsertPointAfterBundle(E);
23043
23044 Value *Op = vectorizeOperand(E, 0);
23045
23046 if (Op->getType() != VecTy) {
23047 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
23048 MinBWs.contains(getOperandEntry(E, 0))) &&
23049 "Expected item in MinBWs.");
23050 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
23051 }
23052 Value *V = Builder.CreateFreeze(Op);
23053 V = FinalShuffle(V, E);
23054
23055 E->VectorizedValue = V;
23056 ++NumVectorInstructions;
23057
23058 return V;
23059 }
23060 case Instruction::Add:
23061 case Instruction::FAdd:
23062 case Instruction::Sub:
23063 case Instruction::FSub:
23064 case Instruction::Mul:
23065 case Instruction::FMul:
23066 case Instruction::UDiv:
23067 case Instruction::SDiv:
23068 case Instruction::FDiv:
23069 case Instruction::URem:
23070 case Instruction::SRem:
23071 case Instruction::FRem:
23072 case Instruction::Shl:
23073 case Instruction::LShr:
23074 case Instruction::AShr:
23075 case Instruction::And:
23076 case Instruction::Or:
23077 case Instruction::Xor: {
23078 setInsertPointAfterBundle(E);
23079
23080 Value *LHS = vectorizeOperand(E, 0);
23081 Value *RHS = vectorizeOperand(E, 1);
23082 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
23083 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
23084 ArrayRef<Value *> Ops = E->getOperand(I);
23085 if (all_of(Ops, [&](Value *Op) {
23086 auto *CI = dyn_cast<ConstantInt>(Op);
23087 return CI && CI->getValue().countr_one() >= It->second.first;
23088 })) {
23089 V = FinalShuffle(I == 0 ? RHS : LHS, E);
23090 E->VectorizedValue = V;
23091 ++NumVectorInstructions;
23092 return V;
23093 }
23094 }
23095 }
23096 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
23097 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
23098 getOperandEntry(E, 1)->isGather() ||
23099 MinBWs.contains(getOperandEntry(E, 0)) ||
23100 MinBWs.contains(getOperandEntry(E, 1))) &&
23101 "Expected item in MinBWs.");
23102 if (LHS->getType() != VecTy)
23103 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
23104 if (RHS->getType() != VecTy)
23105 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
23106 }
23107
23108 Value *V = Builder.CreateBinOp(
23109 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
23110 RHS);
23111 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
23112 if (auto *I = dyn_cast<Instruction>(V)) {
23113 V = ::propagateMetadata(I, E->Scalars);
23114 // Drop nuw flags for abs(sub(commutative), true).
23115 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
23116 any_of(E->Scalars, [E](Value *V) {
23117 return isa<PoisonValue>(V) ||
23118 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
23119 isCommutative(cast<Instruction>(V));
23120 }))
23121 I->setHasNoUnsignedWrap(/*b=*/false);
23122 }
23123
23124 V = FinalShuffle(V, E);
23125
23126 E->VectorizedValue = V;
23127 ++NumVectorInstructions;
23128
23129 return V;
23130 }
23131 case Instruction::Load: {
23132 // Loads are inserted at the head of the tree because we don't want to
23133 // sink them all the way down past store instructions.
23134 setInsertPointAfterBundle(E);
23135
23136 LoadInst *LI = cast<LoadInst>(VL0);
23137 Instruction *NewLI;
23138 FixedVectorType *StridedLoadTy = nullptr;
23139 Value *PO = LI->getPointerOperand();
23140 if (E->State == TreeEntry::Vectorize) {
23141 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
23142 } else if (E->State == TreeEntry::CompressVectorize) {
23143 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
23144 CompressEntryToData.at(E);
23145 Align CommonAlignment = LI->getAlign();
23146 if (IsMasked) {
23147 unsigned VF = getNumElements(LoadVecTy);
23148 SmallVector<Constant *> MaskValues(
23149 VF / getNumElements(LI->getType()),
23150 ConstantInt::getFalse(VecTy->getContext()));
23151 for (int I : CompressMask)
23152 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
23153 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
23154 assert(SLPReVec && "Only supported by REVEC.");
23155 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
23156 }
23157 Constant *MaskValue = ConstantVector::get(MaskValues);
23158 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
23159 MaskValue);
23160 } else {
23161 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
23162 }
23163 NewLI = ::propagateMetadata(NewLI, E->Scalars);
23164 // TODO: include this cost into CommonCost.
23165 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
23166 assert(SLPReVec && "FixedVectorType is not expected.");
23167 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
23168 CompressMask);
23169 }
23170 NewLI =
23171 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
23172 } else if (E->State == TreeEntry::StridedVectorize) {
23173 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
23174 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
23175 PO = IsReverseOrder ? PtrN : Ptr0;
23176 Type *StrideTy = DL->getIndexType(PO->getType());
23177 Value *StrideVal;
23178 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
23179 StridedLoadTy = SPtrInfo.Ty;
23180 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
23181 unsigned StridedLoadEC =
23182 StridedLoadTy->getElementCount().getKnownMinValue();
23183
23184 Value *Stride = SPtrInfo.StrideVal;
23185 if (!Stride) {
23186 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
23187 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
23188 SCEVExpander Expander(*SE, "strided-load-vec");
23189 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
23190 &*Builder.GetInsertPoint());
23191 }
23192 Value *NewStride =
23193 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
23194 StrideVal = Builder.CreateMul(
23195 NewStride, ConstantInt::getSigned(
23196 StrideTy, (IsReverseOrder ? -1 : 1) *
23197 static_cast<int>(
23198 DL->getTypeAllocSize(ScalarTy))));
23199 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
23200 auto *Inst = Builder.CreateIntrinsic(
23201 Intrinsic::experimental_vp_strided_load,
23202 {StridedLoadTy, PO->getType(), StrideTy},
23203 {PO, StrideVal,
23204 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
23205 Builder.getInt32(StridedLoadEC)});
23206 Inst->addParamAttr(
23207 /*ArgNo=*/0,
23208 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23209 NewLI = Inst;
23210 } else {
23211 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
23212 Value *VecPtr = vectorizeOperand(E, 0);
23213 if (isa<FixedVectorType>(ScalarTy)) {
23214 assert(SLPReVec && "FixedVectorType is not expected.");
23215 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
23216 // to expand VecPtr if ScalarTy is a vector type.
23217 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
23218 unsigned VecTyNumElements = getNumElements(VecTy);
23219 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
23220 "Cannot expand getelementptr.");
23221 unsigned VF = VecTyNumElements / ScalarTyNumElements;
23222 SmallVector<Constant *> Indices(VecTyNumElements);
23223 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
23224 return Builder.getInt64(I % ScalarTyNumElements);
23225 });
23226 VecPtr = Builder.CreateGEP(
23227 VecTy->getElementType(),
23228 Builder.CreateShuffleVector(
23229 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
23230 ConstantVector::get(Indices));
23231 }
23232 // Use the minimum alignment of the gathered loads.
23233 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
23234 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
23235 }
23236 Value *V = E->State == TreeEntry::CompressVectorize
23237 ? NewLI
23238 : ::propagateMetadata(NewLI, E->Scalars);
23239
23240 if (StridedLoadTy != VecTy)
23241 V = Builder.CreateBitOrPointerCast(V, VecTy);
23242 V = FinalShuffle(V, E);
23243 E->VectorizedValue = V;
23244 ++NumVectorInstructions;
23245 return V;
23246 }
23247 case Instruction::Store: {
23248 auto *SI = cast<StoreInst>(VL0);
23249
23250 setInsertPointAfterBundle(E);
23251
23252 Value *VecValue = vectorizeOperand(E, 0);
23253 if (VecValue->getType() != VecTy)
23254 VecValue =
23255 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
23256 VecValue = FinalShuffle(VecValue, E);
23257
23258 Value *Ptr = SI->getPointerOperand();
23259 Instruction *ST;
23260 if (E->State == TreeEntry::Vectorize) {
23261 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
23262 } else {
23263 assert(E->State == TreeEntry::StridedVectorize &&
23264 "Expected either strided or consecutive stores.");
23265 if (!E->ReorderIndices.empty()) {
23266 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
23267 Ptr = SI->getPointerOperand();
23268 }
23269 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
23270 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
23271
23272 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
23273 Value *Stride = SPtrInfo.StrideVal;
23274 assert(Stride && "Missing StridedPointerInfo for tree entry.");
23275 Value *StrideVal =
23276 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
23277 // vp_strided_store::stride is defined in bytes
23278 StrideVal = Builder.CreateMul(
23279 StrideVal,
23281 StrideTy, static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
23282 auto *Inst = Builder.CreateIntrinsic(
23283 Intrinsic::experimental_vp_strided_store,
23284 {VecTy, Ptr->getType(), StrideTy},
23285 {VecValue, Ptr, StrideVal,
23286 Builder.getAllOnesMask(VecTy->getElementCount()),
23287 Builder.getInt32(E->Scalars.size())});
23288 Inst->addParamAttr(
23289 /*ArgNo=*/1,
23290 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23291 ST = Inst;
23292 }
23293
23294 Value *V = ::propagateMetadata(ST, E->Scalars);
23295
23296 E->VectorizedValue = V;
23297 ++NumVectorInstructions;
23298 return V;
23299 }
23300 case Instruction::GetElementPtr: {
23301 auto *GEP0 = cast<GetElementPtrInst>(VL0);
23302 setInsertPointAfterBundle(E);
23303
23304 Value *Op0 = vectorizeOperand(E, 0);
23305
23306 SmallVector<Value *> OpVecs;
23307 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
23308 Value *OpVec = vectorizeOperand(E, J);
23309 OpVecs.push_back(OpVec);
23310 }
23311
23312 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
23313 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
23315 for (Value *V : E->Scalars) {
23317 GEPs.push_back(V);
23318 }
23319 V = ::propagateMetadata(I, GEPs);
23320 }
23321
23322 V = FinalShuffle(V, E);
23323
23324 E->VectorizedValue = V;
23325 ++NumVectorInstructions;
23326
23327 return V;
23328 }
23329 case Instruction::Call: {
23330 CallInst *CI = cast<CallInst>(VL0);
23331 setInsertPointAfterBundle(E);
23332
23334
23336 CI, ID, VecTy->getNumElements(),
23337 It != MinBWs.end() ? It->second.first : 0, TTI);
23338 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
23339 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
23340 VecCallCosts.first <= VecCallCosts.second;
23341
23342 Value *ScalarArg = nullptr;
23343 SmallVector<Value *> OpVecs;
23344 SmallVector<Type *, 2> TysForDecl;
23345 // Add return type if intrinsic is overloaded on it.
23346 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
23347 TysForDecl.push_back(VecTy);
23348 auto *CEI = cast<CallInst>(VL0);
23349 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
23350 // Some intrinsics have scalar arguments. This argument should not be
23351 // vectorized.
23352 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
23353 ScalarArg = CEI->getArgOperand(I);
23354 // if decided to reduce bitwidth of abs intrinsic, it second argument
23355 // must be set false (do not return poison, if value issigned min).
23356 if (ID == Intrinsic::abs && It != MinBWs.end() &&
23357 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
23358 ScalarArg = Builder.getFalse();
23359 OpVecs.push_back(ScalarArg);
23361 TysForDecl.push_back(ScalarArg->getType());
23362 continue;
23363 }
23364
23365 Value *OpVec = vectorizeOperand(E, I);
23366 ScalarArg = CEI->getArgOperand(I);
23367 if (cast<VectorType>(OpVec->getType())->getElementType() !=
23368 ScalarArg->getType()->getScalarType() &&
23369 It == MinBWs.end()) {
23370 auto *CastTy =
23371 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
23372 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
23373 } else if (It != MinBWs.end()) {
23374 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
23375 }
23376 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
23377 OpVecs.push_back(OpVec);
23378 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
23379 TysForDecl.push_back(OpVec->getType());
23380 }
23381
23382 Function *CF;
23383 if (!UseIntrinsic) {
23384 VFShape Shape =
23386 ElementCount::getFixed(VecTy->getNumElements()),
23387 false /*HasGlobalPred*/);
23388 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
23389 } else {
23390 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
23391 }
23392
23394 CI->getOperandBundlesAsDefs(OpBundles);
23395 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
23396
23397 propagateIRFlags(V, E->Scalars, VL0);
23398 cast<CallInst>(V)->setCallingConv(CF->getCallingConv());
23399 V = FinalShuffle(V, E);
23400
23401 E->VectorizedValue = V;
23402 ++NumVectorInstructions;
23403 return V;
23404 }
23405 case Instruction::ShuffleVector: {
23406 Value *V;
23407 if (SLPReVec && !E->isAltShuffle()) {
23408 setInsertPointAfterBundle(E);
23409 Value *Src = vectorizeOperand(E, 0);
23410 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
23411 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
23412 SmallVector<int> NewMask(ThisMask.size());
23413 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
23414 return SVSrc->getShuffleMask()[Mask];
23415 });
23416 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
23417 SVSrc->getOperand(1), NewMask);
23418 } else {
23419 V = Builder.CreateShuffleVector(Src, ThisMask);
23420 }
23421 propagateIRFlags(V, E->Scalars, VL0);
23422 if (auto *I = dyn_cast<Instruction>(V))
23423 V = ::propagateMetadata(I, E->Scalars);
23424 V = FinalShuffle(V, E);
23425 } else {
23426 assert(E->isAltShuffle() &&
23427 ((Instruction::isBinaryOp(E->getOpcode()) &&
23428 Instruction::isBinaryOp(E->getAltOpcode())) ||
23429 (Instruction::isCast(E->getOpcode()) &&
23430 Instruction::isCast(E->getAltOpcode())) ||
23431 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
23432 "Invalid Shuffle Vector Operand");
23433
23434 Value *LHS = nullptr, *RHS = nullptr;
23435 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
23436 setInsertPointAfterBundle(E);
23437 LHS = vectorizeOperand(E, 0);
23438 RHS = vectorizeOperand(E, 1);
23439 } else {
23440 setInsertPointAfterBundle(E);
23441 LHS = vectorizeOperand(E, 0);
23442 }
23443 if (LHS && RHS &&
23444 ((Instruction::isBinaryOp(E->getOpcode()) &&
23445 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
23446 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
23447 assert((It != MinBWs.end() ||
23448 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
23449 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
23450 MinBWs.contains(getOperandEntry(E, 0)) ||
23451 MinBWs.contains(getOperandEntry(E, 1))) &&
23452 "Expected item in MinBWs.");
23453 Type *CastTy = VecTy;
23454 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
23456 ->getElementType()
23457 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
23458 ->getElementType()
23459 ->getIntegerBitWidth())
23460 CastTy = RHS->getType();
23461 else
23462 CastTy = LHS->getType();
23463 }
23464 if (LHS->getType() != CastTy)
23465 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
23466 if (RHS->getType() != CastTy)
23467 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
23468 }
23469
23470 Value *V0, *V1;
23471 if (Instruction::isBinaryOp(E->getOpcode())) {
23472 V0 = Builder.CreateBinOp(
23473 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
23474 V1 = Builder.CreateBinOp(
23475 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
23476 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
23477 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
23478 auto *AltCI = cast<CmpInst>(E->getAltOp());
23479 CmpInst::Predicate AltPred = AltCI->getPredicate();
23480 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
23481 } else {
23482 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
23483 unsigned SrcBWSz = DL->getTypeSizeInBits(
23484 cast<VectorType>(LHS->getType())->getElementType());
23485 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
23486 if (BWSz <= SrcBWSz) {
23487 if (BWSz < SrcBWSz)
23488 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
23489 assert(LHS->getType() == VecTy &&
23490 "Expected same type as operand.");
23491 if (auto *I = dyn_cast<Instruction>(LHS))
23492 LHS = ::propagateMetadata(I, E->Scalars);
23493 LHS = FinalShuffle(LHS, E);
23494 E->VectorizedValue = LHS;
23495 ++NumVectorInstructions;
23496 return LHS;
23497 }
23498 }
23499 V0 = Builder.CreateCast(
23500 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
23501 V1 = Builder.CreateCast(
23502 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
23503 }
23504 // Add V0 and V1 to later analysis to try to find and remove matching
23505 // instruction, if any.
23506 for (Value *V : {V0, V1}) {
23507 if (auto *I = dyn_cast<Instruction>(V)) {
23508 GatherShuffleExtractSeq.insert(I);
23509 CSEBlocks.insert(I->getParent());
23510 }
23511 }
23512
23513 // Create shuffle to take alternate operations from the vector.
23514 // Also, gather up main and alt scalar ops to propagate IR flags to
23515 // each vector operation.
23516 ValueList OpScalars, AltScalars;
23517 SmallVector<int> Mask;
23518 E->buildAltOpShuffleMask(
23519 [E, this](Instruction *I) {
23520 assert(E->getMatchingMainOpOrAltOp(I) &&
23521 "Unexpected main/alternate opcode");
23522 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
23523 *TLI);
23524 },
23525 Mask, &OpScalars, &AltScalars);
23526
23527 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
23528 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
23529 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
23530 // Drop nuw flags for abs(sub(commutative), true).
23531 if (auto *I = dyn_cast<Instruction>(Vec);
23532 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
23533 any_of(E->Scalars, [E](Value *V) {
23534 if (isa<PoisonValue>(V))
23535 return false;
23536 if (E->hasCopyableElements() && E->isCopyableElement(V))
23537 return false;
23538 auto *IV = cast<Instruction>(V);
23539 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
23540 }))
23541 I->setHasNoUnsignedWrap(/*b=*/false);
23542 };
23543 DropNuwFlag(V0, E->getOpcode());
23544 DropNuwFlag(V1, E->getAltOpcode());
23545
23546 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
23547 assert(SLPReVec && "FixedVectorType is not expected.");
23548 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
23549 }
23550 V = Builder.CreateShuffleVector(V0, V1, Mask);
23551 if (auto *I = dyn_cast<Instruction>(V)) {
23552 V = ::propagateMetadata(I, E->Scalars);
23553 GatherShuffleExtractSeq.insert(I);
23554 CSEBlocks.insert(I->getParent());
23555 }
23556 }
23557
23558 E->VectorizedValue = V;
23559 ++NumVectorInstructions;
23560
23561 return V;
23562 }
23563 case TreeEntry::ReducedBitcast:
23564 case TreeEntry::ReducedBitcastBSwap: {
23565 assert(UserIgnoreList && "Expected reduction operations only.");
23566 setInsertPointAfterBundle(E);
23567 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
23568 ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
23569 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
23570 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
23571 Const->VectorizedValue = PoisonValue::get(getWidenedType(
23572 Const->Scalars.front()->getType(), Const->getVectorFactor()));
23573 Value *Op = vectorizeOperand(ZExt, 0);
23574 auto *SrcType = IntegerType::get(
23575 Op->getContext(),
23576 DL->getTypeSizeInBits(cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
23577 E->getVectorFactor());
23578 auto *OrigScalarTy = ScalarTy;
23579 // Set the scalar type properly to avoid casting to the extending type.
23580 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
23581 Op = FinalShuffle(Op, E);
23582 auto *V = Builder.CreateBitCast(Op, SrcType);
23583 ++NumVectorInstructions;
23584 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
23585 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
23586 ++NumVectorInstructions;
23587 }
23588 if (SrcType != OrigScalarTy) {
23589 V = Builder.CreateIntCast(V, OrigScalarTy, /*isSigned=*/false);
23590 ++NumVectorInstructions;
23591 }
23592 E->VectorizedValue = V;
23593 return V;
23594 }
23595 case TreeEntry::ReducedBitcastLoads:
23596 case TreeEntry::ReducedBitcastBSwapLoads: {
23597 assert(UserIgnoreList && "Expected reduction operations only.");
23598 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
23599 TreeEntry *Load = getOperandEntry(ZExt, /*Idx=*/0);
23600 setInsertPointAfterBundle(Load);
23601 ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
23602 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
23603 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
23604 Const->VectorizedValue = PoisonValue::get(getWidenedType(
23605 Const->Scalars.front()->getType(), Const->getVectorFactor()));
23606 Load->VectorizedValue = PoisonValue::get(getWidenedType(
23607 Load->getMainOp()->getType(), Load->getVectorFactor()));
23608 LoadInst *LI = cast<LoadInst>(Load->getMainOp());
23609 Value *PO = LI->getPointerOperand();
23610 auto *SrcTy = IntegerType::get(
23611 ScalarTy->getContext(),
23612 DL->getTypeSizeInBits(cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
23613 E->getVectorFactor());
23614 auto *OrigScalarTy = ScalarTy;
23615 ScalarTy = ZExt->getMainOp()->getType();
23616 Value *V = Builder.CreateAlignedLoad(SrcTy, PO, LI->getAlign());
23617 ++NumVectorInstructions;
23618 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
23619 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
23620 ++NumVectorInstructions;
23621 }
23622 if (SrcTy != OrigScalarTy) {
23623 V = Builder.CreateIntCast(V, OrigScalarTy, /*isSigned=*/false);
23624 ++NumVectorInstructions;
23625 }
23626 E->VectorizedValue = V;
23627 return V;
23628 }
23629 case TreeEntry::ReducedCmpBitcast: {
23630 assert(UserIgnoreList && "Expected reduction operations only.");
23631 setInsertPointAfterBundle(E);
23632 TreeEntry *Op1TE = getOperandEntry(E, /*Idx=*/1);
23633 TreeEntry *Op2TE = getOperandEntry(E, /*Idx=*/2);
23634 Op1TE->VectorizedValue =
23635 PoisonValue::get(getWidenedType(ScalarTy, Op1TE->getVectorFactor()));
23636 Op2TE->VectorizedValue =
23637 PoisonValue::get(getWidenedType(ScalarTy, Op2TE->getVectorFactor()));
23638 Value *Cmp = vectorizeOperand(E, /*NodeIdx=*/0);
23639 // Set the scalar type properly to avoid casting to the extending type.
23640 auto *DstTy =
23641 IntegerType::getIntNTy(ScalarTy->getContext(), E->getVectorFactor());
23642 auto *V = Builder.CreateBitCast(Cmp, DstTy);
23643 ++NumVectorInstructions;
23644 if (DstTy != ScalarTy) {
23645 V = Builder.CreateIntCast(V, ScalarTy, /*isSigned=*/false);
23646 ++NumVectorInstructions;
23647 }
23648 E->VectorizedValue = V;
23649 return V;
23650 }
23651 default:
23652 llvm_unreachable("unknown inst");
23653 }
23654 return nullptr;
23655}
23656
23658 ExtraValueToDebugLocsMap ExternallyUsedValues;
23659 return vectorizeTree(ExternallyUsedValues);
23660}
23661
23663 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
23664 Instruction *ReductionRoot,
23665 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
23666 VectorValuesAndScales) {
23667 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
23668 // need to rebuild it.
23669 EntryToLastInstruction.clear();
23670 // All blocks must be scheduled before any instructions are inserted.
23671 for (auto &BSIter : BlocksSchedules)
23672 scheduleBlock(*this, BSIter.second.get());
23673 // Cache last instructions for the nodes to avoid side effects, which may
23674 // appear during vectorization, like extra uses, etc.
23675 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
23676 // Need to generate insertion point for loads nodes of the bitcast/bswap
23677 // ops.
23678 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
23679 (TE->State == TreeEntry::CombinedVectorize &&
23680 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
23681 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
23682 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
23683 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
23684 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
23685 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
23686 continue;
23687 (void)getLastInstructionInBundle(TE.get());
23688 }
23689
23690 if (ReductionRoot)
23691 Builder.SetInsertPoint(ReductionRoot->getParent(),
23692 ReductionRoot->getIterator());
23693 else
23694 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
23695
23696 // Vectorize gather operands of the nodes with the external uses only.
23698 // Multiple gather TEs may share the same UserTE - cache the per-UserTE
23699 // all_of-isUsedOutsideBlock result to avoid re-walking each scalar's
23700 // user list.
23701 SmallDenseMap<const TreeEntry *, bool> UserTEScalarsUsedOutsideBlockCache;
23702 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
23703 if (DeletedNodes.contains(TE.get()))
23704 continue;
23705 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
23706 TE->UserTreeIndex.UserTE->hasState() &&
23707 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
23708 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
23709 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
23710 !TE->UserTreeIndex.UserTE->hasCopyableElements()) {
23711 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
23712 auto [It, Inserted] =
23713 UserTEScalarsUsedOutsideBlockCache.try_emplace(UserTE);
23714 if (Inserted)
23715 It->second = all_of(UserTE->Scalars,
23716 [](Value *V) { return isUsedOutsideBlock(V); });
23717 if (!It->second)
23718 continue;
23719 Instruction &LastInst = getLastInstructionInBundle(UserTE);
23720 GatherEntries.emplace_back(TE.get(), &LastInst);
23721 }
23722 }
23723 for (auto &Entry : GatherEntries) {
23724 IRBuilderBase::InsertPointGuard Guard(Builder);
23725 Builder.SetInsertPoint(Entry.second);
23726 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
23727 (void)vectorizeTree(Entry.first);
23728 }
23729 // Emit gathered loads first to emit better code for the users of those
23730 // gathered loads.
23731 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
23732 if (DeletedNodes.contains(TE.get()))
23733 continue;
23734 if (GatheredLoadsEntriesFirst.has_value() &&
23735 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
23736 (!TE->isGather() || TE->UserTreeIndex)) {
23737 assert((TE->UserTreeIndex ||
23738 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
23739 "Expected gathered load node.");
23740 (void)vectorizeTree(TE.get());
23741 }
23742 }
23743 (void)vectorizeTree(VectorizableTree[0].get());
23744 // Run through the list of postponed gathers and emit them, replacing the temp
23745 // emitted allocas with actual vector instructions.
23746 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
23748 for (const TreeEntry *E : PostponedNodes) {
23749 auto *TE = const_cast<TreeEntry *>(E);
23750 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
23751 TE->VectorizedValue = nullptr;
23752 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
23753 // If user is a PHI node, its vector code have to be inserted right before
23754 // block terminator. Since the node was delayed, there were some unresolved
23755 // dependencies at the moment when stab instruction was emitted. In a case
23756 // when any of these dependencies turn out an operand of another PHI, coming
23757 // from this same block, position of a stab instruction will become invalid.
23758 // The is because source vector that supposed to feed this gather node was
23759 // inserted at the end of the block [after stab instruction]. So we need
23760 // to adjust insertion point again to the end of block.
23761 if (isa<PHINode>(UserI) ||
23762 (TE->UserTreeIndex.UserTE->hasState() &&
23763 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
23764 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
23765 // Insert before all users.
23766 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
23767 for (User *U : PrevVec->users()) {
23768 if (U == UserI)
23769 continue;
23770 auto *UI = dyn_cast<Instruction>(U);
23771 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
23772 continue;
23773 if (UI->comesBefore(InsertPt))
23774 InsertPt = UI;
23775 }
23776 Builder.SetInsertPoint(InsertPt);
23777 } else {
23778 Builder.SetInsertPoint(PrevVec);
23779 }
23780 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
23781 Value *Vec = vectorizeTree(TE);
23782 if (auto *VecI = dyn_cast<Instruction>(Vec);
23783 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
23784 Builder.GetInsertPoint()->comesBefore(VecI))
23785 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
23786 Builder.GetInsertPoint());
23787 if (Vec->getType() != PrevVec->getType()) {
23788 assert(Vec->getType()->isIntOrIntVectorTy() &&
23789 PrevVec->getType()->isIntOrIntVectorTy() &&
23790 "Expected integer vector types only.");
23791 std::optional<bool> IsSigned;
23792 for (Value *V : TE->Scalars) {
23793 if (isVectorized(V)) {
23794 for (const TreeEntry *MNTE : getTreeEntries(V)) {
23795 auto It = MinBWs.find(MNTE);
23796 if (It != MinBWs.end()) {
23797 IsSigned = IsSigned.value_or(false) || It->second.second;
23798 if (*IsSigned)
23799 break;
23800 }
23801 }
23802 if (IsSigned.value_or(false))
23803 break;
23804 // Scan through gather nodes.
23805 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
23806 auto It = MinBWs.find(BVE);
23807 if (It != MinBWs.end()) {
23808 IsSigned = IsSigned.value_or(false) || It->second.second;
23809 if (*IsSigned)
23810 break;
23811 }
23812 }
23813 if (IsSigned.value_or(false))
23814 break;
23815 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
23816 IsSigned =
23817 IsSigned.value_or(false) ||
23818 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
23819 continue;
23820 }
23821 if (IsSigned.value_or(false))
23822 break;
23823 }
23824 }
23825 if (IsSigned.value_or(false)) {
23826 // Final attempt - check user node.
23827 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
23828 if (It != MinBWs.end())
23829 IsSigned = It->second.second;
23830 }
23831 assert(IsSigned &&
23832 "Expected user node or perfect diamond match in MinBWs.");
23833 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
23834 }
23835 PrevVec->replaceAllUsesWith(Vec);
23836 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
23837 // Replace the stub vector node, if it was used before for one of the
23838 // buildvector nodes already.
23839 auto It = PostponedValues.find(PrevVec);
23840 if (It != PostponedValues.end()) {
23841 for (TreeEntry *VTE : It->getSecond())
23842 VTE->VectorizedValue = Vec;
23843 }
23844 eraseInstruction(PrevVec);
23845 }
23846
23847 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
23848 << " values .\n");
23849
23851 // Maps vector instruction to original insertelement instruction
23852 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
23853 // Maps extract Scalar to the corresponding extractelement instruction in the
23854 // basic block. Only one extractelement per block should be emitted.
23856 ScalarToEEs;
23857 SmallDenseSet<Value *, 4> UsedInserts;
23859 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
23861 // Extract all of the elements with the external uses.
23862 for (const auto &ExternalUse : ExternalUses) {
23863 Value *Scalar = ExternalUse.Scalar;
23864 llvm::User *User = ExternalUse.User;
23865
23866 // Skip users that we already RAUW. This happens when one instruction
23867 // has multiple uses of the same value.
23868 if (User && !is_contained(Scalar->users(), User))
23869 continue;
23870 const TreeEntry *E = &ExternalUse.E;
23871 assert(E && "Invalid scalar");
23872 assert(!E->isGather() && "Extracting from a gather list");
23873 // Non-instruction pointers are not deleted, just skip them.
23874 if (E->getOpcode() == Instruction::GetElementPtr &&
23875 !isa<GetElementPtrInst>(Scalar))
23876 continue;
23877
23878 Value *Vec = E->VectorizedValue;
23879 assert(Vec && "Can't find vectorizable value");
23880
23881 Value *Lane = Builder.getInt32(ExternalUse.Lane);
23882 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
23883 if (Scalar->getType() != Vec->getType()) {
23884 Value *Ex = nullptr;
23885 Value *ExV = nullptr;
23886 auto *Inst = dyn_cast<Instruction>(Scalar);
23887 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
23888 auto It = ScalarToEEs.find(Scalar);
23889 if (It != ScalarToEEs.end()) {
23890 // No need to emit many extracts, just move the only one in the
23891 // current block.
23892 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
23893 : Builder.GetInsertBlock());
23894 if (EEIt != It->second.end()) {
23895 Value *PrevV = EEIt->second.first;
23896 if (auto *I = dyn_cast<Instruction>(PrevV);
23897 I && !ReplaceInst &&
23898 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
23899 Builder.GetInsertPoint()->comesBefore(I)) {
23900 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
23901 Builder.GetInsertPoint());
23902 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
23903 CI->moveAfter(I);
23904 }
23905 Ex = PrevV;
23906 ExV = EEIt->second.second ? EEIt->second.second : Ex;
23907 }
23908 }
23909 if (!Ex) {
23910 // "Reuse" the existing extract to improve final codegen.
23911 if (ReplaceInst) {
23912 // Leave the instruction as is, if it cheaper extracts and all
23913 // operands are scalar.
23914 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
23915 IgnoredExtracts.insert(EE);
23916 Ex = EE;
23917 } else {
23918 auto *CloneInst = Inst->clone();
23919 CloneInst->insertBefore(Inst->getIterator());
23920 if (Inst->hasName())
23921 CloneInst->takeName(Inst);
23922 Ex = CloneInst;
23923 }
23924 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
23925 ES && isa<Instruction>(Vec)) {
23926 Value *V = ES->getVectorOperand();
23927 auto *IVec = cast<Instruction>(Vec);
23928 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
23929 V = ETEs.front()->VectorizedValue;
23930 if (auto *IV = dyn_cast<Instruction>(V);
23931 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
23932 IV->comesBefore(IVec))
23933 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
23934 else
23935 Ex = Builder.CreateExtractElement(Vec, Lane);
23936 } else if (auto *VecTy =
23937 dyn_cast<FixedVectorType>(Scalar->getType())) {
23938 assert(SLPReVec && "FixedVectorType is not expected.");
23939 unsigned VecTyNumElements = VecTy->getNumElements();
23940 // When REVEC is enabled, we need to extract a vector.
23941 // Note: The element size of Scalar may be different from the
23942 // element size of Vec.
23943 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
23944 ExternalUse.Lane * VecTyNumElements);
23945 } else {
23946 Ex = Builder.CreateExtractElement(Vec, Lane);
23947 }
23948 // If necessary, sign-extend or zero-extend ScalarRoot
23949 // to the larger type.
23950 ExV = Ex;
23951 if (Scalar->getType() != Ex->getType())
23952 ExV = Builder.CreateIntCast(
23953 Ex, Scalar->getType(),
23954 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
23955 auto *I = dyn_cast<Instruction>(Ex);
23956 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
23957 : &F->getEntryBlock(),
23958 std::make_pair(Ex, ExV));
23959 }
23960 // The then branch of the previous if may produce constants, since 0
23961 // operand might be a constant.
23962 if (auto *ExI = dyn_cast<Instruction>(Ex);
23963 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
23964 GatherShuffleExtractSeq.insert(ExI);
23965 CSEBlocks.insert(ExI->getParent());
23966 }
23967 return ExV;
23968 }
23969 assert(isa<FixedVectorType>(Scalar->getType()) &&
23970 isa<InsertElementInst>(Scalar) &&
23971 "In-tree scalar of vector type is not insertelement?");
23972 auto *IE = cast<InsertElementInst>(Scalar);
23973 VectorToInsertElement.try_emplace(Vec, IE);
23974 return Vec;
23975 };
23976 // If User == nullptr, the Scalar remains as scalar in vectorized
23977 // instructions or is used as extra arg. Generate ExtractElement instruction
23978 // and update the record for this scalar in ExternallyUsedValues.
23979 if (!User) {
23980 if (!ScalarsWithNullptrUser.insert(Scalar).second)
23981 continue;
23982 assert(
23983 (ExternallyUsedValues.count(Scalar) ||
23984 ExternalUsesWithNonUsers.count(Scalar) ||
23985 ExternalUsesAsOriginalScalar.contains(Scalar) ||
23986 any_of(
23987 Scalar->users(),
23988 [&, TTI = TTI](llvm::User *U) {
23989 if (ExternalUsesAsOriginalScalar.contains(U))
23990 return true;
23991 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
23992 return !UseEntries.empty() &&
23993 (E->State == TreeEntry::Vectorize ||
23994 E->State == TreeEntry::StridedVectorize ||
23995 E->State == TreeEntry::CompressVectorize) &&
23996 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
23997 return (UseEntry->State == TreeEntry::Vectorize ||
23998 UseEntry->State ==
23999 TreeEntry::StridedVectorize ||
24000 UseEntry->State ==
24001 TreeEntry::CompressVectorize) &&
24002 doesInTreeUserNeedToExtract(
24003 Scalar, getRootEntryInstruction(*UseEntry),
24004 TLI, TTI);
24005 });
24006 })) &&
24007 "Scalar with nullptr User must be registered in "
24008 "ExternallyUsedValues map or remain as scalar in vectorized "
24009 "instructions");
24010 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
24011 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
24012 if (PHI->getParent()->isLandingPad())
24013 Builder.SetInsertPoint(
24014 PHI->getParent(),
24015 std::next(
24016 PHI->getParent()->getLandingPadInst()->getIterator()));
24017 else
24018 Builder.SetInsertPoint(PHI->getParent(),
24019 PHI->getParent()->getFirstNonPHIIt());
24020 } else {
24021 Builder.SetInsertPoint(VecI->getParent(),
24022 std::next(VecI->getIterator()));
24023 }
24024 } else {
24025 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
24026 }
24027 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
24028 // Required to update internally referenced instructions.
24029 if (Scalar != NewInst) {
24030 assert((!isa<ExtractElementInst>(Scalar) ||
24031 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
24032 "Extractelements should not be replaced.");
24033 Scalar->replaceAllUsesWith(NewInst);
24034 }
24035 continue;
24036 }
24037
24038 if (auto *VU = dyn_cast<InsertElementInst>(User);
24039 VU && VU->getOperand(1) == Scalar) {
24040 // Skip if the scalar is another vector op or Vec is not an instruction.
24041 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
24042 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
24043 if (!UsedInserts.insert(VU).second)
24044 continue;
24045 // Need to use original vector, if the root is truncated.
24046 auto BWIt = MinBWs.find(E);
24047 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
24048 auto *ScalarTy = FTy->getElementType();
24049 auto Key = std::make_pair(Vec, ScalarTy);
24050 auto VecIt = VectorCasts.find(Key);
24051 if (VecIt == VectorCasts.end()) {
24052 IRBuilderBase::InsertPointGuard Guard(Builder);
24053 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
24054 if (IVec->getParent()->isLandingPad())
24055 Builder.SetInsertPoint(IVec->getParent(),
24056 std::next(IVec->getParent()
24057 ->getLandingPadInst()
24058 ->getIterator()));
24059 else
24060 Builder.SetInsertPoint(
24061 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
24062 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
24063 Builder.SetInsertPoint(IVec->getNextNode());
24064 }
24065 Vec = Builder.CreateIntCast(
24066 Vec,
24068 ScalarTy,
24069 cast<FixedVectorType>(Vec->getType())->getNumElements()),
24070 BWIt->second.second);
24071 VectorCasts.try_emplace(Key, Vec);
24072 } else {
24073 Vec = VecIt->second;
24074 }
24075 }
24076
24077 std::optional<unsigned> InsertIdx = getElementIndex(VU);
24078 if (InsertIdx) {
24079 auto *It = find_if(
24080 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
24081 // Checks if 2 insertelements are from the same buildvector.
24082 InsertElementInst *VecInsert = Data.InsertElements.front();
24084 VU, VecInsert,
24085 [](InsertElementInst *II) { return II->getOperand(0); });
24086 });
24087 unsigned Idx = *InsertIdx;
24088 if (It == ShuffledInserts.end()) {
24089 (void)ShuffledInserts.emplace_back();
24090 It = std::next(ShuffledInserts.begin(),
24091 ShuffledInserts.size() - 1);
24092 }
24093 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
24094 if (Mask.empty())
24095 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
24096 Mask[Idx] = ExternalUse.Lane;
24097 It->InsertElements.push_back(cast<InsertElementInst>(User));
24098 continue;
24099 }
24100 }
24101 }
24102 }
24103
24104 // Generate extracts for out-of-tree users.
24105 // Find the insertion point for the extractelement lane.
24106 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
24107 if (PHINode *PH = dyn_cast<PHINode>(User)) {
24108 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
24109 if (PH->getIncomingValue(I) == Scalar) {
24110 Instruction *IncomingTerminator =
24111 PH->getIncomingBlock(I)->getTerminator();
24112 if (isa<CatchSwitchInst>(IncomingTerminator)) {
24113 Builder.SetInsertPoint(VecI->getParent(),
24114 std::next(VecI->getIterator()));
24115 } else {
24116 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
24117 }
24118 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
24119 PH->setOperand(I, NewInst);
24120 }
24121 }
24122 } else {
24123 Builder.SetInsertPoint(cast<Instruction>(User));
24124 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
24125 User->replaceUsesOfWith(Scalar, NewInst);
24126 }
24127 } else {
24128 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
24129 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
24130 User->replaceUsesOfWith(Scalar, NewInst);
24131 }
24132
24133 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
24134 }
24135
24136 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
24137 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
24138 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
24139 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
24140 for (int I = 0, E = Mask.size(); I < E; ++I) {
24141 if (Mask[I] < VF)
24142 CombinedMask1[I] = Mask[I];
24143 else
24144 CombinedMask2[I] = Mask[I] - VF;
24145 }
24146 ShuffleInstructionBuilder ShuffleBuilder(
24147 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
24148 ShuffleBuilder.add(V1, CombinedMask1);
24149 if (V2)
24150 ShuffleBuilder.add(V2, CombinedMask2);
24151 return ShuffleBuilder.finalize({}, {}, {});
24152 };
24153
24154 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
24155 bool ForSingleMask) {
24156 unsigned VF = Mask.size();
24157 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
24158 if (VF != VecVF) {
24159 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
24160 Vec = CreateShuffle(Vec, nullptr, Mask);
24161 return std::make_pair(Vec, true);
24162 }
24163 if (!ForSingleMask) {
24164 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
24165 for (unsigned I = 0; I < VF; ++I) {
24166 if (Mask[I] != PoisonMaskElem)
24167 ResizeMask[Mask[I]] = Mask[I];
24168 }
24169 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
24170 }
24171 }
24172
24173 return std::make_pair(Vec, false);
24174 };
24175 // Perform shuffling of the vectorize tree entries for better handling of
24176 // external extracts.
24177 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
24178 // Find the first and the last instruction in the list of insertelements.
24179 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
24180 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
24181 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
24182 Builder.SetInsertPoint(LastInsert);
24183 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
24185 MutableArrayRef(Vector.data(), Vector.size()),
24186 FirstInsert->getOperand(0),
24187 [](Value *Vec) {
24188 return cast<VectorType>(Vec->getType())
24189 ->getElementCount()
24190 .getKnownMinValue();
24191 },
24192 ResizeToVF,
24193 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
24194 ArrayRef<Value *> Vals) {
24195 assert((Vals.size() == 1 || Vals.size() == 2) &&
24196 "Expected exactly 1 or 2 input values.");
24197 if (Vals.size() == 1) {
24198 // Do not create shuffle if the mask is a simple identity
24199 // non-resizing mask.
24200 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
24201 ->getNumElements() ||
24202 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
24203 return CreateShuffle(Vals.front(), nullptr, Mask);
24204 return Vals.front();
24205 }
24206 return CreateShuffle(Vals.front() ? Vals.front()
24207 : FirstInsert->getOperand(0),
24208 Vals.back(), Mask);
24209 });
24210 auto It = ShuffledInserts[I].InsertElements.rbegin();
24211 // Rebuild buildvector chain.
24212 InsertElementInst *II = nullptr;
24213 if (It != ShuffledInserts[I].InsertElements.rend())
24214 II = *It;
24216 while (It != ShuffledInserts[I].InsertElements.rend()) {
24217 assert(II && "Must be an insertelement instruction.");
24218 if (*It == II)
24219 ++It;
24220 else
24221 Inserts.push_back(II);
24222 II = dyn_cast<InsertElementInst>(II->getOperand(0));
24223 }
24224 for (Instruction *II : reverse(Inserts)) {
24225 II->replaceUsesOfWith(II->getOperand(0), NewInst);
24226 if (auto *NewI = dyn_cast<Instruction>(NewInst))
24227 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
24228 II->moveAfter(NewI);
24229 NewInst = II;
24230 }
24231 LastInsert->replaceAllUsesWith(NewInst);
24232 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
24233 IE->replaceUsesOfWith(IE->getOperand(0),
24234 PoisonValue::get(IE->getOperand(0)->getType()));
24235 IE->replaceUsesOfWith(IE->getOperand(1),
24236 PoisonValue::get(IE->getOperand(1)->getType()));
24237 eraseInstruction(IE);
24238 }
24239 CSEBlocks.insert(LastInsert->getParent());
24240 }
24241
24242 SmallVector<Instruction *> RemovedInsts;
24243 // For each vectorized value:
24244 for (auto &TEPtr : VectorizableTree) {
24245 TreeEntry *Entry = TEPtr.get();
24246
24247 // No need to handle users of gathered values.
24248 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
24249 DeletedNodes.contains(Entry) ||
24250 TransformedToGatherNodes.contains(Entry))
24251 continue;
24252
24253 if (Entry->CombinedOp == TreeEntry::ReducedBitcast ||
24254 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
24255 Entry->CombinedOp == TreeEntry::ReducedBitcastLoads ||
24256 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
24257 Entry->CombinedOp == TreeEntry::ReducedCmpBitcast) {
24258 // Skip constant node
24259 if (!Entry->hasState()) {
24260 assert(allConstant(Entry->Scalars) && "Expected constants only.");
24261 continue;
24262 }
24263 for (Value *Scalar : Entry->Scalars) {
24264 auto *I = dyn_cast<Instruction>(Scalar);
24265
24266 if (!I || Entry->isCopyableElement(I))
24267 continue;
24268 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *I << ".\n");
24269 RemovedInsts.push_back(I);
24270 }
24271 continue;
24272 }
24273
24274 assert(Entry->VectorizedValue && "Can't find vectorizable value");
24275
24276 // For each lane:
24277 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
24278 Value *Scalar = Entry->Scalars[Lane];
24279
24280 if (Entry->getOpcode() == Instruction::GetElementPtr &&
24281 !isa<GetElementPtrInst>(Scalar))
24282 continue;
24283 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
24284 EE && IgnoredExtracts.contains(EE))
24285 continue;
24286 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
24287 continue;
24288#ifndef NDEBUG
24289 Type *Ty = Scalar->getType();
24290 if (!Ty->isVoidTy()) {
24291 for (User *U : Scalar->users()) {
24292 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
24293
24294 // It is legal to delete users in the ignorelist.
24295 assert((isVectorized(U) ||
24296 (UserIgnoreList && UserIgnoreList->contains(U)) ||
24299 "Deleting out-of-tree value");
24300 }
24301 }
24302#endif
24303 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
24304 auto *I = cast<Instruction>(Scalar);
24305 RemovedInsts.push_back(I);
24306 }
24307 }
24308
24309 // Collect tree-entry vector instructions that ended up without any non-dead
24310 // uses (can happen with the non-power-of-2 paths, where intermediate vectors
24311 // get replaced but the original node still holds a reference).
24312 // FIXME: track and clear those references at replacement time instead of
24313 // sweeping after the fact. Done BEFORE mergeDIAssignID so any DIAssignID
24314 // from intermediate vectors is folded into the root vector instead of being
24315 // dropped on erase.
24316 {
24317 Value *RootVec = VectorizableTree.front()->VectorizedValue;
24318 // Seed with the scalars already queued for removal - a vector whose only
24319 // user is a scalar-to-be-removed is also dead.
24320 SmallPtrSet<Value *, 16> DeadSet(RemovedInsts.begin(), RemovedInsts.end());
24321 auto AllUsesAreDead = [&](const Value *V) {
24322 return all_of(V->users(),
24323 [&](const User *U) { return DeadSet.contains(U); });
24324 };
24325 SmallPtrSet<const Value *, 16> Candidates;
24326 SmallVector<Instruction *, 16> Worklist;
24327 for (const auto &TEPtr : VectorizableTree) {
24328 Value *VV = TEPtr->VectorizedValue;
24329 if (!VV || VV == RootVec || VV->getType()->isVoidTy() ||
24330 DeadSet.contains(VV))
24331 continue;
24332 auto *I = dyn_cast<Instruction>(VV);
24333 if (!I)
24334 continue;
24335 if (Candidates.insert(I).second)
24336 Worklist.push_back(I);
24337 }
24338 // Worklist sweep: when a vector becomes dead, its operand candidates
24339 // may have lost their last live user, so re-enqueue them. Cascaded dead
24340 // chains converge in O(N + total uses) instead of O(depth * N * uses).
24341 while (!Worklist.empty()) {
24342 Instruction *I = Worklist.pop_back_val();
24343 if (DeadSet.contains(I))
24344 continue;
24345 if (!I->use_empty() && !AllUsesAreDead(I))
24346 continue;
24347 DeadSet.insert(I);
24348 RemovedInsts.push_back(I);
24349 for (Value *Op : I->operand_values())
24350 if (auto *OI = dyn_cast<Instruction>(Op))
24351 if (Candidates.contains(OI) && !DeadSet.contains(OI))
24352 Worklist.push_back(OI);
24353 }
24354 }
24355
24356 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
24357 // new vector instruction.
24358 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
24359 V->mergeDIAssignID(RemovedInsts);
24360
24361 // Clear up reduction references, if any.
24362 if (UserIgnoreList) {
24363 for (Instruction *I : RemovedInsts) {
24364 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
24365 // Dead intermediate vector instructions collected above have no tree
24366 // entry - nothing to unlink here.
24367 if (Entries.empty())
24368 continue;
24369 const TreeEntry *IE = Entries.front();
24370 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(I);
24371 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
24372 IE = SplitEntries.front();
24373 if (IE->Idx != 0 &&
24374 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
24375 (ValueToGatherNodes.lookup(I).contains(
24376 VectorizableTree.front().get()) ||
24377 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
24378 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
24379 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
24380 IE->UserTreeIndex &&
24381 is_contained(VectorizableTree.front()->Scalars, I)) &&
24382 !(GatheredLoadsEntriesFirst.has_value() &&
24383 IE->Idx >= *GatheredLoadsEntriesFirst &&
24384 VectorizableTree.front()->isGather() &&
24385 is_contained(VectorizableTree.front()->Scalars, I)) &&
24386 !(!VectorizableTree.front()->isGather() &&
24387 VectorizableTree.front()->isCopyableElement(I)))
24388 continue;
24389 SmallVector<SelectInst *> LogicalOpSelects;
24390 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
24391 // Do not replace condition of the logical op in form select <cond>.
24392 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
24393 (match(U.getUser(), m_LogicalAnd()) ||
24394 match(U.getUser(), m_LogicalOr())) &&
24395 U.getOperandNo() == 0;
24396 if (IsPoisoningLogicalOp) {
24397 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
24398 return false;
24399 }
24400 return UserIgnoreList->contains(U.getUser());
24401 });
24402 // Replace conditions of the poisoning logical ops with the non-poison
24403 // constant value.
24404 for (SelectInst *SI : LogicalOpSelects)
24405 SI->setCondition(Constant::getNullValue(SI->