LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Maximum allowed number of operands in the PHI nodes.
220static const unsigned MaxPHINumOperands = 128;
221
222/// Predicate for the element types that the SLP vectorizer supports.
223///
224/// The most important thing to filter here are types which are invalid in LLVM
225/// vectors. We also filter target specific types which have absolutely no
226/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
227/// avoids spending time checking the cost model and realizing that they will
228/// be inevitably scalarized.
229static bool isValidElementType(Type *Ty) {
230 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
231 !Ty->isPPC_FP128Ty();
232}
233
234/// \returns True if the value is a constant (but not globals/constant
235/// expressions).
236static bool isConstant(Value *V) {
237 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
238}
239
240/// Checks if \p V is one of vector-like instructions, i.e. undef,
241/// insertelement/extractelement with constant indices for fixed vector type or
242/// extractvalue instruction.
244 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
245 !isa<ExtractValueInst, UndefValue>(V))
246 return false;
247 auto *I = dyn_cast<Instruction>(V);
248 if (!I || isa<ExtractValueInst>(I))
249 return true;
250 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
251 return false;
252 if (isa<ExtractElementInst>(I))
253 return isConstant(I->getOperand(1));
254 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
255 return isConstant(I->getOperand(2));
256}
257
258/// Returns power-of-2 number of elements in a single register (part), given the
259/// total number of elements \p Size and number of registers (parts) \p
260/// NumParts.
261static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
262 return PowerOf2Ceil(divideCeil(Size, NumParts));
263}
264
265/// Returns correct remaining number of elements, considering total amount \p
266/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
267/// and current register (part) \p Part.
268static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
269 unsigned Part) {
270 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
271}
272
273#if !defined(NDEBUG)
274/// Print a short descriptor of the instruction bundle suitable for debug output.
275static std::string shortBundleName(ArrayRef<Value *> VL) {
276 std::string Result;
277 raw_string_ostream OS(Result);
278 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
279 OS.flush();
280 return Result;
281}
282#endif
283
284/// \returns true if all of the instructions in \p VL are in the same block or
285/// false otherwise.
287 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
288 if (!I0)
289 return false;
291 return true;
292
293 BasicBlock *BB = I0->getParent();
294 for (int I = 1, E = VL.size(); I < E; I++) {
295 auto *II = dyn_cast<Instruction>(VL[I]);
296 if (!II)
297 return false;
298
299 if (BB != II->getParent())
300 return false;
301 }
302 return true;
303}
304
305/// \returns True if all of the values in \p VL are constants (but not
306/// globals/constant expressions).
308 // Constant expressions and globals can't be vectorized like normal integer/FP
309 // constants.
310 return all_of(VL, isConstant);
311}
312
313/// \returns True if all of the values in \p VL are identical or some of them
314/// are UndefValue.
315static bool isSplat(ArrayRef<Value *> VL) {
316 Value *FirstNonUndef = nullptr;
317 for (Value *V : VL) {
318 if (isa<UndefValue>(V))
319 continue;
320 if (!FirstNonUndef) {
321 FirstNonUndef = V;
322 continue;
323 }
324 if (V != FirstNonUndef)
325 return false;
326 }
327 return FirstNonUndef != nullptr;
328}
329
330/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
332 if (auto *Cmp = dyn_cast<CmpInst>(I))
333 return Cmp->isCommutative();
334 if (auto *BO = dyn_cast<BinaryOperator>(I))
335 return BO->isCommutative() ||
336 (BO->getOpcode() == Instruction::Sub &&
337 !BO->hasNUsesOrMore(UsesLimit) &&
338 all_of(
339 BO->uses(),
340 [](const Use &U) {
341 // Commutative, if icmp eq/ne sub, 0
342 ICmpInst::Predicate Pred;
343 if (match(U.getUser(),
344 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
345 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
346 return true;
347 // Commutative, if abs(sub nsw, true) or abs(sub, false).
348 ConstantInt *Flag;
349 return match(U.getUser(),
350 m_Intrinsic<Intrinsic::abs>(
351 m_Specific(U.get()), m_ConstantInt(Flag))) &&
352 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
353 Flag->isOne());
354 })) ||
355 (BO->getOpcode() == Instruction::FSub &&
356 !BO->hasNUsesOrMore(UsesLimit) &&
357 all_of(BO->uses(), [](const Use &U) {
358 return match(U.getUser(),
359 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
360 }));
361 return I->isCommutative();
362}
363
364/// \returns inserting index of InsertElement or InsertValue instruction,
365/// using Offset as base offset for index.
366static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
367 unsigned Offset = 0) {
368 int Index = Offset;
369 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
370 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
371 if (!VT)
372 return std::nullopt;
373 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
374 if (!CI)
375 return std::nullopt;
376 if (CI->getValue().uge(VT->getNumElements()))
377 return std::nullopt;
378 Index *= VT->getNumElements();
379 Index += CI->getZExtValue();
380 return Index;
381 }
382
383 const auto *IV = cast<InsertValueInst>(InsertInst);
384 Type *CurrentType = IV->getType();
385 for (unsigned I : IV->indices()) {
386 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
387 Index *= ST->getNumElements();
388 CurrentType = ST->getElementType(I);
389 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
390 Index *= AT->getNumElements();
391 CurrentType = AT->getElementType();
392 } else {
393 return std::nullopt;
394 }
395 Index += I;
396 }
397 return Index;
398}
399
400namespace {
401/// Specifies the way the mask should be analyzed for undefs/poisonous elements
402/// in the shuffle mask.
403enum class UseMask {
404 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
405 ///< check for the mask elements for the first argument (mask
406 ///< indices are in range [0:VF)).
407 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
408 ///< for the mask elements for the second argument (mask indices
409 ///< are in range [VF:2*VF))
410 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
411 ///< future shuffle elements and mark them as ones as being used
412 ///< in future. Non-undef elements are considered as unused since
413 ///< they're already marked as used in the mask.
414};
415} // namespace
416
417/// Prepares a use bitset for the given mask either for the first argument or
418/// for the second.
420 UseMask MaskArg) {
421 SmallBitVector UseMask(VF, true);
422 for (auto [Idx, Value] : enumerate(Mask)) {
423 if (Value == PoisonMaskElem) {
424 if (MaskArg == UseMask::UndefsAsMask)
425 UseMask.reset(Idx);
426 continue;
427 }
428 if (MaskArg == UseMask::FirstArg && Value < VF)
429 UseMask.reset(Value);
430 else if (MaskArg == UseMask::SecondArg && Value >= VF)
431 UseMask.reset(Value - VF);
432 }
433 return UseMask;
434}
435
436/// Checks if the given value is actually an undefined constant vector.
437/// Also, if the \p UseMask is not empty, tries to check if the non-masked
438/// elements actually mask the insertelement buildvector, if any.
439template <bool IsPoisonOnly = false>
441 const SmallBitVector &UseMask = {}) {
442 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
443 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
444 if (isa<T>(V))
445 return Res;
446 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
447 if (!VecTy)
448 return Res.reset();
449 auto *C = dyn_cast<Constant>(V);
450 if (!C) {
451 if (!UseMask.empty()) {
452 const Value *Base = V;
453 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
454 Base = II->getOperand(0);
455 if (isa<T>(II->getOperand(1)))
456 continue;
457 std::optional<unsigned> Idx = getInsertIndex(II);
458 if (!Idx) {
459 Res.reset();
460 return Res;
461 }
462 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
463 Res.reset(*Idx);
464 }
465 // TODO: Add analysis for shuffles here too.
466 if (V == Base) {
467 Res.reset();
468 } else {
469 SmallBitVector SubMask(UseMask.size(), false);
470 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
471 }
472 } else {
473 Res.reset();
474 }
475 return Res;
476 }
477 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
478 if (Constant *Elem = C->getAggregateElement(I))
479 if (!isa<T>(Elem) &&
480 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
481 Res.reset(I);
482 }
483 return Res;
484}
485
486/// Checks if the vector of instructions can be represented as a shuffle, like:
487/// %x0 = extractelement <4 x i8> %x, i32 0
488/// %x3 = extractelement <4 x i8> %x, i32 3
489/// %y1 = extractelement <4 x i8> %y, i32 1
490/// %y2 = extractelement <4 x i8> %y, i32 2
491/// %x0x0 = mul i8 %x0, %x0
492/// %x3x3 = mul i8 %x3, %x3
493/// %y1y1 = mul i8 %y1, %y1
494/// %y2y2 = mul i8 %y2, %y2
495/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
496/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
497/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
498/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
499/// ret <4 x i8> %ins4
500/// can be transformed into:
501/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
502/// i32 6>
503/// %2 = mul <4 x i8> %1, %1
504/// ret <4 x i8> %2
505/// Mask will return the Shuffle Mask equivalent to the extracted elements.
506/// TODO: Can we split off and reuse the shuffle mask detection from
507/// ShuffleVectorInst/getShuffleCost?
508static std::optional<TargetTransformInfo::ShuffleKind>
510 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
511 if (It == VL.end())
512 return std::nullopt;
513 auto *EI0 = cast<ExtractElementInst>(*It);
514 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
515 return std::nullopt;
516 unsigned Size =
517 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
518 Value *Vec1 = nullptr;
519 Value *Vec2 = nullptr;
520 bool HasNonUndefVec = any_of(VL, [](Value *V) {
521 auto *EE = dyn_cast<ExtractElementInst>(V);
522 if (!EE)
523 return false;
524 Value *Vec = EE->getVectorOperand();
525 if (isa<UndefValue>(Vec))
526 return false;
527 return isGuaranteedNotToBePoison(Vec);
528 });
529 enum ShuffleMode { Unknown, Select, Permute };
530 ShuffleMode CommonShuffleMode = Unknown;
531 Mask.assign(VL.size(), PoisonMaskElem);
532 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
533 // Undef can be represented as an undef element in a vector.
534 if (isa<UndefValue>(VL[I]))
535 continue;
536 auto *EI = cast<ExtractElementInst>(VL[I]);
537 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
538 return std::nullopt;
539 auto *Vec = EI->getVectorOperand();
540 // We can extractelement from undef or poison vector.
541 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
542 continue;
543 // All vector operands must have the same number of vector elements.
544 if (isa<UndefValue>(Vec)) {
545 Mask[I] = I;
546 } else {
547 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
548 return std::nullopt;
549 if (isa<UndefValue>(EI->getIndexOperand()))
550 continue;
551 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
552 if (!Idx)
553 return std::nullopt;
554 // Undefined behavior if Idx is negative or >= Size.
555 if (Idx->getValue().uge(Size))
556 continue;
557 unsigned IntIdx = Idx->getValue().getZExtValue();
558 Mask[I] = IntIdx;
559 }
560 if (isUndefVector(Vec).all() && HasNonUndefVec)
561 continue;
562 // For correct shuffling we have to have at most 2 different vector operands
563 // in all extractelement instructions.
564 if (!Vec1 || Vec1 == Vec) {
565 Vec1 = Vec;
566 } else if (!Vec2 || Vec2 == Vec) {
567 Vec2 = Vec;
568 Mask[I] += Size;
569 } else {
570 return std::nullopt;
571 }
572 if (CommonShuffleMode == Permute)
573 continue;
574 // If the extract index is not the same as the operation number, it is a
575 // permutation.
576 if (Mask[I] % Size != I) {
577 CommonShuffleMode = Permute;
578 continue;
579 }
580 CommonShuffleMode = Select;
581 }
582 // If we're not crossing lanes in different vectors, consider it as blending.
583 if (CommonShuffleMode == Select && Vec2)
585 // If Vec2 was never used, we have a permutation of a single vector, otherwise
586 // we have permutation of 2 vectors.
589}
590
591/// \returns True if Extract{Value,Element} instruction extracts element Idx.
592static std::optional<unsigned> getExtractIndex(Instruction *E) {
593 unsigned Opcode = E->getOpcode();
594 assert((Opcode == Instruction::ExtractElement ||
595 Opcode == Instruction::ExtractValue) &&
596 "Expected extractelement or extractvalue instruction.");
597 if (Opcode == Instruction::ExtractElement) {
598 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
599 if (!CI)
600 return std::nullopt;
601 return CI->getZExtValue();
602 }
603 auto *EI = cast<ExtractValueInst>(E);
604 if (EI->getNumIndices() != 1)
605 return std::nullopt;
606 return *EI->idx_begin();
607}
608
609namespace {
610
611/// Main data required for vectorization of instructions.
612struct InstructionsState {
613 /// The very first instruction in the list with the main opcode.
614 Value *OpValue = nullptr;
615
616 /// The main/alternate instruction.
617 Instruction *MainOp = nullptr;
618 Instruction *AltOp = nullptr;
619
620 /// The main/alternate opcodes for the list of instructions.
621 unsigned getOpcode() const {
622 return MainOp ? MainOp->getOpcode() : 0;
623 }
624
625 unsigned getAltOpcode() const {
626 return AltOp ? AltOp->getOpcode() : 0;
627 }
628
629 /// Some of the instructions in the list have alternate opcodes.
630 bool isAltShuffle() const { return AltOp != MainOp; }
631
632 bool isOpcodeOrAlt(Instruction *I) const {
633 unsigned CheckedOpcode = I->getOpcode();
634 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
635 }
636
637 InstructionsState() = delete;
638 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
639 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
640};
641
642} // end anonymous namespace
643
644/// Chooses the correct key for scheduling data. If \p Op has the same (or
645/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
646/// OpValue.
647static Value *isOneOf(const InstructionsState &S, Value *Op) {
648 auto *I = dyn_cast<Instruction>(Op);
649 if (I && S.isOpcodeOrAlt(I))
650 return Op;
651 return S.OpValue;
652}
653
654/// \returns true if \p Opcode is allowed as part of the main/alternate
655/// instruction for SLP vectorization.
656///
657/// Example of unsupported opcode is SDIV that can potentially cause UB if the
658/// "shuffled out" lane would result in division by zero.
659static bool isValidForAlternation(unsigned Opcode) {
660 if (Instruction::isIntDivRem(Opcode))
661 return false;
662
663 return true;
664}
665
666static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
667 const TargetLibraryInfo &TLI,
668 unsigned BaseIndex = 0);
669
670/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
671/// compatible instructions or constants, or just some other regular values.
672static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
673 Value *Op1, const TargetLibraryInfo &TLI) {
674 return (isConstant(BaseOp0) && isConstant(Op0)) ||
675 (isConstant(BaseOp1) && isConstant(Op1)) ||
676 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
677 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
678 BaseOp0 == Op0 || BaseOp1 == Op1 ||
679 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
680 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
681}
682
683/// \returns true if a compare instruction \p CI has similar "look" and
684/// same predicate as \p BaseCI, "as is" or with its operands and predicate
685/// swapped, false otherwise.
686static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
687 const TargetLibraryInfo &TLI) {
688 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
689 "Assessing comparisons of different types?");
690 CmpInst::Predicate BasePred = BaseCI->getPredicate();
691 CmpInst::Predicate Pred = CI->getPredicate();
693
694 Value *BaseOp0 = BaseCI->getOperand(0);
695 Value *BaseOp1 = BaseCI->getOperand(1);
696 Value *Op0 = CI->getOperand(0);
697 Value *Op1 = CI->getOperand(1);
698
699 return (BasePred == Pred &&
700 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
701 (BasePred == SwappedPred &&
702 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
703}
704
705/// \returns analysis of the Instructions in \p VL described in
706/// InstructionsState, the Opcode that we suppose the whole list
707/// could be vectorized even if its structure is diverse.
708static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
709 const TargetLibraryInfo &TLI,
710 unsigned BaseIndex) {
711 // Make sure these are all Instructions.
712 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
713 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
714
715 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
716 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
717 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
718 CmpInst::Predicate BasePred =
719 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
721 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
722 unsigned AltOpcode = Opcode;
723 unsigned AltIndex = BaseIndex;
724
725 bool SwappedPredsCompatible = [&]() {
726 if (!IsCmpOp)
727 return false;
728 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
729 UniquePreds.insert(BasePred);
730 UniqueNonSwappedPreds.insert(BasePred);
731 for (Value *V : VL) {
732 auto *I = dyn_cast<CmpInst>(V);
733 if (!I)
734 return false;
735 CmpInst::Predicate CurrentPred = I->getPredicate();
736 CmpInst::Predicate SwappedCurrentPred =
737 CmpInst::getSwappedPredicate(CurrentPred);
738 UniqueNonSwappedPreds.insert(CurrentPred);
739 if (!UniquePreds.contains(CurrentPred) &&
740 !UniquePreds.contains(SwappedCurrentPred))
741 UniquePreds.insert(CurrentPred);
742 }
743 // Total number of predicates > 2, but if consider swapped predicates
744 // compatible only 2, consider swappable predicates as compatible opcodes,
745 // not alternate.
746 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
747 }();
748 // Check for one alternate opcode from another BinaryOperator.
749 // TODO - generalize to support all operators (types, calls etc.).
750 auto *IBase = cast<Instruction>(VL[BaseIndex]);
751 Intrinsic::ID BaseID = 0;
752 SmallVector<VFInfo> BaseMappings;
753 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
755 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
756 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
757 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
758 }
759 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
760 auto *I = cast<Instruction>(VL[Cnt]);
761 unsigned InstOpcode = I->getOpcode();
762 if (IsBinOp && isa<BinaryOperator>(I)) {
763 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
764 continue;
765 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
766 isValidForAlternation(Opcode)) {
767 AltOpcode = InstOpcode;
768 AltIndex = Cnt;
769 continue;
770 }
771 } else if (IsCastOp && isa<CastInst>(I)) {
772 Value *Op0 = IBase->getOperand(0);
773 Type *Ty0 = Op0->getType();
774 Value *Op1 = I->getOperand(0);
775 Type *Ty1 = Op1->getType();
776 if (Ty0 == Ty1) {
777 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
778 continue;
779 if (Opcode == AltOpcode) {
781 isValidForAlternation(InstOpcode) &&
782 "Cast isn't safe for alternation, logic needs to be updated!");
783 AltOpcode = InstOpcode;
784 AltIndex = Cnt;
785 continue;
786 }
787 }
788 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
789 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
790 Type *Ty0 = BaseInst->getOperand(0)->getType();
791 Type *Ty1 = Inst->getOperand(0)->getType();
792 if (Ty0 == Ty1) {
793 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
794 // Check for compatible operands. If the corresponding operands are not
795 // compatible - need to perform alternate vectorization.
796 CmpInst::Predicate CurrentPred = Inst->getPredicate();
797 CmpInst::Predicate SwappedCurrentPred =
798 CmpInst::getSwappedPredicate(CurrentPred);
799
800 if ((E == 2 || SwappedPredsCompatible) &&
801 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
802 continue;
803
804 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
805 continue;
806 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
807 if (AltIndex != BaseIndex) {
808 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
809 continue;
810 } else if (BasePred != CurrentPred) {
811 assert(
812 isValidForAlternation(InstOpcode) &&
813 "CmpInst isn't safe for alternation, logic needs to be updated!");
814 AltIndex = Cnt;
815 continue;
816 }
817 CmpInst::Predicate AltPred = AltInst->getPredicate();
818 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
819 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
820 continue;
821 }
822 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
823 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
824 if (Gep->getNumOperands() != 2 ||
825 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
826 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
827 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
829 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
830 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
831 auto *BaseLI = cast<LoadInst>(IBase);
832 if (!LI->isSimple() || !BaseLI->isSimple())
833 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
834 } else if (auto *Call = dyn_cast<CallInst>(I)) {
835 auto *CallBase = cast<CallInst>(IBase);
836 if (Call->getCalledFunction() != CallBase->getCalledFunction())
837 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
838 if (Call->hasOperandBundles() &&
839 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
840 Call->op_begin() + Call->getBundleOperandsEndIndex(),
841 CallBase->op_begin() +
843 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
845 if (ID != BaseID)
846 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
847 if (!ID) {
848 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
849 if (Mappings.size() != BaseMappings.size() ||
850 Mappings.front().ISA != BaseMappings.front().ISA ||
851 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
852 Mappings.front().VectorName != BaseMappings.front().VectorName ||
853 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
854 Mappings.front().Shape.Parameters !=
855 BaseMappings.front().Shape.Parameters)
856 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
857 }
858 }
859 continue;
860 }
861 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
862 }
863
864 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
865 cast<Instruction>(VL[AltIndex]));
866}
867
868/// \returns true if all of the values in \p VL have the same type or false
869/// otherwise.
871 Type *Ty = VL.front()->getType();
872 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
873}
874
875/// \returns True if in-tree use also needs extract. This refers to
876/// possible scalar operand in vectorized instruction.
877static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
878 TargetLibraryInfo *TLI) {
879 unsigned Opcode = UserInst->getOpcode();
880 switch (Opcode) {
881 case Instruction::Load: {
882 LoadInst *LI = cast<LoadInst>(UserInst);
883 return (LI->getPointerOperand() == Scalar);
884 }
885 case Instruction::Store: {
886 StoreInst *SI = cast<StoreInst>(UserInst);
887 return (SI->getPointerOperand() == Scalar);
888 }
889 case Instruction::Call: {
890 CallInst *CI = cast<CallInst>(UserInst);
892 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
893 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
894 Arg.value().get() == Scalar;
895 });
896 }
897 default:
898 return false;
899 }
900}
901
902/// \returns the AA location that is being access by the instruction.
904 if (StoreInst *SI = dyn_cast<StoreInst>(I))
905 return MemoryLocation::get(SI);
906 if (LoadInst *LI = dyn_cast<LoadInst>(I))
907 return MemoryLocation::get(LI);
908 return MemoryLocation();
909}
910
911/// \returns True if the instruction is not a volatile or atomic load/store.
912static bool isSimple(Instruction *I) {
913 if (LoadInst *LI = dyn_cast<LoadInst>(I))
914 return LI->isSimple();
915 if (StoreInst *SI = dyn_cast<StoreInst>(I))
916 return SI->isSimple();
917 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
918 return !MI->isVolatile();
919 return true;
920}
921
922/// Shuffles \p Mask in accordance with the given \p SubMask.
923/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
924/// one but two input vectors.
925static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
926 bool ExtendingManyInputs = false) {
927 if (SubMask.empty())
928 return;
929 assert(
930 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
931 // Check if input scalars were extended to match the size of other node.
932 (SubMask.size() == Mask.size() &&
933 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
934 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
935 "SubMask with many inputs support must be larger than the mask.");
936 if (Mask.empty()) {
937 Mask.append(SubMask.begin(), SubMask.end());
938 return;
939 }
940 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
941 int TermValue = std::min(Mask.size(), SubMask.size());
942 for (int I = 0, E = SubMask.size(); I < E; ++I) {
943 if (SubMask[I] == PoisonMaskElem ||
944 (!ExtendingManyInputs &&
945 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
946 continue;
947 NewMask[I] = Mask[SubMask[I]];
948 }
949 Mask.swap(NewMask);
950}
951
952/// Order may have elements assigned special value (size) which is out of
953/// bounds. Such indices only appear on places which correspond to undef values
954/// (see canReuseExtract for details) and used in order to avoid undef values
955/// have effect on operands ordering.
956/// The first loop below simply finds all unused indices and then the next loop
957/// nest assigns these indices for undef values positions.
958/// As an example below Order has two undef positions and they have assigned
959/// values 3 and 7 respectively:
960/// before: 6 9 5 4 9 2 1 0
961/// after: 6 3 5 4 7 2 1 0
963 const unsigned Sz = Order.size();
964 SmallBitVector UnusedIndices(Sz, /*t=*/true);
965 SmallBitVector MaskedIndices(Sz);
966 for (unsigned I = 0; I < Sz; ++I) {
967 if (Order[I] < Sz)
968 UnusedIndices.reset(Order[I]);
969 else
970 MaskedIndices.set(I);
971 }
972 if (MaskedIndices.none())
973 return;
974 assert(UnusedIndices.count() == MaskedIndices.count() &&
975 "Non-synced masked/available indices.");
976 int Idx = UnusedIndices.find_first();
977 int MIdx = MaskedIndices.find_first();
978 while (MIdx >= 0) {
979 assert(Idx >= 0 && "Indices must be synced.");
980 Order[MIdx] = Idx;
981 Idx = UnusedIndices.find_next(Idx);
982 MIdx = MaskedIndices.find_next(MIdx);
983 }
984}
985
986namespace llvm {
987
989 SmallVectorImpl<int> &Mask) {
990 Mask.clear();
991 const unsigned E = Indices.size();
992 Mask.resize(E, PoisonMaskElem);
993 for (unsigned I = 0; I < E; ++I)
994 Mask[Indices[I]] = I;
995}
996
997/// Reorders the list of scalars in accordance with the given \p Mask.
999 ArrayRef<int> Mask) {
1000 assert(!Mask.empty() && "Expected non-empty mask.");
1001 SmallVector<Value *> Prev(Scalars.size(),
1002 UndefValue::get(Scalars.front()->getType()));
1003 Prev.swap(Scalars);
1004 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1005 if (Mask[I] != PoisonMaskElem)
1006 Scalars[Mask[I]] = Prev[I];
1007}
1008
1009/// Checks if the provided value does not require scheduling. It does not
1010/// require scheduling if this is not an instruction or it is an instruction
1011/// that does not read/write memory and all operands are either not instructions
1012/// or phi nodes or instructions from different blocks.
1014 auto *I = dyn_cast<Instruction>(V);
1015 if (!I)
1016 return true;
1017 return !mayHaveNonDefUseDependency(*I) &&
1018 all_of(I->operands(), [I](Value *V) {
1019 auto *IO = dyn_cast<Instruction>(V);
1020 if (!IO)
1021 return true;
1022 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1023 });
1024}
1025
1026/// Checks if the provided value does not require scheduling. It does not
1027/// require scheduling if this is not an instruction or it is an instruction
1028/// that does not read/write memory and all users are phi nodes or instructions
1029/// from the different blocks.
1030static bool isUsedOutsideBlock(Value *V) {
1031 auto *I = dyn_cast<Instruction>(V);
1032 if (!I)
1033 return true;
1034 // Limits the number of uses to save compile time.
1035 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1036 all_of(I->users(), [I](User *U) {
1037 auto *IU = dyn_cast<Instruction>(U);
1038 if (!IU)
1039 return true;
1040 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1041 });
1042}
1043
1044/// Checks if the specified value does not require scheduling. It does not
1045/// require scheduling if all operands and all users do not need to be scheduled
1046/// in the current basic block.
1049}
1050
1051/// Checks if the specified array of instructions does not require scheduling.
1052/// It is so if all either instructions have operands that do not require
1053/// scheduling or their users do not require scheduling since they are phis or
1054/// in other basic blocks.
1056 return !VL.empty() &&
1058}
1059
1060namespace slpvectorizer {
1061
1062/// Bottom Up SLP Vectorizer.
1063class BoUpSLP {
1064 struct TreeEntry;
1065 struct ScheduleData;
1068
1069public:
1070 /// Tracks the state we can represent the loads in the given sequence.
1071 enum class LoadsState {
1072 Gather,
1073 Vectorize,
1076 };
1077
1085
1087 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1090 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1091 AC(AC), DB(DB), DL(DL), ORE(ORE),
1092 Builder(Se->getContext(), TargetFolder(*DL)) {
1093 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1094 // Use the vector register size specified by the target unless overridden
1095 // by a command-line option.
1096 // TODO: It would be better to limit the vectorization factor based on
1097 // data type rather than just register size. For example, x86 AVX has
1098 // 256-bit registers, but it does not support integer operations
1099 // at that width (that requires AVX2).
1100 if (MaxVectorRegSizeOption.getNumOccurrences())
1101 MaxVecRegSize = MaxVectorRegSizeOption;
1102 else
1103 MaxVecRegSize =
1105 .getFixedValue();
1106
1107 if (MinVectorRegSizeOption.getNumOccurrences())
1108 MinVecRegSize = MinVectorRegSizeOption;
1109 else
1110 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1111 }
1112
1113 /// Vectorize the tree that starts with the elements in \p VL.
1114 /// Returns the vectorized root.
1116
1117 /// Vectorize the tree but with the list of externally used values \p
1118 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1119 /// generated extractvalue instructions.
1120 /// \param ReplacedExternals containd list of replaced external values
1121 /// {scalar, replace} after emitting extractelement for external uses.
1122 Value *
1123 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1124 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1125 Instruction *ReductionRoot = nullptr);
1126
1127 /// \returns the cost incurred by unwanted spills and fills, caused by
1128 /// holding live values over call sites.
1130
1131 /// \returns the vectorization cost of the subtree that starts at \p VL.
1132 /// A negative number means that this is profitable.
1133 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1134
1135 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1136 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1137 void buildTree(ArrayRef<Value *> Roots,
1138 const SmallDenseSet<Value *> &UserIgnoreLst);
1139
1140 /// Construct a vectorizable tree that starts at \p Roots.
1141 void buildTree(ArrayRef<Value *> Roots);
1142
1143 /// Returns whether the root node has in-tree uses.
1145 return !VectorizableTree.empty() &&
1146 !VectorizableTree.front()->UserTreeIndices.empty();
1147 }
1148
1149 /// Return the scalars of the root node.
1151 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1152 return VectorizableTree.front()->Scalars;
1153 }
1154
1155 /// Builds external uses of the vectorized scalars, i.e. the list of
1156 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1157 /// ExternallyUsedValues contains additional list of external uses to handle
1158 /// vectorization of reductions.
1159 void
1160 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1161
1162 /// Transforms graph nodes to target specific representations, if profitable.
1163 void transformNodes();
1164
1165 /// Clear the internal data structures that are created by 'buildTree'.
1166 void deleteTree() {
1167 VectorizableTree.clear();
1168 ScalarToTreeEntry.clear();
1169 MultiNodeScalars.clear();
1170 MustGather.clear();
1171 NonScheduledFirst.clear();
1172 EntryToLastInstruction.clear();
1173 ExternalUses.clear();
1174 ExternalUsesAsGEPs.clear();
1175 for (auto &Iter : BlocksSchedules) {
1176 BlockScheduling *BS = Iter.second.get();
1177 BS->clear();
1178 }
1179 MinBWs.clear();
1180 ReductionBitWidth = 0;
1181 CastMaxMinBWSizes.reset();
1182 ExtraBitWidthNodes.clear();
1183 InstrElementSize.clear();
1184 UserIgnoreList = nullptr;
1185 PostponedGathers.clear();
1186 ValueToGatherNodes.clear();
1187 }
1188
1189 unsigned getTreeSize() const { return VectorizableTree.size(); }
1190
1191 /// Perform LICM and CSE on the newly generated gather sequences.
1193
1194 /// Checks if the specified gather tree entry \p TE can be represented as a
1195 /// shuffled vector entry + (possibly) permutation with other gathers. It
1196 /// implements the checks only for possibly ordered scalars (Loads,
1197 /// ExtractElement, ExtractValue), which can be part of the graph.
1198 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1199
1200 /// Sort loads into increasing pointers offsets to allow greater clustering.
1201 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1202
1203 /// Gets reordering data for the given tree entry. If the entry is vectorized
1204 /// - just return ReorderIndices, otherwise check if the scalars can be
1205 /// reordered and return the most optimal order.
1206 /// \return std::nullopt if ordering is not important, empty order, if
1207 /// identity order is important, or the actual order.
1208 /// \param TopToBottom If true, include the order of vectorized stores and
1209 /// insertelement nodes, otherwise skip them.
1210 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1211 bool TopToBottom);
1212
1213 /// Reorders the current graph to the most profitable order starting from the
1214 /// root node to the leaf nodes. The best order is chosen only from the nodes
1215 /// of the same size (vectorization factor). Smaller nodes are considered
1216 /// parts of subgraph with smaller VF and they are reordered independently. We
1217 /// can make it because we still need to extend smaller nodes to the wider VF
1218 /// and we can merge reordering shuffles with the widening shuffles.
1219 void reorderTopToBottom();
1220
1221 /// Reorders the current graph to the most profitable order starting from
1222 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1223 /// number of reshuffles if the leaf nodes use the same order. In this case we
1224 /// can merge the orders and just shuffle user node instead of shuffling its
1225 /// operands. Plus, even the leaf nodes have different orders, it allows to
1226 /// sink reordering in the graph closer to the root node and merge it later
1227 /// during analysis.
1228 void reorderBottomToTop(bool IgnoreReorder = false);
1229
1230 /// \return The vector element size in bits to use when vectorizing the
1231 /// expression tree ending at \p V. If V is a store, the size is the width of
1232 /// the stored value. Otherwise, the size is the width of the largest loaded
1233 /// value reaching V. This method is used by the vectorizer to calculate
1234 /// vectorization factors.
1235 unsigned getVectorElementSize(Value *V);
1236
1237 /// Compute the minimum type sizes required to represent the entries in a
1238 /// vectorizable tree.
1240
1241 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1242 unsigned getMaxVecRegSize() const {
1243 return MaxVecRegSize;
1244 }
1245
1246 // \returns minimum vector register size as set by cl::opt.
1247 unsigned getMinVecRegSize() const {
1248 return MinVecRegSize;
1249 }
1250
1251 unsigned getMinVF(unsigned Sz) const {
1252 return std::max(2U, getMinVecRegSize() / Sz);
1253 }
1254
1255 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1256 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1257 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1258 return MaxVF ? MaxVF : UINT_MAX;
1259 }
1260
1261 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1262 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1263 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1264 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1265 ///
1266 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1267 unsigned canMapToVector(Type *T) const;
1268
1269 /// \returns True if the VectorizableTree is both tiny and not fully
1270 /// vectorizable. We do not vectorize such trees.
1271 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1272
1273 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1274 /// can be load combined in the backend. Load combining may not be allowed in
1275 /// the IR optimizer, so we do not want to alter the pattern. For example,
1276 /// partially transforming a scalar bswap() pattern into vector code is
1277 /// effectively impossible for the backend to undo.
1278 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1279 /// may not be necessary.
1280 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1281
1282 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1283 /// can be load combined in the backend. Load combining may not be allowed in
1284 /// the IR optimizer, so we do not want to alter the pattern. For example,
1285 /// partially transforming a scalar bswap() pattern into vector code is
1286 /// effectively impossible for the backend to undo.
1287 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1288 /// may not be necessary.
1289 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1290
1291 /// Checks if the given array of loads can be represented as a vectorized,
1292 /// scatter or just simple gather.
1293 /// \param VL list of loads.
1294 /// \param VL0 main load value.
1295 /// \param Order returned order of load instructions.
1296 /// \param PointerOps returned list of pointer operands.
1297 /// \param TryRecursiveCheck used to check if long masked gather can be
1298 /// represented as a serie of loads/insert subvector, if profitable.
1301 SmallVectorImpl<Value *> &PointerOps,
1302 bool TryRecursiveCheck = true) const;
1303
1305
1306 /// This structure holds any data we need about the edges being traversed
1307 /// during buildTree_rec(). We keep track of:
1308 /// (i) the user TreeEntry index, and
1309 /// (ii) the index of the edge.
1310 struct EdgeInfo {
1311 EdgeInfo() = default;
1312 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1314 /// The user TreeEntry.
1315 TreeEntry *UserTE = nullptr;
1316 /// The operand index of the use.
1317 unsigned EdgeIdx = UINT_MAX;
1318#ifndef NDEBUG
1320 const BoUpSLP::EdgeInfo &EI) {
1321 EI.dump(OS);
1322 return OS;
1323 }
1324 /// Debug print.
1325 void dump(raw_ostream &OS) const {
1326 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1327 << " EdgeIdx:" << EdgeIdx << "}";
1328 }
1329 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1330#endif
1331 bool operator == (const EdgeInfo &Other) const {
1332 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1333 }
1334 };
1335
1336 /// A helper class used for scoring candidates for two consecutive lanes.
1338 const TargetLibraryInfo &TLI;
1339 const DataLayout &DL;
1340 ScalarEvolution &SE;
1341 const BoUpSLP &R;
1342 int NumLanes; // Total number of lanes (aka vectorization factor).
1343 int MaxLevel; // The maximum recursion depth for accumulating score.
1344
1345 public:
1347 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1348 int MaxLevel)
1349 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1350 MaxLevel(MaxLevel) {}
1351
1352 // The hard-coded scores listed here are not very important, though it shall
1353 // be higher for better matches to improve the resulting cost. When
1354 // computing the scores of matching one sub-tree with another, we are
1355 // basically counting the number of values that are matching. So even if all
1356 // scores are set to 1, we would still get a decent matching result.
1357 // However, sometimes we have to break ties. For example we may have to
1358 // choose between matching loads vs matching opcodes. This is what these
1359 // scores are helping us with: they provide the order of preference. Also,
1360 // this is important if the scalar is externally used or used in another
1361 // tree entry node in the different lane.
1362
1363 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1364 static const int ScoreConsecutiveLoads = 4;
1365 /// The same load multiple times. This should have a better score than
1366 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1367 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1368 /// a vector load and 1.0 for a broadcast.
1369 static const int ScoreSplatLoads = 3;
1370 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1371 static const int ScoreReversedLoads = 3;
1372 /// A load candidate for masked gather.
1373 static const int ScoreMaskedGatherCandidate = 1;
1374 /// ExtractElementInst from same vector and consecutive indexes.
1375 static const int ScoreConsecutiveExtracts = 4;
1376 /// ExtractElementInst from same vector and reversed indices.
1377 static const int ScoreReversedExtracts = 3;
1378 /// Constants.
1379 static const int ScoreConstants = 2;
1380 /// Instructions with the same opcode.
1381 static const int ScoreSameOpcode = 2;
1382 /// Instructions with alt opcodes (e.g, add + sub).
1383 static const int ScoreAltOpcodes = 1;
1384 /// Identical instructions (a.k.a. splat or broadcast).
1385 static const int ScoreSplat = 1;
1386 /// Matching with an undef is preferable to failing.
1387 static const int ScoreUndef = 1;
1388 /// Score for failing to find a decent match.
1389 static const int ScoreFail = 0;
1390 /// Score if all users are vectorized.
1391 static const int ScoreAllUserVectorized = 1;
1392
1393 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1394 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1395 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1396 /// MainAltOps.
1398 ArrayRef<Value *> MainAltOps) const {
1399 if (!isValidElementType(V1->getType()) ||
1400 !isValidElementType(V2->getType()))
1402
1403 if (V1 == V2) {
1404 if (isa<LoadInst>(V1)) {
1405 // Retruns true if the users of V1 and V2 won't need to be extracted.
1406 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1407 // Bail out if we have too many uses to save compilation time.
1408 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1409 return false;
1410
1411 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1412 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1413 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1414 });
1415 };
1416 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1417 };
1418 // A broadcast of a load can be cheaper on some targets.
1419 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1420 ElementCount::getFixed(NumLanes)) &&
1421 ((int)V1->getNumUses() == NumLanes ||
1422 AllUsersAreInternal(V1, V2)))
1424 }
1426 }
1427
1428 auto CheckSameEntryOrFail = [&]() {
1429 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1430 TE1 && TE1 == R.getTreeEntry(V2))
1433 };
1434
1435 auto *LI1 = dyn_cast<LoadInst>(V1);
1436 auto *LI2 = dyn_cast<LoadInst>(V2);
1437 if (LI1 && LI2) {
1438 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1439 !LI2->isSimple())
1440 return CheckSameEntryOrFail();
1441
1442 std::optional<int> Dist = getPointersDiff(
1443 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1444 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1445 if (!Dist || *Dist == 0) {
1446 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1447 getUnderlyingObject(LI2->getPointerOperand()) &&
1448 R.TTI->isLegalMaskedGather(
1449 FixedVectorType::get(LI1->getType(), NumLanes),
1450 LI1->getAlign()))
1452 return CheckSameEntryOrFail();
1453 }
1454 // The distance is too large - still may be profitable to use masked
1455 // loads/gathers.
1456 if (std::abs(*Dist) > NumLanes / 2)
1458 // This still will detect consecutive loads, but we might have "holes"
1459 // in some cases. It is ok for non-power-2 vectorization and may produce
1460 // better results. It should not affect current vectorization.
1463 }
1464
1465 auto *C1 = dyn_cast<Constant>(V1);
1466 auto *C2 = dyn_cast<Constant>(V2);
1467 if (C1 && C2)
1469
1470 // Extracts from consecutive indexes of the same vector better score as
1471 // the extracts could be optimized away.
1472 Value *EV1;
1473 ConstantInt *Ex1Idx;
1474 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1475 // Undefs are always profitable for extractelements.
1476 // Compiler can easily combine poison and extractelement <non-poison> or
1477 // undef and extractelement <poison>. But combining undef +
1478 // extractelement <non-poison-but-may-produce-poison> requires some
1479 // extra operations.
1480 if (isa<UndefValue>(V2))
1481 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1484 Value *EV2 = nullptr;
1485 ConstantInt *Ex2Idx = nullptr;
1486 if (match(V2,
1488 m_Undef())))) {
1489 // Undefs are always profitable for extractelements.
1490 if (!Ex2Idx)
1492 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1494 if (EV2 == EV1) {
1495 int Idx1 = Ex1Idx->getZExtValue();
1496 int Idx2 = Ex2Idx->getZExtValue();
1497 int Dist = Idx2 - Idx1;
1498 // The distance is too large - still may be profitable to use
1499 // shuffles.
1500 if (std::abs(Dist) == 0)
1502 if (std::abs(Dist) > NumLanes / 2)
1506 }
1508 }
1509 return CheckSameEntryOrFail();
1510 }
1511
1512 auto *I1 = dyn_cast<Instruction>(V1);
1513 auto *I2 = dyn_cast<Instruction>(V2);
1514 if (I1 && I2) {
1515 if (I1->getParent() != I2->getParent())
1516 return CheckSameEntryOrFail();
1517 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1518 Ops.push_back(I1);
1519 Ops.push_back(I2);
1520 InstructionsState S = getSameOpcode(Ops, TLI);
1521 // Note: Only consider instructions with <= 2 operands to avoid
1522 // complexity explosion.
1523 if (S.getOpcode() &&
1524 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1525 !S.isAltShuffle()) &&
1526 all_of(Ops, [&S](Value *V) {
1527 return cast<Instruction>(V)->getNumOperands() ==
1528 S.MainOp->getNumOperands();
1529 }))
1530 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1532 }
1533
1534 if (isa<UndefValue>(V2))
1536
1537 return CheckSameEntryOrFail();
1538 }
1539
1540 /// Go through the operands of \p LHS and \p RHS recursively until
1541 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1542 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1543 /// of \p U1 and \p U2), except at the beginning of the recursion where
1544 /// these are set to nullptr.
1545 ///
1546 /// For example:
1547 /// \verbatim
1548 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1549 /// \ / \ / \ / \ /
1550 /// + + + +
1551 /// G1 G2 G3 G4
1552 /// \endverbatim
1553 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1554 /// each level recursively, accumulating the score. It starts from matching
1555 /// the additions at level 0, then moves on to the loads (level 1). The
1556 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1557 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1558 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1559 /// Please note that the order of the operands does not matter, as we
1560 /// evaluate the score of all profitable combinations of operands. In
1561 /// other words the score of G1 and G4 is the same as G1 and G2. This
1562 /// heuristic is based on ideas described in:
1563 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1564 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1565 /// Luís F. W. Góes
1567 Instruction *U2, int CurrLevel,
1568 ArrayRef<Value *> MainAltOps) const {
1569
1570 // Get the shallow score of V1 and V2.
1571 int ShallowScoreAtThisLevel =
1572 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1573
1574 // If reached MaxLevel,
1575 // or if V1 and V2 are not instructions,
1576 // or if they are SPLAT,
1577 // or if they are not consecutive,
1578 // or if profitable to vectorize loads or extractelements, early return
1579 // the current cost.
1580 auto *I1 = dyn_cast<Instruction>(LHS);
1581 auto *I2 = dyn_cast<Instruction>(RHS);
1582 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1583 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1584 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1585 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1586 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1587 ShallowScoreAtThisLevel))
1588 return ShallowScoreAtThisLevel;
1589 assert(I1 && I2 && "Should have early exited.");
1590
1591 // Contains the I2 operand indexes that got matched with I1 operands.
1592 SmallSet<unsigned, 4> Op2Used;
1593
1594 // Recursion towards the operands of I1 and I2. We are trying all possible
1595 // operand pairs, and keeping track of the best score.
1596 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1597 OpIdx1 != NumOperands1; ++OpIdx1) {
1598 // Try to pair op1I with the best operand of I2.
1599 int MaxTmpScore = 0;
1600 unsigned MaxOpIdx2 = 0;
1601 bool FoundBest = false;
1602 // If I2 is commutative try all combinations.
1603 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1604 unsigned ToIdx = isCommutative(I2)
1605 ? I2->getNumOperands()
1606 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1607 assert(FromIdx <= ToIdx && "Bad index");
1608 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1609 // Skip operands already paired with OpIdx1.
1610 if (Op2Used.count(OpIdx2))
1611 continue;
1612 // Recursively calculate the cost at each level
1613 int TmpScore =
1614 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1615 I1, I2, CurrLevel + 1, std::nullopt);
1616 // Look for the best score.
1617 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1618 TmpScore > MaxTmpScore) {
1619 MaxTmpScore = TmpScore;
1620 MaxOpIdx2 = OpIdx2;
1621 FoundBest = true;
1622 }
1623 }
1624 if (FoundBest) {
1625 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1626 Op2Used.insert(MaxOpIdx2);
1627 ShallowScoreAtThisLevel += MaxTmpScore;
1628 }
1629 }
1630 return ShallowScoreAtThisLevel;
1631 }
1632 };
1633 /// A helper data structure to hold the operands of a vector of instructions.
1634 /// This supports a fixed vector length for all operand vectors.
1636 /// For each operand we need (i) the value, and (ii) the opcode that it
1637 /// would be attached to if the expression was in a left-linearized form.
1638 /// This is required to avoid illegal operand reordering.
1639 /// For example:
1640 /// \verbatim
1641 /// 0 Op1
1642 /// |/
1643 /// Op1 Op2 Linearized + Op2
1644 /// \ / ----------> |/
1645 /// - -
1646 ///
1647 /// Op1 - Op2 (0 + Op1) - Op2
1648 /// \endverbatim
1649 ///
1650 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1651 ///
1652 /// Another way to think of this is to track all the operations across the
1653 /// path from the operand all the way to the root of the tree and to
1654 /// calculate the operation that corresponds to this path. For example, the
1655 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1656 /// corresponding operation is a '-' (which matches the one in the
1657 /// linearized tree, as shown above).
1658 ///
1659 /// For lack of a better term, we refer to this operation as Accumulated
1660 /// Path Operation (APO).
1661 struct OperandData {
1662 OperandData() = default;
1663 OperandData(Value *V, bool APO, bool IsUsed)
1664 : V(V), APO(APO), IsUsed(IsUsed) {}
1665 /// The operand value.
1666 Value *V = nullptr;
1667 /// TreeEntries only allow a single opcode, or an alternate sequence of
1668 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1669 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1670 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1671 /// (e.g., Add/Mul)
1672 bool APO = false;
1673 /// Helper data for the reordering function.
1674 bool IsUsed = false;
1675 };
1676
1677 /// During operand reordering, we are trying to select the operand at lane
1678 /// that matches best with the operand at the neighboring lane. Our
1679 /// selection is based on the type of value we are looking for. For example,
1680 /// if the neighboring lane has a load, we need to look for a load that is
1681 /// accessing a consecutive address. These strategies are summarized in the
1682 /// 'ReorderingMode' enumerator.
1683 enum class ReorderingMode {
1684 Load, ///< Matching loads to consecutive memory addresses
1685 Opcode, ///< Matching instructions based on opcode (same or alternate)
1686 Constant, ///< Matching constants
1687 Splat, ///< Matching the same instruction multiple times (broadcast)
1688 Failed, ///< We failed to create a vectorizable group
1689 };
1690
1692
1693 /// A vector of operand vectors.
1695
1696 const TargetLibraryInfo &TLI;
1697 const DataLayout &DL;
1698 ScalarEvolution &SE;
1699 const BoUpSLP &R;
1700 const Loop *L = nullptr;
1701
1702 /// \returns the operand data at \p OpIdx and \p Lane.
1703 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1704 return OpsVec[OpIdx][Lane];
1705 }
1706
1707 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1708 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1709 return OpsVec[OpIdx][Lane];
1710 }
1711
1712 /// Clears the used flag for all entries.
1713 void clearUsed() {
1714 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1715 OpIdx != NumOperands; ++OpIdx)
1716 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1717 ++Lane)
1718 OpsVec[OpIdx][Lane].IsUsed = false;
1719 }
1720
1721 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1722 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1723 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1724 }
1725
1726 /// \param Lane lane of the operands under analysis.
1727 /// \param OpIdx operand index in \p Lane lane we're looking the best
1728 /// candidate for.
1729 /// \param Idx operand index of the current candidate value.
1730 /// \returns The additional score due to possible broadcasting of the
1731 /// elements in the lane. It is more profitable to have power-of-2 unique
1732 /// elements in the lane, it will be vectorized with higher probability
1733 /// after removing duplicates. Currently the SLP vectorizer supports only
1734 /// vectorization of the power-of-2 number of unique scalars.
1735 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1736 Value *IdxLaneV = getData(Idx, Lane).V;
1737 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1738 return 0;
1740 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1741 if (Ln == Lane)
1742 continue;
1743 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1744 if (!isa<Instruction>(OpIdxLnV))
1745 return 0;
1746 Uniques.insert(OpIdxLnV);
1747 }
1748 int UniquesCount = Uniques.size();
1749 int UniquesCntWithIdxLaneV =
1750 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1751 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1752 int UniquesCntWithOpIdxLaneV =
1753 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1754 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1755 return 0;
1756 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1757 UniquesCntWithOpIdxLaneV) -
1758 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1759 }
1760
1761 /// \param Lane lane of the operands under analysis.
1762 /// \param OpIdx operand index in \p Lane lane we're looking the best
1763 /// candidate for.
1764 /// \param Idx operand index of the current candidate value.
1765 /// \returns The additional score for the scalar which users are all
1766 /// vectorized.
1767 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1768 Value *IdxLaneV = getData(Idx, Lane).V;
1769 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1770 // Do not care about number of uses for vector-like instructions
1771 // (extractelement/extractvalue with constant indices), they are extracts
1772 // themselves and already externally used. Vectorization of such
1773 // instructions does not add extra extractelement instruction, just may
1774 // remove it.
1775 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1776 isVectorLikeInstWithConstOps(OpIdxLaneV))
1778 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1779 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1780 return 0;
1781 return R.areAllUsersVectorized(IdxLaneI)
1783 : 0;
1784 }
1785
1786 /// Score scaling factor for fully compatible instructions but with
1787 /// different number of external uses. Allows better selection of the
1788 /// instructions with less external uses.
1789 static const int ScoreScaleFactor = 10;
1790
1791 /// \Returns the look-ahead score, which tells us how much the sub-trees
1792 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1793 /// score. This helps break ties in an informed way when we cannot decide on
1794 /// the order of the operands by just considering the immediate
1795 /// predecessors.
1796 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1797 int Lane, unsigned OpIdx, unsigned Idx,
1798 bool &IsUsed) {
1799 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1801 // Keep track of the instruction stack as we recurse into the operands
1802 // during the look-ahead score exploration.
1803 int Score =
1804 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1805 /*CurrLevel=*/1, MainAltOps);
1806 if (Score) {
1807 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1808 if (Score <= -SplatScore) {
1809 // Set the minimum score for splat-like sequence to avoid setting
1810 // failed state.
1811 Score = 1;
1812 } else {
1813 Score += SplatScore;
1814 // Scale score to see the difference between different operands
1815 // and similar operands but all vectorized/not all vectorized
1816 // uses. It does not affect actual selection of the best
1817 // compatible operand in general, just allows to select the
1818 // operand with all vectorized uses.
1819 Score *= ScoreScaleFactor;
1820 Score += getExternalUseScore(Lane, OpIdx, Idx);
1821 IsUsed = true;
1822 }
1823 }
1824 return Score;
1825 }
1826
1827 /// Best defined scores per lanes between the passes. Used to choose the
1828 /// best operand (with the highest score) between the passes.
1829 /// The key - {Operand Index, Lane}.
1830 /// The value - the best score between the passes for the lane and the
1831 /// operand.
1833 BestScoresPerLanes;
1834
1835 // Search all operands in Ops[*][Lane] for the one that matches best
1836 // Ops[OpIdx][LastLane] and return its opreand index.
1837 // If no good match can be found, return std::nullopt.
1838 std::optional<unsigned>
1839 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1840 ArrayRef<ReorderingMode> ReorderingModes,
1841 ArrayRef<Value *> MainAltOps) {
1842 unsigned NumOperands = getNumOperands();
1843
1844 // The operand of the previous lane at OpIdx.
1845 Value *OpLastLane = getData(OpIdx, LastLane).V;
1846
1847 // Our strategy mode for OpIdx.
1848 ReorderingMode RMode = ReorderingModes[OpIdx];
1849 if (RMode == ReorderingMode::Failed)
1850 return std::nullopt;
1851
1852 // The linearized opcode of the operand at OpIdx, Lane.
1853 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1854
1855 // The best operand index and its score.
1856 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1857 // are using the score to differentiate between the two.
1858 struct BestOpData {
1859 std::optional<unsigned> Idx;
1860 unsigned Score = 0;
1861 } BestOp;
1862 BestOp.Score =
1863 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1864 .first->second;
1865
1866 // Track if the operand must be marked as used. If the operand is set to
1867 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1868 // want to reestimate the operands again on the following iterations).
1869 bool IsUsed = RMode == ReorderingMode::Splat ||
1870 RMode == ReorderingMode::Constant ||
1871 RMode == ReorderingMode::Load;
1872 // Iterate through all unused operands and look for the best.
1873 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1874 // Get the operand at Idx and Lane.
1875 OperandData &OpData = getData(Idx, Lane);
1876 Value *Op = OpData.V;
1877 bool OpAPO = OpData.APO;
1878
1879 // Skip already selected operands.
1880 if (OpData.IsUsed)
1881 continue;
1882
1883 // Skip if we are trying to move the operand to a position with a
1884 // different opcode in the linearized tree form. This would break the
1885 // semantics.
1886 if (OpAPO != OpIdxAPO)
1887 continue;
1888
1889 // Look for an operand that matches the current mode.
1890 switch (RMode) {
1891 case ReorderingMode::Load:
1892 case ReorderingMode::Opcode: {
1893 bool LeftToRight = Lane > LastLane;
1894 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1895 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1896 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1897 OpIdx, Idx, IsUsed);
1898 if (Score > static_cast<int>(BestOp.Score) ||
1899 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1900 Idx == OpIdx)) {
1901 BestOp.Idx = Idx;
1902 BestOp.Score = Score;
1903 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1904 }
1905 break;
1906 }
1907 case ReorderingMode::Constant:
1908 if (isa<Constant>(Op) ||
1909 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1910 BestOp.Idx = Idx;
1911 if (isa<Constant>(Op)) {
1913 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1915 }
1916 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1917 IsUsed = false;
1918 }
1919 break;
1920 case ReorderingMode::Splat:
1921 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1922 IsUsed = Op == OpLastLane;
1923 if (Op == OpLastLane) {
1924 BestOp.Score = LookAheadHeuristics::ScoreSplat;
1925 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1927 }
1928 BestOp.Idx = Idx;
1929 }
1930 break;
1931 case ReorderingMode::Failed:
1932 llvm_unreachable("Not expected Failed reordering mode.");
1933 }
1934 }
1935
1936 if (BestOp.Idx) {
1937 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1938 return BestOp.Idx;
1939 }
1940 // If we could not find a good match return std::nullopt.
1941 return std::nullopt;
1942 }
1943
1944 /// Helper for reorderOperandVecs.
1945 /// \returns the lane that we should start reordering from. This is the one
1946 /// which has the least number of operands that can freely move about or
1947 /// less profitable because it already has the most optimal set of operands.
1948 unsigned getBestLaneToStartReordering() const {
1949 unsigned Min = UINT_MAX;
1950 unsigned SameOpNumber = 0;
1951 // std::pair<unsigned, unsigned> is used to implement a simple voting
1952 // algorithm and choose the lane with the least number of operands that
1953 // can freely move about or less profitable because it already has the
1954 // most optimal set of operands. The first unsigned is a counter for
1955 // voting, the second unsigned is the counter of lanes with instructions
1956 // with same/alternate opcodes and same parent basic block.
1958 // Try to be closer to the original results, if we have multiple lanes
1959 // with same cost. If 2 lanes have the same cost, use the one with the
1960 // lowest index.
1961 for (int I = getNumLanes(); I > 0; --I) {
1962 unsigned Lane = I - 1;
1963 OperandsOrderData NumFreeOpsHash =
1964 getMaxNumOperandsThatCanBeReordered(Lane);
1965 // Compare the number of operands that can move and choose the one with
1966 // the least number.
1967 if (NumFreeOpsHash.NumOfAPOs < Min) {
1968 Min = NumFreeOpsHash.NumOfAPOs;
1969 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1970 HashMap.clear();
1971 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1972 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1973 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1974 // Select the most optimal lane in terms of number of operands that
1975 // should be moved around.
1976 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1977 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1978 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1979 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1980 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1981 if (It == HashMap.end())
1982 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1983 else
1984 ++It->second.first;
1985 }
1986 }
1987 // Select the lane with the minimum counter.
1988 unsigned BestLane = 0;
1989 unsigned CntMin = UINT_MAX;
1990 for (const auto &Data : reverse(HashMap)) {
1991 if (Data.second.first < CntMin) {
1992 CntMin = Data.second.first;
1993 BestLane = Data.second.second;
1994 }
1995 }
1996 return BestLane;
1997 }
1998
1999 /// Data structure that helps to reorder operands.
2000 struct OperandsOrderData {
2001 /// The best number of operands with the same APOs, which can be
2002 /// reordered.
2003 unsigned NumOfAPOs = UINT_MAX;
2004 /// Number of operands with the same/alternate instruction opcode and
2005 /// parent.
2006 unsigned NumOpsWithSameOpcodeParent = 0;
2007 /// Hash for the actual operands ordering.
2008 /// Used to count operands, actually their position id and opcode
2009 /// value. It is used in the voting mechanism to find the lane with the
2010 /// least number of operands that can freely move about or less profitable
2011 /// because it already has the most optimal set of operands. Can be
2012 /// replaced with SmallVector<unsigned> instead but hash code is faster
2013 /// and requires less memory.
2014 unsigned Hash = 0;
2015 };
2016 /// \returns the maximum number of operands that are allowed to be reordered
2017 /// for \p Lane and the number of compatible instructions(with the same
2018 /// parent/opcode). This is used as a heuristic for selecting the first lane
2019 /// to start operand reordering.
2020 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2021 unsigned CntTrue = 0;
2022 unsigned NumOperands = getNumOperands();
2023 // Operands with the same APO can be reordered. We therefore need to count
2024 // how many of them we have for each APO, like this: Cnt[APO] = x.
2025 // Since we only have two APOs, namely true and false, we can avoid using
2026 // a map. Instead we can simply count the number of operands that
2027 // correspond to one of them (in this case the 'true' APO), and calculate
2028 // the other by subtracting it from the total number of operands.
2029 // Operands with the same instruction opcode and parent are more
2030 // profitable since we don't need to move them in many cases, with a high
2031 // probability such lane already can be vectorized effectively.
2032 bool AllUndefs = true;
2033 unsigned NumOpsWithSameOpcodeParent = 0;
2034 Instruction *OpcodeI = nullptr;
2035 BasicBlock *Parent = nullptr;
2036 unsigned Hash = 0;
2037 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2038 const OperandData &OpData = getData(OpIdx, Lane);
2039 if (OpData.APO)
2040 ++CntTrue;
2041 // Use Boyer-Moore majority voting for finding the majority opcode and
2042 // the number of times it occurs.
2043 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2044 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2045 I->getParent() != Parent) {
2046 if (NumOpsWithSameOpcodeParent == 0) {
2047 NumOpsWithSameOpcodeParent = 1;
2048 OpcodeI = I;
2049 Parent = I->getParent();
2050 } else {
2051 --NumOpsWithSameOpcodeParent;
2052 }
2053 } else {
2054 ++NumOpsWithSameOpcodeParent;
2055 }
2056 }
2057 Hash = hash_combine(
2058 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2059 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2060 }
2061 if (AllUndefs)
2062 return {};
2063 OperandsOrderData Data;
2064 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2065 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2066 Data.Hash = Hash;
2067 return Data;
2068 }
2069
2070 /// Go through the instructions in VL and append their operands.
2071 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2072 assert(!VL.empty() && "Bad VL");
2073 assert((empty() || VL.size() == getNumLanes()) &&
2074 "Expected same number of lanes");
2075 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2076 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2077 constexpr unsigned IntrinsicNumOperands = 2;
2078 if (isa<IntrinsicInst>(VL[0]))
2079 NumOperands = IntrinsicNumOperands;
2080 OpsVec.resize(NumOperands);
2081 unsigned NumLanes = VL.size();
2082 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2083 OpsVec[OpIdx].resize(NumLanes);
2084 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2085 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2086 // Our tree has just 3 nodes: the root and two operands.
2087 // It is therefore trivial to get the APO. We only need to check the
2088 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2089 // RHS operand. The LHS operand of both add and sub is never attached
2090 // to an inversese operation in the linearized form, therefore its APO
2091 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2092
2093 // Since operand reordering is performed on groups of commutative
2094 // operations or alternating sequences (e.g., +, -), we can safely
2095 // tell the inverse operations by checking commutativity.
2096 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2097 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2098 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2099 APO, false};
2100 }
2101 }
2102 }
2103
2104 /// \returns the number of operands.
2105 unsigned getNumOperands() const { return OpsVec.size(); }
2106
2107 /// \returns the number of lanes.
2108 unsigned getNumLanes() const { return OpsVec[0].size(); }
2109
2110 /// \returns the operand value at \p OpIdx and \p Lane.
2111 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2112 return getData(OpIdx, Lane).V;
2113 }
2114
2115 /// \returns true if the data structure is empty.
2116 bool empty() const { return OpsVec.empty(); }
2117
2118 /// Clears the data.
2119 void clear() { OpsVec.clear(); }
2120
2121 /// \Returns true if there are enough operands identical to \p Op to fill
2122 /// the whole vector (it is mixed with constants or loop invariant values).
2123 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2124 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2125 bool OpAPO = getData(OpIdx, Lane).APO;
2126 bool IsInvariant = L && L->isLoopInvariant(Op);
2127 unsigned Cnt = 0;
2128 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2129 if (Ln == Lane)
2130 continue;
2131 // This is set to true if we found a candidate for broadcast at Lane.
2132 bool FoundCandidate = false;
2133 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2134 OperandData &Data = getData(OpI, Ln);
2135 if (Data.APO != OpAPO || Data.IsUsed)
2136 continue;
2137 Value *OpILane = getValue(OpI, Lane);
2138 bool IsConstantOp = isa<Constant>(OpILane);
2139 // Consider the broadcast candidate if:
2140 // 1. Same value is found in one of the operands.
2141 if (Data.V == Op ||
2142 // 2. The operand in the given lane is not constant but there is a
2143 // constant operand in another lane (which can be moved to the
2144 // given lane). In this case we can represent it as a simple
2145 // permutation of constant and broadcast.
2146 (!IsConstantOp &&
2147 ((Lns > 2 && isa<Constant>(Data.V)) ||
2148 // 2.1. If we have only 2 lanes, need to check that value in the
2149 // next lane does not build same opcode sequence.
2150 (Lns == 2 &&
2151 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2152 .getOpcode() &&
2153 isa<Constant>(Data.V)))) ||
2154 // 3. The operand in the current lane is loop invariant (can be
2155 // hoisted out) and another operand is also a loop invariant
2156 // (though not a constant). In this case the whole vector can be
2157 // hoisted out.
2158 // FIXME: need to teach the cost model about this case for better
2159 // estimation.
2160 (IsInvariant && !isa<Constant>(Data.V) &&
2161 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2162 L->isLoopInvariant(Data.V))) {
2163 FoundCandidate = true;
2164 Data.IsUsed = Data.V == Op;
2165 if (Data.V == Op)
2166 ++Cnt;
2167 break;
2168 }
2169 }
2170 if (!FoundCandidate)
2171 return false;
2172 }
2173 return getNumLanes() == 2 || Cnt > 1;
2174 }
2175
2176 public:
2177 /// Initialize with all the operands of the instruction vector \p RootVL.
2179 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2180 L(R.LI->getLoopFor(
2181 (cast<Instruction>(RootVL.front())->getParent()))) {
2182 // Append all the operands of RootVL.
2183 appendOperandsOfVL(RootVL);
2184 }
2185
2186 /// \Returns a value vector with the operands across all lanes for the
2187 /// opearnd at \p OpIdx.
2188 ValueList getVL(unsigned OpIdx) const {
2189 ValueList OpVL(OpsVec[OpIdx].size());
2190 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2191 "Expected same num of lanes across all operands");
2192 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2193 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2194 return OpVL;
2195 }
2196
2197 // Performs operand reordering for 2 or more operands.
2198 // The original operands are in OrigOps[OpIdx][Lane].
2199 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2200 void reorder() {
2201 unsigned NumOperands = getNumOperands();
2202 unsigned NumLanes = getNumLanes();
2203 // Each operand has its own mode. We are using this mode to help us select
2204 // the instructions for each lane, so that they match best with the ones
2205 // we have selected so far.
2206 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2207
2208 // This is a greedy single-pass algorithm. We are going over each lane
2209 // once and deciding on the best order right away with no back-tracking.
2210 // However, in order to increase its effectiveness, we start with the lane
2211 // that has operands that can move the least. For example, given the
2212 // following lanes:
2213 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2214 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2215 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2216 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2217 // we will start at Lane 1, since the operands of the subtraction cannot
2218 // be reordered. Then we will visit the rest of the lanes in a circular
2219 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2220
2221 // Find the first lane that we will start our search from.
2222 unsigned FirstLane = getBestLaneToStartReordering();
2223
2224 // Initialize the modes.
2225 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2226 Value *OpLane0 = getValue(OpIdx, FirstLane);
2227 // Keep track if we have instructions with all the same opcode on one
2228 // side.
2229 if (isa<LoadInst>(OpLane0))
2230 ReorderingModes[OpIdx] = ReorderingMode::Load;
2231 else if (isa<Instruction>(OpLane0)) {
2232 // Check if OpLane0 should be broadcast.
2233 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2234 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2235 else
2236 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2237 }
2238 else if (isa<Constant>(OpLane0))
2239 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2240 else if (isa<Argument>(OpLane0))
2241 // Our best hope is a Splat. It may save some cost in some cases.
2242 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2243 else
2244 // NOTE: This should be unreachable.
2245 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2246 }
2247
2248 // Check that we don't have same operands. No need to reorder if operands
2249 // are just perfect diamond or shuffled diamond match. Do not do it only
2250 // for possible broadcasts or non-power of 2 number of scalars (just for
2251 // now).
2252 auto &&SkipReordering = [this]() {
2253 SmallPtrSet<Value *, 4> UniqueValues;
2254 ArrayRef<OperandData> Op0 = OpsVec.front();
2255 for (const OperandData &Data : Op0)
2256 UniqueValues.insert(Data.V);
2257 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2258 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2259 return !UniqueValues.contains(Data.V);
2260 }))
2261 return false;
2262 }
2263 // TODO: Check if we can remove a check for non-power-2 number of
2264 // scalars after full support of non-power-2 vectorization.
2265 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2266 };
2267
2268 // If the initial strategy fails for any of the operand indexes, then we
2269 // perform reordering again in a second pass. This helps avoid assigning
2270 // high priority to the failed strategy, and should improve reordering for
2271 // the non-failed operand indexes.
2272 for (int Pass = 0; Pass != 2; ++Pass) {
2273 // Check if no need to reorder operands since they're are perfect or
2274 // shuffled diamond match.
2275 // Need to do it to avoid extra external use cost counting for
2276 // shuffled matches, which may cause regressions.
2277 if (SkipReordering())
2278 break;
2279 // Skip the second pass if the first pass did not fail.
2280 bool StrategyFailed = false;
2281 // Mark all operand data as free to use.
2282 clearUsed();
2283 // We keep the original operand order for the FirstLane, so reorder the
2284 // rest of the lanes. We are visiting the nodes in a circular fashion,
2285 // using FirstLane as the center point and increasing the radius
2286 // distance.
2287 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2288 for (unsigned I = 0; I < NumOperands; ++I)
2289 MainAltOps[I].push_back(getData(I, FirstLane).V);
2290
2291 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2292 // Visit the lane on the right and then the lane on the left.
2293 for (int Direction : {+1, -1}) {
2294 int Lane = FirstLane + Direction * Distance;
2295 if (Lane < 0 || Lane >= (int)NumLanes)
2296 continue;
2297 int LastLane = Lane - Direction;
2298 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2299 "Out of bounds");
2300 // Look for a good match for each operand.
2301 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2302 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2303 std::optional<unsigned> BestIdx = getBestOperand(
2304 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2305 // By not selecting a value, we allow the operands that follow to
2306 // select a better matching value. We will get a non-null value in
2307 // the next run of getBestOperand().
2308 if (BestIdx) {
2309 // Swap the current operand with the one returned by
2310 // getBestOperand().
2311 swap(OpIdx, *BestIdx, Lane);
2312 } else {
2313 // Enable the second pass.
2314 StrategyFailed = true;
2315 }
2316 // Try to get the alternate opcode and follow it during analysis.
2317 if (MainAltOps[OpIdx].size() != 2) {
2318 OperandData &AltOp = getData(OpIdx, Lane);
2319 InstructionsState OpS =
2320 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2321 if (OpS.getOpcode() && OpS.isAltShuffle())
2322 MainAltOps[OpIdx].push_back(AltOp.V);
2323 }
2324 }
2325 }
2326 }
2327 // Skip second pass if the strategy did not fail.
2328 if (!StrategyFailed)
2329 break;
2330 }
2331 }
2332
2333#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2334 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2335 switch (RMode) {
2336 case ReorderingMode::Load:
2337 return "Load";
2338 case ReorderingMode::Opcode:
2339 return "Opcode";
2340 case ReorderingMode::Constant:
2341 return "Constant";
2342 case ReorderingMode::Splat:
2343 return "Splat";
2344 case ReorderingMode::Failed:
2345 return "Failed";
2346 }
2347 llvm_unreachable("Unimplemented Reordering Type");
2348 }
2349
2350 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2351 raw_ostream &OS) {
2352 return OS << getModeStr(RMode);
2353 }
2354
2355 /// Debug print.
2356 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2357 printMode(RMode, dbgs());
2358 }
2359
2360 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2361 return printMode(RMode, OS);
2362 }
2363
2365 const unsigned Indent = 2;
2366 unsigned Cnt = 0;
2367 for (const OperandDataVec &OpDataVec : OpsVec) {
2368 OS << "Operand " << Cnt++ << "\n";
2369 for (const OperandData &OpData : OpDataVec) {
2370 OS.indent(Indent) << "{";
2371 if (Value *V = OpData.V)
2372 OS << *V;
2373 else
2374 OS << "null";
2375 OS << ", APO:" << OpData.APO << "}\n";
2376 }
2377 OS << "\n";
2378 }
2379 return OS;
2380 }
2381
2382 /// Debug print.
2383 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2384#endif
2385 };
2386
2387 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2388 /// for a pair which have highest score deemed to have best chance to form
2389 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2390 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2391 /// of the cost, considered to be good enough score.
2392 std::optional<int>
2393 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2394 int Limit = LookAheadHeuristics::ScoreFail) const {
2395 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2397 int BestScore = Limit;
2398 std::optional<int> Index;
2399 for (int I : seq<int>(0, Candidates.size())) {
2400 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2401 Candidates[I].second,
2402 /*U1=*/nullptr, /*U2=*/nullptr,
2403 /*Level=*/1, std::nullopt);
2404 if (Score > BestScore) {
2405 BestScore = Score;
2406 Index = I;
2407 }
2408 }
2409 return Index;
2410 }
2411
2412 /// Checks if the instruction is marked for deletion.
2413 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2414
2415 /// Removes an instruction from its block and eventually deletes it.
2416 /// It's like Instruction::eraseFromParent() except that the actual deletion
2417 /// is delayed until BoUpSLP is destructed.
2419 DeletedInstructions.insert(I);
2420 }
2421
2422 /// Checks if the instruction was already analyzed for being possible
2423 /// reduction root.
2425 return AnalyzedReductionsRoots.count(I);
2426 }
2427 /// Register given instruction as already analyzed for being possible
2428 /// reduction root.
2430 AnalyzedReductionsRoots.insert(I);
2431 }
2432 /// Checks if the provided list of reduced values was checked already for
2433 /// vectorization.
2435 return AnalyzedReductionVals.contains(hash_value(VL));
2436 }
2437 /// Adds the list of reduced values to list of already checked values for the
2438 /// vectorization.
2440 AnalyzedReductionVals.insert(hash_value(VL));
2441 }
2442 /// Clear the list of the analyzed reduction root instructions.
2444 AnalyzedReductionsRoots.clear();
2445 AnalyzedReductionVals.clear();
2446 AnalyzedMinBWVals.clear();
2447 }
2448 /// Checks if the given value is gathered in one of the nodes.
2449 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2450 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2451 }
2452 /// Checks if the given value is gathered in one of the nodes.
2453 bool isGathered(const Value *V) const {
2454 return MustGather.contains(V);
2455 }
2456 /// Checks if the specified value was not schedule.
2457 bool isNotScheduled(const Value *V) const {
2458 return NonScheduledFirst.contains(V);
2459 }
2460
2461 /// Check if the value is vectorized in the tree.
2462 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2463
2464 ~BoUpSLP();
2465
2466private:
2467 /// Determine if a node \p E in can be demoted to a smaller type with a
2468 /// truncation. We collect the entries that will be demoted in ToDemote.
2469 /// \param E Node for analysis
2470 /// \param ToDemote indices of the nodes to be demoted.
2471 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2472 unsigned &BitWidth,
2473 SmallVectorImpl<unsigned> &ToDemote,
2475 unsigned &MaxDepthLevel,
2476 bool &IsProfitableToDemote,
2477 bool IsTruncRoot) const;
2478
2479 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2480 /// reordering (i.e. the operands can be reordered because they have only one
2481 /// user and reordarable).
2482 /// \param ReorderableGathers List of all gather nodes that require reordering
2483 /// (e.g., gather of extractlements or partially vectorizable loads).
2484 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2485 /// reordering, subset of \p NonVectorized.
2486 bool
2487 canReorderOperands(TreeEntry *UserTE,
2488 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2489 ArrayRef<TreeEntry *> ReorderableGathers,
2490 SmallVectorImpl<TreeEntry *> &GatherOps);
2491
2492 /// Checks if the given \p TE is a gather node with clustered reused scalars
2493 /// and reorders it per given \p Mask.
2494 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2495
2496 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2497 /// if any. If it is not vectorized (gather node), returns nullptr.
2498 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2499 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2500 TreeEntry *TE = nullptr;
2501 const auto *It = find_if(VL, [&](Value *V) {
2502 TE = getTreeEntry(V);
2503 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2504 return true;
2505 auto It = MultiNodeScalars.find(V);
2506 if (It != MultiNodeScalars.end()) {
2507 for (TreeEntry *E : It->second) {
2508 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2509 TE = E;
2510 return true;
2511 }
2512 }
2513 }
2514 return false;
2515 });
2516 if (It != VL.end()) {
2517 assert(TE->isSame(VL) && "Expected same scalars.");
2518 return TE;
2519 }
2520 return nullptr;
2521 }
2522
2523 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2524 /// if any. If it is not vectorized (gather node), returns nullptr.
2525 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2526 unsigned OpIdx) const {
2527 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2528 const_cast<TreeEntry *>(UserTE), OpIdx);
2529 }
2530
2531 /// Checks if all users of \p I are the part of the vectorization tree.
2532 bool areAllUsersVectorized(
2533 Instruction *I,
2534 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2535
2536 /// Return information about the vector formed for the specified index
2537 /// of a vector of (the same) instruction.
2539
2540 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2541 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2542
2543 /// \returns Cast context for the given graph node.
2545 getCastContextHint(const TreeEntry &TE) const;
2546
2547 /// \returns the cost of the vectorizable entry.
2548 InstructionCost getEntryCost(const TreeEntry *E,
2549 ArrayRef<Value *> VectorizedVals,
2550 SmallPtrSetImpl<Value *> &CheckedExtracts);
2551
2552 /// This is the recursive part of buildTree.
2553 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2554 const EdgeInfo &EI);
2555
2556 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2557 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2558 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2559 /// returns false, setting \p CurrentOrder to either an empty vector or a
2560 /// non-identity permutation that allows to reuse extract instructions.
2561 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2562 /// extract order.
2563 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2564 SmallVectorImpl<unsigned> &CurrentOrder,
2565 bool ResizeAllowed = false) const;
2566
2567 /// Vectorize a single entry in the tree.
2568 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2569 /// avoid issues with def-use order.
2570 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2571
2572 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2573 /// \p E.
2574 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2575 /// avoid issues with def-use order.
2576 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2577
2578 /// Create a new vector from a list of scalar values. Produces a sequence
2579 /// which exploits values reused across lanes, and arranges the inserts
2580 /// for ease of later optimization.
2581 template <typename BVTy, typename ResTy, typename... Args>
2582 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2583
2584 /// Create a new vector from a list of scalar values. Produces a sequence
2585 /// which exploits values reused across lanes, and arranges the inserts
2586 /// for ease of later optimization.
2587 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2588
2589 /// Returns the instruction in the bundle, which can be used as a base point
2590 /// for scheduling. Usually it is the last instruction in the bundle, except
2591 /// for the case when all operands are external (in this case, it is the first
2592 /// instruction in the list).
2593 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2594
2595 /// Tries to find extractelement instructions with constant indices from fixed
2596 /// vector type and gather such instructions into a bunch, which highly likely
2597 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2598 /// was successful, the matched scalars are replaced by poison values in \p VL
2599 /// for future analysis.
2600 std::optional<TargetTransformInfo::ShuffleKind>
2601 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2602 SmallVectorImpl<int> &Mask) const;
2603
2604 /// Tries to find extractelement instructions with constant indices from fixed
2605 /// vector type and gather such instructions into a bunch, which highly likely
2606 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2607 /// was successful, the matched scalars are replaced by poison values in \p VL
2608 /// for future analysis.
2610 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2612 unsigned NumParts) const;
2613
2614 /// Checks if the gathered \p VL can be represented as a single register
2615 /// shuffle(s) of previous tree entries.
2616 /// \param TE Tree entry checked for permutation.
2617 /// \param VL List of scalars (a subset of the TE scalar), checked for
2618 /// permutations. Must form single-register vector.
2619 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2620 /// commands to build the mask using the original vector value, without
2621 /// relying on the potential reordering.
2622 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2623 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2624 std::optional<TargetTransformInfo::ShuffleKind>
2625 isGatherShuffledSingleRegisterEntry(
2626 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2627 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2628 bool ForOrder);
2629
2630 /// Checks if the gathered \p VL can be represented as multi-register
2631 /// shuffle(s) of previous tree entries.
2632 /// \param TE Tree entry checked for permutation.
2633 /// \param VL List of scalars (a subset of the TE scalar), checked for
2634 /// permutations.
2635 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2636 /// commands to build the mask using the original vector value, without
2637 /// relying on the potential reordering.
2638 /// \returns per-register series of ShuffleKind, if gathered values can be
2639 /// represented as shuffles of previous tree entries. \p Mask is filled with
2640 /// the shuffle mask (also on per-register base).
2642 isGatherShuffledEntry(
2643 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2645 unsigned NumParts, bool ForOrder = false);
2646
2647 /// \returns the scalarization cost for this list of values. Assuming that
2648 /// this subtree gets vectorized, we may need to extract the values from the
2649 /// roots. This method calculates the cost of extracting the values.
2650 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2651 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2652 Type *ScalarTy) const;
2653
2654 /// Set the Builder insert point to one after the last instruction in
2655 /// the bundle
2656 void setInsertPointAfterBundle(const TreeEntry *E);
2657
2658 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2659 /// specified, the starting vector value is poison.
2660 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2661
2662 /// \returns whether the VectorizableTree is fully vectorizable and will
2663 /// be beneficial even the tree height is tiny.
2664 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2665
2666 /// Reorder commutative or alt operands to get better probability of
2667 /// generating vectorized code.
2668 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2671 const BoUpSLP &R);
2672
2673 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2674 /// users of \p TE and collects the stores. It returns the map from the store
2675 /// pointers to the collected stores.
2677 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2678
2679 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2680 /// stores in \p StoresVec can form a vector instruction. If so it returns
2681 /// true and populates \p ReorderIndices with the shuffle indices of the
2682 /// stores when compared to the sorted vector.
2683 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2684 OrdersType &ReorderIndices) const;
2685
2686 /// Iterates through the users of \p TE, looking for scalar stores that can be
2687 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2688 /// their order and builds an order index vector for each store bundle. It
2689 /// returns all these order vectors found.
2690 /// We run this after the tree has formed, otherwise we may come across user
2691 /// instructions that are not yet in the tree.
2693 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2694
2695 struct TreeEntry {
2696 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2697 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2698
2699 /// \returns Common mask for reorder indices and reused scalars.
2700 SmallVector<int> getCommonMask() const {
2702 inversePermutation(ReorderIndices, Mask);
2703 ::addMask(Mask, ReuseShuffleIndices);
2704 return Mask;
2705 }
2706
2707 /// \returns true if the scalars in VL are equal to this entry.
2708 bool isSame(ArrayRef<Value *> VL) const {
2709 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2710 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2711 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2712 return VL.size() == Mask.size() &&
2713 std::equal(VL.begin(), VL.end(), Mask.begin(),
2714 [Scalars](Value *V, int Idx) {
2715 return (isa<UndefValue>(V) &&
2716 Idx == PoisonMaskElem) ||
2717 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2718 });
2719 };
2720 if (!ReorderIndices.empty()) {
2721 // TODO: implement matching if the nodes are just reordered, still can
2722 // treat the vector as the same if the list of scalars matches VL
2723 // directly, without reordering.
2725 inversePermutation(ReorderIndices, Mask);
2726 if (VL.size() == Scalars.size())
2727 return IsSame(Scalars, Mask);
2728 if (VL.size() == ReuseShuffleIndices.size()) {
2729 ::addMask(Mask, ReuseShuffleIndices);
2730 return IsSame(Scalars, Mask);
2731 }
2732 return false;
2733 }
2734 return IsSame(Scalars, ReuseShuffleIndices);
2735 }
2736
2737 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2738 return State == TreeEntry::NeedToGather &&
2739 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2740 UserTreeIndices.front().UserTE == UserEI.UserTE;
2741 }
2742
2743 /// \returns true if current entry has same operands as \p TE.
2744 bool hasEqualOperands(const TreeEntry &TE) const {
2745 if (TE.getNumOperands() != getNumOperands())
2746 return false;
2747 SmallBitVector Used(getNumOperands());
2748 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2749 unsigned PrevCount = Used.count();
2750 for (unsigned K = 0; K < E; ++K) {
2751 if (Used.test(K))
2752 continue;
2753 if (getOperand(K) == TE.getOperand(I)) {
2754 Used.set(K);
2755 break;
2756 }
2757 }
2758 // Check if we actually found the matching operand.
2759 if (PrevCount == Used.count())
2760 return false;
2761 }
2762 return true;
2763 }
2764
2765 /// \return Final vectorization factor for the node. Defined by the total
2766 /// number of vectorized scalars, including those, used several times in the
2767 /// entry and counted in the \a ReuseShuffleIndices, if any.
2768 unsigned getVectorFactor() const {
2769 if (!ReuseShuffleIndices.empty())
2770 return ReuseShuffleIndices.size();
2771 return Scalars.size();
2772 };
2773
2774 /// A vector of scalars.
2775 ValueList Scalars;
2776
2777 /// The Scalars are vectorized into this value. It is initialized to Null.
2778 WeakTrackingVH VectorizedValue = nullptr;
2779
2780 /// New vector phi instructions emitted for the vectorized phi nodes.
2781 PHINode *PHI = nullptr;
2782
2783 /// Do we need to gather this sequence or vectorize it
2784 /// (either with vector instruction or with scatter/gather
2785 /// intrinsics for store/load)?
2786 enum EntryState {
2787 Vectorize,
2788 ScatterVectorize,
2789 StridedVectorize,
2790 NeedToGather
2791 };
2792 EntryState State;
2793
2794 /// Does this sequence require some shuffling?
2795 SmallVector<int, 4> ReuseShuffleIndices;
2796
2797 /// Does this entry require reordering?
2798 SmallVector<unsigned, 4> ReorderIndices;
2799
2800 /// Points back to the VectorizableTree.
2801 ///
2802 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2803 /// to be a pointer and needs to be able to initialize the child iterator.
2804 /// Thus we need a reference back to the container to translate the indices
2805 /// to entries.
2806 VecTreeTy &Container;
2807
2808 /// The TreeEntry index containing the user of this entry. We can actually
2809 /// have multiple users so the data structure is not truly a tree.
2810 SmallVector<EdgeInfo, 1> UserTreeIndices;
2811
2812 /// The index of this treeEntry in VectorizableTree.
2813 int Idx = -1;
2814
2815 private:
2816 /// The operands of each instruction in each lane Operands[op_index][lane].
2817 /// Note: This helps avoid the replication of the code that performs the
2818 /// reordering of operands during buildTree_rec() and vectorizeTree().
2820
2821 /// The main/alternate instruction.
2822 Instruction *MainOp = nullptr;
2823 Instruction *AltOp = nullptr;
2824
2825 public:
2826 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2827 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2828 if (Operands.size() < OpIdx + 1)
2829 Operands.resize(OpIdx + 1);
2830 assert(Operands[OpIdx].empty() && "Already resized?");
2831 assert(OpVL.size() <= Scalars.size() &&
2832 "Number of operands is greater than the number of scalars.");
2833 Operands[OpIdx].resize(OpVL.size());
2834 copy(OpVL, Operands[OpIdx].begin());
2835 }
2836
2837 /// Set the operands of this bundle in their original order.
2838 void setOperandsInOrder() {
2839 assert(Operands.empty() && "Already initialized?");
2840 auto *I0 = cast<Instruction>(Scalars[0]);
2841 Operands.resize(I0->getNumOperands());
2842 unsigned NumLanes = Scalars.size();
2843 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2844 OpIdx != NumOperands; ++OpIdx) {
2845 Operands[OpIdx].resize(NumLanes);
2846 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2847 auto *I = cast<Instruction>(Scalars[Lane]);
2848 assert(I->getNumOperands() == NumOperands &&
2849 "Expected same number of operands");
2850 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2851 }
2852 }
2853 }
2854
2855 /// Reorders operands of the node to the given mask \p Mask.
2856 void reorderOperands(ArrayRef<int> Mask) {
2857 for (ValueList &Operand : Operands)
2858 reorderScalars(Operand, Mask);
2859 }
2860
2861 /// \returns the \p OpIdx operand of this TreeEntry.
2862 ValueList &getOperand(unsigned OpIdx) {
2863 assert(OpIdx < Operands.size() && "Off bounds");
2864 return Operands[OpIdx];
2865 }
2866
2867 /// \returns the \p OpIdx operand of this TreeEntry.
2868 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2869 assert(OpIdx < Operands.size() && "Off bounds");
2870 return Operands[OpIdx];
2871 }
2872
2873 /// \returns the number of operands.
2874 unsigned getNumOperands() const { return Operands.size(); }
2875
2876 /// \return the single \p OpIdx operand.
2877 Value *getSingleOperand(unsigned OpIdx) const {
2878 assert(OpIdx < Operands.size() && "Off bounds");
2879 assert(!Operands[OpIdx].empty() && "No operand available");
2880 return Operands[OpIdx][0];
2881 }
2882
2883 /// Some of the instructions in the list have alternate opcodes.
2884 bool isAltShuffle() const { return MainOp != AltOp; }
2885
2886 bool isOpcodeOrAlt(Instruction *I) const {
2887 unsigned CheckedOpcode = I->getOpcode();
2888 return (getOpcode() == CheckedOpcode ||
2889 getAltOpcode() == CheckedOpcode);
2890 }
2891
2892 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2893 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2894 /// \p OpValue.
2895 Value *isOneOf(Value *Op) const {
2896 auto *I = dyn_cast<Instruction>(Op);
2897 if (I && isOpcodeOrAlt(I))
2898 return Op;
2899 return MainOp;
2900 }
2901
2902 void setOperations(const InstructionsState &S) {
2903 MainOp = S.MainOp;
2904 AltOp = S.AltOp;
2905 }
2906
2907 Instruction *getMainOp() const {
2908 return MainOp;
2909 }
2910
2911 Instruction *getAltOp() const {
2912 return AltOp;
2913 }
2914
2915 /// The main/alternate opcodes for the list of instructions.
2916 unsigned getOpcode() const {
2917 return MainOp ? MainOp->getOpcode() : 0;
2918 }
2919
2920 unsigned getAltOpcode() const {
2921 return AltOp ? AltOp->getOpcode() : 0;
2922 }
2923
2924 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2925 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2926 int findLaneForValue(Value *V) const {
2927 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2928 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2929 if (!ReorderIndices.empty())
2930 FoundLane = ReorderIndices[FoundLane];
2931 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2932 if (!ReuseShuffleIndices.empty()) {
2933 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2934 find(ReuseShuffleIndices, FoundLane));
2935 }
2936 return FoundLane;
2937 }
2938
2939 /// Build a shuffle mask for graph entry which represents a merge of main
2940 /// and alternate operations.
2941 void
2942 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2944 SmallVectorImpl<Value *> *OpScalars = nullptr,
2945 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2946
2947 /// Return true if this is a non-power-of-2 node.
2948 bool isNonPowOf2Vec() const {
2949 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2950 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2951 "Reshuffling not supported with non-power-of-2 vectors yet.");
2952 return IsNonPowerOf2;
2953 }
2954
2955#ifndef NDEBUG
2956 /// Debug printer.
2957 LLVM_DUMP_METHOD void dump() const {
2958 dbgs() << Idx << ".\n";
2959 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2960 dbgs() << "Operand " << OpI << ":\n";
2961 for (const Value *V : Operands[OpI])
2962 dbgs().indent(2) << *V << "\n";
2963 }
2964 dbgs() << "Scalars: \n";
2965 for (Value *V : Scalars)
2966 dbgs().indent(2) << *V << "\n";
2967 dbgs() << "State: ";
2968 switch (State) {
2969 case Vectorize:
2970 dbgs() << "Vectorize\n";
2971 break;
2972 case ScatterVectorize:
2973 dbgs() << "ScatterVectorize\n";
2974 break;
2975 case StridedVectorize:
2976 dbgs() << "StridedVectorize\n";
2977 break;
2978 case NeedToGather:
2979 dbgs() << "NeedToGather\n";
2980 break;
2981 }
2982 dbgs() << "MainOp: ";
2983 if (MainOp)
2984 dbgs() << *MainOp << "\n";
2985 else
2986 dbgs() << "NULL\n";
2987 dbgs() << "AltOp: ";
2988 if (AltOp)
2989 dbgs() << *AltOp << "\n";
2990 else
2991 dbgs() << "NULL\n";
2992 dbgs() << "VectorizedValue: ";
2993 if (VectorizedValue)
2994 dbgs() << *VectorizedValue << "\n";
2995 else
2996 dbgs() << "NULL\n";
2997 dbgs() << "ReuseShuffleIndices: ";
2998 if (ReuseShuffleIndices.empty())
2999 dbgs() << "Empty";
3000 else
3001 for (int ReuseIdx : ReuseShuffleIndices)
3002 dbgs() << ReuseIdx << ", ";
3003 dbgs() << "\n";
3004 dbgs() << "ReorderIndices: ";
3005 for (unsigned ReorderIdx : ReorderIndices)
3006 dbgs() << ReorderIdx << ", ";
3007 dbgs() << "\n";
3008 dbgs() << "UserTreeIndices: ";
3009 for (const auto &EInfo : UserTreeIndices)
3010 dbgs() << EInfo << ", ";
3011 dbgs() << "\n";
3012 }
3013#endif
3014 };
3015
3016#ifndef NDEBUG
3017 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3018 InstructionCost VecCost, InstructionCost ScalarCost,
3019 StringRef Banner) const {
3020 dbgs() << "SLP: " << Banner << ":\n";
3021 E->dump();
3022 dbgs() << "SLP: Costs:\n";
3023 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3024 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3025 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3026 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3027 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3028 }
3029#endif
3030
3031 /// Create a new VectorizableTree entry.
3032 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3033 std::optional<ScheduleData *> Bundle,
3034 const InstructionsState &S,
3035 const EdgeInfo &UserTreeIdx,
3036 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3037 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3038 TreeEntry::EntryState EntryState =
3039 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3040 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3041 ReuseShuffleIndices, ReorderIndices);
3042 }
3043
3044 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3045 TreeEntry::EntryState EntryState,
3046 std::optional<ScheduleData *> Bundle,
3047 const InstructionsState &S,
3048 const EdgeInfo &UserTreeIdx,
3049 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3050 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3051 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3052 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3053 "Need to vectorize gather entry?");
3054 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3055 TreeEntry *Last = VectorizableTree.back().get();
3056 Last->Idx = VectorizableTree.size() - 1;
3057 Last->State = EntryState;
3058 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3059 ReuseShuffleIndices.end());
3060 if (ReorderIndices.empty()) {
3061 Last->Scalars.assign(VL.begin(), VL.end());
3062 Last->setOperations(S);
3063 } else {
3064 // Reorder scalars and build final mask.
3065 Last->Scalars.assign(VL.size(), nullptr);
3066 transform(ReorderIndices, Last->Scalars.begin(),
3067 [VL](unsigned Idx) -> Value * {
3068 if (Idx >= VL.size())
3069 return UndefValue::get(VL.front()->getType());
3070 return VL[Idx];
3071 });
3072 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3073 Last->setOperations(S);
3074 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3075 }
3076 if (Last->State != TreeEntry::NeedToGather) {
3077 for (Value *V : VL) {
3078 const TreeEntry *TE = getTreeEntry(V);
3079 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3080 "Scalar already in tree!");
3081 if (TE) {
3082 if (TE != Last)
3083 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3084 continue;
3085 }
3086 ScalarToTreeEntry[V] = Last;
3087 }
3088 // Update the scheduler bundle to point to this TreeEntry.
3089 ScheduleData *BundleMember = *Bundle;
3090 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3091 isVectorLikeInstWithConstOps(S.MainOp) ||
3092 doesNotNeedToSchedule(VL)) &&
3093 "Bundle and VL out of sync");
3094 if (BundleMember) {
3095 for (Value *V : VL) {
3097 continue;
3098 if (!BundleMember)
3099 continue;
3100 BundleMember->TE = Last;
3101 BundleMember = BundleMember->NextInBundle;
3102 }
3103 }
3104 assert(!BundleMember && "Bundle and VL out of sync");
3105 } else {
3106 // Build a map for gathered scalars to the nodes where they are used.
3107 bool AllConstsOrCasts = true;
3108 for (Value *V : VL)
3109 if (!isConstant(V)) {
3110 auto *I = dyn_cast<CastInst>(V);
3111 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3112 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3113 }
3114 if (AllConstsOrCasts)
3115 CastMaxMinBWSizes =
3116 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3117 MustGather.insert(VL.begin(), VL.end());
3118 }
3119
3120 if (UserTreeIdx.UserTE) {
3121 Last->UserTreeIndices.push_back(UserTreeIdx);
3122 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3123 "Reordering isn't implemented for non-power-of-2 nodes yet");
3124 }
3125 return Last;
3126 }
3127
3128 /// -- Vectorization State --
3129 /// Holds all of the tree entries.
3130 TreeEntry::VecTreeTy VectorizableTree;
3131
3132#ifndef NDEBUG
3133 /// Debug printer.
3134 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3135 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3136 VectorizableTree[Id]->dump();
3137 dbgs() << "\n";
3138 }
3139 }
3140#endif
3141
3142 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3143
3144 const TreeEntry *getTreeEntry(Value *V) const {
3145 return ScalarToTreeEntry.lookup(V);
3146 }
3147
3148 /// Check that the operand node of alternate node does not generate
3149 /// buildvector sequence. If it is, then probably not worth it to build
3150 /// alternate shuffle, if number of buildvector operands + alternate
3151 /// instruction > than the number of buildvector instructions.
3152 /// \param S the instructions state of the analyzed values.
3153 /// \param VL list of the instructions with alternate opcodes.
3154 bool areAltOperandsProfitable(const InstructionsState &S,
3155 ArrayRef<Value *> VL) const;
3156
3157 /// Checks if the specified list of the instructions/values can be vectorized
3158 /// and fills required data before actual scheduling of the instructions.
3159 TreeEntry::EntryState getScalarsVectorizationState(
3160 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3161 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3162
3163 /// Maps a specific scalar to its tree entry.
3164 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3165
3166 /// List of scalars, used in several vectorize nodes, and the list of the
3167 /// nodes.
3169
3170 /// Maps a value to the proposed vectorizable size.
3171 SmallDenseMap<Value *, unsigned> InstrElementSize;
3172
3173 /// A list of scalars that we found that we need to keep as scalars.
3174 ValueSet MustGather;
3175
3176 /// A set of first non-schedulable values.
3177 ValueSet NonScheduledFirst;
3178
3179 /// A map between the vectorized entries and the last instructions in the
3180 /// bundles. The bundles are built in use order, not in the def order of the
3181 /// instructions. So, we cannot rely directly on the last instruction in the
3182 /// bundle being the last instruction in the program order during
3183 /// vectorization process since the basic blocks are affected, need to
3184 /// pre-gather them before.
3185 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3186
3187 /// List of gather nodes, depending on other gather/vector nodes, which should
3188 /// be emitted after the vector instruction emission process to correctly
3189 /// handle order of the vector instructions and shuffles.
3190 SetVector<const TreeEntry *> PostponedGathers;
3191
3192 using ValueToGatherNodesMap =
3194 ValueToGatherNodesMap ValueToGatherNodes;
3195
3196 /// This POD struct describes one external user in the vectorized tree.
3197 struct ExternalUser {
3198 ExternalUser(Value *S, llvm::User *U, int L)
3199 : Scalar(S), User(U), Lane(L) {}
3200
3201 // Which scalar in our function.
3202 Value *Scalar;
3203
3204 // Which user that uses the scalar.
3206
3207 // Which lane does the scalar belong to.
3208 int Lane;
3209 };
3210 using UserList = SmallVector<ExternalUser, 16>;
3211
3212 /// Checks if two instructions may access the same memory.
3213 ///
3214 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3215 /// is invariant in the calling loop.
3216 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3217 Instruction *Inst2) {
3218 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3219 return true;
3220 // First check if the result is already in the cache.
3221 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3222 auto It = AliasCache.find(Key);
3223 if (It != AliasCache.end())
3224 return It->second;
3225 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3226 // Store the result in the cache.
3227 AliasCache.try_emplace(Key, Aliased);
3228 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3229 return Aliased;
3230 }
3231
3232 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3233
3234 /// Cache for alias results.
3235 /// TODO: consider moving this to the AliasAnalysis itself.
3237
3238 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3239 // globally through SLP because we don't perform any action which
3240 // invalidates capture results.
3241 BatchAAResults BatchAA;
3242
3243 /// Temporary store for deleted instructions. Instructions will be deleted
3244 /// eventually when the BoUpSLP is destructed. The deferral is required to
3245 /// ensure that there are no incorrect collisions in the AliasCache, which
3246 /// can happen if a new instruction is allocated at the same address as a
3247 /// previously deleted instruction.
3248 DenseSet<Instruction *> DeletedInstructions;
3249
3250 /// Set of the instruction, being analyzed already for reductions.
3251 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3252
3253 /// Set of hashes for the list of reduction values already being analyzed.
3254 DenseSet<size_t> AnalyzedReductionVals;
3255
3256 /// Values, already been analyzed for mininmal bitwidth and found to be
3257 /// non-profitable.
3258 DenseSet<Value *> AnalyzedMinBWVals;
3259
3260 /// A list of values that need to extracted out of the tree.
3261 /// This list holds pairs of (Internal Scalar : External User). External User
3262 /// can be nullptr, it means that this Internal Scalar will be used later,
3263 /// after vectorization.
3264 UserList ExternalUses;
3265
3266 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3267 /// extractelement instructions.
3268 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3269
3270 /// Values used only by @llvm.assume calls.
3272
3273 /// Holds all of the instructions that we gathered, shuffle instructions and
3274 /// extractelements.
3275 SetVector<Instruction *> GatherShuffleExtractSeq;
3276
3277 /// A list of blocks that we are going to CSE.
3278 DenseSet<BasicBlock *> CSEBlocks;
3279
3280 /// Contains all scheduling relevant data for an instruction.
3281 /// A ScheduleData either represents a single instruction or a member of an
3282 /// instruction bundle (= a group of instructions which is combined into a
3283 /// vector instruction).
3284 struct ScheduleData {
3285 // The initial value for the dependency counters. It means that the
3286 // dependencies are not calculated yet.
3287 enum { InvalidDeps = -1 };
3288
3289 ScheduleData() = default;
3290
3291 void init(int BlockSchedulingRegionID, Value *OpVal) {
3292 FirstInBundle = this;
3293 NextInBundle = nullptr;
3294 NextLoadStore = nullptr;
3295 IsScheduled = false;
3296 SchedulingRegionID = BlockSchedulingRegionID;
3297 clearDependencies();
3298 OpValue = OpVal;
3299 TE = nullptr;
3300 }
3301
3302 /// Verify basic self consistency properties
3303 void verify() {
3304 if (hasValidDependencies()) {
3305 assert(UnscheduledDeps <= Dependencies && "invariant");
3306 } else {
3307 assert(UnscheduledDeps == Dependencies && "invariant");
3308 }
3309
3310 if (IsScheduled) {
3311 assert(isSchedulingEntity() &&
3312 "unexpected scheduled state");
3313 for (const ScheduleData *BundleMember = this; BundleMember;
3314 BundleMember = BundleMember->NextInBundle) {
3315 assert(BundleMember->hasValidDependencies() &&
3316 BundleMember->UnscheduledDeps == 0 &&
3317 "unexpected scheduled state");
3318 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3319 "only bundle is marked scheduled");
3320 }
3321 }
3322
3323 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3324 "all bundle members must be in same basic block");
3325 }
3326
3327 /// Returns true if the dependency information has been calculated.
3328 /// Note that depenendency validity can vary between instructions within
3329 /// a single bundle.
3330 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3331
3332 /// Returns true for single instructions and for bundle representatives
3333 /// (= the head of a bundle).
3334 bool isSchedulingEntity() const { return FirstInBundle == this; }
3335
3336 /// Returns true if it represents an instruction bundle and not only a
3337 /// single instruction.
3338 bool isPartOfBundle() const {
3339 return NextInBundle != nullptr || FirstInBundle != this || TE;
3340 }
3341
3342 /// Returns true if it is ready for scheduling, i.e. it has no more
3343 /// unscheduled depending instructions/bundles.
3344 bool isReady() const {
3345 assert(isSchedulingEntity() &&
3346 "can't consider non-scheduling entity for ready list");
3347 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3348 }
3349
3350 /// Modifies the number of unscheduled dependencies for this instruction,
3351 /// and returns the number of remaining dependencies for the containing
3352 /// bundle.
3353 int incrementUnscheduledDeps(int Incr) {
3354 assert(hasValidDependencies() &&
3355 "increment of unscheduled deps would be meaningless");
3356 UnscheduledDeps += Incr;
3357 return FirstInBundle->unscheduledDepsInBundle();
3358 }
3359
3360 /// Sets the number of unscheduled dependencies to the number of
3361 /// dependencies.
3362 void resetUnscheduledDeps() {
3363 UnscheduledDeps = Dependencies;
3364 }
3365
3366 /// Clears all dependency information.
3367 void clearDependencies() {
3368 Dependencies = InvalidDeps;
3369 resetUnscheduledDeps();
3370 MemoryDependencies.clear();
3371 ControlDependencies.clear();
3372 }
3373
3374 int unscheduledDepsInBundle() const {
3375 assert(isSchedulingEntity() && "only meaningful on the bundle");
3376 int Sum = 0;
3377 for (const ScheduleData *BundleMember = this; BundleMember;
3378 BundleMember = BundleMember->NextInBundle) {
3379 if (BundleMember->UnscheduledDeps == InvalidDeps)
3380 return InvalidDeps;
3381 Sum += BundleMember->UnscheduledDeps;
3382 }
3383 return Sum;
3384 }
3385
3386 void dump(raw_ostream &os) const {
3387 if (!isSchedulingEntity()) {
3388 os << "/ " << *Inst;
3389 } else if (NextInBundle) {
3390 os << '[' << *Inst;
3391 ScheduleData *SD = NextInBundle;
3392 while (SD) {
3393 os << ';' << *SD->Inst;
3394 SD = SD->NextInBundle;
3395 }
3396 os << ']';
3397 } else {
3398 os << *Inst;
3399 }
3400 }
3401
3402 Instruction *Inst = nullptr;
3403
3404 /// Opcode of the current instruction in the schedule data.
3405 Value *OpValue = nullptr;
3406
3407 /// The TreeEntry that this instruction corresponds to.
3408 TreeEntry *TE = nullptr;
3409
3410 /// Points to the head in an instruction bundle (and always to this for
3411 /// single instructions).
3412 ScheduleData *FirstInBundle = nullptr;
3413
3414 /// Single linked list of all instructions in a bundle. Null if it is a
3415 /// single instruction.
3416 ScheduleData *NextInBundle = nullptr;
3417
3418 /// Single linked list of all memory instructions (e.g. load, store, call)
3419 /// in the block - until the end of the scheduling region.
3420 ScheduleData *NextLoadStore = nullptr;
3421
3422 /// The dependent memory instructions.
3423 /// This list is derived on demand in calculateDependencies().
3424 SmallVector<ScheduleData *, 4> MemoryDependencies;
3425
3426 /// List of instructions which this instruction could be control dependent
3427 /// on. Allowing such nodes to be scheduled below this one could introduce
3428 /// a runtime fault which didn't exist in the original program.
3429 /// ex: this is a load or udiv following a readonly call which inf loops
3430 SmallVector<ScheduleData *, 4> ControlDependencies;
3431
3432 /// This ScheduleData is in the current scheduling region if this matches
3433 /// the current SchedulingRegionID of BlockScheduling.
3434 int SchedulingRegionID = 0;
3435
3436 /// Used for getting a "good" final ordering of instructions.
3437 int SchedulingPriority = 0;
3438
3439 /// The number of dependencies. Constitutes of the number of users of the
3440 /// instruction plus the number of dependent memory instructions (if any).
3441 /// This value is calculated on demand.
3442 /// If InvalidDeps, the number of dependencies is not calculated yet.
3443 int Dependencies = InvalidDeps;
3444
3445 /// The number of dependencies minus the number of dependencies of scheduled
3446 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3447 /// for scheduling.
3448 /// Note that this is negative as long as Dependencies is not calculated.
3449 int UnscheduledDeps = InvalidDeps;
3450
3451 /// True if this instruction is scheduled (or considered as scheduled in the
3452 /// dry-run).
3453 bool IsScheduled = false;
3454 };
3455
3456#ifndef NDEBUG
3458 const BoUpSLP::ScheduleData &SD) {
3459 SD.dump(os);
3460 return os;
3461 }
3462#endif
3463
3464 friend struct GraphTraits<BoUpSLP *>;
3465 friend struct DOTGraphTraits<BoUpSLP *>;
3466
3467 /// Contains all scheduling data for a basic block.
3468 /// It does not schedules instructions, which are not memory read/write
3469 /// instructions and their operands are either constants, or arguments, or
3470 /// phis, or instructions from others blocks, or their users are phis or from
3471 /// the other blocks. The resulting vector instructions can be placed at the
3472 /// beginning of the basic block without scheduling (if operands does not need
3473 /// to be scheduled) or at the end of the block (if users are outside of the
3474 /// block). It allows to save some compile time and memory used by the
3475 /// compiler.
3476 /// ScheduleData is assigned for each instruction in between the boundaries of
3477 /// the tree entry, even for those, which are not part of the graph. It is
3478 /// required to correctly follow the dependencies between the instructions and
3479 /// their correct scheduling. The ScheduleData is not allocated for the
3480 /// instructions, which do not require scheduling, like phis, nodes with
3481 /// extractelements/insertelements only or nodes with instructions, with
3482 /// uses/operands outside of the block.
3483 struct BlockScheduling {
3484 BlockScheduling(BasicBlock *BB)
3485 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3486
3487 void clear() {
3488 ReadyInsts.clear();
3489 ScheduleStart = nullptr;
3490 ScheduleEnd = nullptr;
3491 FirstLoadStoreInRegion = nullptr;
3492 LastLoadStoreInRegion = nullptr;
3493 RegionHasStackSave = false;
3494
3495 // Reduce the maximum schedule region size by the size of the
3496 // previous scheduling run.
3497 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3498 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3499 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3500 ScheduleRegionSize = 0;
3501
3502 // Make a new scheduling region, i.e. all existing ScheduleData is not
3503 // in the new region yet.
3504 ++SchedulingRegionID;
3505 }
3506
3507 ScheduleData *getScheduleData(Instruction *I) {
3508 if (BB != I->getParent())
3509 // Avoid lookup if can't possibly be in map.
3510 return nullptr;
3511 ScheduleData *SD = ScheduleDataMap.lookup(I);
3512 if (SD && isInSchedulingRegion(SD))
3513 return SD;
3514 return nullptr;
3515 }
3516
3517 ScheduleData *getScheduleData(Value *V) {
3518 if (auto *I = dyn_cast<Instruction>(V))
3519 return getScheduleData(I);
3520 return nullptr;
3521 }
3522
3523 ScheduleData *getScheduleData(Value *V, Value *Key) {
3524 if (V == Key)
3525 return getScheduleData(V);
3526 auto I = ExtraScheduleDataMap.find(V);
3527 if (I != ExtraScheduleDataMap.end()) {
3528 ScheduleData *SD = I->second.lookup(Key);
3529 if (SD && isInSchedulingRegion(SD))
3530 return SD;
3531 }
3532 return nullptr;
3533 }
3534
3535 bool isInSchedulingRegion(ScheduleData *SD) const {
3536 return SD->SchedulingRegionID == SchedulingRegionID;
3537 }
3538
3539 /// Marks an instruction as scheduled and puts all dependent ready
3540 /// instructions into the ready-list.
3541 template <typename ReadyListType>
3542 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3543 SD->IsScheduled = true;
3544 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3545
3546 for (ScheduleData *BundleMember = SD; BundleMember;
3547 BundleMember = BundleMember->NextInBundle) {
3548 if (BundleMember->Inst != BundleMember->OpValue)
3549 continue;
3550
3551 // Handle the def-use chain dependencies.
3552
3553 // Decrement the unscheduled counter and insert to ready list if ready.
3554 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3555 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3556 if (OpDef && OpDef->hasValidDependencies() &&
3557 OpDef->incrementUnscheduledDeps(-1) == 0) {
3558 // There are no more unscheduled dependencies after
3559 // decrementing, so we can put the dependent instruction
3560 // into the ready list.
3561 ScheduleData *DepBundle = OpDef->FirstInBundle;
3562 assert(!DepBundle->IsScheduled &&
3563 "already scheduled bundle gets ready");
3564 ReadyList.insert(DepBundle);
3565 LLVM_DEBUG(dbgs()
3566 << "SLP: gets ready (def): " << *DepBundle << "\n");
3567 }
3568 });
3569 };
3570
3571 // If BundleMember is a vector bundle, its operands may have been
3572 // reordered during buildTree(). We therefore need to get its operands
3573 // through the TreeEntry.
3574 if (TreeEntry *TE = BundleMember->TE) {
3575 // Need to search for the lane since the tree entry can be reordered.
3576 int Lane = std::distance(TE->Scalars.begin(),
3577 find(TE->Scalars, BundleMember->Inst));
3578 assert(Lane >= 0 && "Lane not set");
3579
3580 // Since vectorization tree is being built recursively this assertion
3581 // ensures that the tree entry has all operands set before reaching
3582 // this code. Couple of exceptions known at the moment are extracts
3583 // where their second (immediate) operand is not added. Since
3584 // immediates do not affect scheduler behavior this is considered
3585 // okay.
3586 auto *In = BundleMember->Inst;
3587 assert(
3588 In &&
3589 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3590 In->getNumOperands() == TE->getNumOperands()) &&
3591 "Missed TreeEntry operands?");
3592 (void)In; // fake use to avoid build failure when assertions disabled
3593
3594 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3595 OpIdx != NumOperands; ++OpIdx)
3596 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3597 DecrUnsched(I);
3598 } else {
3599 // If BundleMember is a stand-alone instruction, no operand reordering
3600 // has taken place, so we directly access its operands.
3601 for (Use &U : BundleMember->Inst->operands())
3602 if (auto *I = dyn_cast<Instruction>(U.get()))
3603 DecrUnsched(I);
3604 }
3605 // Handle the memory dependencies.
3606 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3607 if (MemoryDepSD->hasValidDependencies() &&
3608 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3609 // There are no more unscheduled dependencies after decrementing,
3610 // so we can put the dependent instruction into the ready list.
3611 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3612 assert(!DepBundle->IsScheduled &&
3613 "already scheduled bundle gets ready");
3614 ReadyList.insert(DepBundle);
3616 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3617 }
3618 }
3619 // Handle the control dependencies.
3620 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3621 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3622 // There are no more unscheduled dependencies after decrementing,
3623 // so we can put the dependent instruction into the ready list.
3624 ScheduleData *DepBundle = DepSD->FirstInBundle;
3625 assert(!DepBundle->IsScheduled &&
3626 "already scheduled bundle gets ready");
3627 ReadyList.insert(DepBundle);
3629 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3630 }
3631 }
3632 }
3633 }
3634
3635 /// Verify basic self consistency properties of the data structure.
3636 void verify() {
3637 if (!ScheduleStart)
3638 return;
3639
3640 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3641 ScheduleStart->comesBefore(ScheduleEnd) &&
3642 "Not a valid scheduling region?");
3643
3644 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3645 auto *SD = getScheduleData(I);
3646 if (!SD)
3647 continue;
3648 assert(isInSchedulingRegion(SD) &&
3649 "primary schedule data not in window?");
3650 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3651 "entire bundle in window!");
3652 (void)SD;
3653 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3654 }
3655
3656 for (auto *SD : ReadyInsts) {
3657 assert(SD->isSchedulingEntity() && SD->isReady() &&
3658 "item in ready list not ready?");
3659 (void)SD;
3660 }
3661 }
3662
3663 void doForAllOpcodes(Value *V,
3664 function_ref<void(ScheduleData *SD)> Action) {
3665 if (ScheduleData *SD = getScheduleData(V))
3666 Action(SD);
3667 auto I = ExtraScheduleDataMap.find(V);
3668 if (I != ExtraScheduleDataMap.end())
3669 for (auto &P : I->second)
3670 if (isInSchedulingRegion(P.second))
3671 Action(P.second);
3672 }
3673
3674 /// Put all instructions into the ReadyList which are ready for scheduling.
3675 template <typename ReadyListType>
3676 void initialFillReadyList(ReadyListType &ReadyList) {
3677 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3678 doForAllOpcodes(I, [&](ScheduleData *SD) {
3679 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3680 SD->isReady()) {
3681 ReadyList.insert(SD);
3682 LLVM_DEBUG(dbgs()
3683 << "SLP: initially in ready list: " << *SD << "\n");
3684 }
3685 });
3686 }
3687 }
3688
3689 /// Build a bundle from the ScheduleData nodes corresponding to the
3690 /// scalar instruction for each lane.
3691 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3692
3693 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3694 /// cyclic dependencies. This is only a dry-run, no instructions are
3695 /// actually moved at this stage.
3696 /// \returns the scheduling bundle. The returned Optional value is not
3697 /// std::nullopt if \p VL is allowed to be scheduled.
3698 std::optional<ScheduleData *>
3699 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3700 const InstructionsState &S);
3701
3702 /// Un-bundles a group of instructions.
3703 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3704
3705 /// Allocates schedule data chunk.
3706 ScheduleData *allocateScheduleDataChunks();
3707
3708 /// Extends the scheduling region so that V is inside the region.
3709 /// \returns true if the region size is within the limit.
3710 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3711
3712 /// Initialize the ScheduleData structures for new instructions in the
3713 /// scheduling region.
3714 void initScheduleData(Instruction *FromI, Instruction *ToI,
3715 ScheduleData *PrevLoadStore,
3716 ScheduleData *NextLoadStore);
3717
3718 /// Updates the dependency information of a bundle and of all instructions/
3719 /// bundles which depend on the original bundle.
3720 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3721 BoUpSLP *SLP);
3722
3723 /// Sets all instruction in the scheduling region to un-scheduled.
3724 void resetSchedule();
3725
3726 BasicBlock *BB;
3727
3728 /// Simple memory allocation for ScheduleData.
3730
3731 /// The size of a ScheduleData array in ScheduleDataChunks.
3732 int ChunkSize;
3733
3734 /// The allocator position in the current chunk, which is the last entry
3735 /// of ScheduleDataChunks.
3736 int ChunkPos;
3737
3738 /// Attaches ScheduleData to Instruction.
3739 /// Note that the mapping survives during all vectorization iterations, i.e.
3740 /// ScheduleData structures are recycled.
3742
3743 /// Attaches ScheduleData to Instruction with the leading key.
3745 ExtraScheduleDataMap;
3746
3747 /// The ready-list for scheduling (only used for the dry-run).
3748 SetVector<ScheduleData *> ReadyInsts;
3749
3750 /// The first instruction of the scheduling region.
3751 Instruction *ScheduleStart = nullptr;
3752
3753 /// The first instruction _after_ the scheduling region.
3754 Instruction *ScheduleEnd = nullptr;
3755
3756 /// The first memory accessing instruction in the scheduling region
3757 /// (can be null).
3758 ScheduleData *FirstLoadStoreInRegion = nullptr;
3759
3760 /// The last memory accessing instruction in the scheduling region
3761 /// (can be null).
3762 ScheduleData *LastLoadStoreInRegion = nullptr;
3763
3764 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3765 /// region? Used to optimize the dependence calculation for the
3766 /// common case where there isn't.
3767 bool RegionHasStackSave = false;
3768
3769 /// The current size of the scheduling region.
3770 int ScheduleRegionSize = 0;
3771
3772 /// The maximum size allowed for the scheduling region.
3773 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3774
3775 /// The ID of the scheduling region. For a new vectorization iteration this
3776 /// is incremented which "removes" all ScheduleData from the region.
3777 /// Make sure that the initial SchedulingRegionID is greater than the
3778 /// initial SchedulingRegionID in ScheduleData (which is 0).
3779 int SchedulingRegionID = 1;
3780 };
3781
3782 /// Attaches the BlockScheduling structures to basic blocks.
3784
3785 /// Performs the "real" scheduling. Done before vectorization is actually
3786 /// performed in a basic block.
3787 void scheduleBlock(BlockScheduling *BS);
3788
3789 /// List of users to ignore during scheduling and that don't need extracting.
3790 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3791
3792 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3793 /// sorted SmallVectors of unsigned.
3794 struct OrdersTypeDenseMapInfo {
3795 static OrdersType getEmptyKey() {
3796 OrdersType V;
3797 V.push_back(~1U);
3798 return V;
3799 }
3800
3801 static OrdersType getTombstoneKey() {
3802 OrdersType V;
3803 V.push_back(~2U);
3804 return V;
3805 }
3806
3807 static unsigned getHashValue(const OrdersType &V) {
3808 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3809 }
3810
3811 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3812 return LHS == RHS;
3813 }
3814 };
3815
3816 // Analysis and block reference.
3817 Function *F;
3818 ScalarEvolution *SE;
3820 TargetLibraryInfo *TLI;
3821 LoopInfo *LI;
3822 DominatorTree *DT;
3823 AssumptionCache *AC;
3824 DemandedBits *DB;
3825 const DataLayout *DL;
3827
3828 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3829 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3830
3831 /// Instruction builder to construct the vectorized tree.
3833
3834 /// A map of scalar integer values to the smallest bit width with which they
3835 /// can legally be represented. The values map to (width, signed) pairs,
3836 /// where "width" indicates the minimum bit width and "signed" is True if the
3837 /// value must be signed-extended, rather than zero-extended, back to its
3838 /// original width.
3840
3841 /// Final size of the reduced vector, if the current graph represents the
3842 /// input for the reduction and it was possible to narrow the size of the
3843 /// reduction.
3844 unsigned ReductionBitWidth = 0;
3845
3846 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3847 /// type sizes, used in the tree.
3848 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3849
3850 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3851 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3852 DenseSet<unsigned> ExtraBitWidthNodes;
3853};
3854
3855} // end namespace slpvectorizer
3856
3857template <> struct GraphTraits<BoUpSLP *> {
3858 using TreeEntry = BoUpSLP::TreeEntry;
3859
3860 /// NodeRef has to be a pointer per the GraphWriter.
3862
3864
3865 /// Add the VectorizableTree to the index iterator to be able to return
3866 /// TreeEntry pointers.
3867 struct ChildIteratorType
3868 : public iterator_adaptor_base<
3869 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3871
3873 ContainerTy &VT)
3874 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3875
3876 NodeRef operator*() { return I->UserTE; }
3877 };
3878
3880 return R.VectorizableTree[0].get();
3881 }
3882
3883 static ChildIteratorType child_begin(NodeRef N) {
3884 return {N->UserTreeIndices.begin(), N->Container};
3885 }
3886
3887 static ChildIteratorType child_end(NodeRef N) {
3888 return {N->UserTreeIndices.end(), N->Container};
3889 }
3890
3891 /// For the node iterator we just need to turn the TreeEntry iterator into a
3892 /// TreeEntry* iterator so that it dereferences to NodeRef.
3893 class nodes_iterator {
3895 ItTy It;
3896
3897 public:
3898 nodes_iterator(const ItTy &It2) : It(It2) {}
3899 NodeRef operator*() { return It->get(); }
3900 nodes_iterator operator++() {
3901 ++It;
3902 return *this;
3903 }
3904 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3905 };
3906
3907 static nodes_iterator nodes_begin(BoUpSLP *R) {
3908 return nodes_iterator(R->VectorizableTree.begin());
3909 }
3910
3911 static nodes_iterator nodes_end(BoUpSLP *R) {
3912 return nodes_iterator(R->VectorizableTree.end());
3913 }
3914
3915 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3916};
3917
3918template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3919 using TreeEntry = BoUpSLP::TreeEntry;
3920
3921 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3922
3923 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3924 std::string Str;
3926 OS << Entry->Idx << ".\n";
3927 if (isSplat(Entry->Scalars))
3928 OS << "<splat> ";
3929 for (auto *V : Entry->Scalars) {
3930 OS << *V;
3931 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3932 return EU.Scalar == V;
3933 }))
3934 OS << " <extract>";
3935 OS << "\n";
3936 }
3937 return Str;
3938 }
3939
3940 static std::string getNodeAttributes(const TreeEntry *Entry,
3941 const BoUpSLP *) {
3942 if (Entry->State == TreeEntry::NeedToGather)
3943 return "color=red";
3944 if (Entry->State == TreeEntry::ScatterVectorize ||
3945 Entry->State == TreeEntry::StridedVectorize)
3946 return "color=blue";
3947 return "";
3948 }
3949};
3950
3951} // end namespace llvm
3952
3955 for (auto *I : DeletedInstructions) {
3956 for (Use &U : I->operands()) {
3957 auto *Op = dyn_cast<Instruction>(U.get());
3958 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3960 DeadInsts.emplace_back(Op);
3961 }
3962 I->dropAllReferences();
3963 }
3964 for (auto *I : DeletedInstructions) {
3965 assert(I->use_empty() &&
3966 "trying to erase instruction with users.");
3967 I->eraseFromParent();
3968 }
3969
3970 // Cleanup any dead scalar code feeding the vectorized instructions
3972
3973#ifdef EXPENSIVE_CHECKS
3974 // If we could guarantee that this call is not extremely slow, we could
3975 // remove the ifdef limitation (see PR47712).
3976 assert(!verifyFunction(*F, &dbgs()));
3977#endif
3978}
3979
3980/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3981/// contains original mask for the scalars reused in the node. Procedure
3982/// transform this mask in accordance with the given \p Mask.
3984 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3985 "Expected non-empty mask.");
3986 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3987 Prev.swap(Reuses);
3988 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3989 if (Mask[I] != PoisonMaskElem)
3990 Reuses[Mask[I]] = Prev[I];
3991}
3992
3993/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3994/// the original order of the scalars. Procedure transforms the provided order
3995/// in accordance with the given \p Mask. If the resulting \p Order is just an
3996/// identity order, \p Order is cleared.
3998 bool BottomOrder = false) {
3999 assert(!Mask.empty() && "Expected non-empty mask.");
4000 unsigned Sz = Mask.size();
4001 if (BottomOrder) {
4002 SmallVector<unsigned> PrevOrder;
4003 if (Order.empty()) {
4004 PrevOrder.resize(Sz);
4005 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4006 } else {
4007 PrevOrder.swap(Order);
4008 }
4009 Order.assign(Sz, Sz);
4010 for (unsigned I = 0; I < Sz; ++I)
4011 if (Mask[I] != PoisonMaskElem)
4012 Order[I] = PrevOrder[Mask[I]];
4013 if (all_of(enumerate(Order), [&](const auto &Data) {
4014 return Data.value() == Sz || Data.index() == Data.value();
4015 })) {
4016 Order.clear();
4017 return;
4018 }
4019 fixupOrderingIndices(Order);
4020 return;
4021 }
4022 SmallVector<int> MaskOrder;
4023 if (Order.empty()) {
4024 MaskOrder.resize(Sz);
4025 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4026 } else {
4027 inversePermutation(Order, MaskOrder);
4028 }
4029 reorderReuses(MaskOrder, Mask);
4030 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4031 Order.clear();
4032 return;
4033 }
4034 Order.assign(Sz, Sz);
4035 for (unsigned I = 0; I < Sz; ++I)
4036 if (MaskOrder[I] != PoisonMaskElem)
4037 Order[MaskOrder[I]] = I;
4038 fixupOrderingIndices(Order);
4039}
4040
4041std::optional<BoUpSLP::OrdersType>
4042BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4043 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4044 // Try to find subvector extract/insert patterns and reorder only such
4045 // patterns.
4046 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4047 Type *ScalarTy = GatheredScalars.front()->getType();
4048 int NumScalars = GatheredScalars.size();
4049 if (!isValidElementType(ScalarTy))
4050 return std::nullopt;
4051 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
4052 int NumParts = TTI->getNumberOfParts(VecTy);
4053 if (NumParts == 0 || NumParts >= NumScalars)
4054 NumParts = 1;
4055 SmallVector<int> ExtractMask;
4056 SmallVector<int> Mask;
4059 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4061 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4062 /*ForOrder=*/true);
4063 // No shuffled operands - ignore.
4064 if (GatherShuffles.empty() && ExtractShuffles.empty())
4065 return std::nullopt;
4066 OrdersType CurrentOrder(NumScalars, NumScalars);
4067 if (GatherShuffles.size() == 1 &&
4068 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4069 Entries.front().front()->isSame(TE.Scalars)) {
4070 // Perfect match in the graph, will reuse the previously vectorized
4071 // node. Cost is 0.
4072 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4073 return CurrentOrder;
4074 }
4075 auto IsSplatMask = [](ArrayRef<int> Mask) {
4076 int SingleElt = PoisonMaskElem;
4077 return all_of(Mask, [&](int I) {
4078 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4079 SingleElt = I;
4080 return I == PoisonMaskElem || I == SingleElt;
4081 });
4082 };
4083 // Exclusive broadcast mask - ignore.
4084 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4085 (Entries.size() != 1 ||
4086 Entries.front().front()->ReorderIndices.empty())) ||
4087 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4088 return std::nullopt;
4089 SmallBitVector ShuffledSubMasks(NumParts);
4090 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4091 ArrayRef<int> Mask, int PartSz, int NumParts,
4092 function_ref<unsigned(unsigned)> GetVF) {
4093 for (int I : seq<int>(0, NumParts)) {
4094 if (ShuffledSubMasks.test(I))
4095 continue;
4096 const int VF = GetVF(I);
4097 if (VF == 0)
4098 continue;
4099 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4100 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4101 // Shuffle of at least 2 vectors - ignore.
4102 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4103 std::fill(Slice.begin(), Slice.end(), NumScalars);
4104 ShuffledSubMasks.set(I);
4105 continue;
4106 }
4107 // Try to include as much elements from the mask as possible.
4108 int FirstMin = INT_MAX;
4109 int SecondVecFound = false;
4110 for (int K : seq<int>(Limit)) {
4111 int Idx = Mask[I * PartSz + K];
4112 if (Idx == PoisonMaskElem) {
4113 Value *V = GatheredScalars[I * PartSz + K];
4114 if (isConstant(V) && !isa<PoisonValue>(V)) {
4115 SecondVecFound = true;
4116 break;
4117 }
4118 continue;
4119 }
4120 if (Idx < VF) {
4121 if (FirstMin > Idx)
4122 FirstMin = Idx;
4123 } else {
4124 SecondVecFound = true;
4125 break;
4126 }
4127 }
4128 FirstMin = (FirstMin / PartSz) * PartSz;
4129 // Shuffle of at least 2 vectors - ignore.
4130 if (SecondVecFound) {
4131 std::fill(Slice.begin(), Slice.end(), NumScalars);
4132 ShuffledSubMasks.set(I);
4133 continue;
4134 }
4135 for (int K : seq<int>(Limit)) {
4136 int Idx = Mask[I * PartSz + K];
4137 if (Idx == PoisonMaskElem)
4138 continue;
4139 Idx -= FirstMin;
4140 if (Idx >= PartSz) {
4141 SecondVecFound = true;
4142 break;
4143 }
4144 if (CurrentOrder[I * PartSz + Idx] >
4145 static_cast<unsigned>(I * PartSz + K) &&
4146 CurrentOrder[I * PartSz + Idx] !=
4147 static_cast<unsigned>(I * PartSz + Idx))
4148 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4149 }
4150 // Shuffle of at least 2 vectors - ignore.
4151 if (SecondVecFound) {
4152 std::fill(Slice.begin(), Slice.end(), NumScalars);
4153 ShuffledSubMasks.set(I);
4154 continue;
4155 }
4156 }
4157 };
4158 int PartSz = getPartNumElems(NumScalars, NumParts);
4159 if (!ExtractShuffles.empty())
4160 TransformMaskToOrder(
4161 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4162 if (!ExtractShuffles[I])
4163 return 0U;
4164 unsigned VF = 0;
4165 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4166 for (unsigned Idx : seq<unsigned>(Sz)) {
4167 int K = I * PartSz + Idx;
4168 if (ExtractMask[K] == PoisonMaskElem)
4169 continue;
4170 if (!TE.ReuseShuffleIndices.empty())
4171 K = TE.ReuseShuffleIndices[K];
4172 if (!TE.ReorderIndices.empty())
4173 K = std::distance(TE.ReorderIndices.begin(),
4174 find(TE.ReorderIndices, K));
4175 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4176 if (!EI)
4177 continue;
4178 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4179 ->getElementCount()
4180 .getKnownMinValue());
4181 }
4182 return VF;
4183 });
4184 // Check special corner case - single shuffle of the same entry.
4185 if (GatherShuffles.size() == 1 && NumParts != 1) {
4186 if (ShuffledSubMasks.any())
4187 return std::nullopt;
4188 PartSz = NumScalars;
4189 NumParts = 1;
4190 }
4191 if (!Entries.empty())
4192 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4193 if (!GatherShuffles[I])
4194 return 0U;
4195 return std::max(Entries[I].front()->getVectorFactor(),
4196 Entries[I].back()->getVectorFactor());
4197 });
4198 int NumUndefs =
4199 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4200 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4201 return std::nullopt;
4202 return std::move(CurrentOrder);
4203}
4204
4205static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4206 const TargetLibraryInfo &TLI,
4207 bool CompareOpcodes = true) {
4208 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4209 return false;
4210 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4211 if (!GEP1)
4212 return false;
4213 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4214 if (!GEP2)
4215 return false;
4216 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4217 ((isConstant(GEP1->getOperand(1)) &&
4218 isConstant(GEP2->getOperand(1))) ||
4219 !CompareOpcodes ||
4220 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4221 .getOpcode());
4222}
4223
4224/// Calculates minimal alignment as a common alignment.
4225template <typename T>
4227 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4228 for (Value *V : VL.drop_front())
4229 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4230 return CommonAlignment;
4231}
4232
4233/// Check if \p Order represents reverse order.
4235 unsigned Sz = Order.size();
4236 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4237 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4238 });
4239}
4240
4241/// Checks if the provided list of pointers \p Pointers represents the strided
4242/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4243/// Otherwise, if \p Inst is not specified, just initialized optional value is
4244/// returned to show that the pointers represent strided pointers. If \p Inst
4245/// specified, the runtime stride is materialized before the given \p Inst.
4246/// \returns std::nullopt if the pointers are not pointers with the runtime
4247/// stride, nullptr or actual stride value, otherwise.
4248static std::optional<Value *>
4250 const DataLayout &DL, ScalarEvolution &SE,
4251 SmallVectorImpl<unsigned> &SortedIndices,
4252 Instruction *Inst = nullptr) {
4254 const SCEV *PtrSCEVLowest = nullptr;
4255 const SCEV *PtrSCEVHighest = nullptr;
4256 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4257 // addresses).
4258 for (Value *Ptr : PointerOps) {
4259 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4260 if (!PtrSCEV)
4261 return std::nullopt;
4262 SCEVs.push_back(PtrSCEV);
4263 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4264 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4265 continue;
4266 }
4267 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4268 if (isa<SCEVCouldNotCompute>(Diff))
4269 return std::nullopt;
4270 if (Diff->isNonConstantNegative()) {
4271 PtrSCEVLowest = PtrSCEV;
4272 continue;
4273 }
4274 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4275 if (isa<SCEVCouldNotCompute>(Diff1))
4276 return std::nullopt;
4277 if (Diff1->isNonConstantNegative()) {
4278 PtrSCEVHighest = PtrSCEV;
4279 continue;
4280 }
4281 }
4282 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4283 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4284 if (isa<SCEVCouldNotCompute>(Dist))
4285 return std::nullopt;
4286 int Size = DL.getTypeStoreSize(ElemTy);
4287 auto TryGetStride = [&](const SCEV *Dist,
4288 const SCEV *Multiplier) -> const SCEV * {
4289 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4290 if (M->getOperand(0) == Multiplier)
4291 return M->getOperand(1);
4292 if (M->getOperand(1) == Multiplier)
4293 return M->getOperand(0);
4294 return nullptr;
4295 }
4296 if (Multiplier == Dist)
4297 return SE.getConstant(Dist->getType(), 1);
4298 return SE.getUDivExactExpr(Dist, Multiplier);
4299 };
4300 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4301 const SCEV *Stride = nullptr;
4302 if (Size != 1 || SCEVs.size() > 2) {
4303 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4304 Stride = TryGetStride(Dist, Sz);
4305 if (!Stride)
4306 return std::nullopt;
4307 }
4308 if (!Stride || isa<SCEVConstant>(Stride))
4309 return std::nullopt;
4310 // Iterate through all pointers and check if all distances are
4311 // unique multiple of Stride.
4312 using DistOrdPair = std::pair<int64_t, int>;
4313 auto Compare = llvm::less_first();
4314 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4315 int Cnt = 0;
4316 bool IsConsecutive = true;
4317 for (const SCEV *PtrSCEV : SCEVs) {
4318 unsigned Dist = 0;
4319 if (PtrSCEV != PtrSCEVLowest) {
4320 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4321 const SCEV *Coeff = TryGetStride(Diff, Stride);
4322 if (!Coeff)
4323 return std::nullopt;
4324 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4325 if (!SC || isa<SCEVCouldNotCompute>(SC))
4326 return std::nullopt;
4327 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4328 SE.getMulExpr(Stride, SC)))
4329 ->isZero())
4330 return std::nullopt;
4331 Dist = SC->getAPInt().getZExtValue();
4332 }
4333 // If the strides are not the same or repeated, we can't vectorize.
4334 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4335 return std::nullopt;
4336 auto Res = Offsets.emplace(Dist, Cnt);
4337 if (!Res.second)
4338 return std::nullopt;
4339 // Consecutive order if the inserted element is the last one.
4340 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4341 ++Cnt;
4342 }
4343 if (Offsets.size() != SCEVs.size())
4344 return std::nullopt;
4345 SortedIndices.clear();
4346 if (!IsConsecutive) {
4347 // Fill SortedIndices array only if it is non-consecutive.
4348 SortedIndices.resize(PointerOps.size());
4349 Cnt = 0;
4350 for (const std::pair<int64_t, int> &Pair : Offsets) {
4351 SortedIndices[Cnt] = Pair.second;
4352 ++Cnt;
4353 }
4354 }
4355 if (!Inst)
4356 return nullptr;
4357 SCEVExpander Expander(SE, DL, "strided-load-vec");
4358 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4359}
4360
4361static std::pair<InstructionCost, InstructionCost>
4363 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4364 Type *ScalarTy, VectorType *VecTy);
4365
4367 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4368 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4369 // Check that a vectorized load would load the same memory as a scalar
4370 // load. For example, we don't want to vectorize loads that are smaller
4371 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4372 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4373 // from such a struct, we read/write packed bits disagreeing with the
4374 // unvectorized version.
4375 Type *ScalarTy = VL0->getType();
4376
4377 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4378 return LoadsState::Gather;
4379
4380 // Make sure all loads in the bundle are simple - we can't vectorize
4381 // atomic or volatile loads.
4382 PointerOps.clear();
4383 const unsigned Sz = VL.size();
4384 PointerOps.resize(Sz);
4385 auto *POIter = PointerOps.begin();
4386 for (Value *V : VL) {
4387 auto *L = cast<LoadInst>(V);
4388 if (!L->isSimple())
4389 return LoadsState::Gather;
4390 *POIter = L->getPointerOperand();
4391 ++POIter;
4392 }
4393
4394 Order.clear();
4395 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4396 // Check the order of pointer operands or that all pointers are the same.
4397 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4398 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4399 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4400 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4401 "supported with VectorizeNonPowerOf2");
4402 return LoadsState::Gather;
4403 }
4404
4405 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4406 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4407 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4408 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4410 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4411 return arePointersCompatible(P, PointerOps.front(), *TLI);
4412 })) {
4413 if (IsSorted) {
4414 Value *Ptr0;
4415 Value *PtrN;
4416 if (Order.empty()) {
4417 Ptr0 = PointerOps.front();
4418 PtrN = PointerOps.back();
4419 } else {
4420 Ptr0 = PointerOps[Order.front()];
4421 PtrN = PointerOps[Order.back()];
4422 }
4423 std::optional<int> Diff =
4424 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4425 // Check that the sorted loads are consecutive.
4426 if (static_cast<unsigned>(*Diff) == Sz - 1)
4427 return LoadsState::Vectorize;
4428 // Simple check if not a strided access - clear order.
4429 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4430 // Try to generate strided load node if:
4431 // 1. Target with strided load support is detected.
4432 // 2. The number of loads is greater than MinProfitableStridedLoads,
4433 // or the potential stride <= MaxProfitableLoadStride and the
4434 // potential stride is power-of-2 (to avoid perf regressions for the very
4435 // small number of loads) and max distance > number of loads, or potential
4436 // stride is -1.
4437 // 3. The loads are ordered, or number of unordered loads <=
4438 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4439 // (this check is to avoid extra costs for very expensive shuffles).
4440 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4441 (static_cast<unsigned>(std::abs(*Diff)) <=
4443 isPowerOf2_32(std::abs(*Diff)))) &&
4444 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4445 *Diff == -(static_cast<int>(Sz) - 1))) {
4446 int Stride = *Diff / static_cast<int>(Sz - 1);
4447 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4448 Align Alignment =
4449 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4450 ->getAlign();
4451 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4452 // Iterate through all pointers and check if all distances are
4453 // unique multiple of Dist.
4454 SmallSet<int, 4> Dists;
4455 for (Value *Ptr : PointerOps) {
4456 int Dist = 0;
4457 if (Ptr == PtrN)
4458 Dist = *Diff;
4459 else if (Ptr != Ptr0)
4460 Dist =
4461 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4462 // If the strides are not the same or repeated, we can't
4463 // vectorize.
4464 if (((Dist / Stride) * Stride) != Dist ||
4465 !Dists.insert(Dist).second)
4466 break;
4467 }
4468 if (Dists.size() == Sz)
4470 }
4471 }
4472 }
4473 }
4474 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4475 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4476 unsigned MinVF = getMinVF(Sz);
4477 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4478 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4479 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4480 unsigned VectorizedCnt = 0;
4482 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4483 Cnt += VF, ++VectorizedCnt) {
4484 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4486 SmallVector<Value *> PointerOps;
4487 LoadsState LS =
4488 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4489 /*TryRecursiveCheck=*/false);
4490 // Check that the sorted loads are consecutive.
4491 if (LS == LoadsState::Gather)
4492 break;
4493 // If need the reorder - consider as high-cost masked gather for now.
4494 if ((LS == LoadsState::Vectorize ||
4496 !Order.empty() && !isReverseOrder(Order))
4498 States.push_back(LS);
4499 }
4500 // Can be vectorized later as a serie of loads/insertelements.
4501 if (VectorizedCnt == VL.size() / VF) {
4502 // Compare masked gather cost and loads + insersubvector costs.
4504 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4505 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4506 CostKind, ScalarTy, VecTy);
4507 InstructionCost MaskedGatherCost =
4509 Instruction::Load, VecTy,
4510 cast<LoadInst>(VL0)->getPointerOperand(),
4511 /*VariableMask=*/false, CommonAlignment, CostKind) +
4512 VectorGEPCost - ScalarGEPCost;
4513 InstructionCost VecLdCost = 0;
4514 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4515 for (auto [I, LS] : enumerate(States)) {
4516 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4517 switch (LS) {
4518 case LoadsState::Vectorize: {
4519 auto [ScalarGEPCost, VectorGEPCost] =
4520 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4521 LI0->getPointerOperand(), Instruction::Load,
4522 CostKind, ScalarTy, SubVecTy);
4523 VecLdCost += TTI.getMemoryOpCost(
4524 Instruction::Load, SubVecTy, LI0->getAlign(),
4525 LI0->getPointerAddressSpace(), CostKind,
4527 VectorGEPCost - ScalarGEPCost;
4528 break;
4529 }
4531 auto [ScalarGEPCost, VectorGEPCost] =
4532 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4533 LI0->getPointerOperand(), Instruction::Load,
4534 CostKind, ScalarTy, SubVecTy);
4535 VecLdCost +=
4537 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4538 /*VariableMask=*/false, CommonAlignment, CostKind) +
4539 VectorGEPCost - ScalarGEPCost;
4540 break;
4541 }
4543 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4544 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4545 LI0->getPointerOperand(), Instruction::GetElementPtr,
4546 CostKind, ScalarTy, SubVecTy);
4547 VecLdCost +=
4549 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4550 /*VariableMask=*/false, CommonAlignment, CostKind) +
4551 VectorGEPCost - ScalarGEPCost;
4552 break;
4553 }
4554 case LoadsState::Gather:
4556 "Expected only consecutive, strided or masked gather loads.");
4557 }
4558 SmallVector<int> ShuffleMask(VL.size());
4559 for (int Idx : seq<int>(0, VL.size()))
4560 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4561 VecLdCost +=
4562 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4563 CostKind, I * VF, SubVecTy);
4564 }
4565 // If masked gather cost is higher - better to vectorize, so
4566 // consider it as a gather node. It will be better estimated
4567 // later.
4568 if (MaskedGatherCost >= VecLdCost)
4569 return true;
4570 }
4571 }
4572 return false;
4573 };
4574 // TODO: need to improve analysis of the pointers, if not all of them are
4575 // GEPs or have > 2 operands, we end up with a gather node, which just
4576 // increases the cost.
4577 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4578 bool ProfitableGatherPointers =
4579 L && Sz > 2 &&
4580 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4581 return L->isLoopInvariant(V);
4582 })) <= Sz / 2;
4583 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4584 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4585 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4586 (GEP && GEP->getNumOperands() == 2 &&
4587 isa<Constant, Instruction>(GEP->getOperand(1)));
4588 })) {
4589 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4590 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4591 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4592 // Check if potential masked gather can be represented as series
4593 // of loads + insertsubvectors.
4594 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4595 // If masked gather cost is higher - better to vectorize, so
4596 // consider it as a gather node. It will be better estimated
4597 // later.
4598 return LoadsState::Gather;
4599 }
4601 }
4602 }
4603 }
4604
4605 return LoadsState::Gather;
4606}
4607
4609 const DataLayout &DL, ScalarEvolution &SE,
4610 SmallVectorImpl<unsigned> &SortedIndices) {
4612 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4613 "Expected list of pointer operands.");
4614 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4615 // Ptr into, sort and return the sorted indices with values next to one
4616 // another.
4618 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4619
4620 unsigned Cnt = 1;
4621 for (Value *Ptr : VL.drop_front()) {
4622 bool Found = any_of(Bases, [&](auto &Base) {
4623 std::optional<int> Diff =
4624 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4625 /*StrictCheck=*/true);
4626 if (!Diff)
4627 return false;
4628
4629 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4630 return true;
4631 });
4632
4633 if (!Found) {
4634 // If we haven't found enough to usefully cluster, return early.
4635 if (Bases.size() > VL.size() / 2 - 1)
4636 return false;
4637
4638 // Not found already - add a new Base
4639 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4640 }
4641 }
4642
4643 // For each of the bases sort the pointers by Offset and check if any of the
4644 // base become consecutively allocated.
4645 bool AnyConsecutive = false;
4646 for (auto &Base : Bases) {
4647 auto &Vec = Base.second;
4648 if (Vec.size() > 1) {
4649 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4650 const std::tuple<Value *, int, unsigned> &Y) {
4651 return std::get<1>(X) < std::get<1>(Y);
4652 });
4653 int InitialOffset = std::get<1>(Vec[0]);
4654 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4655 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4656 });
4657 }
4658 }
4659
4660 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4661 SortedIndices.clear();
4662 if (!AnyConsecutive)
4663 return false;
4664
4665 for (auto &Base : Bases) {
4666 for (auto &T : Base.second)
4667 SortedIndices.push_back(std::get<2>(T));
4668 }
4669
4670 assert(SortedIndices.size() == VL.size() &&
4671 "Expected SortedIndices to be the size of VL");
4672 return true;
4673}
4674
4675std::optional<BoUpSLP::OrdersType>
4676BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4677 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4678 Type *ScalarTy = TE.Scalars[0]->getType();
4679
4681 Ptrs.reserve(TE.Scalars.size());
4682 for (Value *V : TE.Scalars) {
4683 auto *L = dyn_cast<LoadInst>(V);
4684 if (!L || !L->isSimple())
4685 return std::nullopt;
4686 Ptrs.push_back(L->getPointerOperand());
4687 }
4688
4689 BoUpSLP::OrdersType Order;
4690 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4691 return std::move(Order);
4692 return std::nullopt;
4693}
4694
4695/// Check if two insertelement instructions are from the same buildvector.
4698 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4699 // Instructions must be from the same basic blocks.
4700 if (VU->getParent() != V->getParent())
4701 return false;
4702 // Checks if 2 insertelements are from the same buildvector.
4703 if (VU->getType() != V->getType())
4704 return false;
4705 // Multiple used inserts are separate nodes.
4706 if (!VU->hasOneUse() && !V->hasOneUse())
4707 return false;
4708 auto *IE1 = VU;
4709 auto *IE2 = V;
4710 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4711 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4712 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4713 return false;
4714 // Go through the vector operand of insertelement instructions trying to find
4715 // either VU as the original vector for IE2 or V as the original vector for
4716 // IE1.
4717 SmallBitVector ReusedIdx(
4718 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4719 bool IsReusedIdx = false;
4720 do {
4721 if (IE2 == VU && !IE1)
4722 return VU->hasOneUse();
4723 if (IE1 == V && !IE2)
4724 return V->hasOneUse();
4725 if (IE1 && IE1 != V) {
4726 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4727 IsReusedIdx |= ReusedIdx.test(Idx1);
4728 ReusedIdx.set(Idx1);
4729 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4730 IE1 = nullptr;
4731 else
4732 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4733 }
4734 if (IE2 && IE2 != VU) {
4735 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4736 IsReusedIdx |= ReusedIdx.test(Idx2);
4737 ReusedIdx.set(Idx2);
4738 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4739 IE2 = nullptr;
4740 else
4741 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4742 }
4743 } while (!IsReusedIdx && (IE1 || IE2));
4744 return false;
4745}
4746
4747std::optional<BoUpSLP::OrdersType>
4748BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4749 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4750 if (TE.isNonPowOf2Vec())
4751 return std::nullopt;
4752
4753 // No need to reorder if need to shuffle reuses, still need to shuffle the
4754 // node.
4755 if (!TE.ReuseShuffleIndices.empty()) {
4756 if (isSplat(TE.Scalars))
4757 return std::nullopt;
4758 // Check if reuse shuffle indices can be improved by reordering.
4759 // For this, check that reuse mask is "clustered", i.e. each scalar values
4760 // is used once in each submask of size <number_of_scalars>.
4761 // Example: 4 scalar values.
4762 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4763 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4764 // element 3 is used twice in the second submask.
4765 unsigned Sz = TE.Scalars.size();
4766 if (TE.State == TreeEntry::NeedToGather) {
4767 if (std::optional<OrdersType> CurrentOrder =
4769 SmallVector<int> Mask;
4770 fixupOrderingIndices(*CurrentOrder);
4771 inversePermutation(*CurrentOrder, Mask);
4772 ::addMask(Mask, TE.ReuseShuffleIndices);
4773 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4774 unsigned Sz = TE.Scalars.size();
4775 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4776 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4777 if (Idx != PoisonMaskElem)
4778 Res[Idx + K * Sz] = I + K * Sz;
4779 }
4780 return std::move(Res);
4781 }
4782 }
4783 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4785 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4786 return std::nullopt;
4787 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4788 Sz)) {
4789 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4790 if (TE.ReorderIndices.empty())
4791 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4792 else
4793 inversePermutation(TE.ReorderIndices, ReorderMask);
4794 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4795 unsigned VF = ReorderMask.size();
4796 OrdersType ResOrder(VF, VF);
4797 unsigned NumParts = divideCeil(VF, Sz);
4798 SmallBitVector UsedVals(NumParts);
4799 for (unsigned I = 0; I < VF; I += Sz) {
4800 int Val = PoisonMaskElem;
4801 unsigned UndefCnt = 0;
4802 unsigned Limit = std::min(Sz, VF - I);
4803 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
4804 [&](int Idx) {
4805 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4806 Val = Idx;
4807 if (Idx == PoisonMaskElem)
4808 ++UndefCnt;
4809 return Idx != PoisonMaskElem && Idx != Val;
4810 }) ||
4811 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4812 UndefCnt > Sz / 2)
4813 return std::nullopt;
4814 UsedVals.set(Val);
4815 for (unsigned K = 0; K < NumParts; ++K)
4816 ResOrder[Val + Sz * K] = I + K;
4817 }
4818 return std::move(ResOrder);
4819 }
4820 unsigned VF = TE.getVectorFactor();
4821 // Try build correct order for extractelement instructions.
4822 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4823 TE.ReuseShuffleIndices.end());
4824 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4825 all_of(TE.Scalars, [Sz](Value *V) {
4826 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4827 return Idx && *Idx < Sz;
4828 })) {
4829 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4830 if (TE.ReorderIndices.empty())
4831 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4832 else
4833 inversePermutation(TE.ReorderIndices, ReorderMask);
4834 for (unsigned I = 0; I < VF; ++I) {
4835 int &Idx = ReusedMask[I];
4836 if (Idx == PoisonMaskElem)
4837 continue;
4838 Value *V = TE.Scalars[ReorderMask[Idx]];
4839 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4840 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4841 }
4842 }
4843 // Build the order of the VF size, need to reorder reuses shuffles, they are
4844 // always of VF size.
4845 OrdersType ResOrder(VF);
4846 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4847 auto *It = ResOrder.begin();
4848 for (unsigned K = 0; K < VF; K += Sz) {
4849 OrdersType CurrentOrder(TE.ReorderIndices);
4850 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4851 if (SubMask.front() == PoisonMaskElem)
4852 std::iota(SubMask.begin(), SubMask.end(), 0);
4853 reorderOrder(CurrentOrder, SubMask);
4854 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4855 std::advance(It, Sz);
4856 }
4857 if (TE.State == TreeEntry::NeedToGather &&
4858 all_of(enumerate(ResOrder),
4859 [](const auto &Data) { return Data.index() == Data.value(); }))
4860 return std::nullopt; // No need to reorder.
4861 return std::move(ResOrder);
4862 }
4863 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4864 any_of(TE.UserTreeIndices,
4865 [](const EdgeInfo &EI) {
4866 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4867 }) &&
4868 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4869 return std::nullopt;
4870 if ((TE.State == TreeEntry::Vectorize ||
4871 TE.State == TreeEntry::StridedVectorize) &&
4872 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4873 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4874 !TE.isAltShuffle())
4875 return TE.ReorderIndices;
4876 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4877 auto PHICompare = [&](unsigned I1, unsigned I2) {
4878 Value *V1 = TE.Scalars[I1];
4879 Value *V2 = TE.Scalars[I2];
4880 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4881 return false;
4882 if (V1->getNumUses() < V2->getNumUses())
4883 return true;
4884 if (V1->getNumUses() > V2->getNumUses())
4885 return false;
4886 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4887 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4888 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4889 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4891 IE1, IE2,
4892 [](InsertElementInst *II) { return II->getOperand(0); }))
4893 return I1 < I2;
4894 return getInsertIndex(IE1) < getInsertIndex(IE2);
4895 }
4896 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4897 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4898 if (EE1->getOperand(0) != EE2->getOperand(0))
4899 return I1 < I2;
4900 return getInsertIndex(EE1) < getInsertIndex(EE2);
4901 }
4902 return I1 < I2;
4903 };
4904 auto IsIdentityOrder = [](const OrdersType &Order) {
4905 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4906 if (Idx != Order[Idx])
4907 return false;
4908 return true;
4909 };
4910 if (!TE.ReorderIndices.empty())
4911 return TE.ReorderIndices;
4913 SmallVector<unsigned> Phis(TE.Scalars.size());
4914 std::iota(Phis.begin(), Phis.end(), 0);
4915 OrdersType ResOrder(TE.Scalars.size());
4916 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4917 PhiToId[Id] = Id;
4918 stable_sort(Phis, PHICompare);
4919 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4920 ResOrder[Id] = PhiToId[Phis[Id]];
4921 if (IsIdentityOrder(ResOrder))
4922 return std::nullopt; // No need to reorder.
4923 return std::move(ResOrder);
4924 }
4925 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4926 allSameType(TE.Scalars)) {
4927 // TODO: add analysis of other gather nodes with extractelement
4928 // instructions and other values/instructions, not only undefs.
4929 if ((TE.getOpcode() == Instruction::ExtractElement ||
4930 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4931 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4932 all_of(TE.Scalars, [](Value *V) {
4933 auto *EE = dyn_cast<ExtractElementInst>(V);
4934 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4935 })) {
4936 // Check that gather of extractelements can be represented as
4937 // just a shuffle of a single vector.
4938 OrdersType CurrentOrder;
4939 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4940 /*ResizeAllowed=*/true);
4941 if (Reuse || !CurrentOrder.empty())
4942 return std::move(CurrentOrder);
4943 }
4944 // If the gather node is <undef, v, .., poison> and
4945 // insertelement poison, v, 0 [+ permute]
4946 // is cheaper than
4947 // insertelement poison, v, n - try to reorder.
4948 // If rotating the whole graph, exclude the permute cost, the whole graph
4949 // might be transformed.
4950 int Sz = TE.Scalars.size();
4951 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4952 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4953 const auto *It =
4954 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4955 if (It == TE.Scalars.begin())
4956 return OrdersType();
4957 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4958 if (It != TE.Scalars.end()) {
4959 OrdersType Order(Sz, Sz);
4960 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4961 Order[Idx] = 0;
4962 fixupOrderingIndices(Order);
4963 SmallVector<int> Mask;
4964 inversePermutation(Order, Mask);
4965 InstructionCost PermuteCost =
4966 TopToBottom
4967 ? 0
4969 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4970 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4971 PoisonValue::get(Ty), *It);
4972 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4973 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4974 PoisonValue::get(Ty), *It);
4975 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4976 OrdersType Order(Sz, Sz);
4977 Order[Idx] = 0;