LLVM 18.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
25#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/iterator.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/Dominators.h"
57#include "llvm/IR/Function.h"
58#include "llvm/IR/IRBuilder.h"
59#include "llvm/IR/InstrTypes.h"
60#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Intrinsics.h"
64#include "llvm/IR/Module.h"
65#include "llvm/IR/Operator.h"
67#include "llvm/IR/Type.h"
68#include "llvm/IR/Use.h"
69#include "llvm/IR/User.h"
70#include "llvm/IR/Value.h"
71#include "llvm/IR/ValueHandle.h"
72#ifdef EXPENSIVE_CHECKS
73#include "llvm/IR/Verifier.h"
74#endif
75#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
90#include <algorithm>
91#include <cassert>
92#include <cstdint>
93#include <iterator>
94#include <memory>
95#include <optional>
96#include <set>
97#include <string>
98#include <tuple>
99#include <utility>
100
101using namespace llvm;
102using namespace llvm::PatternMatch;
103using namespace slpvectorizer;
104
105#define SV_NAME "slp-vectorizer"
106#define DEBUG_TYPE "SLP"
107
108STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
109
110static cl::opt<bool>
111 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
112 cl::desc("Run the SLP vectorization passes"));
113
114static cl::opt<int>
116 cl::desc("Only vectorize if you gain more than this "
117 "number "));
118
119static cl::opt<bool>
120ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
121 cl::desc("Attempt to vectorize horizontal reductions"));
122
124 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
125 cl::desc(
126 "Attempt to vectorize horizontal reductions feeding into a store"));
127
128// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
129// even if we match a reduction but do not vectorize in the end.
131 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
132 cl::desc("Allow optimization of original scalar identity operations on "
133 "matched horizontal reductions."));
134
135static cl::opt<int>
137 cl::desc("Attempt to vectorize for this register size in bits"));
138
141 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
142
143/// Limits the size of scheduling regions in a block.
144/// It avoid long compile times for _very_ large blocks where vector
145/// instructions are spread over a wide range.
146/// This limit is way higher than needed by real-world functions.
147static cl::opt<int>
148ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
149 cl::desc("Limit the size of the SLP scheduling region per block"));
150
152 "slp-min-reg-size", cl::init(128), cl::Hidden,
153 cl::desc("Attempt to vectorize for this register size in bits"));
154
156 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
157 cl::desc("Limit the recursion depth when building a vectorizable tree"));
158
160 "slp-min-tree-size", cl::init(3), cl::Hidden,
161 cl::desc("Only vectorize small trees if they are fully vectorizable"));
162
163// The maximum depth that the look-ahead score heuristic will explore.
164// The higher this value, the higher the compilation time overhead.
166 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
167 cl::desc("The maximum look-ahead depth for operand reordering scores"));
168
169// The maximum depth that the look-ahead score heuristic will explore
170// when it probing among candidates for vectorization tree roots.
171// The higher this value, the higher the compilation time overhead but unlike
172// similar limit for operands ordering this is less frequently used, hence
173// impact of higher value is less noticeable.
175 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for searching best rooting option"));
177
178static cl::opt<bool>
179 ViewSLPTree("view-slp-tree", cl::Hidden,
180 cl::desc("Display the SLP trees with Graphviz"));
181
182// Limit the number of alias checks. The limit is chosen so that
183// it has no negative effect on the llvm benchmarks.
184static const unsigned AliasedCheckLimit = 10;
185
186// Another limit for the alias checks: The maximum distance between load/store
187// instructions where alias checks are done.
188// This limit is useful for very large basic blocks.
189static const unsigned MaxMemDepDistance = 160;
190
191/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
192/// regions to be handled.
193static const int MinScheduleRegionSize = 16;
194
195/// Predicate for the element types that the SLP vectorizer supports.
196///
197/// The most important thing to filter here are types which are invalid in LLVM
198/// vectors. We also filter target specific types which have absolutely no
199/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
200/// avoids spending time checking the cost model and realizing that they will
201/// be inevitably scalarized.
202static bool isValidElementType(Type *Ty) {
203 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
204 !Ty->isPPC_FP128Ty();
205}
206
207/// \returns True if the value is a constant (but not globals/constant
208/// expressions).
209static bool isConstant(Value *V) {
210 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
211}
212
213/// Checks if \p V is one of vector-like instructions, i.e. undef,
214/// insertelement/extractelement with constant indices for fixed vector type or
215/// extractvalue instruction.
217 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
218 !isa<ExtractValueInst, UndefValue>(V))
219 return false;
220 auto *I = dyn_cast<Instruction>(V);
221 if (!I || isa<ExtractValueInst>(I))
222 return true;
223 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
224 return false;
225 if (isa<ExtractElementInst>(I))
226 return isConstant(I->getOperand(1));
227 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
228 return isConstant(I->getOperand(2));
229}
230
231#if !defined(NDEBUG)
232/// Print a short descriptor of the instruction bundle suitable for debug output.
233static std::string shortBundleName(ArrayRef<Value *> VL) {
234 std::string Result;
235 raw_string_ostream OS(Result);
236 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
237 OS.flush();
238 return Result;
239}
240#endif
241
242/// \returns true if all of the instructions in \p VL are in the same block or
243/// false otherwise.
245 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
246 if (!I0)
247 return false;
249 return true;
250
251 BasicBlock *BB = I0->getParent();
252 for (int I = 1, E = VL.size(); I < E; I++) {
253 auto *II = dyn_cast<Instruction>(VL[I]);
254 if (!II)
255 return false;
256
257 if (BB != II->getParent())
258 return false;
259 }
260 return true;
261}
262
263/// \returns True if all of the values in \p VL are constants (but not
264/// globals/constant expressions).
266 // Constant expressions and globals can't be vectorized like normal integer/FP
267 // constants.
268 return all_of(VL, isConstant);
269}
270
271/// \returns True if all of the values in \p VL are identical or some of them
272/// are UndefValue.
273static bool isSplat(ArrayRef<Value *> VL) {
274 Value *FirstNonUndef = nullptr;
275 for (Value *V : VL) {
276 if (isa<UndefValue>(V))
277 continue;
278 if (!FirstNonUndef) {
279 FirstNonUndef = V;
280 continue;
281 }
282 if (V != FirstNonUndef)
283 return false;
284 }
285 return FirstNonUndef != nullptr;
286}
287
288/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
290 if (auto *Cmp = dyn_cast<CmpInst>(I))
291 return Cmp->isCommutative();
292 if (auto *BO = dyn_cast<BinaryOperator>(I))
293 return BO->isCommutative();
294 // TODO: This should check for generic Instruction::isCommutative(), but
295 // we need to confirm that the caller code correctly handles Intrinsics
296 // for example (does not have 2 operands).
297 return false;
298}
299
300/// \returns inserting index of InsertElement or InsertValue instruction,
301/// using Offset as base offset for index.
302static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
303 unsigned Offset = 0) {
304 int Index = Offset;
305 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
306 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
307 if (!VT)
308 return std::nullopt;
309 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
310 if (!CI)
311 return std::nullopt;
312 if (CI->getValue().uge(VT->getNumElements()))
313 return std::nullopt;
314 Index *= VT->getNumElements();
315 Index += CI->getZExtValue();
316 return Index;
317 }
318
319 const auto *IV = cast<InsertValueInst>(InsertInst);
320 Type *CurrentType = IV->getType();
321 for (unsigned I : IV->indices()) {
322 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
323 Index *= ST->getNumElements();
324 CurrentType = ST->getElementType(I);
325 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
326 Index *= AT->getNumElements();
327 CurrentType = AT->getElementType();
328 } else {
329 return std::nullopt;
330 }
331 Index += I;
332 }
333 return Index;
334}
335
336namespace {
337/// Specifies the way the mask should be analyzed for undefs/poisonous elements
338/// in the shuffle mask.
339enum class UseMask {
340 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
341 ///< check for the mask elements for the first argument (mask
342 ///< indices are in range [0:VF)).
343 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
344 ///< for the mask elements for the second argument (mask indices
345 ///< are in range [VF:2*VF))
346 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
347 ///< future shuffle elements and mark them as ones as being used
348 ///< in future. Non-undef elements are considered as unused since
349 ///< they're already marked as used in the mask.
350};
351} // namespace
352
353/// Prepares a use bitset for the given mask either for the first argument or
354/// for the second.
356 UseMask MaskArg) {
357 SmallBitVector UseMask(VF, true);
358 for (auto [Idx, Value] : enumerate(Mask)) {
359 if (Value == PoisonMaskElem) {
360 if (MaskArg == UseMask::UndefsAsMask)
361 UseMask.reset(Idx);
362 continue;
363 }
364 if (MaskArg == UseMask::FirstArg && Value < VF)
365 UseMask.reset(Value);
366 else if (MaskArg == UseMask::SecondArg && Value >= VF)
367 UseMask.reset(Value - VF);
368 }
369 return UseMask;
370}
371
372/// Checks if the given value is actually an undefined constant vector.
373/// Also, if the \p UseMask is not empty, tries to check if the non-masked
374/// elements actually mask the insertelement buildvector, if any.
375template <bool IsPoisonOnly = false>
377 const SmallBitVector &UseMask = {}) {
378 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
379 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
380 if (isa<T>(V))
381 return Res;
382 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
383 if (!VecTy)
384 return Res.reset();
385 auto *C = dyn_cast<Constant>(V);
386 if (!C) {
387 if (!UseMask.empty()) {
388 const Value *Base = V;
389 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
390 Base = II->getOperand(0);
391 if (isa<T>(II->getOperand(1)))
392 continue;
393 std::optional<unsigned> Idx = getInsertIndex(II);
394 if (!Idx)
395 continue;
396 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
397 Res.reset(*Idx);
398 }
399 // TODO: Add analysis for shuffles here too.
400 if (V == Base) {
401 Res.reset();
402 } else {
403 SmallBitVector SubMask(UseMask.size(), false);
404 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
405 }
406 } else {
407 Res.reset();
408 }
409 return Res;
410 }
411 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
412 if (Constant *Elem = C->getAggregateElement(I))
413 if (!isa<T>(Elem) &&
414 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
415 Res.reset(I);
416 }
417 return Res;
418}
419
420/// Checks if the vector of instructions can be represented as a shuffle, like:
421/// %x0 = extractelement <4 x i8> %x, i32 0
422/// %x3 = extractelement <4 x i8> %x, i32 3
423/// %y1 = extractelement <4 x i8> %y, i32 1
424/// %y2 = extractelement <4 x i8> %y, i32 2
425/// %x0x0 = mul i8 %x0, %x0
426/// %x3x3 = mul i8 %x3, %x3
427/// %y1y1 = mul i8 %y1, %y1
428/// %y2y2 = mul i8 %y2, %y2
429/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
430/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
431/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
432/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
433/// ret <4 x i8> %ins4
434/// can be transformed into:
435/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
436/// i32 6>
437/// %2 = mul <4 x i8> %1, %1
438/// ret <4 x i8> %2
439/// Mask will return the Shuffle Mask equivalent to the extracted elements.
440/// TODO: Can we split off and reuse the shuffle mask detection from
441/// ShuffleVectorInst/getShuffleCost?
442static std::optional<TargetTransformInfo::ShuffleKind>
444 const auto *It =
445 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
446 if (It == VL.end())
447 return std::nullopt;
448 auto *EI0 = cast<ExtractElementInst>(*It);
449 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
450 return std::nullopt;
451 unsigned Size =
452 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
453 Value *Vec1 = nullptr;
454 Value *Vec2 = nullptr;
455 enum ShuffleMode { Unknown, Select, Permute };
456 ShuffleMode CommonShuffleMode = Unknown;
457 Mask.assign(VL.size(), PoisonMaskElem);
458 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
459 // Undef can be represented as an undef element in a vector.
460 if (isa<UndefValue>(VL[I]))
461 continue;
462 auto *EI = cast<ExtractElementInst>(VL[I]);
463 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
464 return std::nullopt;
465 auto *Vec = EI->getVectorOperand();
466 // We can extractelement from undef or poison vector.
467 if (isUndefVector(Vec).all())
468 continue;
469 // All vector operands must have the same number of vector elements.
470 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
471 return std::nullopt;
472 if (isa<UndefValue>(EI->getIndexOperand()))
473 continue;
474 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
475 if (!Idx)
476 return std::nullopt;
477 // Undefined behavior if Idx is negative or >= Size.
478 if (Idx->getValue().uge(Size))
479 continue;
480 unsigned IntIdx = Idx->getValue().getZExtValue();
481 Mask[I] = IntIdx;
482 // For correct shuffling we have to have at most 2 different vector operands
483 // in all extractelement instructions.
484 if (!Vec1 || Vec1 == Vec) {
485 Vec1 = Vec;
486 } else if (!Vec2 || Vec2 == Vec) {
487 Vec2 = Vec;
488 Mask[I] += Size;
489 } else {
490 return std::nullopt;
491 }
492 if (CommonShuffleMode == Permute)
493 continue;
494 // If the extract index is not the same as the operation number, it is a
495 // permutation.
496 if (IntIdx != I) {
497 CommonShuffleMode = Permute;
498 continue;
499 }
500 CommonShuffleMode = Select;
501 }
502 // If we're not crossing lanes in different vectors, consider it as blending.
503 if (CommonShuffleMode == Select && Vec2)
505 // If Vec2 was never used, we have a permutation of a single vector, otherwise
506 // we have permutation of 2 vectors.
509}
510
511/// \returns True if Extract{Value,Element} instruction extracts element Idx.
512static std::optional<unsigned> getExtractIndex(Instruction *E) {
513 unsigned Opcode = E->getOpcode();
514 assert((Opcode == Instruction::ExtractElement ||
515 Opcode == Instruction::ExtractValue) &&
516 "Expected extractelement or extractvalue instruction.");
517 if (Opcode == Instruction::ExtractElement) {
518 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
519 if (!CI)
520 return std::nullopt;
521 return CI->getZExtValue();
522 }
523 auto *EI = cast<ExtractValueInst>(E);
524 if (EI->getNumIndices() != 1)
525 return std::nullopt;
526 return *EI->idx_begin();
527}
528
529namespace {
530
531/// Main data required for vectorization of instructions.
532struct InstructionsState {
533 /// The very first instruction in the list with the main opcode.
534 Value *OpValue = nullptr;
535
536 /// The main/alternate instruction.
537 Instruction *MainOp = nullptr;
538 Instruction *AltOp = nullptr;
539
540 /// The main/alternate opcodes for the list of instructions.
541 unsigned getOpcode() const {
542 return MainOp ? MainOp->getOpcode() : 0;
543 }
544
545 unsigned getAltOpcode() const {
546 return AltOp ? AltOp->getOpcode() : 0;
547 }
548
549 /// Some of the instructions in the list have alternate opcodes.
550 bool isAltShuffle() const { return AltOp != MainOp; }
551
552 bool isOpcodeOrAlt(Instruction *I) const {
553 unsigned CheckedOpcode = I->getOpcode();
554 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
555 }
556
557 InstructionsState() = delete;
558 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
559 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
560};
561
562} // end anonymous namespace
563
564/// Chooses the correct key for scheduling data. If \p Op has the same (or
565/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
566/// OpValue.
567static Value *isOneOf(const InstructionsState &S, Value *Op) {
568 auto *I = dyn_cast<Instruction>(Op);
569 if (I && S.isOpcodeOrAlt(I))
570 return Op;
571 return S.OpValue;
572}
573
574/// \returns true if \p Opcode is allowed as part of the main/alternate
575/// instruction for SLP vectorization.
576///
577/// Example of unsupported opcode is SDIV that can potentially cause UB if the
578/// "shuffled out" lane would result in division by zero.
579static bool isValidForAlternation(unsigned Opcode) {
581 return false;
582
583 return true;
584}
585
586static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
587 const TargetLibraryInfo &TLI,
588 unsigned BaseIndex = 0);
589
590/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
591/// compatible instructions or constants, or just some other regular values.
592static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
593 Value *Op1, const TargetLibraryInfo &TLI) {
594 return (isConstant(BaseOp0) && isConstant(Op0)) ||
595 (isConstant(BaseOp1) && isConstant(Op1)) ||
596 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
597 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
598 BaseOp0 == Op0 || BaseOp1 == Op1 ||
599 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
600 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
601}
602
603/// \returns true if a compare instruction \p CI has similar "look" and
604/// same predicate as \p BaseCI, "as is" or with its operands and predicate
605/// swapped, false otherwise.
606static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
607 const TargetLibraryInfo &TLI) {
608 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
609 "Assessing comparisons of different types?");
610 CmpInst::Predicate BasePred = BaseCI->getPredicate();
611 CmpInst::Predicate Pred = CI->getPredicate();
613
614 Value *BaseOp0 = BaseCI->getOperand(0);
615 Value *BaseOp1 = BaseCI->getOperand(1);
616 Value *Op0 = CI->getOperand(0);
617 Value *Op1 = CI->getOperand(1);
618
619 return (BasePred == Pred &&
620 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
621 (BasePred == SwappedPred &&
622 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
623}
624
625/// \returns analysis of the Instructions in \p VL described in
626/// InstructionsState, the Opcode that we suppose the whole list
627/// could be vectorized even if its structure is diverse.
628static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
629 const TargetLibraryInfo &TLI,
630 unsigned BaseIndex) {
631 // Make sure these are all Instructions.
632 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
633 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
634
635 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
636 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
637 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
638 CmpInst::Predicate BasePred =
639 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
641 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
642 unsigned AltOpcode = Opcode;
643 unsigned AltIndex = BaseIndex;
644
645 // Check for one alternate opcode from another BinaryOperator.
646 // TODO - generalize to support all operators (types, calls etc.).
647 auto *IBase = cast<Instruction>(VL[BaseIndex]);
648 Intrinsic::ID BaseID = 0;
649 SmallVector<VFInfo> BaseMappings;
650 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
652 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
653 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
654 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
655 }
656 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
657 auto *I = cast<Instruction>(VL[Cnt]);
658 unsigned InstOpcode = I->getOpcode();
659 if (IsBinOp && isa<BinaryOperator>(I)) {
660 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
661 continue;
662 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
664 AltOpcode = InstOpcode;
665 AltIndex = Cnt;
666 continue;
667 }
668 } else if (IsCastOp && isa<CastInst>(I)) {
669 Value *Op0 = IBase->getOperand(0);
670 Type *Ty0 = Op0->getType();
671 Value *Op1 = I->getOperand(0);
672 Type *Ty1 = Op1->getType();
673 if (Ty0 == Ty1) {
674 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
675 continue;
676 if (Opcode == AltOpcode) {
678 isValidForAlternation(InstOpcode) &&
679 "Cast isn't safe for alternation, logic needs to be updated!");
680 AltOpcode = InstOpcode;
681 AltIndex = Cnt;
682 continue;
683 }
684 }
685 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
686 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
687 Type *Ty0 = BaseInst->getOperand(0)->getType();
688 Type *Ty1 = Inst->getOperand(0)->getType();
689 if (Ty0 == Ty1) {
690 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
691 // Check for compatible operands. If the corresponding operands are not
692 // compatible - need to perform alternate vectorization.
693 CmpInst::Predicate CurrentPred = Inst->getPredicate();
694 CmpInst::Predicate SwappedCurrentPred =
695 CmpInst::getSwappedPredicate(CurrentPred);
696
697 if (E == 2 &&
698 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
699 continue;
700
701 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
702 continue;
703 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
704 if (AltIndex != BaseIndex) {
705 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
706 continue;
707 } else if (BasePred != CurrentPred) {
708 assert(
709 isValidForAlternation(InstOpcode) &&
710 "CmpInst isn't safe for alternation, logic needs to be updated!");
711 AltIndex = Cnt;
712 continue;
713 }
714 CmpInst::Predicate AltPred = AltInst->getPredicate();
715 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
716 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
717 continue;
718 }
719 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
720 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
721 if (Gep->getNumOperands() != 2 ||
722 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
723 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
724 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
726 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
727 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
728 auto *BaseLI = cast<LoadInst>(IBase);
729 if (!LI->isSimple() || !BaseLI->isSimple())
730 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
731 } else if (auto *Call = dyn_cast<CallInst>(I)) {
732 auto *CallBase = cast<CallInst>(IBase);
733 if (Call->getCalledFunction() != CallBase->getCalledFunction())
734 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
735 if (Call->hasOperandBundles() &&
736 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
737 Call->op_begin() + Call->getBundleOperandsEndIndex(),
738 CallBase->op_begin() +
740 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
742 if (ID != BaseID)
743 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
744 if (!ID) {
745 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
746 if (Mappings.size() != BaseMappings.size() ||
747 Mappings.front().ISA != BaseMappings.front().ISA ||
748 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
749 Mappings.front().VectorName != BaseMappings.front().VectorName ||
750 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
751 Mappings.front().Shape.Parameters !=
752 BaseMappings.front().Shape.Parameters)
753 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
754 }
755 }
756 continue;
757 }
758 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759 }
760
761 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
762 cast<Instruction>(VL[AltIndex]));
763}
764
765/// \returns true if all of the values in \p VL have the same type or false
766/// otherwise.
768 Type *Ty = VL.front()->getType();
769 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
770}
771
772/// \returns True if in-tree use also needs extract. This refers to
773/// possible scalar operand in vectorized instruction.
774static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
775 TargetLibraryInfo *TLI) {
776 unsigned Opcode = UserInst->getOpcode();
777 switch (Opcode) {
778 case Instruction::Load: {
779 LoadInst *LI = cast<LoadInst>(UserInst);
780 return (LI->getPointerOperand() == Scalar);
781 }
782 case Instruction::Store: {
783 StoreInst *SI = cast<StoreInst>(UserInst);
784 return (SI->getPointerOperand() == Scalar);
785 }
786 case Instruction::Call: {
787 CallInst *CI = cast<CallInst>(UserInst);
789 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
790 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
791 Arg.value().get() == Scalar;
792 });
793 }
794 default:
795 return false;
796 }
797}
798
799/// \returns the AA location that is being access by the instruction.
801 if (StoreInst *SI = dyn_cast<StoreInst>(I))
802 return MemoryLocation::get(SI);
803 if (LoadInst *LI = dyn_cast<LoadInst>(I))
804 return MemoryLocation::get(LI);
805 return MemoryLocation();
806}
807
808/// \returns True if the instruction is not a volatile or atomic load/store.
809static bool isSimple(Instruction *I) {
810 if (LoadInst *LI = dyn_cast<LoadInst>(I))
811 return LI->isSimple();
812 if (StoreInst *SI = dyn_cast<StoreInst>(I))
813 return SI->isSimple();
814 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
815 return !MI->isVolatile();
816 return true;
817}
818
819/// Shuffles \p Mask in accordance with the given \p SubMask.
820/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
821/// one but two input vectors.
822static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
823 bool ExtendingManyInputs = false) {
824 if (SubMask.empty())
825 return;
826 assert(
827 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
828 // Check if input scalars were extended to match the size of other node.
829 (SubMask.size() == Mask.size() &&
830 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
831 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
832 "SubMask with many inputs support must be larger than the mask.");
833 if (Mask.empty()) {
834 Mask.append(SubMask.begin(), SubMask.end());
835 return;
836 }
837 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
838 int TermValue = std::min(Mask.size(), SubMask.size());
839 for (int I = 0, E = SubMask.size(); I < E; ++I) {
840 if (SubMask[I] == PoisonMaskElem ||
841 (!ExtendingManyInputs &&
842 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
843 continue;
844 NewMask[I] = Mask[SubMask[I]];
845 }
846 Mask.swap(NewMask);
847}
848
849/// Order may have elements assigned special value (size) which is out of
850/// bounds. Such indices only appear on places which correspond to undef values
851/// (see canReuseExtract for details) and used in order to avoid undef values
852/// have effect on operands ordering.
853/// The first loop below simply finds all unused indices and then the next loop
854/// nest assigns these indices for undef values positions.
855/// As an example below Order has two undef positions and they have assigned
856/// values 3 and 7 respectively:
857/// before: 6 9 5 4 9 2 1 0
858/// after: 6 3 5 4 7 2 1 0
860 const unsigned Sz = Order.size();
861 SmallBitVector UnusedIndices(Sz, /*t=*/true);
862 SmallBitVector MaskedIndices(Sz);
863 for (unsigned I = 0; I < Sz; ++I) {
864 if (Order[I] < Sz)
865 UnusedIndices.reset(Order[I]);
866 else
867 MaskedIndices.set(I);
868 }
869 if (MaskedIndices.none())
870 return;
871 assert(UnusedIndices.count() == MaskedIndices.count() &&
872 "Non-synced masked/available indices.");
873 int Idx = UnusedIndices.find_first();
874 int MIdx = MaskedIndices.find_first();
875 while (MIdx >= 0) {
876 assert(Idx >= 0 && "Indices must be synced.");
877 Order[MIdx] = Idx;
878 Idx = UnusedIndices.find_next(Idx);
879 MIdx = MaskedIndices.find_next(MIdx);
880 }
881}
882
883namespace llvm {
884
886 SmallVectorImpl<int> &Mask) {
887 Mask.clear();
888 const unsigned E = Indices.size();
889 Mask.resize(E, PoisonMaskElem);
890 for (unsigned I = 0; I < E; ++I)
891 Mask[Indices[I]] = I;
892}
893
894/// Reorders the list of scalars in accordance with the given \p Mask.
896 ArrayRef<int> Mask) {
897 assert(!Mask.empty() && "Expected non-empty mask.");
898 SmallVector<Value *> Prev(Scalars.size(),
899 UndefValue::get(Scalars.front()->getType()));
900 Prev.swap(Scalars);
901 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
902 if (Mask[I] != PoisonMaskElem)
903 Scalars[Mask[I]] = Prev[I];
904}
905
906/// Checks if the provided value does not require scheduling. It does not
907/// require scheduling if this is not an instruction or it is an instruction
908/// that does not read/write memory and all operands are either not instructions
909/// or phi nodes or instructions from different blocks.
911 auto *I = dyn_cast<Instruction>(V);
912 if (!I)
913 return true;
914 return !mayHaveNonDefUseDependency(*I) &&
915 all_of(I->operands(), [I](Value *V) {
916 auto *IO = dyn_cast<Instruction>(V);
917 if (!IO)
918 return true;
919 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
920 });
921}
922
923/// Checks if the provided value does not require scheduling. It does not
924/// require scheduling if this is not an instruction or it is an instruction
925/// that does not read/write memory and all users are phi nodes or instructions
926/// from the different blocks.
927static bool isUsedOutsideBlock(Value *V) {
928 auto *I = dyn_cast<Instruction>(V);
929 if (!I)
930 return true;
931 // Limits the number of uses to save compile time.
932 constexpr int UsesLimit = 8;
933 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
934 all_of(I->users(), [I](User *U) {
935 auto *IU = dyn_cast<Instruction>(U);
936 if (!IU)
937 return true;
938 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
939 });
940}
941
942/// Checks if the specified value does not require scheduling. It does not
943/// require scheduling if all operands and all users do not need to be scheduled
944/// in the current basic block.
947}
948
949/// Checks if the specified array of instructions does not require scheduling.
950/// It is so if all either instructions have operands that do not require
951/// scheduling or their users do not require scheduling since they are phis or
952/// in other basic blocks.
954 return !VL.empty() &&
956}
957
958namespace slpvectorizer {
959
960/// Bottom Up SLP Vectorizer.
961class BoUpSLP {
962 struct TreeEntry;
963 struct ScheduleData;
966
967public:
975
977 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
980 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
981 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
983 // Use the vector register size specified by the target unless overridden
984 // by a command-line option.
985 // TODO: It would be better to limit the vectorization factor based on
986 // data type rather than just register size. For example, x86 AVX has
987 // 256-bit registers, but it does not support integer operations
988 // at that width (that requires AVX2).
989 if (MaxVectorRegSizeOption.getNumOccurrences())
990 MaxVecRegSize = MaxVectorRegSizeOption;
991 else
992 MaxVecRegSize =
994 .getFixedValue();
995
996 if (MinVectorRegSizeOption.getNumOccurrences())
997 MinVecRegSize = MinVectorRegSizeOption;
998 else
999 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1000 }
1001
1002 /// Vectorize the tree that starts with the elements in \p VL.
1003 /// Returns the vectorized root.
1005
1006 /// Vectorize the tree but with the list of externally used values \p
1007 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1008 /// generated extractvalue instructions.
1009 /// \param ReplacedExternals containd list of replaced external values
1010 /// {scalar, replace} after emitting extractelement for external uses.
1011 Value *
1012 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1013 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1014 Instruction *ReductionRoot = nullptr);
1015
1016 /// \returns the cost incurred by unwanted spills and fills, caused by
1017 /// holding live values over call sites.
1019
1020 /// \returns the vectorization cost of the subtree that starts at \p VL.
1021 /// A negative number means that this is profitable.
1022 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1023
1024 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1025 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1026 void buildTree(ArrayRef<Value *> Roots,
1027 const SmallDenseSet<Value *> &UserIgnoreLst);
1028
1029 /// Construct a vectorizable tree that starts at \p Roots.
1030 void buildTree(ArrayRef<Value *> Roots);
1031
1032 /// Returns whether the root node has in-tree uses.
1034 return !VectorizableTree.empty() &&
1035 !VectorizableTree.front()->UserTreeIndices.empty();
1036 }
1037
1038 /// Return the scalars of the root node.
1040 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1041 return VectorizableTree.front()->Scalars;
1042 }
1043
1044 /// Builds external uses of the vectorized scalars, i.e. the list of
1045 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1046 /// ExternallyUsedValues contains additional list of external uses to handle
1047 /// vectorization of reductions.
1048 void
1049 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1050
1051 /// Clear the internal data structures that are created by 'buildTree'.
1052 void deleteTree() {
1053 VectorizableTree.clear();
1054 ScalarToTreeEntry.clear();
1055 MultiNodeScalars.clear();
1056 MustGather.clear();
1057 EntryToLastInstruction.clear();
1058 ExternalUses.clear();
1059 for (auto &Iter : BlocksSchedules) {
1060 BlockScheduling *BS = Iter.second.get();
1061 BS->clear();
1062 }
1063 MinBWs.clear();
1064 InstrElementSize.clear();
1065 UserIgnoreList = nullptr;
1066 PostponedGathers.clear();
1067 ValueToGatherNodes.clear();
1068 }
1069
1070 unsigned getTreeSize() const { return VectorizableTree.size(); }
1071
1072 /// Perform LICM and CSE on the newly generated gather sequences.
1074
1075 /// Checks if the specified gather tree entry \p TE can be represented as a
1076 /// shuffled vector entry + (possibly) permutation with other gathers. It
1077 /// implements the checks only for possibly ordered scalars (Loads,
1078 /// ExtractElement, ExtractValue), which can be part of the graph.
1079 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1080
1081 /// Sort loads into increasing pointers offsets to allow greater clustering.
1082 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1083
1084 /// Gets reordering data for the given tree entry. If the entry is vectorized
1085 /// - just return ReorderIndices, otherwise check if the scalars can be
1086 /// reordered and return the most optimal order.
1087 /// \return std::nullopt if ordering is not important, empty order, if
1088 /// identity order is important, or the actual order.
1089 /// \param TopToBottom If true, include the order of vectorized stores and
1090 /// insertelement nodes, otherwise skip them.
1091 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1092 bool TopToBottom);
1093
1094 /// Reorders the current graph to the most profitable order starting from the
1095 /// root node to the leaf nodes. The best order is chosen only from the nodes
1096 /// of the same size (vectorization factor). Smaller nodes are considered
1097 /// parts of subgraph with smaller VF and they are reordered independently. We
1098 /// can make it because we still need to extend smaller nodes to the wider VF
1099 /// and we can merge reordering shuffles with the widening shuffles.
1100 void reorderTopToBottom();
1101
1102 /// Reorders the current graph to the most profitable order starting from
1103 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1104 /// number of reshuffles if the leaf nodes use the same order. In this case we
1105 /// can merge the orders and just shuffle user node instead of shuffling its
1106 /// operands. Plus, even the leaf nodes have different orders, it allows to
1107 /// sink reordering in the graph closer to the root node and merge it later
1108 /// during analysis.
1109 void reorderBottomToTop(bool IgnoreReorder = false);
1110
1111 /// \return The vector element size in bits to use when vectorizing the
1112 /// expression tree ending at \p V. If V is a store, the size is the width of
1113 /// the stored value. Otherwise, the size is the width of the largest loaded
1114 /// value reaching V. This method is used by the vectorizer to calculate
1115 /// vectorization factors.
1116 unsigned getVectorElementSize(Value *V);
1117
1118 /// Compute the minimum type sizes required to represent the entries in a
1119 /// vectorizable tree.
1121
1122 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1123 unsigned getMaxVecRegSize() const {
1124 return MaxVecRegSize;
1125 }
1126
1127 // \returns minimum vector register size as set by cl::opt.
1128 unsigned getMinVecRegSize() const {
1129 return MinVecRegSize;
1130 }
1131
1132 unsigned getMinVF(unsigned Sz) const {
1133 return std::max(2U, getMinVecRegSize() / Sz);
1134 }
1135
1136 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1137 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1138 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1139 return MaxVF ? MaxVF : UINT_MAX;
1140 }
1141
1142 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1143 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1144 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1145 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1146 ///
1147 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1148 unsigned canMapToVector(Type *T) const;
1149
1150 /// \returns True if the VectorizableTree is both tiny and not fully
1151 /// vectorizable. We do not vectorize such trees.
1152 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1153
1154 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1155 /// can be load combined in the backend. Load combining may not be allowed in
1156 /// the IR optimizer, so we do not want to alter the pattern. For example,
1157 /// partially transforming a scalar bswap() pattern into vector code is
1158 /// effectively impossible for the backend to undo.
1159 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1160 /// may not be necessary.
1161 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1162
1163 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1164 /// can be load combined in the backend. Load combining may not be allowed in
1165 /// the IR optimizer, so we do not want to alter the pattern. For example,
1166 /// partially transforming a scalar bswap() pattern into vector code is
1167 /// effectively impossible for the backend to undo.
1168 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1169 /// may not be necessary.
1170 bool isLoadCombineCandidate() const;
1171
1173
1174 /// This structure holds any data we need about the edges being traversed
1175 /// during buildTree_rec(). We keep track of:
1176 /// (i) the user TreeEntry index, and
1177 /// (ii) the index of the edge.
1178 struct EdgeInfo {
1179 EdgeInfo() = default;
1180 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1182 /// The user TreeEntry.
1183 TreeEntry *UserTE = nullptr;
1184 /// The operand index of the use.
1185 unsigned EdgeIdx = UINT_MAX;
1186#ifndef NDEBUG
1188 const BoUpSLP::EdgeInfo &EI) {
1189 EI.dump(OS);
1190 return OS;
1191 }
1192 /// Debug print.
1193 void dump(raw_ostream &OS) const {
1194 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1195 << " EdgeIdx:" << EdgeIdx << "}";
1196 }
1197 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1198#endif
1199 bool operator == (const EdgeInfo &Other) const {
1200 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1201 }
1202 };
1203
1204 /// A helper class used for scoring candidates for two consecutive lanes.
1206 const TargetLibraryInfo &TLI;
1207 const DataLayout &DL;
1208 ScalarEvolution &SE;
1209 const BoUpSLP &R;
1210 int NumLanes; // Total number of lanes (aka vectorization factor).
1211 int MaxLevel; // The maximum recursion depth for accumulating score.
1212
1213 public:
1215 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1216 int MaxLevel)
1217 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1218 MaxLevel(MaxLevel) {}
1219
1220 // The hard-coded scores listed here are not very important, though it shall
1221 // be higher for better matches to improve the resulting cost. When
1222 // computing the scores of matching one sub-tree with another, we are
1223 // basically counting the number of values that are matching. So even if all
1224 // scores are set to 1, we would still get a decent matching result.
1225 // However, sometimes we have to break ties. For example we may have to
1226 // choose between matching loads vs matching opcodes. This is what these
1227 // scores are helping us with: they provide the order of preference. Also,
1228 // this is important if the scalar is externally used or used in another
1229 // tree entry node in the different lane.
1230
1231 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1232 static const int ScoreConsecutiveLoads = 4;
1233 /// The same load multiple times. This should have a better score than
1234 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1235 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1236 /// a vector load and 1.0 for a broadcast.
1237 static const int ScoreSplatLoads = 3;
1238 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1239 static const int ScoreReversedLoads = 3;
1240 /// A load candidate for masked gather.
1241 static const int ScoreMaskedGatherCandidate = 1;
1242 /// ExtractElementInst from same vector and consecutive indexes.
1243 static const int ScoreConsecutiveExtracts = 4;
1244 /// ExtractElementInst from same vector and reversed indices.
1245 static const int ScoreReversedExtracts = 3;
1246 /// Constants.
1247 static const int ScoreConstants = 2;
1248 /// Instructions with the same opcode.
1249 static const int ScoreSameOpcode = 2;
1250 /// Instructions with alt opcodes (e.g, add + sub).
1251 static const int ScoreAltOpcodes = 1;
1252 /// Identical instructions (a.k.a. splat or broadcast).
1253 static const int ScoreSplat = 1;
1254 /// Matching with an undef is preferable to failing.
1255 static const int ScoreUndef = 1;
1256 /// Score for failing to find a decent match.
1257 static const int ScoreFail = 0;
1258 /// Score if all users are vectorized.
1259 static const int ScoreAllUserVectorized = 1;
1260
1261 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1262 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1263 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1264 /// MainAltOps.
1266 ArrayRef<Value *> MainAltOps) const {
1267 if (!isValidElementType(V1->getType()) ||
1268 !isValidElementType(V2->getType()))
1270
1271 if (V1 == V2) {
1272 if (isa<LoadInst>(V1)) {
1273 // Retruns true if the users of V1 and V2 won't need to be extracted.
1274 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1275 // Bail out if we have too many uses to save compilation time.
1276 static constexpr unsigned Limit = 8;
1277 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1278 return false;
1279
1280 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1281 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1282 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1283 });
1284 };
1285 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1286 };
1287 // A broadcast of a load can be cheaper on some targets.
1288 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1289 ElementCount::getFixed(NumLanes)) &&
1290 ((int)V1->getNumUses() == NumLanes ||
1291 AllUsersAreInternal(V1, V2)))
1293 }
1295 }
1296
1297 auto *LI1 = dyn_cast<LoadInst>(V1);
1298 auto *LI2 = dyn_cast<LoadInst>(V2);
1299 if (LI1 && LI2) {
1300 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1301 !LI2->isSimple())
1303
1304 std::optional<int> Dist = getPointersDiff(
1305 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1306 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1307 if (!Dist || *Dist == 0) {
1308 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1309 getUnderlyingObject(LI2->getPointerOperand()) &&
1310 R.TTI->isLegalMaskedGather(
1311 FixedVectorType::get(LI1->getType(), NumLanes),
1312 LI1->getAlign()))
1315 }
1316 // The distance is too large - still may be profitable to use masked
1317 // loads/gathers.
1318 if (std::abs(*Dist) > NumLanes / 2)
1320 // This still will detect consecutive loads, but we might have "holes"
1321 // in some cases. It is ok for non-power-2 vectorization and may produce
1322 // better results. It should not affect current vectorization.
1325 }
1326
1327 auto *C1 = dyn_cast<Constant>(V1);
1328 auto *C2 = dyn_cast<Constant>(V2);
1329 if (C1 && C2)
1331
1332 // Extracts from consecutive indexes of the same vector better score as
1333 // the extracts could be optimized away.
1334 Value *EV1;
1335 ConstantInt *Ex1Idx;
1336 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1337 // Undefs are always profitable for extractelements.
1338 // Compiler can easily combine poison and extractelement <non-poison> or
1339 // undef and extractelement <poison>. But combining undef +
1340 // extractelement <non-poison-but-may-produce-poison> requires some
1341 // extra operations.
1342 if (isa<UndefValue>(V2))
1343 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1346 Value *EV2 = nullptr;
1347 ConstantInt *Ex2Idx = nullptr;
1348 if (match(V2,
1350 m_Undef())))) {
1351 // Undefs are always profitable for extractelements.
1352 if (!Ex2Idx)
1354 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1356 if (EV2 == EV1) {
1357 int Idx1 = Ex1Idx->getZExtValue();
1358 int Idx2 = Ex2Idx->getZExtValue();
1359 int Dist = Idx2 - Idx1;
1360 // The distance is too large - still may be profitable to use
1361 // shuffles.
1362 if (std::abs(Dist) == 0)
1364 if (std::abs(Dist) > NumLanes / 2)
1368 }
1370 }
1372 }
1373
1374 auto *I1 = dyn_cast<Instruction>(V1);
1375 auto *I2 = dyn_cast<Instruction>(V2);
1376 if (I1 && I2) {
1377 if (I1->getParent() != I2->getParent())
1379 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1380 Ops.push_back(I1);
1381 Ops.push_back(I2);
1382 InstructionsState S = getSameOpcode(Ops, TLI);
1383 // Note: Only consider instructions with <= 2 operands to avoid
1384 // complexity explosion.
1385 if (S.getOpcode() &&
1386 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1387 !S.isAltShuffle()) &&
1388 all_of(Ops, [&S](Value *V) {
1389 return cast<Instruction>(V)->getNumOperands() ==
1390 S.MainOp->getNumOperands();
1391 }))
1392 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1394 }
1395
1396 if (isa<UndefValue>(V2))
1398
1400 }
1401
1402 /// Go through the operands of \p LHS and \p RHS recursively until
1403 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1404 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1405 /// of \p U1 and \p U2), except at the beginning of the recursion where
1406 /// these are set to nullptr.
1407 ///
1408 /// For example:
1409 /// \verbatim
1410 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1411 /// \ / \ / \ / \ /
1412 /// + + + +
1413 /// G1 G2 G3 G4
1414 /// \endverbatim
1415 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1416 /// each level recursively, accumulating the score. It starts from matching
1417 /// the additions at level 0, then moves on to the loads (level 1). The
1418 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1419 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1420 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1421 /// Please note that the order of the operands does not matter, as we
1422 /// evaluate the score of all profitable combinations of operands. In
1423 /// other words the score of G1 and G4 is the same as G1 and G2. This
1424 /// heuristic is based on ideas described in:
1425 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1426 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1427 /// Luís F. W. Góes
1429 Instruction *U2, int CurrLevel,
1430 ArrayRef<Value *> MainAltOps) const {
1431
1432 // Get the shallow score of V1 and V2.
1433 int ShallowScoreAtThisLevel =
1434 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1435
1436 // If reached MaxLevel,
1437 // or if V1 and V2 are not instructions,
1438 // or if they are SPLAT,
1439 // or if they are not consecutive,
1440 // or if profitable to vectorize loads or extractelements, early return
1441 // the current cost.
1442 auto *I1 = dyn_cast<Instruction>(LHS);
1443 auto *I2 = dyn_cast<Instruction>(RHS);
1444 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1445 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1446 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1447 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1448 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1449 ShallowScoreAtThisLevel))
1450 return ShallowScoreAtThisLevel;
1451 assert(I1 && I2 && "Should have early exited.");
1452
1453 // Contains the I2 operand indexes that got matched with I1 operands.
1454 SmallSet<unsigned, 4> Op2Used;
1455
1456 // Recursion towards the operands of I1 and I2. We are trying all possible
1457 // operand pairs, and keeping track of the best score.
1458 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1459 OpIdx1 != NumOperands1; ++OpIdx1) {
1460 // Try to pair op1I with the best operand of I2.
1461 int MaxTmpScore = 0;
1462 unsigned MaxOpIdx2 = 0;
1463 bool FoundBest = false;
1464 // If I2 is commutative try all combinations.
1465 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1466 unsigned ToIdx = isCommutative(I2)
1467 ? I2->getNumOperands()
1468 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1469 assert(FromIdx <= ToIdx && "Bad index");
1470 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1471 // Skip operands already paired with OpIdx1.
1472 if (Op2Used.count(OpIdx2))
1473 continue;
1474 // Recursively calculate the cost at each level
1475 int TmpScore =
1476 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1477 I1, I2, CurrLevel + 1, std::nullopt);
1478 // Look for the best score.
1479 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1480 TmpScore > MaxTmpScore) {
1481 MaxTmpScore = TmpScore;
1482 MaxOpIdx2 = OpIdx2;
1483 FoundBest = true;
1484 }
1485 }
1486 if (FoundBest) {
1487 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1488 Op2Used.insert(MaxOpIdx2);
1489 ShallowScoreAtThisLevel += MaxTmpScore;
1490 }
1491 }
1492 return ShallowScoreAtThisLevel;
1493 }
1494 };
1495 /// A helper data structure to hold the operands of a vector of instructions.
1496 /// This supports a fixed vector length for all operand vectors.
1498 /// For each operand we need (i) the value, and (ii) the opcode that it
1499 /// would be attached to if the expression was in a left-linearized form.
1500 /// This is required to avoid illegal operand reordering.
1501 /// For example:
1502 /// \verbatim
1503 /// 0 Op1
1504 /// |/
1505 /// Op1 Op2 Linearized + Op2
1506 /// \ / ----------> |/
1507 /// - -
1508 ///
1509 /// Op1 - Op2 (0 + Op1) - Op2
1510 /// \endverbatim
1511 ///
1512 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1513 ///
1514 /// Another way to think of this is to track all the operations across the
1515 /// path from the operand all the way to the root of the tree and to
1516 /// calculate the operation that corresponds to this path. For example, the
1517 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1518 /// corresponding operation is a '-' (which matches the one in the
1519 /// linearized tree, as shown above).
1520 ///
1521 /// For lack of a better term, we refer to this operation as Accumulated
1522 /// Path Operation (APO).
1523 struct OperandData {
1524 OperandData() = default;
1525 OperandData(Value *V, bool APO, bool IsUsed)
1526 : V(V), APO(APO), IsUsed(IsUsed) {}
1527 /// The operand value.
1528 Value *V = nullptr;
1529 /// TreeEntries only allow a single opcode, or an alternate sequence of
1530 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1531 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1532 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1533 /// (e.g., Add/Mul)
1534 bool APO = false;
1535 /// Helper data for the reordering function.
1536 bool IsUsed = false;
1537 };
1538
1539 /// During operand reordering, we are trying to select the operand at lane
1540 /// that matches best with the operand at the neighboring lane. Our
1541 /// selection is based on the type of value we are looking for. For example,
1542 /// if the neighboring lane has a load, we need to look for a load that is
1543 /// accessing a consecutive address. These strategies are summarized in the
1544 /// 'ReorderingMode' enumerator.
1545 enum class ReorderingMode {
1546 Load, ///< Matching loads to consecutive memory addresses
1547 Opcode, ///< Matching instructions based on opcode (same or alternate)
1548 Constant, ///< Matching constants
1549 Splat, ///< Matching the same instruction multiple times (broadcast)
1550 Failed, ///< We failed to create a vectorizable group
1551 };
1552
1554
1555 /// A vector of operand vectors.
1557
1558 const TargetLibraryInfo &TLI;
1559 const DataLayout &DL;
1560 ScalarEvolution &SE;
1561 const BoUpSLP &R;
1562
1563 /// \returns the operand data at \p OpIdx and \p Lane.
1564 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1565 return OpsVec[OpIdx][Lane];
1566 }
1567
1568 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1569 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1570 return OpsVec[OpIdx][Lane];
1571 }
1572
1573 /// Clears the used flag for all entries.
1574 void clearUsed() {
1575 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1576 OpIdx != NumOperands; ++OpIdx)
1577 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1578 ++Lane)
1579 OpsVec[OpIdx][Lane].IsUsed = false;
1580 }
1581
1582 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1583 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1584 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1585 }
1586
1587 /// \param Lane lane of the operands under analysis.
1588 /// \param OpIdx operand index in \p Lane lane we're looking the best
1589 /// candidate for.
1590 /// \param Idx operand index of the current candidate value.
1591 /// \returns The additional score due to possible broadcasting of the
1592 /// elements in the lane. It is more profitable to have power-of-2 unique
1593 /// elements in the lane, it will be vectorized with higher probability
1594 /// after removing duplicates. Currently the SLP vectorizer supports only
1595 /// vectorization of the power-of-2 number of unique scalars.
1596 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1597 Value *IdxLaneV = getData(Idx, Lane).V;
1598 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1599 return 0;
1601 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1602 if (Ln == Lane)
1603 continue;
1604 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1605 if (!isa<Instruction>(OpIdxLnV))
1606 return 0;
1607 Uniques.insert(OpIdxLnV);
1608 }
1609 int UniquesCount = Uniques.size();
1610 int UniquesCntWithIdxLaneV =
1611 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1612 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1613 int UniquesCntWithOpIdxLaneV =
1614 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1615 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1616 return 0;
1617 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1618 UniquesCntWithOpIdxLaneV) -
1619 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1620 }
1621
1622 /// \param Lane lane of the operands under analysis.
1623 /// \param OpIdx operand index in \p Lane lane we're looking the best
1624 /// candidate for.
1625 /// \param Idx operand index of the current candidate value.
1626 /// \returns The additional score for the scalar which users are all
1627 /// vectorized.
1628 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1629 Value *IdxLaneV = getData(Idx, Lane).V;
1630 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1631 // Do not care about number of uses for vector-like instructions
1632 // (extractelement/extractvalue with constant indices), they are extracts
1633 // themselves and already externally used. Vectorization of such
1634 // instructions does not add extra extractelement instruction, just may
1635 // remove it.
1636 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1637 isVectorLikeInstWithConstOps(OpIdxLaneV))
1639 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1640 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1641 return 0;
1642 return R.areAllUsersVectorized(IdxLaneI)
1644 : 0;
1645 }
1646
1647 /// Score scaling factor for fully compatible instructions but with
1648 /// different number of external uses. Allows better selection of the
1649 /// instructions with less external uses.
1650 static const int ScoreScaleFactor = 10;
1651
1652 /// \Returns the look-ahead score, which tells us how much the sub-trees
1653 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1654 /// score. This helps break ties in an informed way when we cannot decide on
1655 /// the order of the operands by just considering the immediate
1656 /// predecessors.
1657 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1658 int Lane, unsigned OpIdx, unsigned Idx,
1659 bool &IsUsed) {
1660 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1662 // Keep track of the instruction stack as we recurse into the operands
1663 // during the look-ahead score exploration.
1664 int Score =
1665 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1666 /*CurrLevel=*/1, MainAltOps);
1667 if (Score) {
1668 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1669 if (Score <= -SplatScore) {
1670 // Set the minimum score for splat-like sequence to avoid setting
1671 // failed state.
1672 Score = 1;
1673 } else {
1674 Score += SplatScore;
1675 // Scale score to see the difference between different operands
1676 // and similar operands but all vectorized/not all vectorized
1677 // uses. It does not affect actual selection of the best
1678 // compatible operand in general, just allows to select the
1679 // operand with all vectorized uses.
1680 Score *= ScoreScaleFactor;
1681 Score += getExternalUseScore(Lane, OpIdx, Idx);
1682 IsUsed = true;
1683 }
1684 }
1685 return Score;
1686 }
1687
1688 /// Best defined scores per lanes between the passes. Used to choose the
1689 /// best operand (with the highest score) between the passes.
1690 /// The key - {Operand Index, Lane}.
1691 /// The value - the best score between the passes for the lane and the
1692 /// operand.
1694 BestScoresPerLanes;
1695
1696 // Search all operands in Ops[*][Lane] for the one that matches best
1697 // Ops[OpIdx][LastLane] and return its opreand index.
1698 // If no good match can be found, return std::nullopt.
1699 std::optional<unsigned>
1700 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1701 ArrayRef<ReorderingMode> ReorderingModes,
1702 ArrayRef<Value *> MainAltOps) {
1703 unsigned NumOperands = getNumOperands();
1704
1705 // The operand of the previous lane at OpIdx.
1706 Value *OpLastLane = getData(OpIdx, LastLane).V;
1707
1708 // Our strategy mode for OpIdx.
1709 ReorderingMode RMode = ReorderingModes[OpIdx];
1710 if (RMode == ReorderingMode::Failed)
1711 return std::nullopt;
1712
1713 // The linearized opcode of the operand at OpIdx, Lane.
1714 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1715
1716 // The best operand index and its score.
1717 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1718 // are using the score to differentiate between the two.
1719 struct BestOpData {
1720 std::optional<unsigned> Idx;
1721 unsigned Score = 0;
1722 } BestOp;
1723 BestOp.Score =
1724 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1725 .first->second;
1726
1727 // Track if the operand must be marked as used. If the operand is set to
1728 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1729 // want to reestimate the operands again on the following iterations).
1730 bool IsUsed =
1731 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1732 // Iterate through all unused operands and look for the best.
1733 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1734 // Get the operand at Idx and Lane.
1735 OperandData &OpData = getData(Idx, Lane);
1736 Value *Op = OpData.V;
1737 bool OpAPO = OpData.APO;
1738
1739 // Skip already selected operands.
1740 if (OpData.IsUsed)
1741 continue;
1742
1743 // Skip if we are trying to move the operand to a position with a
1744 // different opcode in the linearized tree form. This would break the
1745 // semantics.
1746 if (OpAPO != OpIdxAPO)
1747 continue;
1748
1749 // Look for an operand that matches the current mode.
1750 switch (RMode) {
1751 case ReorderingMode::Load:
1752 case ReorderingMode::Constant:
1753 case ReorderingMode::Opcode: {
1754 bool LeftToRight = Lane > LastLane;
1755 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1756 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1757 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1758 OpIdx, Idx, IsUsed);
1759 if (Score > static_cast<int>(BestOp.Score)) {
1760 BestOp.Idx = Idx;
1761 BestOp.Score = Score;
1762 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1763 }
1764 break;
1765 }
1766 case ReorderingMode::Splat:
1767 if (Op == OpLastLane)
1768 BestOp.Idx = Idx;
1769 break;
1770 case ReorderingMode::Failed:
1771 llvm_unreachable("Not expected Failed reordering mode.");
1772 }
1773 }
1774
1775 if (BestOp.Idx) {
1776 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1777 return BestOp.Idx;
1778 }
1779 // If we could not find a good match return std::nullopt.
1780 return std::nullopt;
1781 }
1782
1783 /// Helper for reorderOperandVecs.
1784 /// \returns the lane that we should start reordering from. This is the one
1785 /// which has the least number of operands that can freely move about or
1786 /// less profitable because it already has the most optimal set of operands.
1787 unsigned getBestLaneToStartReordering() const {
1788 unsigned Min = UINT_MAX;
1789 unsigned SameOpNumber = 0;
1790 // std::pair<unsigned, unsigned> is used to implement a simple voting
1791 // algorithm and choose the lane with the least number of operands that
1792 // can freely move about or less profitable because it already has the
1793 // most optimal set of operands. The first unsigned is a counter for
1794 // voting, the second unsigned is the counter of lanes with instructions
1795 // with same/alternate opcodes and same parent basic block.
1797 // Try to be closer to the original results, if we have multiple lanes
1798 // with same cost. If 2 lanes have the same cost, use the one with the
1799 // lowest index.
1800 for (int I = getNumLanes(); I > 0; --I) {
1801 unsigned Lane = I - 1;
1802 OperandsOrderData NumFreeOpsHash =
1803 getMaxNumOperandsThatCanBeReordered(Lane);
1804 // Compare the number of operands that can move and choose the one with
1805 // the least number.
1806 if (NumFreeOpsHash.NumOfAPOs < Min) {
1807 Min = NumFreeOpsHash.NumOfAPOs;
1808 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1809 HashMap.clear();
1810 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1811 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1812 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1813 // Select the most optimal lane in terms of number of operands that
1814 // should be moved around.
1815 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1816 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1817 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1818 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1819 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1820 if (It == HashMap.end())
1821 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1822 else
1823 ++It->second.first;
1824 }
1825 }
1826 // Select the lane with the minimum counter.
1827 unsigned BestLane = 0;
1828 unsigned CntMin = UINT_MAX;
1829 for (const auto &Data : reverse(HashMap)) {
1830 if (Data.second.first < CntMin) {
1831 CntMin = Data.second.first;
1832 BestLane = Data.second.second;
1833 }
1834 }
1835 return BestLane;
1836 }
1837
1838 /// Data structure that helps to reorder operands.
1839 struct OperandsOrderData {
1840 /// The best number of operands with the same APOs, which can be
1841 /// reordered.
1842 unsigned NumOfAPOs = UINT_MAX;
1843 /// Number of operands with the same/alternate instruction opcode and
1844 /// parent.
1845 unsigned NumOpsWithSameOpcodeParent = 0;
1846 /// Hash for the actual operands ordering.
1847 /// Used to count operands, actually their position id and opcode
1848 /// value. It is used in the voting mechanism to find the lane with the
1849 /// least number of operands that can freely move about or less profitable
1850 /// because it already has the most optimal set of operands. Can be
1851 /// replaced with SmallVector<unsigned> instead but hash code is faster
1852 /// and requires less memory.
1853 unsigned Hash = 0;
1854 };
1855 /// \returns the maximum number of operands that are allowed to be reordered
1856 /// for \p Lane and the number of compatible instructions(with the same
1857 /// parent/opcode). This is used as a heuristic for selecting the first lane
1858 /// to start operand reordering.
1859 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1860 unsigned CntTrue = 0;
1861 unsigned NumOperands = getNumOperands();
1862 // Operands with the same APO can be reordered. We therefore need to count
1863 // how many of them we have for each APO, like this: Cnt[APO] = x.
1864 // Since we only have two APOs, namely true and false, we can avoid using
1865 // a map. Instead we can simply count the number of operands that
1866 // correspond to one of them (in this case the 'true' APO), and calculate
1867 // the other by subtracting it from the total number of operands.
1868 // Operands with the same instruction opcode and parent are more
1869 // profitable since we don't need to move them in many cases, with a high
1870 // probability such lane already can be vectorized effectively.
1871 bool AllUndefs = true;
1872 unsigned NumOpsWithSameOpcodeParent = 0;
1873 Instruction *OpcodeI = nullptr;
1874 BasicBlock *Parent = nullptr;
1875 unsigned Hash = 0;
1876 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1877 const OperandData &OpData = getData(OpIdx, Lane);
1878 if (OpData.APO)
1879 ++CntTrue;
1880 // Use Boyer-Moore majority voting for finding the majority opcode and
1881 // the number of times it occurs.
1882 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1883 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1884 I->getParent() != Parent) {
1885 if (NumOpsWithSameOpcodeParent == 0) {
1886 NumOpsWithSameOpcodeParent = 1;
1887 OpcodeI = I;
1888 Parent = I->getParent();
1889 } else {
1890 --NumOpsWithSameOpcodeParent;
1891 }
1892 } else {
1893 ++NumOpsWithSameOpcodeParent;
1894 }
1895 }
1896 Hash = hash_combine(
1897 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1898 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1899 }
1900 if (AllUndefs)
1901 return {};
1902 OperandsOrderData Data;
1903 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1904 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1905 Data.Hash = Hash;
1906 return Data;
1907 }
1908
1909 /// Go through the instructions in VL and append their operands.
1910 void appendOperandsOfVL(ArrayRef<Value *> VL) {
1911 assert(!VL.empty() && "Bad VL");
1912 assert((empty() || VL.size() == getNumLanes()) &&
1913 "Expected same number of lanes");
1914 assert(isa<Instruction>(VL[0]) && "Expected instruction");
1915 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1916 OpsVec.resize(NumOperands);
1917 unsigned NumLanes = VL.size();
1918 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1919 OpsVec[OpIdx].resize(NumLanes);
1920 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1921 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
1922 // Our tree has just 3 nodes: the root and two operands.
1923 // It is therefore trivial to get the APO. We only need to check the
1924 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1925 // RHS operand. The LHS operand of both add and sub is never attached
1926 // to an inversese operation in the linearized form, therefore its APO
1927 // is false. The RHS is true only if VL[Lane] is an inverse operation.
1928
1929 // Since operand reordering is performed on groups of commutative
1930 // operations or alternating sequences (e.g., +, -), we can safely
1931 // tell the inverse operations by checking commutativity.
1932 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1933 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1934 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1935 APO, false};
1936 }
1937 }
1938 }
1939
1940 /// \returns the number of operands.
1941 unsigned getNumOperands() const { return OpsVec.size(); }
1942
1943 /// \returns the number of lanes.
1944 unsigned getNumLanes() const { return OpsVec[0].size(); }
1945
1946 /// \returns the operand value at \p OpIdx and \p Lane.
1947 Value *getValue(unsigned OpIdx, unsigned Lane) const {
1948 return getData(OpIdx, Lane).V;
1949 }
1950
1951 /// \returns true if the data structure is empty.
1952 bool empty() const { return OpsVec.empty(); }
1953
1954 /// Clears the data.
1955 void clear() { OpsVec.clear(); }
1956
1957 /// \Returns true if there are enough operands identical to \p Op to fill
1958 /// the whole vector.
1959 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1960 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1961 bool OpAPO = getData(OpIdx, Lane).APO;
1962 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1963 if (Ln == Lane)
1964 continue;
1965 // This is set to true if we found a candidate for broadcast at Lane.
1966 bool FoundCandidate = false;
1967 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1968 OperandData &Data = getData(OpI, Ln);
1969 if (Data.APO != OpAPO || Data.IsUsed)
1970 continue;
1971 if (Data.V == Op) {
1972 FoundCandidate = true;
1973 Data.IsUsed = true;
1974 break;
1975 }
1976 }
1977 if (!FoundCandidate)
1978 return false;
1979 }
1980 return true;
1981 }
1982
1983 public:
1984 /// Initialize with all the operands of the instruction vector \p RootVL.
1986 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
1987 : TLI(TLI), DL(DL), SE(SE), R(R) {
1988 // Append all the operands of RootVL.
1989 appendOperandsOfVL(RootVL);
1990 }
1991
1992 /// \Returns a value vector with the operands across all lanes for the
1993 /// opearnd at \p OpIdx.
1994 ValueList getVL(unsigned OpIdx) const {
1995 ValueList OpVL(OpsVec[OpIdx].size());
1996 assert(OpsVec[OpIdx].size() == getNumLanes() &&
1997 "Expected same num of lanes across all operands");
1998 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1999 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2000 return OpVL;
2001 }
2002
2003 // Performs operand reordering for 2 or more operands.
2004 // The original operands are in OrigOps[OpIdx][Lane].
2005 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2006 void reorder() {
2007 unsigned NumOperands = getNumOperands();
2008 unsigned NumLanes = getNumLanes();
2009 // Each operand has its own mode. We are using this mode to help us select
2010 // the instructions for each lane, so that they match best with the ones
2011 // we have selected so far.
2012 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2013
2014 // This is a greedy single-pass algorithm. We are going over each lane
2015 // once and deciding on the best order right away with no back-tracking.
2016 // However, in order to increase its effectiveness, we start with the lane
2017 // that has operands that can move the least. For example, given the
2018 // following lanes:
2019 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2020 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2021 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2022 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2023 // we will start at Lane 1, since the operands of the subtraction cannot
2024 // be reordered. Then we will visit the rest of the lanes in a circular
2025 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2026
2027 // Find the first lane that we will start our search from.
2028 unsigned FirstLane = getBestLaneToStartReordering();
2029
2030 // Initialize the modes.
2031 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2032 Value *OpLane0 = getValue(OpIdx, FirstLane);
2033 // Keep track if we have instructions with all the same opcode on one
2034 // side.
2035 if (isa<LoadInst>(OpLane0))
2036 ReorderingModes[OpIdx] = ReorderingMode::Load;
2037 else if (isa<Instruction>(OpLane0)) {
2038 // Check if OpLane0 should be broadcast.
2039 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2040 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2041 else
2042 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2043 }
2044 else if (isa<Constant>(OpLane0))
2045 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2046 else if (isa<Argument>(OpLane0))
2047 // Our best hope is a Splat. It may save some cost in some cases.
2048 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2049 else
2050 // NOTE: This should be unreachable.
2051 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2052 }
2053
2054 // Check that we don't have same operands. No need to reorder if operands
2055 // are just perfect diamond or shuffled diamond match. Do not do it only
2056 // for possible broadcasts or non-power of 2 number of scalars (just for
2057 // now).
2058 auto &&SkipReordering = [this]() {
2059 SmallPtrSet<Value *, 4> UniqueValues;
2060 ArrayRef<OperandData> Op0 = OpsVec.front();
2061 for (const OperandData &Data : Op0)
2062 UniqueValues.insert(Data.V);
2063 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2064 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2065 return !UniqueValues.contains(Data.V);
2066 }))
2067 return false;
2068 }
2069 // TODO: Check if we can remove a check for non-power-2 number of
2070 // scalars after full support of non-power-2 vectorization.
2071 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2072 };
2073
2074 // If the initial strategy fails for any of the operand indexes, then we
2075 // perform reordering again in a second pass. This helps avoid assigning
2076 // high priority to the failed strategy, and should improve reordering for
2077 // the non-failed operand indexes.
2078 for (int Pass = 0; Pass != 2; ++Pass) {
2079 // Check if no need to reorder operands since they're are perfect or
2080 // shuffled diamond match.
2081 // Need to do it to avoid extra external use cost counting for
2082 // shuffled matches, which may cause regressions.
2083 if (SkipReordering())
2084 break;
2085 // Skip the second pass if the first pass did not fail.
2086 bool StrategyFailed = false;
2087 // Mark all operand data as free to use.
2088 clearUsed();
2089 // We keep the original operand order for the FirstLane, so reorder the
2090 // rest of the lanes. We are visiting the nodes in a circular fashion,
2091 // using FirstLane as the center point and increasing the radius
2092 // distance.
2093 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2094 for (unsigned I = 0; I < NumOperands; ++I)
2095 MainAltOps[I].push_back(getData(I, FirstLane).V);
2096
2097 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2098 // Visit the lane on the right and then the lane on the left.
2099 for (int Direction : {+1, -1}) {
2100 int Lane = FirstLane + Direction * Distance;
2101 if (Lane < 0 || Lane >= (int)NumLanes)
2102 continue;
2103 int LastLane = Lane - Direction;
2104 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2105 "Out of bounds");
2106 // Look for a good match for each operand.
2107 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2108 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2109 std::optional<unsigned> BestIdx = getBestOperand(
2110 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2111 // By not selecting a value, we allow the operands that follow to
2112 // select a better matching value. We will get a non-null value in
2113 // the next run of getBestOperand().
2114 if (BestIdx) {
2115 // Swap the current operand with the one returned by
2116 // getBestOperand().
2117 swap(OpIdx, *BestIdx, Lane);
2118 } else {
2119 // We failed to find a best operand, set mode to 'Failed'.
2120 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2121 // Enable the second pass.
2122 StrategyFailed = true;
2123 }
2124 // Try to get the alternate opcode and follow it during analysis.
2125 if (MainAltOps[OpIdx].size() != 2) {
2126 OperandData &AltOp = getData(OpIdx, Lane);
2127 InstructionsState OpS =
2128 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2129 if (OpS.getOpcode() && OpS.isAltShuffle())
2130 MainAltOps[OpIdx].push_back(AltOp.V);
2131 }
2132 }
2133 }
2134 }
2135 // Skip second pass if the strategy did not fail.
2136 if (!StrategyFailed)
2137 break;
2138 }
2139 }
2140
2141#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2142 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2143 switch (RMode) {
2144 case ReorderingMode::Load:
2145 return "Load";
2146 case ReorderingMode::Opcode:
2147 return "Opcode";
2148 case ReorderingMode::Constant:
2149 return "Constant";
2150 case ReorderingMode::Splat:
2151 return "Splat";
2152 case ReorderingMode::Failed:
2153 return "Failed";
2154 }
2155 llvm_unreachable("Unimplemented Reordering Type");
2156 }
2157
2158 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2159 raw_ostream &OS) {
2160 return OS << getModeStr(RMode);
2161 }
2162
2163 /// Debug print.
2164 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2165 printMode(RMode, dbgs());
2166 }
2167
2168 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2169 return printMode(RMode, OS);
2170 }
2171
2173 const unsigned Indent = 2;
2174 unsigned Cnt = 0;
2175 for (const OperandDataVec &OpDataVec : OpsVec) {
2176 OS << "Operand " << Cnt++ << "\n";
2177 for (const OperandData &OpData : OpDataVec) {
2178 OS.indent(Indent) << "{";
2179 if (Value *V = OpData.V)
2180 OS << *V;
2181 else
2182 OS << "null";
2183 OS << ", APO:" << OpData.APO << "}\n";
2184 }
2185 OS << "\n";
2186 }
2187 return OS;
2188 }
2189
2190 /// Debug print.
2191 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2192#endif
2193 };
2194
2195 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2196 /// for a pair which have highest score deemed to have best chance to form
2197 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2198 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2199 /// of the cost, considered to be good enough score.
2200 std::optional<int>
2201 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2202 int Limit = LookAheadHeuristics::ScoreFail) {
2203 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2205 int BestScore = Limit;
2206 std::optional<int> Index;
2207 for (int I : seq<int>(0, Candidates.size())) {
2208 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2209 Candidates[I].second,
2210 /*U1=*/nullptr, /*U2=*/nullptr,
2211 /*Level=*/1, std::nullopt);
2212 if (Score > BestScore) {
2213 BestScore = Score;
2214 Index = I;
2215 }
2216 }
2217 return Index;
2218 }
2219
2220 /// Checks if the instruction is marked for deletion.
2221 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2222
2223 /// Removes an instruction from its block and eventually deletes it.
2224 /// It's like Instruction::eraseFromParent() except that the actual deletion
2225 /// is delayed until BoUpSLP is destructed.
2227 DeletedInstructions.insert(I);
2228 }
2229
2230 /// Checks if the instruction was already analyzed for being possible
2231 /// reduction root.
2233 return AnalyzedReductionsRoots.count(I);
2234 }
2235 /// Register given instruction as already analyzed for being possible
2236 /// reduction root.
2238 AnalyzedReductionsRoots.insert(I);
2239 }
2240 /// Checks if the provided list of reduced values was checked already for
2241 /// vectorization.
2243 return AnalyzedReductionVals.contains(hash_value(VL));
2244 }
2245 /// Adds the list of reduced values to list of already checked values for the
2246 /// vectorization.
2248 AnalyzedReductionVals.insert(hash_value(VL));
2249 }
2250 /// Clear the list of the analyzed reduction root instructions.
2252 AnalyzedReductionsRoots.clear();
2253 AnalyzedReductionVals.clear();
2254 }
2255 /// Checks if the given value is gathered in one of the nodes.
2256 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2257 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2258 }
2259
2260 /// Check if the value is vectorized in the tree.
2261 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2262
2263 ~BoUpSLP();
2264
2265private:
2266 /// Determine if a vectorized value \p V in can be demoted to
2267 /// a smaller type with a truncation. We collect the values that will be
2268 /// demoted in ToDemote and additional roots that require investigating in
2269 /// Roots.
2270 /// \param DemotedConsts list of Instruction/OperandIndex pairs that are
2271 /// constant and to be demoted. Required to correctly identify constant nodes
2272 /// to be demoted.
2273 bool collectValuesToDemote(
2274 Value *V, SmallVectorImpl<Value *> &ToDemote,
2275 DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
2276 SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
2277
2278 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2279 /// reordering (i.e. the operands can be reordered because they have only one
2280 /// user and reordarable).
2281 /// \param ReorderableGathers List of all gather nodes that require reordering
2282 /// (e.g., gather of extractlements or partially vectorizable loads).
2283 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2284 /// reordering, subset of \p NonVectorized.
2285 bool
2286 canReorderOperands(TreeEntry *UserTE,
2287 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2288 ArrayRef<TreeEntry *> ReorderableGathers,
2289 SmallVectorImpl<TreeEntry *> &GatherOps);
2290
2291 /// Checks if the given \p TE is a gather node with clustered reused scalars
2292 /// and reorders it per given \p Mask.
2293 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2294
2295 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2296 /// if any. If it is not vectorized (gather node), returns nullptr.
2297 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2298 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2299 TreeEntry *TE = nullptr;
2300 const auto *It = find_if(VL, [&](Value *V) {
2301 TE = getTreeEntry(V);
2302 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2303 return true;
2304 auto It = MultiNodeScalars.find(V);
2305 if (It != MultiNodeScalars.end()) {
2306 for (TreeEntry *E : It->second) {
2307 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2308 TE = E;
2309 return true;
2310 }
2311 }
2312 }
2313 return false;
2314 });
2315 if (It != VL.end()) {
2316 assert(TE->isSame(VL) && "Expected same scalars.");
2317 return TE;
2318 }
2319 return nullptr;
2320 }
2321
2322 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2323 /// if any. If it is not vectorized (gather node), returns nullptr.
2324 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2325 unsigned OpIdx) const {
2326 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2327 const_cast<TreeEntry *>(UserTE), OpIdx);
2328 }
2329
2330 /// Checks if all users of \p I are the part of the vectorization tree.
2331 bool areAllUsersVectorized(
2332 Instruction *I,
2333 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2334
2335 /// Return information about the vector formed for the specified index
2336 /// of a vector of (the same) instruction.
2338
2339 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2340 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2341
2342 /// \returns the cost of the vectorizable entry.
2343 InstructionCost getEntryCost(const TreeEntry *E,
2344 ArrayRef<Value *> VectorizedVals,
2345 SmallPtrSetImpl<Value *> &CheckedExtracts);
2346
2347 /// This is the recursive part of buildTree.
2348 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2349 const EdgeInfo &EI);
2350
2351 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2352 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2353 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2354 /// returns false, setting \p CurrentOrder to either an empty vector or a
2355 /// non-identity permutation that allows to reuse extract instructions.
2356 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2357 /// extract order.
2358 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2359 SmallVectorImpl<unsigned> &CurrentOrder,
2360 bool ResizeAllowed = false) const;
2361
2362 /// Vectorize a single entry in the tree.
2363 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2364 /// avoid issues with def-use order.
2365 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2366
2367 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2368 /// \p E.
2369 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2370 /// avoid issues with def-use order.
2371 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2372
2373 /// Create a new vector from a list of scalar values. Produces a sequence
2374 /// which exploits values reused across lanes, and arranges the inserts
2375 /// for ease of later optimization.
2376 template <typename BVTy, typename ResTy, typename... Args>
2377 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2378
2379 /// Create a new vector from a list of scalar values. Produces a sequence
2380 /// which exploits values reused across lanes, and arranges the inserts
2381 /// for ease of later optimization.
2382 Value *createBuildVector(const TreeEntry *E);
2383
2384 /// Returns the instruction in the bundle, which can be used as a base point
2385 /// for scheduling. Usually it is the last instruction in the bundle, except
2386 /// for the case when all operands are external (in this case, it is the first
2387 /// instruction in the list).
2388 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2389
2390 /// Tries to find extractelement instructions with constant indices from fixed
2391 /// vector type and gather such instructions into a bunch, which highly likely
2392 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2393 /// was successful, the matched scalars are replaced by poison values in \p VL
2394 /// for future analysis.
2395 std::optional<TargetTransformInfo::ShuffleKind>
2396 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2397 SmallVectorImpl<int> &Mask) const;
2398
2399 /// Tries to find extractelement instructions with constant indices from fixed
2400 /// vector type and gather such instructions into a bunch, which highly likely
2401 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2402 /// was successful, the matched scalars are replaced by poison values in \p VL
2403 /// for future analysis.
2405 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2407 unsigned NumParts) const;
2408
2409 /// Checks if the gathered \p VL can be represented as a single register
2410 /// shuffle(s) of previous tree entries.
2411 /// \param TE Tree entry checked for permutation.
2412 /// \param VL List of scalars (a subset of the TE scalar), checked for
2413 /// permutations. Must form single-register vector.
2414 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2415 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2416 std::optional<TargetTransformInfo::ShuffleKind>
2417 isGatherShuffledSingleRegisterEntry(
2418 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2419 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
2420
2421 /// Checks if the gathered \p VL can be represented as multi-register
2422 /// shuffle(s) of previous tree entries.
2423 /// \param TE Tree entry checked for permutation.
2424 /// \param VL List of scalars (a subset of the TE scalar), checked for
2425 /// permutations.
2426 /// \returns per-register series of ShuffleKind, if gathered values can be
2427 /// represented as shuffles of previous tree entries. \p Mask is filled with
2428 /// the shuffle mask (also on per-register base).
2430 isGatherShuffledEntry(
2431 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2433 unsigned NumParts);
2434
2435 /// \returns the scalarization cost for this list of values. Assuming that
2436 /// this subtree gets vectorized, we may need to extract the values from the
2437 /// roots. This method calculates the cost of extracting the values.
2438 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2439 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2440
2441 /// Set the Builder insert point to one after the last instruction in
2442 /// the bundle
2443 void setInsertPointAfterBundle(const TreeEntry *E);
2444
2445 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2446 /// specified, the starting vector value is poison.
2447 Value *gather(ArrayRef<Value *> VL, Value *Root);
2448
2449 /// \returns whether the VectorizableTree is fully vectorizable and will
2450 /// be beneficial even the tree height is tiny.
2451 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2452
2453 /// Reorder commutative or alt operands to get better probability of
2454 /// generating vectorized code.
2455 static void reorderInputsAccordingToOpcode(
2458 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
2459
2460 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2461 /// users of \p TE and collects the stores. It returns the map from the store
2462 /// pointers to the collected stores.
2464 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2465
2466 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2467 /// stores in \p StoresVec can form a vector instruction. If so it returns
2468 /// true and populates \p ReorderIndices with the shuffle indices of the
2469 /// stores when compared to the sorted vector.
2470 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2471 OrdersType &ReorderIndices) const;
2472
2473 /// Iterates through the users of \p TE, looking for scalar stores that can be
2474 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2475 /// their order and builds an order index vector for each store bundle. It
2476 /// returns all these order vectors found.
2477 /// We run this after the tree has formed, otherwise we may come across user
2478 /// instructions that are not yet in the tree.
2480 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2481
2482 struct TreeEntry {
2483 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2484 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2485
2486 /// \returns Common mask for reorder indices and reused scalars.
2487 SmallVector<int> getCommonMask() const {
2489 inversePermutation(ReorderIndices, Mask);
2490 ::addMask(Mask, ReuseShuffleIndices);
2491 return Mask;
2492 }
2493
2494 /// \returns true if the scalars in VL are equal to this entry.
2495 bool isSame(ArrayRef<Value *> VL) const {
2496 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2497 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2498 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2499 return VL.size() == Mask.size() &&
2500 std::equal(VL.begin(), VL.end(), Mask.begin(),
2501 [Scalars](Value *V, int Idx) {
2502 return (isa<UndefValue>(V) &&
2503 Idx == PoisonMaskElem) ||
2504 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2505 });
2506 };
2507 if (!ReorderIndices.empty()) {
2508 // TODO: implement matching if the nodes are just reordered, still can
2509 // treat the vector as the same if the list of scalars matches VL
2510 // directly, without reordering.
2512 inversePermutation(ReorderIndices, Mask);
2513 if (VL.size() == Scalars.size())
2514 return IsSame(Scalars, Mask);
2515 if (VL.size() == ReuseShuffleIndices.size()) {
2516 ::addMask(Mask, ReuseShuffleIndices);
2517 return IsSame(Scalars, Mask);
2518 }
2519 return false;
2520 }
2521 return IsSame(Scalars, ReuseShuffleIndices);
2522 }
2523
2524 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2525 return State == TreeEntry::NeedToGather &&
2526 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2527 UserTreeIndices.front().UserTE == UserEI.UserTE;
2528 }
2529
2530 /// \returns true if current entry has same operands as \p TE.
2531 bool hasEqualOperands(const TreeEntry &TE) const {
2532 if (TE.getNumOperands() != getNumOperands())
2533 return false;
2534 SmallBitVector Used(getNumOperands());
2535 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2536 unsigned PrevCount = Used.count();
2537 for (unsigned K = 0; K < E; ++K) {
2538 if (Used.test(K))
2539 continue;
2540 if (getOperand(K) == TE.getOperand(I)) {
2541 Used.set(K);
2542 break;
2543 }
2544 }
2545 // Check if we actually found the matching operand.
2546 if (PrevCount == Used.count())
2547 return false;
2548 }
2549 return true;
2550 }
2551
2552 /// \return Final vectorization factor for the node. Defined by the total
2553 /// number of vectorized scalars, including those, used several times in the
2554 /// entry and counted in the \a ReuseShuffleIndices, if any.
2555 unsigned getVectorFactor() const {
2556 if (!ReuseShuffleIndices.empty())
2557 return ReuseShuffleIndices.size();
2558 return Scalars.size();
2559 };
2560
2561 /// A vector of scalars.
2562 ValueList Scalars;
2563
2564 /// The Scalars are vectorized into this value. It is initialized to Null.
2565 WeakTrackingVH VectorizedValue = nullptr;
2566
2567 /// New vector phi instructions emitted for the vectorized phi nodes.
2568 PHINode *PHI = nullptr;
2569
2570 /// Do we need to gather this sequence or vectorize it
2571 /// (either with vector instruction or with scatter/gather
2572 /// intrinsics for store/load)?
2573 enum EntryState {
2574 Vectorize,
2575 ScatterVectorize,
2576 PossibleStridedVectorize,
2577 NeedToGather
2578 };
2579 EntryState State;
2580
2581 /// Does this sequence require some shuffling?
2582 SmallVector<int, 4> ReuseShuffleIndices;
2583
2584 /// Does this entry require reordering?
2585 SmallVector<unsigned, 4> ReorderIndices;
2586
2587 /// Points back to the VectorizableTree.
2588 ///
2589 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2590 /// to be a pointer and needs to be able to initialize the child iterator.
2591 /// Thus we need a reference back to the container to translate the indices
2592 /// to entries.
2593 VecTreeTy &Container;
2594
2595 /// The TreeEntry index containing the user of this entry. We can actually
2596 /// have multiple users so the data structure is not truly a tree.
2597 SmallVector<EdgeInfo, 1> UserTreeIndices;
2598
2599 /// The index of this treeEntry in VectorizableTree.
2600 int Idx = -1;
2601
2602 private:
2603 /// The operands of each instruction in each lane Operands[op_index][lane].
2604 /// Note: This helps avoid the replication of the code that performs the
2605 /// reordering of operands during buildTree_rec() and vectorizeTree().
2607
2608 /// The main/alternate instruction.
2609 Instruction *MainOp = nullptr;
2610 Instruction *AltOp = nullptr;
2611
2612 public:
2613 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2614 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2615 if (Operands.size() < OpIdx + 1)
2616 Operands.resize(OpIdx + 1);
2617 assert(Operands[OpIdx].empty() && "Already resized?");
2618 assert(OpVL.size() <= Scalars.size() &&
2619 "Number of operands is greater than the number of scalars.");
2620 Operands[OpIdx].resize(OpVL.size());
2621 copy(OpVL, Operands[OpIdx].begin());
2622 }
2623
2624 /// Set the operands of this bundle in their original order.
2625 void setOperandsInOrder() {
2626 assert(Operands.empty() && "Already initialized?");
2627 auto *I0 = cast<Instruction>(Scalars[0]);
2628 Operands.resize(I0->getNumOperands());
2629 unsigned NumLanes = Scalars.size();
2630 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2631 OpIdx != NumOperands; ++OpIdx) {
2632 Operands[OpIdx].resize(NumLanes);
2633 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2634 auto *I = cast<Instruction>(Scalars[Lane]);
2635 assert(I->getNumOperands() == NumOperands &&
2636 "Expected same number of operands");
2637 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2638 }
2639 }
2640 }
2641
2642 /// Reorders operands of the node to the given mask \p Mask.
2643 void reorderOperands(ArrayRef<int> Mask) {
2644 for (ValueList &Operand : Operands)
2645 reorderScalars(Operand, Mask);
2646 }
2647
2648 /// \returns the \p OpIdx operand of this TreeEntry.
2649 ValueList &getOperand(unsigned OpIdx) {
2650 assert(OpIdx < Operands.size() && "Off bounds");
2651 return Operands[OpIdx];
2652 }
2653
2654 /// \returns the \p OpIdx operand of this TreeEntry.
2655 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2656 assert(OpIdx < Operands.size() && "Off bounds");
2657 return Operands[OpIdx];
2658 }
2659
2660 /// \returns the number of operands.
2661 unsigned getNumOperands() const { return Operands.size(); }
2662
2663 /// \return the single \p OpIdx operand.
2664 Value *getSingleOperand(unsigned OpIdx) const {
2665 assert(OpIdx < Operands.size() && "Off bounds");
2666 assert(!Operands[OpIdx].empty() && "No operand available");
2667 return Operands[OpIdx][0];
2668 }
2669
2670 /// Some of the instructions in the list have alternate opcodes.
2671 bool isAltShuffle() const { return MainOp != AltOp; }
2672
2673 bool isOpcodeOrAlt(Instruction *I) const {
2674 unsigned CheckedOpcode = I->getOpcode();
2675 return (getOpcode() == CheckedOpcode ||
2676 getAltOpcode() == CheckedOpcode);
2677 }
2678
2679 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2680 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2681 /// \p OpValue.
2682 Value *isOneOf(Value *Op) const {
2683 auto *I = dyn_cast<Instruction>(Op);
2684 if (I && isOpcodeOrAlt(I))
2685 return Op;
2686 return MainOp;
2687 }
2688
2689 void setOperations(const InstructionsState &S) {
2690 MainOp = S.MainOp;
2691 AltOp = S.AltOp;
2692 }
2693
2694 Instruction *getMainOp() const {
2695 return MainOp;
2696 }
2697
2698 Instruction *getAltOp() const {
2699 return AltOp;
2700 }
2701
2702 /// The main/alternate opcodes for the list of instructions.
2703 unsigned getOpcode() const {
2704 return MainOp ? MainOp->getOpcode() : 0;
2705 }
2706
2707 unsigned getAltOpcode() const {
2708 return AltOp ? AltOp->getOpcode() : 0;
2709 }
2710
2711 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2712 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2713 int findLaneForValue(Value *V) const {
2714 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2715 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2716 if (!ReorderIndices.empty())
2717 FoundLane = ReorderIndices[FoundLane];
2718 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2719 if (!ReuseShuffleIndices.empty()) {
2720 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2721 find(ReuseShuffleIndices, FoundLane));
2722 }
2723 return FoundLane;
2724 }
2725
2726 /// Build a shuffle mask for graph entry which represents a merge of main
2727 /// and alternate operations.
2728 void
2729 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2731 SmallVectorImpl<Value *> *OpScalars = nullptr,
2732 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2733
2734#ifndef NDEBUG
2735 /// Debug printer.
2736 LLVM_DUMP_METHOD void dump() const {
2737 dbgs() << Idx << ".\n";
2738 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2739 dbgs() << "Operand " << OpI << ":\n";
2740 for (const Value *V : Operands[OpI])
2741 dbgs().indent(2) << *V << "\n";
2742 }
2743 dbgs() << "Scalars: \n";
2744 for (Value *V : Scalars)
2745 dbgs().indent(2) << *V << "\n";
2746 dbgs() << "State: ";
2747 switch (State) {
2748 case Vectorize:
2749 dbgs() << "Vectorize\n";
2750 break;
2751 case ScatterVectorize:
2752 dbgs() << "ScatterVectorize\n";
2753 break;
2754 case PossibleStridedVectorize:
2755 dbgs() << "PossibleStridedVectorize\n";
2756 break;
2757 case NeedToGather:
2758 dbgs() << "NeedToGather\n";
2759 break;
2760 }
2761 dbgs() << "MainOp: ";
2762 if (MainOp)
2763 dbgs() << *MainOp << "\n";
2764 else
2765 dbgs() << "NULL\n";
2766 dbgs() << "AltOp: ";
2767 if (AltOp)
2768 dbgs() << *AltOp << "\n";
2769 else
2770 dbgs() << "NULL\n";
2771 dbgs() << "VectorizedValue: ";
2772 if (VectorizedValue)
2773 dbgs() << *VectorizedValue << "\n";
2774 else
2775 dbgs() << "NULL\n";
2776 dbgs() << "ReuseShuffleIndices: ";
2777 if (ReuseShuffleIndices.empty())
2778 dbgs() << "Empty";
2779 else
2780 for (int ReuseIdx : ReuseShuffleIndices)
2781 dbgs() << ReuseIdx << ", ";
2782 dbgs() << "\n";
2783 dbgs() << "ReorderIndices: ";
2784 for (unsigned ReorderIdx : ReorderIndices)
2785 dbgs() << ReorderIdx << ", ";
2786 dbgs() << "\n";
2787 dbgs() << "UserTreeIndices: ";
2788 for (const auto &EInfo : UserTreeIndices)
2789 dbgs() << EInfo << ", ";
2790 dbgs() << "\n";
2791 }
2792#endif
2793 };
2794
2795#ifndef NDEBUG
2796 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2797 InstructionCost VecCost, InstructionCost ScalarCost,
2798 StringRef Banner) const {
2799 dbgs() << "SLP: " << Banner << ":\n";
2800 E->dump();
2801 dbgs() << "SLP: Costs:\n";
2802 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2803 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2804 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2805 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2806 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2807 }
2808#endif
2809
2810 /// Create a new VectorizableTree entry.
2811 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2812 std::optional<ScheduleData *> Bundle,
2813 const InstructionsState &S,
2814 const EdgeInfo &UserTreeIdx,
2815 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2816 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2817 TreeEntry::EntryState EntryState =
2818 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2819 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2820 ReuseShuffleIndices, ReorderIndices);
2821 }
2822
2823 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2824 TreeEntry::EntryState EntryState,
2825 std::optional<ScheduleData *> Bundle,
2826 const InstructionsState &S,
2827 const EdgeInfo &UserTreeIdx,
2828 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2829 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2830 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2831 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2832 "Need to vectorize gather entry?");
2833 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2834 TreeEntry *Last = VectorizableTree.back().get();
2835 Last->Idx = VectorizableTree.size() - 1;
2836 Last->State = EntryState;
2837 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2838 ReuseShuffleIndices.end());
2839 if (ReorderIndices.empty()) {
2840 Last->Scalars.assign(VL.begin(), VL.end());
2841 Last->setOperations(S);
2842 } else {
2843 // Reorder scalars and build final mask.
2844 Last->Scalars.assign(VL.size(), nullptr);
2845 transform(ReorderIndices, Last->Scalars.begin(),
2846 [VL](unsigned Idx) -> Value * {
2847 if (Idx >= VL.size())
2848 return UndefValue::get(VL.front()->getType());
2849 return VL[Idx];
2850 });
2851 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2852 Last->setOperations(S);
2853 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2854 }
2855 if (Last->State != TreeEntry::NeedToGather) {
2856 for (Value *V : VL) {
2857 const TreeEntry *TE = getTreeEntry(V);
2858 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2859 "Scalar already in tree!");
2860 if (TE) {
2861 if (TE != Last)
2862 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2863 continue;
2864 }
2865 ScalarToTreeEntry[V] = Last;
2866 }
2867 // Update the scheduler bundle to point to this TreeEntry.
2868 ScheduleData *BundleMember = *Bundle;
2869 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2870 isVectorLikeInstWithConstOps(S.MainOp) ||
2871 doesNotNeedToSchedule(VL)) &&
2872 "Bundle and VL out of sync");
2873 if (BundleMember) {
2874 for (Value *V : VL) {
2876 continue;
2877 if (!BundleMember)
2878 continue;
2879 BundleMember->TE = Last;
2880 BundleMember = BundleMember->NextInBundle;
2881 }
2882 }
2883 assert(!BundleMember && "Bundle and VL out of sync");
2884 } else {
2885 MustGather.insert(VL.begin(), VL.end());
2886 }
2887
2888 if (UserTreeIdx.UserTE)
2889 Last->UserTreeIndices.push_back(UserTreeIdx);
2890
2891 return Last;
2892 }
2893
2894 /// -- Vectorization State --
2895 /// Holds all of the tree entries.
2896 TreeEntry::VecTreeTy VectorizableTree;
2897
2898#ifndef NDEBUG
2899 /// Debug printer.
2900 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2901 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2902 VectorizableTree[Id]->dump();
2903 dbgs() << "\n";
2904 }
2905 }
2906#endif
2907
2908 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2909
2910 const TreeEntry *getTreeEntry(Value *V) const {
2911 return ScalarToTreeEntry.lookup(V);
2912 }
2913
2914 /// Checks if the specified list of the instructions/values can be vectorized
2915 /// and fills required data before actual scheduling of the instructions.
2916 TreeEntry::EntryState getScalarsVectorizationState(
2917 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2918 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2919
2920 /// Maps a specific scalar to its tree entry.
2921 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
2922
2923 /// List of scalars, used in several vectorize nodes, and the list of the
2924 /// nodes.
2926
2927 /// Maps a value to the proposed vectorizable size.
2928 SmallDenseMap<Value *, unsigned> InstrElementSize;
2929
2930 /// A list of scalars that we found that we need to keep as scalars.
2931 ValueSet MustGather;
2932
2933 /// A map between the vectorized entries and the last instructions in the
2934 /// bundles. The bundles are built in use order, not in the def order of the
2935 /// instructions. So, we cannot rely directly on the last instruction in the
2936 /// bundle being the last instruction in the program order during
2937 /// vectorization process since the basic blocks are affected, need to
2938 /// pre-gather them before.
2939 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
2940
2941 /// List of gather nodes, depending on other gather/vector nodes, which should
2942 /// be emitted after the vector instruction emission process to correctly
2943 /// handle order of the vector instructions and shuffles.
2944 SetVector<const TreeEntry *> PostponedGathers;
2945
2946 using ValueToGatherNodesMap =
2948 ValueToGatherNodesMap ValueToGatherNodes;
2949
2950 /// This POD struct describes one external user in the vectorized tree.
2951 struct ExternalUser {
2952 ExternalUser(Value *S, llvm::User *U, int L)
2953 : Scalar(S), User(U), Lane(L) {}
2954
2955 // Which scalar in our function.
2956 Value *Scalar;
2957
2958 // Which user that uses the scalar.
2960
2961 // Which lane does the scalar belong to.
2962 int Lane;
2963 };
2964 using UserList = SmallVector<ExternalUser, 16>;
2965
2966 /// Checks if two instructions may access the same memory.
2967 ///
2968 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2969 /// is invariant in the calling loop.
2970 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2971 Instruction *Inst2) {
2972 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
2973 return true;
2974 // First check if the result is already in the cache.
2975 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
2976 auto It = AliasCache.find(Key);
2977 if (It != AliasCache.end())
2978 return It->second;
2979 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
2980 // Store the result in the cache.
2981 AliasCache.try_emplace(Key, Aliased);
2982 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
2983 return Aliased;
2984 }
2985
2986 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2987
2988 /// Cache for alias results.
2989 /// TODO: consider moving this to the AliasAnalysis itself.
2991
2992 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
2993 // globally through SLP because we don't perform any action which
2994 // invalidates capture results.
2995 BatchAAResults BatchAA;
2996
2997 /// Temporary store for deleted instructions. Instructions will be deleted
2998 /// eventually when the BoUpSLP is destructed. The deferral is required to
2999 /// ensure that there are no incorrect collisions in the AliasCache, which
3000 /// can happen if a new instruction is allocated at the same address as a
3001 /// previously deleted instruction.
3002 DenseSet<Instruction *> DeletedInstructions;
3003
3004 /// Set of the instruction, being analyzed already for reductions.
3005 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3006
3007 /// Set of hashes for the list of reduction values already being analyzed.
3008 DenseSet<size_t> AnalyzedReductionVals;
3009
3010 /// A list of values that need to extracted out of the tree.
3011 /// This list holds pairs of (Internal Scalar : External User). External User
3012 /// can be nullptr, it means that this Internal Scalar will be used later,
3013 /// after vectorization.
3014 UserList ExternalUses;
3015
3016 /// Values used only by @llvm.assume calls.
3018
3019 /// Holds all of the instructions that we gathered, shuffle instructions and
3020 /// extractelements.
3021 SetVector<Instruction *> GatherShuffleExtractSeq;
3022
3023 /// A list of blocks that we are going to CSE.
3024 DenseSet<BasicBlock *> CSEBlocks;
3025
3026 /// Contains all scheduling relevant data for an instruction.
3027 /// A ScheduleData either represents a single instruction or a member of an
3028 /// instruction bundle (= a group of instructions which is combined into a
3029 /// vector instruction).
3030 struct ScheduleData {
3031 // The initial value for the dependency counters. It means that the
3032 // dependencies are not calculated yet.
3033 enum { InvalidDeps = -1 };
3034
3035 ScheduleData() = default;
3036
3037 void init(int BlockSchedulingRegionID, Value *OpVal) {
3038 FirstInBundle = this;
3039 NextInBundle = nullptr;
3040 NextLoadStore = nullptr;
3041 IsScheduled = false;
3042 SchedulingRegionID = BlockSchedulingRegionID;
3043 clearDependencies();
3044 OpValue = OpVal;
3045 TE = nullptr;
3046 }
3047
3048 /// Verify basic self consistency properties
3049 void verify() {
3050 if (hasValidDependencies()) {
3051 assert(UnscheduledDeps <= Dependencies && "invariant");
3052 } else {
3053 assert(UnscheduledDeps == Dependencies && "invariant");
3054 }
3055
3056 if (IsScheduled) {
3057 assert(isSchedulingEntity() &&
3058 "unexpected scheduled state");
3059 for (const ScheduleData *BundleMember = this; BundleMember;
3060 BundleMember = BundleMember->NextInBundle) {
3061 assert(BundleMember->hasValidDependencies() &&
3062 BundleMember->UnscheduledDeps == 0 &&
3063 "unexpected scheduled state");
3064 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3065 "only bundle is marked scheduled");
3066 }
3067 }
3068
3069 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3070 "all bundle members must be in same basic block");
3071 }
3072
3073 /// Returns true if the dependency information has been calculated.
3074 /// Note that depenendency validity can vary between instructions within
3075 /// a single bundle.
3076 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3077
3078 /// Returns true for single instructions and for bundle representatives
3079 /// (= the head of a bundle).
3080 bool isSchedulingEntity() const { return FirstInBundle == this; }
3081
3082 /// Returns true if it represents an instruction bundle and not only a
3083 /// single instruction.
3084 bool isPartOfBundle() const {
3085 return NextInBundle != nullptr || FirstInBundle != this || TE;
3086 }
3087
3088 /// Returns true if it is ready for scheduling, i.e. it has no more
3089 /// unscheduled depending instructions/bundles.
3090 bool isReady() const {
3091 assert(isSchedulingEntity() &&
3092 "can't consider non-scheduling entity for ready list");
3093 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3094 }
3095
3096 /// Modifies the number of unscheduled dependencies for this instruction,
3097 /// and returns the number of remaining dependencies for the containing
3098 /// bundle.
3099 int incrementUnscheduledDeps(int Incr) {
3100 assert(hasValidDependencies() &&
3101 "increment of unscheduled deps would be meaningless");
3102 UnscheduledDeps += Incr;
3103 return FirstInBundle->unscheduledDepsInBundle();
3104 }
3105
3106 /// Sets the number of unscheduled dependencies to the number of
3107 /// dependencies.
3108 void resetUnscheduledDeps() {
3109 UnscheduledDeps = Dependencies;
3110 }
3111
3112 /// Clears all dependency information.
3113 void clearDependencies() {
3114 Dependencies = InvalidDeps;
3115 resetUnscheduledDeps();
3116 MemoryDependencies.clear();
3117 ControlDependencies.clear();
3118 }
3119
3120 int unscheduledDepsInBundle() const {
3121 assert(isSchedulingEntity() && "only meaningful on the bundle");
3122 int Sum = 0;
3123 for (const ScheduleData *BundleMember = this; BundleMember;
3124 BundleMember = BundleMember->NextInBundle) {
3125 if (BundleMember->UnscheduledDeps == InvalidDeps)
3126 return InvalidDeps;
3127 Sum += BundleMember->UnscheduledDeps;
3128 }
3129 return Sum;
3130 }
3131
3132 void dump(raw_ostream &os) const {
3133 if (!isSchedulingEntity()) {
3134 os << "/ " << *Inst;
3135 } else if (NextInBundle) {
3136 os << '[' << *Inst;
3137 ScheduleData *SD = NextInBundle;
3138 while (SD) {
3139 os << ';' << *SD->Inst;
3140 SD = SD->NextInBundle;
3141 }
3142 os << ']';
3143 } else {
3144 os << *Inst;
3145 }
3146 }
3147
3148 Instruction *Inst = nullptr;
3149
3150 /// Opcode of the current instruction in the schedule data.
3151 Value *OpValue = nullptr;
3152
3153 /// The TreeEntry that this instruction corresponds to.
3154 TreeEntry *TE = nullptr;
3155
3156 /// Points to the head in an instruction bundle (and always to this for
3157 /// single instructions).
3158 ScheduleData *FirstInBundle = nullptr;
3159
3160 /// Single linked list of all instructions in a bundle. Null if it is a
3161 /// single instruction.
3162 ScheduleData *NextInBundle = nullptr;
3163
3164 /// Single linked list of all memory instructions (e.g. load, store, call)
3165 /// in the block - until the end of the scheduling region.
3166 ScheduleData *NextLoadStore = nullptr;
3167
3168 /// The dependent memory instructions.
3169 /// This list is derived on demand in calculateDependencies().
3170 SmallVector<ScheduleData *, 4> MemoryDependencies;
3171
3172 /// List of instructions which this instruction could be control dependent
3173 /// on. Allowing such nodes to be scheduled below this one could introduce
3174 /// a runtime fault which didn't exist in the original program.
3175 /// ex: this is a load or udiv following a readonly call which inf loops
3176 SmallVector<ScheduleData *, 4> ControlDependencies;
3177
3178 /// This ScheduleData is in the current scheduling region if this matches
3179 /// the current SchedulingRegionID of BlockScheduling.
3180 int SchedulingRegionID = 0;
3181
3182 /// Used for getting a "good" final ordering of instructions.
3183 int SchedulingPriority = 0;
3184
3185 /// The number of dependencies. Constitutes of the number of users of the
3186 /// instruction plus the number of dependent memory instructions (if any).
3187 /// This value is calculated on demand.
3188 /// If InvalidDeps, the number of dependencies is not calculated yet.
3189 int Dependencies = InvalidDeps;
3190
3191 /// The number of dependencies minus the number of dependencies of scheduled
3192 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3193 /// for scheduling.
3194 /// Note that this is negative as long as Dependencies is not calculated.
3195 int UnscheduledDeps = InvalidDeps;
3196
3197 /// True if this instruction is scheduled (or considered as scheduled in the
3198 /// dry-run).
3199 bool IsScheduled = false;
3200 };
3201
3202#ifndef NDEBUG
3204 const BoUpSLP::ScheduleData &SD) {
3205 SD.dump(os);
3206 return os;
3207 }
3208#endif
3209
3210 friend struct GraphTraits<BoUpSLP *>;
3211 friend struct DOTGraphTraits<BoUpSLP *>;
3212
3213 /// Contains all scheduling data for a basic block.
3214 /// It does not schedules instructions, which are not memory read/write
3215 /// instructions and their operands are either constants, or arguments, or
3216 /// phis, or instructions from others blocks, or their users are phis or from
3217 /// the other blocks. The resulting vector instructions can be placed at the
3218 /// beginning of the basic block without scheduling (if operands does not need
3219 /// to be scheduled) or at the end of the block (if users are outside of the
3220 /// block). It allows to save some compile time and memory used by the
3221 /// compiler.
3222 /// ScheduleData is assigned for each instruction in between the boundaries of
3223 /// the tree entry, even for those, which are not part of the graph. It is
3224 /// required to correctly follow the dependencies between the instructions and
3225 /// their correct scheduling. The ScheduleData is not allocated for the
3226 /// instructions, which do not require scheduling, like phis, nodes with
3227 /// extractelements/insertelements only or nodes with instructions, with
3228 /// uses/operands outside of the block.
3229 struct BlockScheduling {
3230 BlockScheduling(BasicBlock *BB)
3231 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3232
3233 void clear() {
3234 ReadyInsts.clear();
3235 ScheduleStart = nullptr;
3236 ScheduleEnd = nullptr;
3237 FirstLoadStoreInRegion = nullptr;
3238 LastLoadStoreInRegion = nullptr;
3239 RegionHasStackSave = false;
3240
3241 // Reduce the maximum schedule region size by the size of the
3242 // previous scheduling run.
3243 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3244 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3245 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3246 ScheduleRegionSize = 0;
3247
3248 // Make a new scheduling region, i.e. all existing ScheduleData is not
3249 // in the new region yet.
3250 ++SchedulingRegionID;
3251 }
3252
3253 ScheduleData *getScheduleData(Instruction *I) {
3254 if (BB != I->getParent())
3255 // Avoid lookup if can't possibly be in map.
3256 return nullptr;
3257 ScheduleData *SD = ScheduleDataMap.lookup(I);
3258 if (SD && isInSchedulingRegion(SD))
3259 return SD;
3260 return nullptr;
3261 }
3262
3263 ScheduleData *getScheduleData(Value *V) {
3264 if (auto *I = dyn_cast<Instruction>(V))
3265 return getScheduleData(I);
3266 return nullptr;
3267 }
3268
3269 ScheduleData *getScheduleData(Value *V, Value *Key) {
3270 if (V == Key)
3271 return getScheduleData(V);
3272 auto I = ExtraScheduleDataMap.find(V);
3273 if (I != ExtraScheduleDataMap.end()) {
3274 ScheduleData *SD = I->second.lookup(Key);
3275 if (SD && isInSchedulingRegion(SD))
3276 return SD;
3277 }
3278 return nullptr;
3279 }
3280
3281 bool isInSchedulingRegion(ScheduleData *SD) const {
3282 return SD->SchedulingRegionID == SchedulingRegionID;
3283 }
3284
3285 /// Marks an instruction as scheduled and puts all dependent ready
3286 /// instructions into the ready-list.
3287 template <typename ReadyListType>
3288 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3289 SD->IsScheduled = true;
3290 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3291
3292 for (ScheduleData *BundleMember = SD; BundleMember;
3293 BundleMember = BundleMember->NextInBundle) {
3294 if (BundleMember->Inst != BundleMember->OpValue)
3295 continue;
3296
3297 // Handle the def-use chain dependencies.
3298
3299 // Decrement the unscheduled counter and insert to ready list if ready.
3300 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3301 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3302 if (OpDef && OpDef->hasValidDependencies() &&
3303 OpDef->incrementUnscheduledDeps(-1) == 0) {
3304 // There are no more unscheduled dependencies after
3305 // decrementing, so we can put the dependent instruction
3306 // into the ready list.
3307 ScheduleData *DepBundle = OpDef->FirstInBundle;
3308 assert(!DepBundle->IsScheduled &&
3309 "already scheduled bundle gets ready");
3310 ReadyList.insert(DepBundle);
3311 LLVM_DEBUG(dbgs()
3312 << "SLP: gets ready (def): " << *DepBundle << "\n");
3313 }
3314 });
3315 };
3316
3317 // If BundleMember is a vector bundle, its operands may have been
3318 // reordered during buildTree(). We therefore need to get its operands
3319 // through the TreeEntry.
3320 if (TreeEntry *TE = BundleMember->TE) {
3321 // Need to search for the lane since the tree entry can be reordered.
3322 int Lane = std::distance(TE->Scalars.begin(),
3323 find(TE->Scalars, BundleMember->Inst));
3324 assert(Lane >= 0 && "Lane not set");
3325
3326 // Since vectorization tree is being built recursively this assertion
3327 // ensures that the tree entry has all operands set before reaching
3328 // this code. Couple of exceptions known at the moment are extracts
3329 // where their second (immediate) operand is not added. Since
3330 // immediates do not affect scheduler behavior this is considered
3331 // okay.
3332 auto *In = BundleMember->Inst;
3333 assert(In &&
3334 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3335 In->getNumOperands() == TE->getNumOperands()) &&
3336 "Missed TreeEntry operands?");
3337 (void)In; // fake use to avoid build failure when assertions disabled
3338
3339 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3340 OpIdx != NumOperands; ++OpIdx)
3341 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3342 DecrUnsched(I);
3343 } else {
3344 // If BundleMember is a stand-alone instruction, no operand reordering
3345 // has taken place, so we directly access its operands.
3346 for (Use &U : BundleMember->Inst->operands())
3347 if (auto *I = dyn_cast<Instruction>(U.get()))
3348 DecrUnsched(I);
3349 }
3350 // Handle the memory dependencies.
3351 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3352 if (MemoryDepSD->hasValidDependencies() &&
3353 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3354 // There are no more unscheduled dependencies after decrementing,
3355 // so we can put the dependent instruction into the ready list.
3356 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3357 assert(!DepBundle->IsScheduled &&
3358 "already scheduled bundle gets ready");
3359 ReadyList.insert(DepBundle);
3361 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3362 }
3363 }
3364 // Handle the control dependencies.
3365 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3366 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3367 // There are no more unscheduled dependencies after decrementing,
3368 // so we can put the dependent instruction into the ready list.
3369 ScheduleData *DepBundle = DepSD->FirstInBundle;
3370 assert(!DepBundle->IsScheduled &&
3371 "already scheduled bundle gets ready");
3372 ReadyList.insert(DepBundle);
3374 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3375 }
3376 }
3377 }
3378 }
3379
3380 /// Verify basic self consistency properties of the data structure.
3381 void verify() {
3382 if (!ScheduleStart)
3383 return;
3384
3385 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3386 ScheduleStart->comesBefore(ScheduleEnd) &&
3387 "Not a valid scheduling region?");
3388
3389 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3390 auto *SD = getScheduleData(I);
3391 if (!SD)
3392 continue;
3393 assert(isInSchedulingRegion(SD) &&
3394 "primary schedule data not in window?");
3395 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3396 "entire bundle in window!");
3397 (void)SD;
3398 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3399 }
3400
3401 for (auto *SD : ReadyInsts) {
3402 assert(SD->isSchedulingEntity() && SD->isReady() &&
3403 "item in ready list not ready?");
3404 (void)SD;
3405 }
3406 }
3407
3408 void doForAllOpcodes(Value *V,
3409 function_ref<void(ScheduleData *SD)> Action) {
3410 if (ScheduleData *SD = getScheduleData(V))
3411 Action(SD);
3412 auto I = ExtraScheduleDataMap.find(V);
3413 if (I != ExtraScheduleDataMap.end())
3414 for (auto &P : I->second)
3415 if (isInSchedulingRegion(P.second))
3416 Action(P.second);
3417 }
3418
3419 /// Put all instructions into the ReadyList which are ready for scheduling.
3420 template <typename ReadyListType>
3421 void initialFillReadyList(ReadyListType &ReadyList) {
3422 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3423 doForAllOpcodes(I, [&](ScheduleData *SD) {
3424 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3425 SD->isReady()) {
3426 ReadyList.insert(SD);
3427 LLVM_DEBUG(dbgs()
3428 << "SLP: initially in ready list: " << *SD << "\n");
3429 }
3430 });
3431 }
3432 }
3433
3434 /// Build a bundle from the ScheduleData nodes corresponding to the
3435 /// scalar instruction for each lane.
3436 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3437
3438 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3439 /// cyclic dependencies. This is only a dry-run, no instructions are
3440 /// actually moved at this stage.
3441 /// \returns the scheduling bundle. The returned Optional value is not
3442 /// std::nullopt if \p VL is allowed to be scheduled.
3443 std::optional<ScheduleData *>
3444 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3445 const InstructionsState &S);
3446
3447 /// Un-bundles a group of instructions.
3448 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3449
3450 /// Allocates schedule data chunk.
3451 ScheduleData *allocateScheduleDataChunks();
3452
3453 /// Extends the scheduling region so that V is inside the region.
3454 /// \returns true if the region size is within the limit.
3455 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3456
3457 /// Initialize the ScheduleData structures for new instructions in the
3458 /// scheduling region.
3459 void initScheduleData(Instruction *FromI, Instruction *ToI,
3460 ScheduleData *PrevLoadStore,
3461 ScheduleData *NextLoadStore);
3462
3463 /// Updates the dependency information of a bundle and of all instructions/
3464 /// bundles which depend on the original bundle.
3465 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3466 BoUpSLP *SLP);
3467
3468 /// Sets all instruction in the scheduling region to un-scheduled.
3469 void resetSchedule();
3470
3471 BasicBlock *BB;
3472
3473 /// Simple memory allocation for ScheduleData.
3475
3476 /// The size of a ScheduleData array in ScheduleDataChunks.
3477 int ChunkSize;
3478
3479 /// The allocator position in the current chunk, which is the last entry
3480 /// of ScheduleDataChunks.
3481 int ChunkPos;
3482
3483 /// Attaches ScheduleData to Instruction.
3484 /// Note that the mapping survives during all vectorization iterations, i.e.
3485 /// ScheduleData structures are recycled.
3487
3488 /// Attaches ScheduleData to Instruction with the leading key.
3490 ExtraScheduleDataMap;
3491
3492 /// The ready-list for scheduling (only used for the dry-run).
3493 SetVector<ScheduleData *> ReadyInsts;
3494
3495 /// The first instruction of the scheduling region.
3496 Instruction *ScheduleStart = nullptr;
3497
3498 /// The first instruction _after_ the scheduling region.
3499 Instruction *ScheduleEnd = nullptr;
3500
3501 /// The first memory accessing instruction in the scheduling region
3502 /// (can be null).
3503 ScheduleData *FirstLoadStoreInRegion = nullptr;
3504
3505 /// The last memory accessing instruction in the scheduling region
3506 /// (can be null).
3507 ScheduleData *LastLoadStoreInRegion = nullptr;
3508
3509 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3510 /// region? Used to optimize the dependence calculation for the
3511 /// common case where there isn't.
3512 bool RegionHasStackSave = false;
3513
3514 /// The current size of the scheduling region.
3515 int ScheduleRegionSize = 0;
3516
3517 /// The maximum size allowed for the scheduling region.
3518 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3519
3520 /// The ID of the scheduling region. For a new vectorization iteration this
3521 /// is incremented which "removes" all ScheduleData from the region.
3522 /// Make sure that the initial SchedulingRegionID is greater than the
3523 /// initial SchedulingRegionID in ScheduleData (which is 0).
3524 int SchedulingRegionID = 1;
3525 };
3526
3527 /// Attaches the BlockScheduling structures to basic blocks.
3529
3530 /// Performs the "real" scheduling. Done before vectorization is actually
3531 /// performed in a basic block.
3532 void scheduleBlock(BlockScheduling *BS);
3533
3534 /// List of users to ignore during scheduling and that don't need extracting.
3535 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3536
3537 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3538 /// sorted SmallVectors of unsigned.
3539 struct OrdersTypeDenseMapInfo {
3540 static OrdersType getEmptyKey() {
3541 OrdersType V;
3542 V.push_back(~1U);
3543 return V;
3544 }
3545
3546 static OrdersType getTombstoneKey() {
3547 OrdersType V;
3548 V.push_back(~2U);
3549 return V;
3550 }
3551
3552 static unsigned getHashValue(const OrdersType &V) {
3553 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3554 }
3555
3556 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3557 return LHS == RHS;
3558 }
3559 };
3560
3561 // Analysis and block reference.
3562 Function *F;
3563 ScalarEvolution *SE;
3565 TargetLibraryInfo *TLI;
3566 LoopInfo *LI;
3567 DominatorTree *DT;
3568 AssumptionCache *AC;
3569 DemandedBits *DB;
3570 const DataLayout *DL;
3572
3573 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3574 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3575
3576 /// Instruction builder to construct the vectorized tree.
3577 IRBuilder<> Builder;
3578
3579 /// A map of scalar integer values to the smallest bit width with which they
3580 /// can legally be represented. The values map to (width, signed) pairs,
3581 /// where "width" indicates the minimum bit width and "signed" is True if the
3582 /// value must be signed-extended, rather than zero-extended, back to its
3583 /// original width.
3585};
3586
3587} // end namespace slpvectorizer
3588
3589template <> struct GraphTraits<BoUpSLP *> {
3590 using TreeEntry = BoUpSLP::TreeEntry;
3591
3592 /// NodeRef has to be a pointer per the GraphWriter.
3594
3596
3597 /// Add the VectorizableTree to the index iterator to be able to return
3598 /// TreeEntry pointers.
3599 struct ChildIteratorType
3600 : public iterator_adaptor_base<
3601 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3603
3605 ContainerTy &VT)
3606 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3607
3608 NodeRef operator*() { return I->UserTE; }
3609 };
3610
3612 return R.VectorizableTree[0].get();
3613 }
3614
3615 static ChildIteratorType child_begin(NodeRef N) {
3616 return {N->UserTreeIndices.begin(), N->Container};
3617 }
3618
3619 static ChildIteratorType child_end(NodeRef N) {
3620 return {N->UserTreeIndices.end(), N->Container};
3621 }
3622
3623 /// For the node iterator we just need to turn the TreeEntry iterator into a
3624 /// TreeEntry* iterator so that it dereferences to NodeRef.
3625 class nodes_iterator {
3627 ItTy It;
3628
3629 public:
3630 nodes_iterator(const ItTy &It2) : It(It2) {}
3631 NodeRef operator*() { return It->get(); }
3632 nodes_iterator operator++() {
3633 ++It;
3634 return *this;
3635 }
3636 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3637 };
3638
3639 static nodes_iterator nodes_begin(BoUpSLP *R) {
3640 return nodes_iterator(R->VectorizableTree.begin());
3641 }
3642
3643 static nodes_iterator nodes_end(BoUpSLP *R) {
3644 return nodes_iterator(R->VectorizableTree.end());
3645 }
3646
3647 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3648};
3649
3650template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3651 using TreeEntry = BoUpSLP::TreeEntry;
3652
3653 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3654
3655 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3656 std::string Str;
3658 OS << Entry->Idx << ".\n";
3659 if (isSplat(Entry->Scalars))
3660 OS << "<splat> ";
3661 for (auto *V : Entry->Scalars) {
3662 OS << *V;
3663 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3664 return EU.Scalar == V;
3665 }))
3666 OS << " <extract>";
3667 OS << "\n";
3668 }
3669 return Str;
3670 }
3671
3672 static std::string getNodeAttributes(const TreeEntry *Entry,
3673 const BoUpSLP *) {
3674 if (Entry->State == TreeEntry::NeedToGather)
3675 return "color=red";
3676 if (Entry->State == TreeEntry::ScatterVectorize ||
3677 Entry->State == TreeEntry::PossibleStridedVectorize)
3678 return "color=blue";
3679 return "";
3680 }
3681};
3682
3683} // end namespace llvm
3684
3687 for (auto *I : DeletedInstructions) {
3688 for (Use &U : I->operands()) {
3689 auto *Op = dyn_cast<Instruction>(U.get());
3690 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3692 DeadInsts.emplace_back(Op);
3693 }
3694 I->dropAllReferences();
3695 }
3696 for (auto *I : DeletedInstructions) {
3697 assert(I->use_empty() &&
3698 "trying to erase instruction with users.");
3699 I->eraseFromParent();
3700 }
3701
3702 // Cleanup any dead scalar code feeding the vectorized instructions
3704
3705#ifdef EXPENSIVE_CHECKS
3706 // If we could guarantee that this call is not extremely slow, we could
3707 // remove the ifdef limitation (see PR47712).
3708 assert(!verifyFunction(*F, &dbgs()));
3709#endif
3710}
3711
3712/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3713/// contains original mask for the scalars reused in the node. Procedure
3714/// transform this mask in accordance with the given \p Mask.
3716 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3717 "Expected non-empty mask.");
3718 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3719 Prev.swap(Reuses);
3720 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3721 if (Mask[I] != PoisonMaskElem)
3722 Reuses[Mask[I]] = Prev[I];
3723}
3724
3725/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3726/// the original order of the scalars. Procedure transforms the provided order
3727/// in accordance with the given \p Mask. If the resulting \p Order is just an
3728/// identity order, \p Order is cleared.
3730 assert(!Mask.empty() && "Expected non-empty mask.");
3731 SmallVector<int> MaskOrder;
3732 if (Order.empty()) {
3733 MaskOrder.resize(Mask.size());
3734 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3735 } else {
3736 inversePermutation(Order, MaskOrder);
3737 }
3738 reorderReuses(MaskOrder, Mask);
3739 if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {
3740 Order.clear();
3741 return;
3742 }
3743 Order.assign(Mask.size(), Mask.size());
3744 for (unsigned I = 0, E = Mask.size(); I < E; ++I)
3745 if (MaskOrder[I] != PoisonMaskElem)
3746 Order[MaskOrder[I]] = I;
3747 fixupOrderingIndices(Order);
3748}
3749
3750std::optional<BoUpSLP::OrdersType>
3751BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3752 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3753 unsigned NumScalars = TE.Scalars.size();
3754 OrdersType CurrentOrder(NumScalars, NumScalars);
3755 SmallVector<int> Positions;
3756 SmallBitVector UsedPositions(NumScalars);
3757 const TreeEntry *STE = nullptr;
3758 // Try to find all gathered scalars that are gets vectorized in other
3759 // vectorize node. Here we can have only one single tree vector node to
3760 // correctly identify order of the gathered scalars.
3761 for (unsigned I = 0; I < NumScalars; ++I) {
3762 Value *V = TE.Scalars[I];
3763 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3764 continue;
3765 if (const auto *LocalSTE = getTreeEntry(V)) {
3766 if (!STE)
3767 STE = LocalSTE;
3768 else if (STE != LocalSTE)
3769 // Take the order only from the single vector node.
3770 return std::nullopt;
3771 unsigned Lane =
3772 std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
3773 if (Lane >= NumScalars)
3774 return std::nullopt;
3775 if (CurrentOrder[Lane] != NumScalars) {
3776 if (Lane != I)
3777 continue;
3778 UsedPositions.reset(CurrentOrder[Lane]);
3779 }
3780 // The partial identity (where only some elements of the gather node are
3781 // in the identity order) is good.
3782 CurrentOrder[Lane] = I;
3783 UsedPositions.set(I);
3784 }
3785 }
3786 // Need to keep the order if we have a vector entry and at least 2 scalars or
3787 // the vectorized entry has just 2 scalars.
3788 if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
3789 auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
3790 for (unsigned I = 0; I < NumScalars; ++I)
3791 if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
3792 return false;
3793 return true;
3794 };
3795 if (IsIdentityOrder(CurrentOrder))
3796 return OrdersType();
3797 auto *It = CurrentOrder.begin();
3798 for (unsigned I = 0; I < NumScalars;) {
3799 if (UsedPositions.test(I)) {
3800 ++I;
3801 continue;
3802 }
3803 if (*It == NumScalars) {
3804 *It = I;
3805 ++I;
3806 }
3807 ++It;
3808 }
3809 return std::move(CurrentOrder);
3810 }
3811 return std::nullopt;
3812}
3813
3814namespace {
3815/// Tracks the state we can represent the loads in the given sequence.
3816enum class LoadsState {
3817 Gather,
3818 Vectorize,
3819 ScatterVectorize,
3820 PossibleStridedVectorize
3821};
3822} // anonymous namespace
3823
3824static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
3825 const TargetLibraryInfo &TLI,
3826 bool CompareOpcodes = true) {
3827 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
3828 return false;
3829 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3830 if (!GEP1)
3831 return false;
3832 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3833 if (!GEP2)
3834 return false;
3835 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3836 ((isConstant(GEP1->getOperand(1)) &&
3837 isConstant(GEP2->getOperand(1))) ||
3838 !CompareOpcodes ||
3839 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3840 .getOpcode());
3841}
3842
3843/// Checks if the given array of loads can be represented as a vectorized,
3844/// scatter or just simple gather.
3845static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3846 const TargetTransformInfo &TTI,
3847 const DataLayout &DL, ScalarEvolution &SE,
3848 LoopInfo &LI, const TargetLibraryInfo &TLI,
3850 SmallVectorImpl<Value *> &PointerOps) {
3851 // Check that a vectorized load would load the same memory as a scalar
3852 // load. For example, we don't want to vectorize loads that are smaller
3853 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3854 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3855 // from such a struct, we read/write packed bits disagreeing with the
3856 // unvectorized version.
3857 Type *ScalarTy = VL0->getType();
3858
3859 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
3860 return LoadsState::Gather;
3861
3862 // Make sure all loads in the bundle are simple - we can't vectorize
3863 // atomic or volatile loads.
3864 PointerOps.clear();
3865 PointerOps.resize(VL.size());
3866 auto *POIter = PointerOps.begin();
3867 for (Value *V : VL) {
3868 auto *L = cast<LoadInst>(V);
3869 if (!L->isSimple())
3870 return LoadsState::Gather;
3871 *POIter = L->getPointerOperand();
3872 ++POIter;
3873 }
3874
3875 Order.clear();
3876 // Check the order of pointer operands or that all pointers are the same.
3877 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3878 if (IsSorted || all_of(PointerOps, [&](Value *P) {
3879 return arePointersCompatible(P, PointerOps.front(), TLI);
3880 })) {
3881 bool IsPossibleStrided = false;
3882 if (IsSorted) {
3883 Value *Ptr0;
3884 Value *PtrN;
3885 if (Order.empty()) {
3886 Ptr0 = PointerOps.front();
3887 PtrN = PointerOps.back();
3888 } else {
3889 Ptr0 = PointerOps[Order.front()];
3890 PtrN = PointerOps[Order.back()];
3891 }
3892 std::optional<int> Diff =
3893 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
3894 // Check that the sorted loads are consecutive.
3895 if (static_cast<unsigned>(*Diff) == VL.size() - 1)
3896 return LoadsState::Vectorize;
3897 // Simple check if not a strided access - clear order.
3898 IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
3899 }
3900 // TODO: need to improve analysis of the pointers, if not all of them are
3901 // GEPs or have > 2 operands, we end up with a gather node, which just
3902 // increases the cost.
3903 Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
3904 bool ProfitableGatherPointers =
3905 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
3906 return L && L->isLoopInvariant(V);
3907 })) <= VL.size() / 2 && VL.size() > 2;
3908 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
3909 auto *GEP = dyn_cast<GetElementPtrInst>(P);
3910 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
3911 (GEP && GEP->getNumOperands() == 2);
3912 })) {
3913 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3914 for (Value *V : VL)
3915 CommonAlignment =
3916 std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
3917 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
3918 if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
3919 !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
3920 return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
3921 : LoadsState::ScatterVectorize;
3922 }
3923 }
3924
3925 return LoadsState::Gather;
3926}
3927
3929 const DataLayout &DL, ScalarEvolution &SE,
3930 SmallVectorImpl<unsigned> &SortedIndices) {
3932 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
3933 "Expected list of pointer operands.");
3934 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
3935 // Ptr into, sort and return the sorted indices with values next to one
3936 // another.
3938 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3939
3940 unsigned Cnt = 1;
3941 for (Value *Ptr : VL.drop_front()) {
3942 bool Found = any_of(Bases, [&](auto &Base) {
3943 std::optional<int> Diff =
3944 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
3945 /*StrictCheck=*/true);
3946 if (!Diff)
3947 return false;
3948
3949 Base.second.emplace_back(Ptr, *Diff, Cnt++);
3950 return true;
3951 });
3952
3953 if (!Found) {
3954 // If we haven't found enough to usefully cluster, return early.
3955 if (Bases.size() > VL.size() / 2 - 1)
3956 return false;
3957
3958 // Not found already - add a new Base
3959 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
3960 }
3961 }
3962
3963 // For each of the bases sort the pointers by Offset and check if any of the
3964 // base become consecutively allocated.
3965 bool AnyConsecutive = false;
3966 for (auto &Base : Bases) {
3967 auto &Vec = Base.second;
3968 if (Vec.size() > 1) {
3969 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
3970 const std::tuple<Value *, int, unsigned> &Y) {
3971 return std::get<1>(X) < std::get<1>(Y);
3972 });
3973 int InitialOffset = std::get<1>(Vec[0]);
3974 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
3975 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
3976 });
3977 }
3978 }
3979
3980 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
3981 SortedIndices.clear();
3982 if (!AnyConsecutive)
3983 return false;
3984
3985 for (auto &Base : Bases) {
3986 for (auto &T : Base.second)
3987 SortedIndices.push_back(std::get<2>(T));
3988 }
3989
3990 assert(SortedIndices.size() == VL.size() &&
3991 "Expected SortedIndices to be the size of VL");
3992 return true;
3993}
3994
3995std::optional<BoUpSLP::OrdersType>
3996BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
3997 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3998 Type *ScalarTy = TE.Scalars[0]->getType();
3999
4001 Ptrs.reserve(TE.Scalars.size());
4002 for (Value *V : TE.Scalars) {
4003 auto *L = dyn_cast<LoadInst>(V);
4004 if (!L || !L->isSimple())
4005 return std::nullopt;
4006 Ptrs.push_back(L->getPointerOperand());
4007 }
4008
4009 BoUpSLP::OrdersType Order;
4010 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4011 return std::move(Order);
4012 return std::nullopt;
4013}
4014
4015/// Check if two insertelement instructions are from the same buildvector.
4018 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4019 // Instructions must be from the same basic blocks.
4020 if (VU->getParent() != V->getParent())
4021 return false;
4022 // Checks if 2 insertelements are from the same buildvector.
4023 if (VU->getType() != V->getType())
4024 return false;
4025 // Multiple used inserts are separate nodes.
4026 if (!VU->hasOneUse() && !V->hasOneUse())
4027 return false;
4028 auto *IE1 = VU;
4029 auto *IE2 = V;
4030 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4031 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4032 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4033 return false;
4034 // Go through the vector operand of insertelement instructions trying to find
4035 // either VU as the original vector for IE2 or V as the original vector for
4036 // IE1.
4037 SmallBitVector ReusedIdx(
4038 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4039 bool IsReusedIdx = false;
4040 do {
4041 if (IE2 == VU && !IE1)
4042 return VU->hasOneUse();
4043 if (IE1 == V && !IE2)
4044 return V->hasOneUse();
4045 if (IE1 && IE1 != V) {
4046 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4047 IsReusedIdx |= ReusedIdx.test(Idx1);
4048 ReusedIdx.set(Idx1);
4049 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4050 IE1 = nullptr;
4051 else
4052 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4053 }
4054 if (IE2 && IE2 != VU) {
4055 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4056 IsReusedIdx |= ReusedIdx.test(Idx2);
4057 ReusedIdx.set(Idx2);
4058 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4059 IE2 = nullptr;
4060 else
4061 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4062 }
4063 } while (!IsReusedIdx && (IE1 || IE2));
4064 return false;
4065}
4066
4067std::optional<BoUpSLP::OrdersType>
4068BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4069 // No need to reorder if need to shuffle reuses, still need to shuffle the
4070 // node.
4071 if (!TE.ReuseShuffleIndices.empty()) {
4072 // Check if reuse shuffle indices can be improved by reordering.
4073 // For this, check that reuse mask is "clustered", i.e. each scalar values
4074 // is used once in each submask of size <number_of_scalars>.
4075 // Example: 4 scalar values.
4076 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4077 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4078 // element 3 is used twice in the second submask.
4079 unsigned Sz = TE.Scalars.size();
4080 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4081 Sz))
4082 return std::nullopt;
4083 unsigned VF = TE.getVectorFactor();
4084 // Try build correct order for extractelement instructions.
4085 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4086 TE.ReuseShuffleIndices.end());
4087 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4088 all_of(TE.Scalars, [Sz](Value *V) {
4089 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4090 return Idx && *Idx < Sz;
4091 })) {
4092 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4093 if (TE.ReorderIndices.empty())
4094 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4095 else
4096 inversePermutation(TE.ReorderIndices, ReorderMask);
4097 for (unsigned I = 0; I < VF; ++I) {
4098 int &Idx = ReusedMask[I];
4099 if (Idx == PoisonMaskElem)
4100 continue;
4101 Value *V = TE.Scalars[ReorderMask[Idx]];
4102 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4103 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4104 }
4105 }
4106 // Build the order of the VF size, need to reorder reuses shuffles, they are
4107 // always of VF size.
4108 OrdersType ResOrder(VF);
4109 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4110 auto *It = ResOrder.begin();
4111 for (unsigned K = 0; K < VF; K += Sz) {
4112 OrdersType CurrentOrder(TE.ReorderIndices);
4113 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4114 if (SubMask.front() == PoisonMaskElem)
4115 std::iota(SubMask.begin(), SubMask.end(), 0);
4116 reorderOrder(CurrentOrder, SubMask);
4117 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4118 std::advance(It, Sz);
4119 }
4120 if (all_of(enumerate(ResOrder),
4121 [](const auto &Data) { return Data.index() == Data.value(); }))
4122 return std::nullopt; // No need to reorder.
4123 return std::move(ResOrder);
4124 }
4125 if ((TE.State == TreeEntry::Vectorize ||
4126 TE.State == TreeEntry::PossibleStridedVectorize) &&
4127 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4128 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4129 !TE.isAltShuffle())
4130 return TE.ReorderIndices;
4131 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4132 auto PHICompare = [&](unsigned I1, unsigned I2) {
4133 Value *V1 = TE.Scalars[I1];
4134 Value *V2 = TE.Scalars[I2];
4135 if (V1 == V2)
4136 return false;
4137 if (!V1->hasOneUse() || !V2->hasOneUse())
4138 return false;
4139 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4140 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4141 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4142 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4144 IE1, IE2,
4145 [](InsertElementInst *II) { return II->getOperand(0); }))
4146 return false;
4147 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4148 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4149 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4150 return false;
4151 return *Idx1 < *Idx2;
4152 }
4153 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4154 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4155 if (EE1->getOperand(0) != EE2->getOperand(0))
4156 return false;
4157 std::optional<unsigned> Idx1 = getExtractIndex(EE1);
4158 std::optional<unsigned> Idx2 = getExtractIndex(EE2);
4159 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4160 return false;
4161 return *Idx1 < *Idx2;
4162 }
4163 return false;
4164 };
4165 auto IsIdentityOrder = [](const OrdersType &Order) {
4166 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4167 if (Idx != Order[Idx])
4168 return false;
4169 return true;
4170 };
4171 if (!TE.ReorderIndices.empty())
4172 return TE.ReorderIndices;
4174 SmallVector<unsigned> Phis(TE.Scalars.size());
4175 std::iota(Phis.begin(), Phis.end(), 0);
4176 OrdersType ResOrder(TE.Scalars.size());
4177 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4178 PhiToId[Id] = Id;
4179 stable_sort(Phis, PHICompare);
4180 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4181 ResOrder[Id] = PhiToId[Phis[Id]];
4182 if (IsIdentityOrder(ResOrder))
4183 return std::nullopt; // No need to reorder.
4184 return std::move(ResOrder);
4185 }
4186 if (TE.State == TreeEntry::NeedToGather) {
4187 // TODO: add analysis of other gather nodes with extractelement
4188 // instructions and other values/instructions, not only undefs.
4189 if (((TE.getOpcode() == Instruction::ExtractElement &&
4190 !TE.isAltShuffle()) ||
4191 (all_of(TE.Scalars,
4192 [](Value *V) {
4193 return isa<UndefValue, ExtractElementInst>(V);
4194 }) &&
4195 any_of(TE.Scalars,
4196 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
4197 all_of(TE.Scalars,
4198 [](Value *V) {
4199 auto *EE = dyn_cast<ExtractElementInst>(V);
4200 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4201 }) &&
4202 allSameType(TE.Scalars)) {
4203 // Check that gather of extractelements can be represented as
4204 // just a shuffle of a single vector.
4205 OrdersType CurrentOrder;
4206 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4207 /*ResizeAllowed=*/true);
4208 if (Reuse || !CurrentOrder.empty()) {
4209 if (!CurrentOrder.empty())
4210 fixupOrderingIndices(CurrentOrder);
4211 return std::move(CurrentOrder);
4212 }
4213 }
4214 // If the gather node is <undef, v, .., poison> and
4215 // insertelement poison, v, 0 [+ permute]
4216 // is cheaper than
4217 // insertelement poison, v, n - try to reorder.
4218 // If rotating the whole graph, exclude the permute cost, the whole graph
4219 // might be transformed.
4220 int Sz = TE.Scalars.size();
4221 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4222 count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
4223 const auto *It =
4224 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4225 if (It == TE.Scalars.begin())
4226 return OrdersType();
4227 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4228 if (It != TE.Scalars.end()) {
4229 OrdersType Order(Sz, Sz);
4230 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4231 Order[Idx] = 0;
4232 fixupOrderingIndices(Order);
4233 SmallVector<int> Mask;
4234 inversePermutation(Order, Mask);
4235 InstructionCost PermuteCost =
4236 TopToBottom
4237 ? 0
4239 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4240 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4241 PoisonValue::get(Ty), *It);
4242 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4243 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4244 PoisonValue::get(Ty), *It);
4245 if (InsertFirstCost + PermuteCost < InsertIdxCost)
4246 return std::move(Order);
4247 }
4248 }
4249 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4250 return CurrentOrder;
4251 if (TE.Scalars.size() >= 4)
4252 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4253 return Order;
4254 }
4255 return std::nullopt;
4256}
4257
4258/// Checks if the given mask is a "clustered" mask with the same clusters of
4259/// size \p Sz, which are not identity submasks.
4261 unsigned Sz) {
4262 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4263 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4264 return false;
4265 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4266 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4267 if (Cluster != FirstCluster)
4268 return false;
4269 }
4270 return true;
4271}
4272
4273void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4274 // Reorder reuses mask.
4275 reorderReuses(TE.ReuseShuffleIndices, Mask);
4276 const unsigned Sz = TE.Scalars.size();
4277 // For vectorized and non-clustered reused no need to do anything else.
4278 if (TE.State != TreeEntry::NeedToGather ||
4280 Sz) ||
4281 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4282 return;
4283 SmallVector<int> NewMask;
4284 inversePermutation(TE.ReorderIndices, NewMask);
4285 addMask(NewMask, TE.ReuseShuffleIndices);
4286 // Clear reorder since it is going to be applied to the new mask.
4287 TE.ReorderIndices.clear();
4288 // Try to improve gathered nodes with clustered reuses, if possible.
4289 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4290 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4291 inversePermutation(NewOrder, NewMask);
4292 reorderScalars(TE.Scalars, NewMask);
4293 // Fill the reuses mask with the identity submasks.
4294 for (auto *It = TE.ReuseShuffleIndices.begin(),
4295 *End = TE.ReuseShuffleIndices.end();
4296 It != End; std::advance(It, Sz))
4297 std::iota(It, std::next(It, Sz), 0);
4298}
4299
4301 // Maps VF to the graph nodes.
4303 // ExtractElement gather nodes which can be vectorized and need to handle
4304 // their ordering.
4306
4307 // Phi nodes can have preferred ordering based on their result users
4309
4310 // AltShuffles can also have a preferred ordering that leads to fewer
4311 // instructions, e.g., the addsub instruction in x86.
4312 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4313
4314 // Maps a TreeEntry to the reorder indices of external users.
4316 ExternalUserReorderMap;
4317 // FIXME: Workaround for syntax error reported by MSVC buildbots.
4318 TargetTransformInfo &TTIRef = *TTI;
4319 // Find all reorderable nodes with the given VF.
4320 // Currently the are vectorized stores,loads,extracts + some gathering of
4321 // extracts.
4322 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
4323 &GathersToOrders, &ExternalUserReorderMap,
4324 &AltShufflesToOrders, &PhisToOrders](
4325 const std::unique_ptr<TreeEntry> &TE) {
4326 // Look for external users that will probably be vectorized.
4327 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4328 findExternalStoreUsersReorderIndices(TE.get());
4329 if (!ExternalUserReorderIndices.empty()) {
4330 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4331 ExternalUserReorderMap.try_emplace(TE.get(),
4332 std::move(ExternalUserReorderIndices));
4333 }
4334
4335 // Patterns like [fadd,fsub] can be combined into a single instruction in
4336 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4337 // to take into account their order when looking for the most used order.
4338 if (TE->isAltShuffle()) {
4339 VectorType *VecTy =
4340 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4341 unsigned Opcode0 = TE->getOpcode();
4342 unsigned Opcode1 = TE->getAltOpcode();
4343 // The opcode mask selects between the two opcodes.
4344 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4345 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4346 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4347 OpcodeMask.set(Lane);
4348 // If this pattern is supported by the target then we consider the order.
4349 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4350 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4351 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4352 }
4353 // TODO: Check the reverse order too.
4354 }
4355
4356 if (std::optional<OrdersType> CurrentOrder =
4357 getReorderingData(*TE, /*TopToBottom=*/true)) {
4358 // Do not include ordering for nodes used in the alt opcode vectorization,
4359 // better to reorder them during bottom-to-top stage. If follow the order
4360 // here, it causes reordering of the whole graph though actually it is
4361 // profitable just to reorder the subgraph that starts from the alternate
4362 // opcode vectorization node. Such nodes already end-up with the shuffle
4363 // instruction and it is just enough to change this shuffle rather than
4364 // rotate the scalars for the whole graph.
4365 unsigned Cnt = 0;
4366 const TreeEntry *UserTE = TE.get();
4367 while (UserTE && Cnt < RecursionMaxDepth) {
4368 if (UserTE->UserTreeIndices.size() != 1)
4369 break;
4370 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4371 return EI.UserTE->State == TreeEntry::Vectorize &&
4372 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4373 }))
4374 return;
4375 UserTE = UserTE->UserTreeIndices.back().UserTE;
4376 ++Cnt;
4377 }
4378 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4379 if (!(TE->State == TreeEntry::Vectorize ||
4380 TE->State == TreeEntry::PossibleStridedVectorize) ||
4381 !TE->ReuseShuffleIndices.empty())
4382 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4383 if (TE->State == TreeEntry::Vectorize &&
4384 TE->getOpcode() == Instruction::PHI)
4385 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4386 }
4387 });
4388
4389 // Reorder the graph nodes according to their vectorization factor.
4390 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4391 VF /= 2) {
4392 auto It = VFToOrderedEntries.find(VF);
4393 if (It == VFToOrderedEntries.end())
4394 continue;
4395 // Try to find the most profitable order. We just are looking for the most
4396 // used order and reorder scalar elements in the nodes according to this
4397 // mostly used order.
4398 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
4399 // All operands are reordered and used only in this node - propagate the
4400 // most used order to the user node.
4403 OrdersUses;
4404 // Last chance orders - scatter vectorize. Try to use their orders if no
4405 // other orders or the order is counted already.
4406 SmallVector<OrdersType> StridedVectorizeOrders;
4408 for (const TreeEntry *OpTE : OrderedEntries) {
4409 // No need to reorder this nodes, still need to extend and to use shuffle,
4410 // just need to merge reordering shuffle and the reuse shuffle.
4411 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4412 continue;
4413 // Count number of orders uses.
4414 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4415 &PhisToOrders]() -> const OrdersType & {
4416 if (OpTE->State == TreeEntry::NeedToGather ||
4417 !OpTE->ReuseShuffleIndices.empty()) {
4418 auto It = GathersToOrders.find(OpTE);
4419 if (It != GathersToOrders.end())
4420 return It->second;
4421 }
4422 if (OpTE->isAltShuffle()) {
4423 auto It = AltShufflesToOrders.find(OpTE);
4424 if (It != AltShufflesToOrders.end())
4425 return It->second;
4426 }
4427 if (OpTE->State == TreeEntry::Vectorize &&
4428 OpTE->getOpcode() == Instruction::PHI) {
4429 auto It = PhisToOrders.find(OpTE);
4430 if (It != PhisToOrders.end())
4431 return It->second;
4432 }
4433 return OpTE->ReorderIndices;
4434 }();
4435 // First consider the order of the external scalar users.
4436 auto It = ExternalUserReorderMap.find(OpTE);
4437 if (It != ExternalUserReorderMap.end()) {
4438 const auto &ExternalUserReorderIndices = It->second;
4439 // If the OpTE vector factor != number of scalars - use natural order,
4440 // it is an attempt to reorder node with reused scalars but with
4441 // external uses.
4442 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4443 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
4444 ExternalUserReorderIndices.size();
4445 } else {
4446 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
4447 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4448 }
4449 // No other useful reorder data in this entry.
4450 if (Order.empty())
4451 continue;
4452 }
4453 // Postpone scatter orders.
4454 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4455 StridedVectorizeOrders.push_back(Order);
4456 continue;
4457 }
4458 // Stores actually store the mask, not the order, need to invert.
4459 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4460 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4461 SmallVector<int> Mask;
4462 inversePermutation(Order, Mask);
4463 unsigned E = Order.size();
4464 OrdersType CurrentOrder(E, E);
4465 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4466 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4467 });
4468 fixupOrderingIndices(CurrentOrder);
4469 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4470 } else {
4471 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4472 }
4473 }
4474 // Set order of the user node.
4475 if (OrdersUses.empty()) {
4476 if (StridedVectorizeOrders.empty())
4477 continue;
4478 // Add (potentially!) strided vectorize orders.
4479 for (OrdersType &Order : StridedVectorizeOrders)
4480 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4481 } else {
4482 // Account (potentially!) strided vectorize orders only if it was used
4483 // already.
4484 for (OrdersType &Order : StridedVectorizeOrders) {
4485 auto *It = OrdersUses.find(Order);
4486 if (It != OrdersUses.end())
4487 ++It->second;
4488 }
4489 }
4490 // Choose the most used order.
4491 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4492 unsigned Cnt = OrdersUses.front().second;
4493 for (const auto &Pair : drop_begin(OrdersUses)) {
4494 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4495 BestOrder = Pair.first;
4496 Cnt = Pair.second;
4497 }
4498 }
4499 // Set order of the user node.
4500 if (BestOrder.empty())
4501 continue;
4502 SmallVector<int> Mask;
4503 inversePermutation(BestOrder, Mask);
4504 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4505 unsigned E = BestOrder.size();
4506 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4507 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4508 });
4509 // Do an actual reordering, if profitable.
4510 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4511 // Just do the reordering for the nodes with the given VF.
4512 if (TE->Scalars.size() != VF) {
4513 if (TE->ReuseShuffleIndices.size() == VF) {
4514 // Need to reorder the reuses masks of the operands with smaller VF to
4515 // be able to find the match between the graph nodes and scalar
4516 // operands of the given node during vectorization/cost estimation.
4517 assert(all_of(TE->UserTreeIndices,
4518 [VF, &TE](const EdgeInfo &EI) {
4519 return EI.UserTE->Scalars.size() == VF ||
4520 EI.UserTE->Scalars.size() ==
4521 TE->Scalars.size();
4522 }) &&
4523 "All users must be of VF size.");
4524 // Update ordering of the operands with the smaller VF than the given
4525 // one.
4526 reorderNodeWithReuses(*TE, Mask);
4527 }
4528 continue;
4529 }
4530 if ((TE->State == TreeEntry::Vectorize ||
4531 TE->State == TreeEntry::PossibleStridedVectorize) &&
4533 InsertElementInst>(TE->getMainOp()) &&
4534 !TE->isAltShuffle()) {
4535 // Build correct orders for extract{element,value}, loads and
4536 // stores.
4537 reorderOrder(TE->ReorderIndices, Mask);
4538 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4539 TE->reorderOperands(Mask);
4540 } else {
4541 // Reorder the node and its operands.
4542 TE->reorderOperands(Mask);
4543 assert(TE->ReorderIndices.empty() &&
4544 "Expected empty reorder sequence.");
4545 reorderScalars(TE->Scalars, Mask);
4546 }
4547 if (!TE->ReuseShuffleIndices.empty()) {
4548 // Apply reversed order to keep the original ordering of the reused
4549 // elements to avoid extra reorder indices shuffling.
4550 OrdersType CurrentOrder;
4551 reorderOrder(CurrentOrder, MaskOrder);
4552 SmallVector<int> NewReuses;
4553 inversePermutation(CurrentOrder, NewReuses);
4554 addMask(NewReuses, TE->ReuseShuffleIndices);
4555 TE->ReuseShuffleIndices.swap(NewReuses);
4556 }
4557 }
4558 }
4559}
4560
4561bool BoUpSLP::canReorderOperands(
4562 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4563 ArrayRef<TreeEntry *> ReorderableGathers,
4564 SmallVectorImpl<TreeEntry *> &GatherOps) {
4565 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4566 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4567 return OpData.first == I &&
4568 OpData.second->State == TreeEntry::Vectorize;
4569 }))
4570 continue;
4571 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
4572 // FIXME: Do not reorder (possible!) strided vectorized nodes, they
4573 // require reordering of the operands, which is not implemented yet.
4574 if (TE->State == TreeEntry::PossibleStridedVectorize)
4575 return false;
4576 // Do not reorder if operand node is used by many user nodes.
4577 if (any_of(TE->UserTreeIndices,
4578 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4579 return false;
4580 // Add the node to the list of the ordered nodes with the identity
4581 // order.
4582 Edges.emplace_back(I, TE);
4583 // Add ScatterVectorize nodes to the list of operands, where just
4584 // reordering of the scalars is required. Similar to the gathers, so
4585 // simply add to the list of gathered ops.
4586 // If there are reused scalars, process this node as a regular vectorize
4587 // node, just reorder reuses mask.
4588 if (TE->State != TreeEntry::Vectorize &&
4589 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
4590 GatherOps.push_back(TE);
4591 continue;
4592 }
4593 TreeEntry *Gather = nullptr;
4594 if (count_if(ReorderableGathers,
4595 [&Gather, UserTE, I](TreeEntry *TE) {
4596 assert(TE->State != TreeEntry::Vectorize &&
4597 "Only non-vectorized nodes are expected.");
4598 if (any_of(TE->UserTreeIndices,
4599 [UserTE, I](const EdgeInfo &EI) {
4600 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4601 })) {
4602 assert(TE->isSame(UserTE->getOperand(I)) &&
4603 "Operand entry does not match operands.");
4604 Gather = TE;
4605 return true;
4606 }
4607 return false;
4608 }) > 1 &&
4609 !allConstant(UserTE->getOperand(I)))
4610 return false;
4611 if (Gather)
4612 GatherOps.push_back(Gather);
4613 }
4614 return true;
4615}
4616
4617void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4618 SetVector<TreeEntry *> OrderedEntries;
4620 // Find all reorderable leaf nodes with the given VF.
4621 // Currently the are vectorized loads,extracts without alternate operands +
4622 // some gathering of extracts.
4623 SmallVector<TreeEntry *> NonVectorized;
4624 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4625 if (TE->State != TreeEntry::Vectorize &&
4626 TE->State != TreeEntry::PossibleStridedVectorize)
4627 NonVectorized.push_back(TE.get());
4628 if (std::optional<OrdersType> CurrentOrder =
4629 getReorderingData(*TE, /*TopToBottom=*/false)) {
4630 OrderedEntries.insert(TE.get());
4631 if (!(TE->State == TreeEntry::Vectorize ||
4632 TE->State == TreeEntry::PossibleStridedVectorize) ||
4633 !TE->ReuseShuffleIndices.empty())
4634 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4635 }
4636 }
4637
4638 // 1. Propagate order to the graph nodes, which use only reordered nodes.
4639 // I.e., if the node has operands, that are reordered, try to make at least
4640 // one operand order in the natural order and reorder others + reorder the
4641 // user node itself.
4643 while (!OrderedEntries.empty()) {
4644 // 1. Filter out only reordered nodes.
4645 // 2. If the entry has multiple uses - skip it and jump to the next node.
4647 SmallVector<TreeEntry *> Filtered;
4648 for (TreeEntry *TE : OrderedEntries) {
4649 if (!(TE->State == TreeEntry::Vectorize ||
4650 TE->State == TreeEntry::PossibleStridedVectorize ||
4651 (TE->State == TreeEntry::NeedToGather &&
4652 GathersToOrders.count(TE))) ||
4653 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4654 !all_of(drop_begin(TE->UserTreeIndices),
4655 [TE](const EdgeInfo &EI) {
4656 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4657 }) ||
4658 !Visited.insert(TE).second) {
4659 Filtered.push_back(TE);
4660 continue;
4661 }
4662 // Build a map between user nodes and their operands order to speedup
4663 // search. The graph currently does not provide this dependency directly.
4664 for (EdgeInfo &EI : TE->UserTreeIndices) {
4665 TreeEntry *UserTE = EI.UserTE;
4666 auto It = Users.find(UserTE);
4667 if (It == Users.end())
4668 It = Users.insert({UserTE, {}}).first;
4669 It->second.emplace_back(EI.EdgeIdx, TE);
4670 }
4671 }
4672 // Erase filtered entries.
4673 for (TreeEntry *TE : Filtered)
4674 OrderedEntries.remove(TE);
4676 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4677 UsersVec(Users.begin(), Users.end());
4678 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
4679 return Data1.first->Idx > Data2.first->Idx;
4680 });
4681 for (auto &Data : UsersVec) {
4682 // Check that operands are used only in the User node.
4683 SmallVector<TreeEntry *> GatherOps;
4684 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
4685 GatherOps)) {
4686 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4687 OrderedEntries.remove(Op.second);
4688 continue;
4689 }
4690 // All operands are reordered and used only in this node - propagate the
4691 // most used order to the user node.
4694 OrdersUses;
4695 // Last chance orders - scatter vectorize. Try to use their orders if no
4696 // other orders or the order is counted already.
4697 SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
4698 // Do the analysis for each tree entry only once, otherwise the order of
4699 // the same node my be considered several times, though might be not
4700 // profitable.
4703 for (const auto &Op : Data.second) {
4704 TreeEntry *OpTE = Op.second;
4705 if (!VisitedOps.insert(OpTE).second)
4706 continue;
4707 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4708 continue;
4709 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
4710 if (OpTE->State == TreeEntry::NeedToGather ||
4711 !OpTE->ReuseShuffleIndices.empty())
4712 return GathersToOrders.find(OpTE)->second;
4713 return OpTE->ReorderIndices;
4714 }();
4715 unsigned NumOps = count_if(
4716 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
4717 return P.second == OpTE;
4718 });
4719 // Postpone scatter orders.
4720 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4721 StridedVectorizeOrders.emplace_back(Order, NumOps);
4722 continue;
4723 }
4724 // Stores actually store the mask, not the order, need to invert.
4725 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4726 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4727 SmallVector<int> Mask;
4728 inversePermutation(Order, Mask);
4729 unsigned E = Order.size();
4730 OrdersType CurrentOrder(E, E);
4731 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4732 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4733 });
4734 fixupOrderingIndices(CurrentOrder);
4735 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4736 NumOps;
4737 } else {
4738 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4739 }
4740 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
4741 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4742 const TreeEntry *TE) {
4743 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4744 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4745 (IgnoreReorder && TE->Idx == 0))
4746 return true;
4747 if (TE->State == TreeEntry::NeedToGather) {
4748 auto It = GathersToOrders.find(TE);
4749 if (It != GathersToOrders.end())
4750 return !It->second.empty();
4751 return true;
4752 }
4753 return false;
4754 };
4755 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
4756 TreeEntry *UserTE = EI.UserTE;
4757 if (!VisitedUsers.insert(UserTE).second)
4758 continue;
4759 // May reorder user node if it requires reordering, has reused
4760 // scalars, is an alternate op vectorize node or its op nodes require
4761 // reordering.
4762 if (AllowsReordering(UserTE))
4763 continue;
4764 // Check if users allow reordering.
4765 // Currently look up just 1 level of operands to avoid increase of
4766 // the compile time.
4767 // Profitable to reorder if definitely more operands allow
4768 // reordering rather than those with natural order.
4770 if (static_cast<unsigned>(count_if(
4771 Ops, [UserTE, &AllowsReordering](
4772 const std::pair<unsigned, TreeEntry *> &Op) {
4773 return AllowsReordering(Op.second) &&
4774 all_of(Op.second->UserTreeIndices,
4775 [UserTE](const EdgeInfo &EI) {
4776 return EI.UserTE == UserTE;
4777 });
4778 })) <= Ops.size() / 2)
4779 ++Res.first->second;
4780 }
4781 }
4782 // If no orders - skip current nodes and jump to the next one, if any.
4783 if (OrdersUses.empty()) {
4784 if (StridedVectorizeOrders.empty() ||
4785 (Data.first->ReorderIndices.empty() &&
4786 Data.first->ReuseShuffleIndices.empty() &&
4787 !(IgnoreReorder &&
4788 Data.first == VectorizableTree.front().get()))) {
4789 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4790 OrderedEntries.remove(Op.second);
4791 continue;
4792 }
4793 // Add (potentially!) strided vectorize orders.
4794 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
4795 OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
4796 Pair.second;
4797 } else {
4798 // Account (potentially!) strided vectorize orders only if it was used
4799 // already.
4800 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
4801 auto *It = OrdersUses.find(Pair.first);
4802 if (It != OrdersUses.end())
4803 It->second += Pair.second;
4804 }
4805 }
4806 // Choose the best order.
4807 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4808 unsigned Cnt = OrdersUses.front().second;
4809 for (const auto &Pair : drop_begin(OrdersUses)) {
4810 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4811 BestOrder = Pair.first;
4812 Cnt = Pair.second;
4813 }
4814 }
4815 // Set order of the user node (reordering of operands and user nodes).
4816 if (BestOrder.empty()) {
4817 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4818 OrderedEntries.remove(Op.second);
4819 continue;
4820 }
4821 // Erase operands from OrderedEntries list and adjust their orders.
4822 VisitedOps.clear();
4823 SmallVector<int> Mask;
4824 inversePermutation(BestOrder, Mask);
4825 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4826 unsigned E = BestOrder.size();
4827 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4828 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4829 });
4830 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
4831 TreeEntry *TE = Op.second;
4832 OrderedEntries.remove(TE);
4833 if (!VisitedOps.insert(TE).second)
4834 continue;
4835 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
4836 reorderNodeWithReuses(*TE, Mask);
4837 continue;
4838 }
4839 // Gathers are processed separately.
4840 if (TE->State != TreeEntry::Vectorize &&
4841 TE->State != TreeEntry::PossibleStridedVectorize &&
4842 (TE->State != TreeEntry::ScatterVectorize ||
4843 TE->ReorderIndices.empty()))
4844 continue;
4845 assert((BestOrder.size() == TE->ReorderIndices.size() ||
4846 TE->ReorderIndices.empty()) &&
4847 "Non-matching sizes of user/operand entries.");
4848 reorderOrder(TE->ReorderIndices, Mask);
4849 if (IgnoreReorder && TE == VectorizableTree.front().get())
4850 IgnoreReorder = false;
4851 }
4852 // For gathers just need to reorder its scalars.
4853 for (TreeEntry *Gather : GatherOps) {
4854 assert(Gather->ReorderIndices.empty() &&
4855 "Unexpected reordering of gathers.");
4856 if (!Gather->ReuseShuffleIndices.empty()) {
4857 // Just reorder reuses indices.
4858 reorderReuses(Gather->ReuseShuffleIndices, Mask);
4859 continue;
4860 }
4861 reorderScalars(Gather->Scalars, Mask);
4862 OrderedEntries.remove(Gather);
4863 }
4864 // Reorder operands of the user node and set the ordering for the user
4865 // node itself.
4866 if (Data.first->State != TreeEntry::Vectorize ||
4867 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4868 Data.first->getMainOp()) ||
4869 Data.first->isAltShuffle())
4870 Data.first->reorderOperands(Mask);
4871 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
4872 Data.first->isAltShuffle() ||
4873 Data.first->State == TreeEntry::PossibleStridedVectorize) {
4874 reorderScalars(Data.first->Scalars, Mask);
4875 reorderOrder(Data.first->ReorderIndices, MaskOrder);
4876 if (Data.first->ReuseShuffleIndices.empty() &&
4877 !Data.first->ReorderIndices.empty() &&
4878 !Data.first->isAltShuffle()) {
4879 // Insert user node to the list to try to sink reordering deeper in
4880 // the graph.
4881 OrderedEntries.insert(Data.first);
4882 }
4883 } else {
4884 reorderOrder(Data.first->ReorderIndices, Mask);
4885 }
4886 }
4887 }
4888 // If the reordering is unnecessary, just remove the reorder.
4889 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4890 VectorizableTree.front()->ReuseShuffleIndices.empty())
4891 VectorizableTree.front()->ReorderIndices.clear();
4892}
4893
4895 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4896 // Collect the values that we need to extract from the tree.
4897 for (auto &TEPtr : VectorizableTree) {
4898 TreeEntry *Entry = TEPtr.get();
4899
4900 // No need to handle users of gathered values.
4901 if (Entry->State == TreeEntry::NeedToGather)
4902 continue;
4903
4904 // For each lane:
4905 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4906 Value *Scalar = Entry->Scalars[Lane];
4907 if (!isa<Instruction>(Scalar))
4908 continue;
4909 int FoundLane = Entry->findLaneForValue(Scalar);
4910
4911 // Check if the scalar is externally used as an extra arg.
4912 const auto *ExtI = ExternallyUsedValues.find(Scalar);
4913 if (ExtI != ExternallyUsedValues.end()) {
4914 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
4915 << Lane << " from " << *Scalar << ".\n");
4916 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
4917 }
4918 for (User *U : Scalar->users()) {
4919 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
4920
4921 Instruction *UserInst = dyn_cast<Instruction>(U);
4922 if (!UserInst)
4923 continue;
4924
4925 if (isDeleted(UserInst))
4926 continue;
4927
4928 // Skip in-tree scalars that become vectors
4929 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4930 Value *UseScalar = UseEntry->Scalars[0];
4931 // Some in-tree scalars will remain as scalar in vectorized
4932 // instructions. If that is the case, the one in Lane 0 will
4933 // be used.
4934 if (UseScalar != U ||
4935 UseEntry->State == TreeEntry::ScatterVectorize ||
4936 UseEntry->State == TreeEntry::PossibleStridedVectorize ||
4937 !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
4938 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
4939 << ".\n");
4940 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
4941 continue;
4942 }
4943 }
4944
4945 // Ignore users in the user ignore list.
4946 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4947 continue;
4948
4949 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
4950 << Lane << " from " << *Scalar << ".\n");
4951 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4952 }
4953 }
4954 }
4955}
4956
4958BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
4960 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4961 Value *V = TE->Scalars[Lane];
4962 // To save compilation time we don't visit if we have too many users.
4963 static constexpr unsigned UsersLimit = 4;
4964 if (V->hasNUsesOrMore(UsersLimit))
4965 break;
4966
4967 // Collect stores per pointer object.
4968 for (User *U : V->users()) {
4969 auto *SI = dyn_cast<StoreInst>(U);
4970 if (SI == nullptr || !SI->isSimple() ||
4971 !isValidElementType(SI->getValueOperand()->getType()))
4972 continue;
4973 // Skip entry if already
4974 if (getTreeEntry(U))
4975 continue;
4976
4977 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
4978 auto &StoresVec = PtrToStoresMap[Ptr];
4979 // For now just keep one store per pointer object per lane.
4980 // TODO: Extend this to support multiple stores per pointer per lane
4981 if (StoresVec.size() > Lane)
4982 continue;
4983 // Skip if in different BBs.
4984 if (!StoresVec.empty() &&
4985 SI->getParent() != StoresVec.back()->getParent())
4986 continue;
4987 // Make sure that the stores are of the same type.
4988 if (!StoresVec.empty() &&
4989 SI->getValueOperand()->getType() !=
4990 StoresVec.back()->getValueOperand()->getType())
4991 continue;
4992 StoresVec.push_back(SI);
4993 }
4994 }
4995 return PtrToStoresMap;
4996}
4997
4998bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
4999 OrdersType &ReorderIndices) const {
5000 // We check whether the stores in StoreVec can form a vector by sorting them
5001 // and checking whether they are consecutive.
5002
5003 // To avoid calling getPointersDiff() while sorting we create a vector of
5004 // pairs {store, offset from first} and sort this instead.
5005 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5006 StoreInst *S0 = StoresVec[0];
5007 StoreOffsetVec[0] = {S0, 0};
5008 Type *S0Ty = S0->getValueOperand()->getType();
5009 Value *S0Ptr = S0->getPointerOperand();
5010 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5011 StoreInst *SI = StoresVec[Idx];
5012 std::optional<int> Diff =
5013 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5014 SI->getPointerOperand(), *DL, *SE,
5015 /*StrictCheck=*/true);
5016 // We failed to compare the pointers so just abandon this StoresVec.
5017 if (!Diff)
5018 return false;
5019 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5020 }
5021
5022 // Sort the vector based on the pointers. We create a copy because we may
5023 // need the original later for calculating the reorder (shuffle) indices.
5024 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5025 const std::pair<StoreInst *, int> &Pair2) {
5026 int Offset1 = Pair1.second;
5027 int Offset2 = Pair2.second;
5028 return Offset1 < Offset2;
5029 });
5030
5031 // Check if the stores are consecutive by checking if their difference is 1.
5032 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5033 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5034 return false;
5035
5036 // Calculate the shuffle indices according to their offset against the sorted
5037 // StoreOffsetVec.
5038 ReorderIndices.reserve(StoresVec.size());
5039 for (StoreInst *SI : StoresVec) {
5040 unsigned Idx = find_if(StoreOffsetVec,
5041 [SI](const std::pair<StoreInst *, int> &Pair) {
5042 return Pair.first == SI;
5043 }) -
5044 StoreOffsetVec.begin();
5045 ReorderIndices.push_back(Idx);
5046 }
5047 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5048 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5049 // same convention here.
5050 auto IsIdentityOrder = [](const OrdersType &Order) {
5051