LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Predicate for the element types that the SLP vectorizer supports.
220///
221/// The most important thing to filter here are types which are invalid in LLVM
222/// vectors. We also filter target specific types which have absolutely no
223/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
224/// avoids spending time checking the cost model and realizing that they will
225/// be inevitably scalarized.
226static bool isValidElementType(Type *Ty) {
227 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
228 !Ty->isPPC_FP128Ty();
229}
230
231/// \returns True if the value is a constant (but not globals/constant
232/// expressions).
233static bool isConstant(Value *V) {
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
235}
236
237/// Checks if \p V is one of vector-like instructions, i.e. undef,
238/// insertelement/extractelement with constant indices for fixed vector type or
239/// extractvalue instruction.
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
243 return false;
244 auto *I = dyn_cast<Instruction>(V);
245 if (!I || isa<ExtractValueInst>(I))
246 return true;
247 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
248 return false;
249 if (isa<ExtractElementInst>(I))
250 return isConstant(I->getOperand(1));
251 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
252 return isConstant(I->getOperand(2));
253}
254
255#if !defined(NDEBUG)
256/// Print a short descriptor of the instruction bundle suitable for debug output.
257static std::string shortBundleName(ArrayRef<Value *> VL) {
258 std::string Result;
259 raw_string_ostream OS(Result);
260 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
261 OS.flush();
262 return Result;
263}
264#endif
265
266/// \returns true if all of the instructions in \p VL are in the same block or
267/// false otherwise.
269 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
270 if (!I0)
271 return false;
273 return true;
274
275 BasicBlock *BB = I0->getParent();
276 for (int I = 1, E = VL.size(); I < E; I++) {
277 auto *II = dyn_cast<Instruction>(VL[I]);
278 if (!II)
279 return false;
280
281 if (BB != II->getParent())
282 return false;
283 }
284 return true;
285}
286
287/// \returns True if all of the values in \p VL are constants (but not
288/// globals/constant expressions).
290 // Constant expressions and globals can't be vectorized like normal integer/FP
291 // constants.
292 return all_of(VL, isConstant);
293}
294
295/// \returns True if all of the values in \p VL are identical or some of them
296/// are UndefValue.
297static bool isSplat(ArrayRef<Value *> VL) {
298 Value *FirstNonUndef = nullptr;
299 for (Value *V : VL) {
300 if (isa<UndefValue>(V))
301 continue;
302 if (!FirstNonUndef) {
303 FirstNonUndef = V;
304 continue;
305 }
306 if (V != FirstNonUndef)
307 return false;
308 }
309 return FirstNonUndef != nullptr;
310}
311
312/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
314 if (auto *Cmp = dyn_cast<CmpInst>(I))
315 return Cmp->isCommutative();
316 if (auto *BO = dyn_cast<BinaryOperator>(I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
319 !BO->hasNUsesOrMore(UsesLimit) &&
320 all_of(
321 BO->uses(),
322 [](const Use &U) {
323 // Commutative, if icmp eq/ne sub, 0
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
328 return true;
329 // Commutative, if abs(sub nsw, true) or abs(sub, false).
330 ConstantInt *Flag;
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
335 Flag->isOne());
336 })) ||
337 (BO->getOpcode() == Instruction::FSub &&
338 !BO->hasNUsesOrMore(UsesLimit) &&
339 all_of(BO->uses(), [](const Use &U) {
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
342 }));
343 return I->isCommutative();
344}
345
346/// \returns inserting index of InsertElement or InsertValue instruction,
347/// using Offset as base offset for index.
348static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
349 unsigned Offset = 0) {
350 int Index = Offset;
351 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
353 if (!VT)
354 return std::nullopt;
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
356 if (!CI)
357 return std::nullopt;
358 if (CI->getValue().uge(VT->getNumElements()))
359 return std::nullopt;
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
362 return Index;
363 }
364
365 const auto *IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType = IV->getType();
367 for (unsigned I : IV->indices()) {
368 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(I);
371 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
374 } else {
375 return std::nullopt;
376 }
377 Index += I;
378 }
379 return Index;
380}
381
382namespace {
383/// Specifies the way the mask should be analyzed for undefs/poisonous elements
384/// in the shuffle mask.
385enum class UseMask {
386 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
387 ///< check for the mask elements for the first argument (mask
388 ///< indices are in range [0:VF)).
389 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
390 ///< for the mask elements for the second argument (mask indices
391 ///< are in range [VF:2*VF))
392 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
393 ///< future shuffle elements and mark them as ones as being used
394 ///< in future. Non-undef elements are considered as unused since
395 ///< they're already marked as used in the mask.
396};
397} // namespace
398
399/// Prepares a use bitset for the given mask either for the first argument or
400/// for the second.
402 UseMask MaskArg) {
403 SmallBitVector UseMask(VF, true);
404 for (auto [Idx, Value] : enumerate(Mask)) {
405 if (Value == PoisonMaskElem) {
406 if (MaskArg == UseMask::UndefsAsMask)
407 UseMask.reset(Idx);
408 continue;
409 }
410 if (MaskArg == UseMask::FirstArg && Value < VF)
411 UseMask.reset(Value);
412 else if (MaskArg == UseMask::SecondArg && Value >= VF)
413 UseMask.reset(Value - VF);
414 }
415 return UseMask;
416}
417
418/// Checks if the given value is actually an undefined constant vector.
419/// Also, if the \p UseMask is not empty, tries to check if the non-masked
420/// elements actually mask the insertelement buildvector, if any.
421template <bool IsPoisonOnly = false>
423 const SmallBitVector &UseMask = {}) {
424 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
426 if (isa<T>(V))
427 return Res;
428 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
429 if (!VecTy)
430 return Res.reset();
431 auto *C = dyn_cast<Constant>(V);
432 if (!C) {
433 if (!UseMask.empty()) {
434 const Value *Base = V;
435 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
438 continue;
439 std::optional<unsigned> Idx = getInsertIndex(II);
440 if (!Idx) {
441 Res.reset();
442 return Res;
443 }
444 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
445 Res.reset(*Idx);
446 }
447 // TODO: Add analysis for shuffles here too.
448 if (V == Base) {
449 Res.reset();
450 } else {
451 SmallBitVector SubMask(UseMask.size(), false);
452 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
453 }
454 } else {
455 Res.reset();
456 }
457 return Res;
458 }
459 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
460 if (Constant *Elem = C->getAggregateElement(I))
461 if (!isa<T>(Elem) &&
462 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
463 Res.reset(I);
464 }
465 return Res;
466}
467
468/// Checks if the vector of instructions can be represented as a shuffle, like:
469/// %x0 = extractelement <4 x i8> %x, i32 0
470/// %x3 = extractelement <4 x i8> %x, i32 3
471/// %y1 = extractelement <4 x i8> %y, i32 1
472/// %y2 = extractelement <4 x i8> %y, i32 2
473/// %x0x0 = mul i8 %x0, %x0
474/// %x3x3 = mul i8 %x3, %x3
475/// %y1y1 = mul i8 %y1, %y1
476/// %y2y2 = mul i8 %y2, %y2
477/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
478/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
479/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
480/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
481/// ret <4 x i8> %ins4
482/// can be transformed into:
483/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
484/// i32 6>
485/// %2 = mul <4 x i8> %1, %1
486/// ret <4 x i8> %2
487/// Mask will return the Shuffle Mask equivalent to the extracted elements.
488/// TODO: Can we split off and reuse the shuffle mask detection from
489/// ShuffleVectorInst/getShuffleCost?
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
493 if (It == VL.end())
494 return std::nullopt;
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
497 return std::nullopt;
498 unsigned Size =
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 = nullptr;
501 Value *Vec2 = nullptr;
502 enum ShuffleMode { Unknown, Select, Permute };
503 ShuffleMode CommonShuffleMode = Unknown;
504 Mask.assign(VL.size(), PoisonMaskElem);
505 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
506 // Undef can be represented as an undef element in a vector.
507 if (isa<UndefValue>(VL[I]))
508 continue;
509 auto *EI = cast<ExtractElementInst>(VL[I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
511 return std::nullopt;
512 auto *Vec = EI->getVectorOperand();
513 // We can extractelement from undef or poison vector.
514 if (isUndefVector(Vec).all())
515 continue;
516 // All vector operands must have the same number of vector elements.
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
518 return std::nullopt;
519 if (isa<UndefValue>(EI->getIndexOperand()))
520 continue;
521 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
522 if (!Idx)
523 return std::nullopt;
524 // Undefined behavior if Idx is negative or >= Size.
525 if (Idx->getValue().uge(Size))
526 continue;
527 unsigned IntIdx = Idx->getValue().getZExtValue();
528 Mask[I] = IntIdx;
529 // For correct shuffling we have to have at most 2 different vector operands
530 // in all extractelement instructions.
531 if (!Vec1 || Vec1 == Vec) {
532 Vec1 = Vec;
533 } else if (!Vec2 || Vec2 == Vec) {
534 Vec2 = Vec;
535 Mask[I] += Size;
536 } else {
537 return std::nullopt;
538 }
539 if (CommonShuffleMode == Permute)
540 continue;
541 // If the extract index is not the same as the operation number, it is a
542 // permutation.
543 if (IntIdx != I) {
544 CommonShuffleMode = Permute;
545 continue;
546 }
547 CommonShuffleMode = Select;
548 }
549 // If we're not crossing lanes in different vectors, consider it as blending.
550 if (CommonShuffleMode == Select && Vec2)
552 // If Vec2 was never used, we have a permutation of a single vector, otherwise
553 // we have permutation of 2 vectors.
556}
557
558/// \returns True if Extract{Value,Element} instruction extracts element Idx.
559static std::optional<unsigned> getExtractIndex(Instruction *E) {
560 unsigned Opcode = E->getOpcode();
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
566 if (!CI)
567 return std::nullopt;
568 return CI->getZExtValue();
569 }
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
572 return std::nullopt;
573 return *EI->idx_begin();
574}
575
576namespace {
577
578/// Main data required for vectorization of instructions.
579struct InstructionsState {
580 /// The very first instruction in the list with the main opcode.
581 Value *OpValue = nullptr;
582
583 /// The main/alternate instruction.
584 Instruction *MainOp = nullptr;
585 Instruction *AltOp = nullptr;
586
587 /// The main/alternate opcodes for the list of instructions.
588 unsigned getOpcode() const {
589 return MainOp ? MainOp->getOpcode() : 0;
590 }
591
592 unsigned getAltOpcode() const {
593 return AltOp ? AltOp->getOpcode() : 0;
594 }
595
596 /// Some of the instructions in the list have alternate opcodes.
597 bool isAltShuffle() const { return AltOp != MainOp; }
598
599 bool isOpcodeOrAlt(Instruction *I) const {
600 unsigned CheckedOpcode = I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
602 }
603
604 InstructionsState() = delete;
605 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
607};
608
609} // end anonymous namespace
610
611/// Chooses the correct key for scheduling data. If \p Op has the same (or
612/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
613/// OpValue.
614static Value *isOneOf(const InstructionsState &S, Value *Op) {
615 auto *I = dyn_cast<Instruction>(Op);
616 if (I && S.isOpcodeOrAlt(I))
617 return Op;
618 return S.OpValue;
619}
620
621/// \returns true if \p Opcode is allowed as part of the main/alternate
622/// instruction for SLP vectorization.
623///
624/// Example of unsupported opcode is SDIV that can potentially cause UB if the
625/// "shuffled out" lane would result in division by zero.
626static bool isValidForAlternation(unsigned Opcode) {
627 if (Instruction::isIntDivRem(Opcode))
628 return false;
629
630 return true;
631}
632
633static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
634 const TargetLibraryInfo &TLI,
635 unsigned BaseIndex = 0);
636
637/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
638/// compatible instructions or constants, or just some other regular values.
639static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
640 Value *Op1, const TargetLibraryInfo &TLI) {
641 return (isConstant(BaseOp0) && isConstant(Op0)) ||
642 (isConstant(BaseOp1) && isConstant(Op1)) ||
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
646 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
647 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
648}
649
650/// \returns true if a compare instruction \p CI has similar "look" and
651/// same predicate as \p BaseCI, "as is" or with its operands and predicate
652/// swapped, false otherwise.
653static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
654 const TargetLibraryInfo &TLI) {
655 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
656 "Assessing comparisons of different types?");
657 CmpInst::Predicate BasePred = BaseCI->getPredicate();
658 CmpInst::Predicate Pred = CI->getPredicate();
660
661 Value *BaseOp0 = BaseCI->getOperand(0);
662 Value *BaseOp1 = BaseCI->getOperand(1);
663 Value *Op0 = CI->getOperand(0);
664 Value *Op1 = CI->getOperand(1);
665
666 return (BasePred == Pred &&
667 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
668 (BasePred == SwappedPred &&
669 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
670}
671
672/// \returns analysis of the Instructions in \p VL described in
673/// InstructionsState, the Opcode that we suppose the whole list
674/// could be vectorized even if its structure is diverse.
675static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
676 const TargetLibraryInfo &TLI,
677 unsigned BaseIndex) {
678 // Make sure these are all Instructions.
679 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
680 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
681
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
685 CmpInst::Predicate BasePred =
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
691
692 bool SwappedPredsCompatible = [&]() {
693 if (!IsCmpOp)
694 return false;
695 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
696 UniquePreds.insert(BasePred);
697 UniqueNonSwappedPreds.insert(BasePred);
698 for (Value *V : VL) {
699 auto *I = dyn_cast<CmpInst>(V);
700 if (!I)
701 return false;
702 CmpInst::Predicate CurrentPred = I->getPredicate();
703 CmpInst::Predicate SwappedCurrentPred =
704 CmpInst::getSwappedPredicate(CurrentPred);
705 UniqueNonSwappedPreds.insert(CurrentPred);
706 if (!UniquePreds.contains(CurrentPred) &&
707 !UniquePreds.contains(SwappedCurrentPred))
708 UniquePreds.insert(CurrentPred);
709 }
710 // Total number of predicates > 2, but if consider swapped predicates
711 // compatible only 2, consider swappable predicates as compatible opcodes,
712 // not alternate.
713 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
714 }();
715 // Check for one alternate opcode from another BinaryOperator.
716 // TODO - generalize to support all operators (types, calls etc.).
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
718 Intrinsic::ID BaseID = 0;
719 SmallVector<VFInfo> BaseMappings;
720 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
722 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
723 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
724 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
725 }
726 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
727 auto *I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode = I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
731 continue;
732 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
733 isValidForAlternation(Opcode)) {
734 AltOpcode = InstOpcode;
735 AltIndex = Cnt;
736 continue;
737 }
738 } else if (IsCastOp && isa<CastInst>(I)) {
739 Value *Op0 = IBase->getOperand(0);
740 Type *Ty0 = Op0->getType();
741 Value *Op1 = I->getOperand(0);
742 Type *Ty1 = Op1->getType();
743 if (Ty0 == Ty1) {
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
745 continue;
746 if (Opcode == AltOpcode) {
748 isValidForAlternation(InstOpcode) &&
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
751 AltIndex = Cnt;
752 continue;
753 }
754 }
755 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
759 if (Ty0 == Ty1) {
760 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
761 // Check for compatible operands. If the corresponding operands are not
762 // compatible - need to perform alternate vectorization.
763 CmpInst::Predicate CurrentPred = Inst->getPredicate();
764 CmpInst::Predicate SwappedCurrentPred =
765 CmpInst::getSwappedPredicate(CurrentPred);
766
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
769 continue;
770
771 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
772 continue;
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
775 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
776 continue;
777 } else if (BasePred != CurrentPred) {
778 assert(
779 isValidForAlternation(InstOpcode) &&
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
781 AltIndex = Cnt;
782 continue;
783 }
784 CmpInst::Predicate AltPred = AltInst->getPredicate();
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
787 continue;
788 }
789 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
794 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
796 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
797 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
801 } else if (auto *Call = dyn_cast<CallInst>(I)) {
802 auto *CallBase = cast<CallInst>(IBase);
803 if (Call->getCalledFunction() != CallBase->getCalledFunction())
804 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
808 CallBase->op_begin() +
810 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
812 if (ID != BaseID)
813 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
814 if (!ID) {
815 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
816 if (Mappings.size() != BaseMappings.size() ||
817 Mappings.front().ISA != BaseMappings.front().ISA ||
818 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
819 Mappings.front().VectorName != BaseMappings.front().VectorName ||
820 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
821 Mappings.front().Shape.Parameters !=
822 BaseMappings.front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
824 }
825 }
826 continue;
827 }
828 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
829 }
830
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
833}
834
835/// \returns true if all of the values in \p VL have the same type or false
836/// otherwise.
838 Type *Ty = VL.front()->getType();
839 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
840}
841
842/// \returns True if in-tree use also needs extract. This refers to
843/// possible scalar operand in vectorized instruction.
844static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
845 TargetLibraryInfo *TLI) {
846 unsigned Opcode = UserInst->getOpcode();
847 switch (Opcode) {
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
850 return (LI->getPointerOperand() == Scalar);
851 }
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
855 }
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
859 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
862 });
863 }
864 default:
865 return false;
866 }
867}
868
869/// \returns the AA location that is being access by the instruction.
871 if (StoreInst *SI = dyn_cast<StoreInst>(I))
872 return MemoryLocation::get(SI);
873 if (LoadInst *LI = dyn_cast<LoadInst>(I))
874 return MemoryLocation::get(LI);
875 return MemoryLocation();
876}
877
878/// \returns True if the instruction is not a volatile or atomic load/store.
879static bool isSimple(Instruction *I) {
880 if (LoadInst *LI = dyn_cast<LoadInst>(I))
881 return LI->isSimple();
882 if (StoreInst *SI = dyn_cast<StoreInst>(I))
883 return SI->isSimple();
884 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
885 return !MI->isVolatile();
886 return true;
887}
888
889/// Shuffles \p Mask in accordance with the given \p SubMask.
890/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
891/// one but two input vectors.
892static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
893 bool ExtendingManyInputs = false) {
894 if (SubMask.empty())
895 return;
896 assert(
897 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
898 // Check if input scalars were extended to match the size of other node.
899 (SubMask.size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
903 if (Mask.empty()) {
904 Mask.append(SubMask.begin(), SubMask.end());
905 return;
906 }
907 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
908 int TermValue = std::min(Mask.size(), SubMask.size());
909 for (int I = 0, E = SubMask.size(); I < E; ++I) {
910 if (SubMask[I] == PoisonMaskElem ||
911 (!ExtendingManyInputs &&
912 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
913 continue;
914 NewMask[I] = Mask[SubMask[I]];
915 }
916 Mask.swap(NewMask);
917}
918
919/// Order may have elements assigned special value (size) which is out of
920/// bounds. Such indices only appear on places which correspond to undef values
921/// (see canReuseExtract for details) and used in order to avoid undef values
922/// have effect on operands ordering.
923/// The first loop below simply finds all unused indices and then the next loop
924/// nest assigns these indices for undef values positions.
925/// As an example below Order has two undef positions and they have assigned
926/// values 3 and 7 respectively:
927/// before: 6 9 5 4 9 2 1 0
928/// after: 6 3 5 4 7 2 1 0
930 const unsigned Sz = Order.size();
931 SmallBitVector UnusedIndices(Sz, /*t=*/true);
932 SmallBitVector MaskedIndices(Sz);
933 for (unsigned I = 0; I < Sz; ++I) {
934 if (Order[I] < Sz)
935 UnusedIndices.reset(Order[I]);
936 else
937 MaskedIndices.set(I);
938 }
939 if (MaskedIndices.none())
940 return;
941 assert(UnusedIndices.count() == MaskedIndices.count() &&
942 "Non-synced masked/available indices.");
943 int Idx = UnusedIndices.find_first();
944 int MIdx = MaskedIndices.find_first();
945 while (MIdx >= 0) {
946 assert(Idx >= 0 && "Indices must be synced.");
947 Order[MIdx] = Idx;
948 Idx = UnusedIndices.find_next(Idx);
949 MIdx = MaskedIndices.find_next(MIdx);
950 }
951}
952
953namespace llvm {
954
956 SmallVectorImpl<int> &Mask) {
957 Mask.clear();
958 const unsigned E = Indices.size();
959 Mask.resize(E, PoisonMaskElem);
960 for (unsigned I = 0; I < E; ++I)
961 Mask[Indices[I]] = I;
962}
963
964/// Reorders the list of scalars in accordance with the given \p Mask.
966 ArrayRef<int> Mask) {
967 assert(!Mask.empty() && "Expected non-empty mask.");
968 SmallVector<Value *> Prev(Scalars.size(),
969 UndefValue::get(Scalars.front()->getType()));
970 Prev.swap(Scalars);
971 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
972 if (Mask[I] != PoisonMaskElem)
973 Scalars[Mask[I]] = Prev[I];
974}
975
976/// Checks if the provided value does not require scheduling. It does not
977/// require scheduling if this is not an instruction or it is an instruction
978/// that does not read/write memory and all operands are either not instructions
979/// or phi nodes or instructions from different blocks.
981 auto *I = dyn_cast<Instruction>(V);
982 if (!I)
983 return true;
984 return !mayHaveNonDefUseDependency(*I) &&
985 all_of(I->operands(), [I](Value *V) {
986 auto *IO = dyn_cast<Instruction>(V);
987 if (!IO)
988 return true;
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
990 });
991}
992
993/// Checks if the provided value does not require scheduling. It does not
994/// require scheduling if this is not an instruction or it is an instruction
995/// that does not read/write memory and all users are phi nodes or instructions
996/// from the different blocks.
997static bool isUsedOutsideBlock(Value *V) {
998 auto *I = dyn_cast<Instruction>(V);
999 if (!I)
1000 return true;
1001 // Limits the number of uses to save compile time.
1002 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1003 all_of(I->users(), [I](User *U) {
1004 auto *IU = dyn_cast<Instruction>(U);
1005 if (!IU)
1006 return true;
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1008 });
1009}
1010
1011/// Checks if the specified value does not require scheduling. It does not
1012/// require scheduling if all operands and all users do not need to be scheduled
1013/// in the current basic block.
1016}
1017
1018/// Checks if the specified array of instructions does not require scheduling.
1019/// It is so if all either instructions have operands that do not require
1020/// scheduling or their users do not require scheduling since they are phis or
1021/// in other basic blocks.
1023 return !VL.empty() &&
1025}
1026
1027namespace slpvectorizer {
1028
1029/// Bottom Up SLP Vectorizer.
1030class BoUpSLP {
1031 struct TreeEntry;
1032 struct ScheduleData;
1035
1036public:
1037 /// Tracks the state we can represent the loads in the given sequence.
1038 enum class LoadsState {
1039 Gather,
1040 Vectorize,
1043 };
1044
1052
1054 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1057 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB), DL(DL), ORE(ORE),
1059 Builder(Se->getContext(), TargetFolder(*DL)) {
1060 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1061 // Use the vector register size specified by the target unless overridden
1062 // by a command-line option.
1063 // TODO: It would be better to limit the vectorization factor based on
1064 // data type rather than just register size. For example, x86 AVX has
1065 // 256-bit registers, but it does not support integer operations
1066 // at that width (that requires AVX2).
1067 if (MaxVectorRegSizeOption.getNumOccurrences())
1068 MaxVecRegSize = MaxVectorRegSizeOption;
1069 else
1070 MaxVecRegSize =
1072 .getFixedValue();
1073
1074 if (MinVectorRegSizeOption.getNumOccurrences())
1075 MinVecRegSize = MinVectorRegSizeOption;
1076 else
1077 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1078 }
1079
1080 /// Vectorize the tree that starts with the elements in \p VL.
1081 /// Returns the vectorized root.
1083
1084 /// Vectorize the tree but with the list of externally used values \p
1085 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1086 /// generated extractvalue instructions.
1087 /// \param ReplacedExternals containd list of replaced external values
1088 /// {scalar, replace} after emitting extractelement for external uses.
1089 Value *
1090 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1091 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1092 Instruction *ReductionRoot = nullptr);
1093
1094 /// \returns the cost incurred by unwanted spills and fills, caused by
1095 /// holding live values over call sites.
1097
1098 /// \returns the vectorization cost of the subtree that starts at \p VL.
1099 /// A negative number means that this is profitable.
1100 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1101
1102 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1103 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1104 void buildTree(ArrayRef<Value *> Roots,
1105 const SmallDenseSet<Value *> &UserIgnoreLst);
1106
1107 /// Construct a vectorizable tree that starts at \p Roots.
1108 void buildTree(ArrayRef<Value *> Roots);
1109
1110 /// Returns whether the root node has in-tree uses.
1112 return !VectorizableTree.empty() &&
1113 !VectorizableTree.front()->UserTreeIndices.empty();
1114 }
1115
1116 /// Return the scalars of the root node.
1118 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1119 return VectorizableTree.front()->Scalars;
1120 }
1121
1122 /// Builds external uses of the vectorized scalars, i.e. the list of
1123 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1124 /// ExternallyUsedValues contains additional list of external uses to handle
1125 /// vectorization of reductions.
1126 void
1127 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1128
1129 /// Clear the internal data structures that are created by 'buildTree'.
1130 void deleteTree() {
1131 VectorizableTree.clear();
1132 ScalarToTreeEntry.clear();
1133 MultiNodeScalars.clear();
1134 MustGather.clear();
1135 EntryToLastInstruction.clear();
1136 ExternalUses.clear();
1137 ExternalUsesAsGEPs.clear();
1138 for (auto &Iter : BlocksSchedules) {
1139 BlockScheduling *BS = Iter.second.get();
1140 BS->clear();
1141 }
1142 MinBWs.clear();
1143 ReductionBitWidth = 0;
1144 CastMaxMinBWSizes.reset();
1145 ExtraBitWidthNodes.clear();
1146 InstrElementSize.clear();
1147 UserIgnoreList = nullptr;
1148 PostponedGathers.clear();
1149 ValueToGatherNodes.clear();
1150 }
1151
1152 unsigned getTreeSize() const { return VectorizableTree.size(); }
1153
1154 /// Perform LICM and CSE on the newly generated gather sequences.
1156
1157 /// Checks if the specified gather tree entry \p TE can be represented as a
1158 /// shuffled vector entry + (possibly) permutation with other gathers. It
1159 /// implements the checks only for possibly ordered scalars (Loads,
1160 /// ExtractElement, ExtractValue), which can be part of the graph.
1161 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1162
1163 /// Sort loads into increasing pointers offsets to allow greater clustering.
1164 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1165
1166 /// Gets reordering data for the given tree entry. If the entry is vectorized
1167 /// - just return ReorderIndices, otherwise check if the scalars can be
1168 /// reordered and return the most optimal order.
1169 /// \return std::nullopt if ordering is not important, empty order, if
1170 /// identity order is important, or the actual order.
1171 /// \param TopToBottom If true, include the order of vectorized stores and
1172 /// insertelement nodes, otherwise skip them.
1173 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1174 bool TopToBottom);
1175
1176 /// Reorders the current graph to the most profitable order starting from the
1177 /// root node to the leaf nodes. The best order is chosen only from the nodes
1178 /// of the same size (vectorization factor). Smaller nodes are considered
1179 /// parts of subgraph with smaller VF and they are reordered independently. We
1180 /// can make it because we still need to extend smaller nodes to the wider VF
1181 /// and we can merge reordering shuffles with the widening shuffles.
1182 void reorderTopToBottom();
1183
1184 /// Reorders the current graph to the most profitable order starting from
1185 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1186 /// number of reshuffles if the leaf nodes use the same order. In this case we
1187 /// can merge the orders and just shuffle user node instead of shuffling its
1188 /// operands. Plus, even the leaf nodes have different orders, it allows to
1189 /// sink reordering in the graph closer to the root node and merge it later
1190 /// during analysis.
1191 void reorderBottomToTop(bool IgnoreReorder = false);
1192
1193 /// \return The vector element size in bits to use when vectorizing the
1194 /// expression tree ending at \p V. If V is a store, the size is the width of
1195 /// the stored value. Otherwise, the size is the width of the largest loaded
1196 /// value reaching V. This method is used by the vectorizer to calculate
1197 /// vectorization factors.
1198 unsigned getVectorElementSize(Value *V);
1199
1200 /// Compute the minimum type sizes required to represent the entries in a
1201 /// vectorizable tree.
1203
1204 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1205 unsigned getMaxVecRegSize() const {
1206 return MaxVecRegSize;
1207 }
1208
1209 // \returns minimum vector register size as set by cl::opt.
1210 unsigned getMinVecRegSize() const {
1211 return MinVecRegSize;
1212 }
1213
1214 unsigned getMinVF(unsigned Sz) const {
1215 return std::max(2U, getMinVecRegSize() / Sz);
1216 }
1217
1218 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1219 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1220 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1221 return MaxVF ? MaxVF : UINT_MAX;
1222 }
1223
1224 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1225 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1226 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1227 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1228 ///
1229 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1230 unsigned canMapToVector(Type *T) const;
1231
1232 /// \returns True if the VectorizableTree is both tiny and not fully
1233 /// vectorizable. We do not vectorize such trees.
1234 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1235
1236 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1237 /// can be load combined in the backend. Load combining may not be allowed in
1238 /// the IR optimizer, so we do not want to alter the pattern. For example,
1239 /// partially transforming a scalar bswap() pattern into vector code is
1240 /// effectively impossible for the backend to undo.
1241 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1242 /// may not be necessary.
1243 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1244
1245 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1246 /// can be load combined in the backend. Load combining may not be allowed in
1247 /// the IR optimizer, so we do not want to alter the pattern. For example,
1248 /// partially transforming a scalar bswap() pattern into vector code is
1249 /// effectively impossible for the backend to undo.
1250 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1251 /// may not be necessary.
1252 bool isLoadCombineCandidate() const;
1253
1254 /// Checks if the given array of loads can be represented as a vectorized,
1255 /// scatter or just simple gather.
1256 /// \param VL list of loads.
1257 /// \param VL0 main load value.
1258 /// \param Order returned order of load instructions.
1259 /// \param PointerOps returned list of pointer operands.
1260 /// \param TryRecursiveCheck used to check if long masked gather can be
1261 /// represented as a serie of loads/insert subvector, if profitable.
1264 SmallVectorImpl<Value *> &PointerOps,
1265 bool TryRecursiveCheck = true) const;
1266
1268
1269 /// This structure holds any data we need about the edges being traversed
1270 /// during buildTree_rec(). We keep track of:
1271 /// (i) the user TreeEntry index, and
1272 /// (ii) the index of the edge.
1273 struct EdgeInfo {
1274 EdgeInfo() = default;
1275 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1277 /// The user TreeEntry.
1278 TreeEntry *UserTE = nullptr;
1279 /// The operand index of the use.
1280 unsigned EdgeIdx = UINT_MAX;
1281#ifndef NDEBUG
1283 const BoUpSLP::EdgeInfo &EI) {
1284 EI.dump(OS);
1285 return OS;
1286 }
1287 /// Debug print.
1288 void dump(raw_ostream &OS) const {
1289 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1290 << " EdgeIdx:" << EdgeIdx << "}";
1291 }
1292 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1293#endif
1294 bool operator == (const EdgeInfo &Other) const {
1295 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1296 }
1297 };
1298
1299 /// A helper class used for scoring candidates for two consecutive lanes.
1301 const TargetLibraryInfo &TLI;
1302 const DataLayout &DL;
1303 ScalarEvolution &SE;
1304 const BoUpSLP &R;
1305 int NumLanes; // Total number of lanes (aka vectorization factor).
1306 int MaxLevel; // The maximum recursion depth for accumulating score.
1307
1308 public:
1310 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1311 int MaxLevel)
1312 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1313 MaxLevel(MaxLevel) {}
1314
1315 // The hard-coded scores listed here are not very important, though it shall
1316 // be higher for better matches to improve the resulting cost. When
1317 // computing the scores of matching one sub-tree with another, we are
1318 // basically counting the number of values that are matching. So even if all
1319 // scores are set to 1, we would still get a decent matching result.
1320 // However, sometimes we have to break ties. For example we may have to
1321 // choose between matching loads vs matching opcodes. This is what these
1322 // scores are helping us with: they provide the order of preference. Also,
1323 // this is important if the scalar is externally used or used in another
1324 // tree entry node in the different lane.
1325
1326 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1327 static const int ScoreConsecutiveLoads = 4;
1328 /// The same load multiple times. This should have a better score than
1329 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1330 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1331 /// a vector load and 1.0 for a broadcast.
1332 static const int ScoreSplatLoads = 3;
1333 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1334 static const int ScoreReversedLoads = 3;
1335 /// A load candidate for masked gather.
1336 static const int ScoreMaskedGatherCandidate = 1;
1337 /// ExtractElementInst from same vector and consecutive indexes.
1338 static const int ScoreConsecutiveExtracts = 4;
1339 /// ExtractElementInst from same vector and reversed indices.
1340 static const int ScoreReversedExtracts = 3;
1341 /// Constants.
1342 static const int ScoreConstants = 2;
1343 /// Instructions with the same opcode.
1344 static const int ScoreSameOpcode = 2;
1345 /// Instructions with alt opcodes (e.g, add + sub).
1346 static const int ScoreAltOpcodes = 1;
1347 /// Identical instructions (a.k.a. splat or broadcast).
1348 static const int ScoreSplat = 1;
1349 /// Matching with an undef is preferable to failing.
1350 static const int ScoreUndef = 1;
1351 /// Score for failing to find a decent match.
1352 static const int ScoreFail = 0;
1353 /// Score if all users are vectorized.
1354 static const int ScoreAllUserVectorized = 1;
1355
1356 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1357 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1358 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1359 /// MainAltOps.
1361 ArrayRef<Value *> MainAltOps) const {
1362 if (!isValidElementType(V1->getType()) ||
1363 !isValidElementType(V2->getType()))
1365
1366 if (V1 == V2) {
1367 if (isa<LoadInst>(V1)) {
1368 // Retruns true if the users of V1 and V2 won't need to be extracted.
1369 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1370 // Bail out if we have too many uses to save compilation time.
1371 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1372 return false;
1373
1374 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1375 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1376 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1377 });
1378 };
1379 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1380 };
1381 // A broadcast of a load can be cheaper on some targets.
1382 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1383 ElementCount::getFixed(NumLanes)) &&
1384 ((int)V1->getNumUses() == NumLanes ||
1385 AllUsersAreInternal(V1, V2)))
1387 }
1389 }
1390
1391 auto *LI1 = dyn_cast<LoadInst>(V1);
1392 auto *LI2 = dyn_cast<LoadInst>(V2);
1393 if (LI1 && LI2) {
1394 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1395 !LI2->isSimple())
1397
1398 std::optional<int> Dist = getPointersDiff(
1399 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1400 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1401 if (!Dist || *Dist == 0) {
1402 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1403 getUnderlyingObject(LI2->getPointerOperand()) &&
1404 R.TTI->isLegalMaskedGather(
1405 FixedVectorType::get(LI1->getType(), NumLanes),
1406 LI1->getAlign()))
1409 }
1410 // The distance is too large - still may be profitable to use masked
1411 // loads/gathers.
1412 if (std::abs(*Dist) > NumLanes / 2)
1414 // This still will detect consecutive loads, but we might have "holes"
1415 // in some cases. It is ok for non-power-2 vectorization and may produce
1416 // better results. It should not affect current vectorization.
1419 }
1420
1421 auto *C1 = dyn_cast<Constant>(V1);
1422 auto *C2 = dyn_cast<Constant>(V2);
1423 if (C1 && C2)
1425
1426 // Extracts from consecutive indexes of the same vector better score as
1427 // the extracts could be optimized away.
1428 Value *EV1;
1429 ConstantInt *Ex1Idx;
1430 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1431 // Undefs are always profitable for extractelements.
1432 // Compiler can easily combine poison and extractelement <non-poison> or
1433 // undef and extractelement <poison>. But combining undef +
1434 // extractelement <non-poison-but-may-produce-poison> requires some
1435 // extra operations.
1436 if (isa<UndefValue>(V2))
1437 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1440 Value *EV2 = nullptr;
1441 ConstantInt *Ex2Idx = nullptr;
1442 if (match(V2,
1444 m_Undef())))) {
1445 // Undefs are always profitable for extractelements.
1446 if (!Ex2Idx)
1448 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1450 if (EV2 == EV1) {
1451 int Idx1 = Ex1Idx->getZExtValue();
1452 int Idx2 = Ex2Idx->getZExtValue();
1453 int Dist = Idx2 - Idx1;
1454 // The distance is too large - still may be profitable to use
1455 // shuffles.
1456 if (std::abs(Dist) == 0)
1458 if (std::abs(Dist) > NumLanes / 2)
1462 }
1464 }
1466 }
1467
1468 auto *I1 = dyn_cast<Instruction>(V1);
1469 auto *I2 = dyn_cast<Instruction>(V2);
1470 if (I1 && I2) {
1471 if (I1->getParent() != I2->getParent())
1473 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1474 Ops.push_back(I1);
1475 Ops.push_back(I2);
1476 InstructionsState S = getSameOpcode(Ops, TLI);
1477 // Note: Only consider instructions with <= 2 operands to avoid
1478 // complexity explosion.
1479 if (S.getOpcode() &&
1480 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1481 !S.isAltShuffle()) &&
1482 all_of(Ops, [&S](Value *V) {
1483 return cast<Instruction>(V)->getNumOperands() ==
1484 S.MainOp->getNumOperands();
1485 }))
1486 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1488 }
1489
1490 if (isa<UndefValue>(V2))
1492
1494 }
1495
1496 /// Go through the operands of \p LHS and \p RHS recursively until
1497 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1498 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1499 /// of \p U1 and \p U2), except at the beginning of the recursion where
1500 /// these are set to nullptr.
1501 ///
1502 /// For example:
1503 /// \verbatim
1504 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1505 /// \ / \ / \ / \ /
1506 /// + + + +
1507 /// G1 G2 G3 G4
1508 /// \endverbatim
1509 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1510 /// each level recursively, accumulating the score. It starts from matching
1511 /// the additions at level 0, then moves on to the loads (level 1). The
1512 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1513 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1514 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1515 /// Please note that the order of the operands does not matter, as we
1516 /// evaluate the score of all profitable combinations of operands. In
1517 /// other words the score of G1 and G4 is the same as G1 and G2. This
1518 /// heuristic is based on ideas described in:
1519 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1520 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1521 /// Luís F. W. Góes
1523 Instruction *U2, int CurrLevel,
1524 ArrayRef<Value *> MainAltOps) const {
1525
1526 // Get the shallow score of V1 and V2.
1527 int ShallowScoreAtThisLevel =
1528 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1529
1530 // If reached MaxLevel,
1531 // or if V1 and V2 are not instructions,
1532 // or if they are SPLAT,
1533 // or if they are not consecutive,
1534 // or if profitable to vectorize loads or extractelements, early return
1535 // the current cost.
1536 auto *I1 = dyn_cast<Instruction>(LHS);
1537 auto *I2 = dyn_cast<Instruction>(RHS);
1538 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1539 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1540 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1541 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1542 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1543 ShallowScoreAtThisLevel))
1544 return ShallowScoreAtThisLevel;
1545 assert(I1 && I2 && "Should have early exited.");
1546
1547 // Contains the I2 operand indexes that got matched with I1 operands.
1548 SmallSet<unsigned, 4> Op2Used;
1549
1550 // Recursion towards the operands of I1 and I2. We are trying all possible
1551 // operand pairs, and keeping track of the best score.
1552 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1553 OpIdx1 != NumOperands1; ++OpIdx1) {
1554 // Try to pair op1I with the best operand of I2.
1555 int MaxTmpScore = 0;
1556 unsigned MaxOpIdx2 = 0;
1557 bool FoundBest = false;
1558 // If I2 is commutative try all combinations.
1559 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1560 unsigned ToIdx = isCommutative(I2)
1561 ? I2->getNumOperands()
1562 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1563 assert(FromIdx <= ToIdx && "Bad index");
1564 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1565 // Skip operands already paired with OpIdx1.
1566 if (Op2Used.count(OpIdx2))
1567 continue;
1568 // Recursively calculate the cost at each level
1569 int TmpScore =
1570 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1571 I1, I2, CurrLevel + 1, std::nullopt);
1572 // Look for the best score.
1573 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1574 TmpScore > MaxTmpScore) {
1575 MaxTmpScore = TmpScore;
1576 MaxOpIdx2 = OpIdx2;
1577 FoundBest = true;
1578 }
1579 }
1580 if (FoundBest) {
1581 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1582 Op2Used.insert(MaxOpIdx2);
1583 ShallowScoreAtThisLevel += MaxTmpScore;
1584 }
1585 }
1586 return ShallowScoreAtThisLevel;
1587 }
1588 };
1589 /// A helper data structure to hold the operands of a vector of instructions.
1590 /// This supports a fixed vector length for all operand vectors.
1592 /// For each operand we need (i) the value, and (ii) the opcode that it
1593 /// would be attached to if the expression was in a left-linearized form.
1594 /// This is required to avoid illegal operand reordering.
1595 /// For example:
1596 /// \verbatim
1597 /// 0 Op1
1598 /// |/
1599 /// Op1 Op2 Linearized + Op2
1600 /// \ / ----------> |/
1601 /// - -
1602 ///
1603 /// Op1 - Op2 (0 + Op1) - Op2
1604 /// \endverbatim
1605 ///
1606 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1607 ///
1608 /// Another way to think of this is to track all the operations across the
1609 /// path from the operand all the way to the root of the tree and to
1610 /// calculate the operation that corresponds to this path. For example, the
1611 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1612 /// corresponding operation is a '-' (which matches the one in the
1613 /// linearized tree, as shown above).
1614 ///
1615 /// For lack of a better term, we refer to this operation as Accumulated
1616 /// Path Operation (APO).
1617 struct OperandData {
1618 OperandData() = default;
1619 OperandData(Value *V, bool APO, bool IsUsed)
1620 : V(V), APO(APO), IsUsed(IsUsed) {}
1621 /// The operand value.
1622 Value *V = nullptr;
1623 /// TreeEntries only allow a single opcode, or an alternate sequence of
1624 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1625 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1626 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1627 /// (e.g., Add/Mul)
1628 bool APO = false;
1629 /// Helper data for the reordering function.
1630 bool IsUsed = false;
1631 };
1632
1633 /// During operand reordering, we are trying to select the operand at lane
1634 /// that matches best with the operand at the neighboring lane. Our
1635 /// selection is based on the type of value we are looking for. For example,
1636 /// if the neighboring lane has a load, we need to look for a load that is
1637 /// accessing a consecutive address. These strategies are summarized in the
1638 /// 'ReorderingMode' enumerator.
1639 enum class ReorderingMode {
1640 Load, ///< Matching loads to consecutive memory addresses
1641 Opcode, ///< Matching instructions based on opcode (same or alternate)
1642 Constant, ///< Matching constants
1643 Splat, ///< Matching the same instruction multiple times (broadcast)
1644 Failed, ///< We failed to create a vectorizable group
1645 };
1646
1648
1649 /// A vector of operand vectors.
1651
1652 const TargetLibraryInfo &TLI;
1653 const DataLayout &DL;
1654 ScalarEvolution &SE;
1655 const BoUpSLP &R;
1656
1657 /// \returns the operand data at \p OpIdx and \p Lane.
1658 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1659 return OpsVec[OpIdx][Lane];
1660 }
1661
1662 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1663 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1664 return OpsVec[OpIdx][Lane];
1665 }
1666
1667 /// Clears the used flag for all entries.
1668 void clearUsed() {
1669 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1670 OpIdx != NumOperands; ++OpIdx)
1671 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1672 ++Lane)
1673 OpsVec[OpIdx][Lane].IsUsed = false;
1674 }
1675
1676 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1677 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1678 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1679 }
1680
1681 /// \param Lane lane of the operands under analysis.
1682 /// \param OpIdx operand index in \p Lane lane we're looking the best
1683 /// candidate for.
1684 /// \param Idx operand index of the current candidate value.
1685 /// \returns The additional score due to possible broadcasting of the
1686 /// elements in the lane. It is more profitable to have power-of-2 unique
1687 /// elements in the lane, it will be vectorized with higher probability
1688 /// after removing duplicates. Currently the SLP vectorizer supports only
1689 /// vectorization of the power-of-2 number of unique scalars.
1690 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1691 Value *IdxLaneV = getData(Idx, Lane).V;
1692 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1693 return 0;
1695 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1696 if (Ln == Lane)
1697 continue;
1698 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1699 if (!isa<Instruction>(OpIdxLnV))
1700 return 0;
1701 Uniques.insert(OpIdxLnV);
1702 }
1703 int UniquesCount = Uniques.size();
1704 int UniquesCntWithIdxLaneV =
1705 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1706 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1707 int UniquesCntWithOpIdxLaneV =
1708 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1709 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1710 return 0;
1711 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1712 UniquesCntWithOpIdxLaneV) -
1713 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1714 }
1715
1716 /// \param Lane lane of the operands under analysis.
1717 /// \param OpIdx operand index in \p Lane lane we're looking the best
1718 /// candidate for.
1719 /// \param Idx operand index of the current candidate value.
1720 /// \returns The additional score for the scalar which users are all
1721 /// vectorized.
1722 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1723 Value *IdxLaneV = getData(Idx, Lane).V;
1724 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1725 // Do not care about number of uses for vector-like instructions
1726 // (extractelement/extractvalue with constant indices), they are extracts
1727 // themselves and already externally used. Vectorization of such
1728 // instructions does not add extra extractelement instruction, just may
1729 // remove it.
1730 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1731 isVectorLikeInstWithConstOps(OpIdxLaneV))
1733 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1734 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1735 return 0;
1736 return R.areAllUsersVectorized(IdxLaneI)
1738 : 0;
1739 }
1740
1741 /// Score scaling factor for fully compatible instructions but with
1742 /// different number of external uses. Allows better selection of the
1743 /// instructions with less external uses.
1744 static const int ScoreScaleFactor = 10;
1745
1746 /// \Returns the look-ahead score, which tells us how much the sub-trees
1747 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1748 /// score. This helps break ties in an informed way when we cannot decide on
1749 /// the order of the operands by just considering the immediate
1750 /// predecessors.
1751 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1752 int Lane, unsigned OpIdx, unsigned Idx,
1753 bool &IsUsed) {
1754 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1756 // Keep track of the instruction stack as we recurse into the operands
1757 // during the look-ahead score exploration.
1758 int Score =
1759 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1760 /*CurrLevel=*/1, MainAltOps);
1761 if (Score) {
1762 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1763 if (Score <= -SplatScore) {
1764 // Set the minimum score for splat-like sequence to avoid setting
1765 // failed state.
1766 Score = 1;
1767 } else {
1768 Score += SplatScore;
1769 // Scale score to see the difference between different operands
1770 // and similar operands but all vectorized/not all vectorized
1771 // uses. It does not affect actual selection of the best
1772 // compatible operand in general, just allows to select the
1773 // operand with all vectorized uses.
1774 Score *= ScoreScaleFactor;
1775 Score += getExternalUseScore(Lane, OpIdx, Idx);
1776 IsUsed = true;
1777 }
1778 }
1779 return Score;
1780 }
1781
1782 /// Best defined scores per lanes between the passes. Used to choose the
1783 /// best operand (with the highest score) between the passes.
1784 /// The key - {Operand Index, Lane}.
1785 /// The value - the best score between the passes for the lane and the
1786 /// operand.
1788 BestScoresPerLanes;
1789
1790 // Search all operands in Ops[*][Lane] for the one that matches best
1791 // Ops[OpIdx][LastLane] and return its opreand index.
1792 // If no good match can be found, return std::nullopt.
1793 std::optional<unsigned>
1794 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1795 ArrayRef<ReorderingMode> ReorderingModes,
1796 ArrayRef<Value *> MainAltOps) {
1797 unsigned NumOperands = getNumOperands();
1798
1799 // The operand of the previous lane at OpIdx.
1800 Value *OpLastLane = getData(OpIdx, LastLane).V;
1801
1802 // Our strategy mode for OpIdx.
1803 ReorderingMode RMode = ReorderingModes[OpIdx];
1804 if (RMode == ReorderingMode::Failed)
1805 return std::nullopt;
1806
1807 // The linearized opcode of the operand at OpIdx, Lane.
1808 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1809
1810 // The best operand index and its score.
1811 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1812 // are using the score to differentiate between the two.
1813 struct BestOpData {
1814 std::optional<unsigned> Idx;
1815 unsigned Score = 0;
1816 } BestOp;
1817 BestOp.Score =
1818 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1819 .first->second;
1820
1821 // Track if the operand must be marked as used. If the operand is set to
1822 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1823 // want to reestimate the operands again on the following iterations).
1824 bool IsUsed =
1825 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1826 // Iterate through all unused operands and look for the best.
1827 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1828 // Get the operand at Idx and Lane.
1829 OperandData &OpData = getData(Idx, Lane);
1830 Value *Op = OpData.V;
1831 bool OpAPO = OpData.APO;
1832
1833 // Skip already selected operands.
1834 if (OpData.IsUsed)
1835 continue;
1836
1837 // Skip if we are trying to move the operand to a position with a
1838 // different opcode in the linearized tree form. This would break the
1839 // semantics.
1840 if (OpAPO != OpIdxAPO)
1841 continue;
1842
1843 // Look for an operand that matches the current mode.
1844 switch (RMode) {
1845 case ReorderingMode::Load:
1846 case ReorderingMode::Constant:
1847 case ReorderingMode::Opcode: {
1848 bool LeftToRight = Lane > LastLane;
1849 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1850 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1851 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1852 OpIdx, Idx, IsUsed);
1853 if (Score > static_cast<int>(BestOp.Score)) {
1854 BestOp.Idx = Idx;
1855 BestOp.Score = Score;
1856 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1857 }
1858 break;
1859 }
1860 case ReorderingMode::Splat:
1861 if (Op == OpLastLane)
1862 BestOp.Idx = Idx;
1863 break;
1864 case ReorderingMode::Failed:
1865 llvm_unreachable("Not expected Failed reordering mode.");
1866 }
1867 }
1868
1869 if (BestOp.Idx) {
1870 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1871 return BestOp.Idx;
1872 }
1873 // If we could not find a good match return std::nullopt.
1874 return std::nullopt;
1875 }
1876
1877 /// Helper for reorderOperandVecs.
1878 /// \returns the lane that we should start reordering from. This is the one
1879 /// which has the least number of operands that can freely move about or
1880 /// less profitable because it already has the most optimal set of operands.
1881 unsigned getBestLaneToStartReordering() const {
1882 unsigned Min = UINT_MAX;
1883 unsigned SameOpNumber = 0;
1884 // std::pair<unsigned, unsigned> is used to implement a simple voting
1885 // algorithm and choose the lane with the least number of operands that
1886 // can freely move about or less profitable because it already has the
1887 // most optimal set of operands. The first unsigned is a counter for
1888 // voting, the second unsigned is the counter of lanes with instructions
1889 // with same/alternate opcodes and same parent basic block.
1891 // Try to be closer to the original results, if we have multiple lanes
1892 // with same cost. If 2 lanes have the same cost, use the one with the
1893 // lowest index.
1894 for (int I = getNumLanes(); I > 0; --I) {
1895 unsigned Lane = I - 1;
1896 OperandsOrderData NumFreeOpsHash =
1897 getMaxNumOperandsThatCanBeReordered(Lane);
1898 // Compare the number of operands that can move and choose the one with
1899 // the least number.
1900 if (NumFreeOpsHash.NumOfAPOs < Min) {
1901 Min = NumFreeOpsHash.NumOfAPOs;
1902 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1903 HashMap.clear();
1904 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1905 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1906 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1907 // Select the most optimal lane in terms of number of operands that
1908 // should be moved around.
1909 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1910 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1911 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1912 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1913 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1914 if (It == HashMap.end())
1915 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1916 else
1917 ++It->second.first;
1918 }
1919 }
1920 // Select the lane with the minimum counter.
1921 unsigned BestLane = 0;
1922 unsigned CntMin = UINT_MAX;
1923 for (const auto &Data : reverse(HashMap)) {
1924 if (Data.second.first < CntMin) {
1925 CntMin = Data.second.first;
1926 BestLane = Data.second.second;
1927 }
1928 }
1929 return BestLane;
1930 }
1931
1932 /// Data structure that helps to reorder operands.
1933 struct OperandsOrderData {
1934 /// The best number of operands with the same APOs, which can be
1935 /// reordered.
1936 unsigned NumOfAPOs = UINT_MAX;
1937 /// Number of operands with the same/alternate instruction opcode and
1938 /// parent.
1939 unsigned NumOpsWithSameOpcodeParent = 0;
1940 /// Hash for the actual operands ordering.
1941 /// Used to count operands, actually their position id and opcode
1942 /// value. It is used in the voting mechanism to find the lane with the
1943 /// least number of operands that can freely move about or less profitable
1944 /// because it already has the most optimal set of operands. Can be
1945 /// replaced with SmallVector<unsigned> instead but hash code is faster
1946 /// and requires less memory.
1947 unsigned Hash = 0;
1948 };
1949 /// \returns the maximum number of operands that are allowed to be reordered
1950 /// for \p Lane and the number of compatible instructions(with the same
1951 /// parent/opcode). This is used as a heuristic for selecting the first lane
1952 /// to start operand reordering.
1953 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1954 unsigned CntTrue = 0;
1955 unsigned NumOperands = getNumOperands();
1956 // Operands with the same APO can be reordered. We therefore need to count
1957 // how many of them we have for each APO, like this: Cnt[APO] = x.
1958 // Since we only have two APOs, namely true and false, we can avoid using
1959 // a map. Instead we can simply count the number of operands that
1960 // correspond to one of them (in this case the 'true' APO), and calculate
1961 // the other by subtracting it from the total number of operands.
1962 // Operands with the same instruction opcode and parent are more
1963 // profitable since we don't need to move them in many cases, with a high
1964 // probability such lane already can be vectorized effectively.
1965 bool AllUndefs = true;
1966 unsigned NumOpsWithSameOpcodeParent = 0;
1967 Instruction *OpcodeI = nullptr;
1968 BasicBlock *Parent = nullptr;
1969 unsigned Hash = 0;
1970 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1971 const OperandData &OpData = getData(OpIdx, Lane);
1972 if (OpData.APO)
1973 ++CntTrue;
1974 // Use Boyer-Moore majority voting for finding the majority opcode and
1975 // the number of times it occurs.
1976 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1977 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1978 I->getParent() != Parent) {
1979 if (NumOpsWithSameOpcodeParent == 0) {
1980 NumOpsWithSameOpcodeParent = 1;
1981 OpcodeI = I;
1982 Parent = I->getParent();
1983 } else {
1984 --NumOpsWithSameOpcodeParent;
1985 }
1986 } else {
1987 ++NumOpsWithSameOpcodeParent;
1988 }
1989 }
1990 Hash = hash_combine(
1991 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1992 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1993 }
1994 if (AllUndefs)
1995 return {};
1996 OperandsOrderData Data;
1997 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1998 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1999 Data.Hash = Hash;
2000 return Data;
2001 }
2002
2003 /// Go through the instructions in VL and append their operands.
2004 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2005 assert(!VL.empty() && "Bad VL");
2006 assert((empty() || VL.size() == getNumLanes()) &&
2007 "Expected same number of lanes");
2008 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2009 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2010 constexpr unsigned IntrinsicNumOperands = 2;
2011 if (isa<IntrinsicInst>(VL[0]))
2012 NumOperands = IntrinsicNumOperands;
2013 OpsVec.resize(NumOperands);
2014 unsigned NumLanes = VL.size();
2015 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2016 OpsVec[OpIdx].resize(NumLanes);
2017 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2018 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2019 // Our tree has just 3 nodes: the root and two operands.
2020 // It is therefore trivial to get the APO. We only need to check the
2021 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2022 // RHS operand. The LHS operand of both add and sub is never attached
2023 // to an inversese operation in the linearized form, therefore its APO
2024 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2025
2026 // Since operand reordering is performed on groups of commutative
2027 // operations or alternating sequences (e.g., +, -), we can safely
2028 // tell the inverse operations by checking commutativity.
2029 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2030 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2031 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2032 APO, false};
2033 }
2034 }
2035 }
2036
2037 /// \returns the number of operands.
2038 unsigned getNumOperands() const { return OpsVec.size(); }
2039
2040 /// \returns the number of lanes.
2041 unsigned getNumLanes() const { return OpsVec[0].size(); }
2042
2043 /// \returns the operand value at \p OpIdx and \p Lane.
2044 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2045 return getData(OpIdx, Lane).V;
2046 }
2047
2048 /// \returns true if the data structure is empty.
2049 bool empty() const { return OpsVec.empty(); }
2050
2051 /// Clears the data.
2052 void clear() { OpsVec.clear(); }
2053
2054 /// \Returns true if there are enough operands identical to \p Op to fill
2055 /// the whole vector.
2056 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2057 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2058 bool OpAPO = getData(OpIdx, Lane).APO;
2059 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2060 if (Ln == Lane)
2061 continue;
2062 // This is set to true if we found a candidate for broadcast at Lane.
2063 bool FoundCandidate = false;
2064 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2065 OperandData &Data = getData(OpI, Ln);
2066 if (Data.APO != OpAPO || Data.IsUsed)
2067 continue;
2068 if (Data.V == Op) {
2069 FoundCandidate = true;
2070 Data.IsUsed = true;
2071 break;
2072 }
2073 }
2074 if (!FoundCandidate)
2075 return false;
2076 }
2077 return true;
2078 }
2079
2080 public:
2081 /// Initialize with all the operands of the instruction vector \p RootVL.
2083 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
2084 // Append all the operands of RootVL.
2085 appendOperandsOfVL(RootVL);
2086 }
2087
2088 /// \Returns a value vector with the operands across all lanes for the
2089 /// opearnd at \p OpIdx.
2090 ValueList getVL(unsigned OpIdx) const {
2091 ValueList OpVL(OpsVec[OpIdx].size());
2092 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2093 "Expected same num of lanes across all operands");
2094 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2095 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2096 return OpVL;
2097 }
2098
2099 // Performs operand reordering for 2 or more operands.
2100 // The original operands are in OrigOps[OpIdx][Lane].
2101 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2102 void reorder() {
2103 unsigned NumOperands = getNumOperands();
2104 unsigned NumLanes = getNumLanes();
2105 // Each operand has its own mode. We are using this mode to help us select
2106 // the instructions for each lane, so that they match best with the ones
2107 // we have selected so far.
2108 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2109
2110 // This is a greedy single-pass algorithm. We are going over each lane
2111 // once and deciding on the best order right away with no back-tracking.
2112 // However, in order to increase its effectiveness, we start with the lane
2113 // that has operands that can move the least. For example, given the
2114 // following lanes:
2115 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2116 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2117 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2118 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2119 // we will start at Lane 1, since the operands of the subtraction cannot
2120 // be reordered. Then we will visit the rest of the lanes in a circular
2121 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2122
2123 // Find the first lane that we will start our search from.
2124 unsigned FirstLane = getBestLaneToStartReordering();
2125
2126 // Initialize the modes.
2127 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2128 Value *OpLane0 = getValue(OpIdx, FirstLane);
2129 // Keep track if we have instructions with all the same opcode on one
2130 // side.
2131 if (isa<LoadInst>(OpLane0))
2132 ReorderingModes[OpIdx] = ReorderingMode::Load;
2133 else if (isa<Instruction>(OpLane0)) {
2134 // Check if OpLane0 should be broadcast.
2135 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2136 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2137 else
2138 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2139 }
2140 else if (isa<Constant>(OpLane0))
2141 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2142 else if (isa<Argument>(OpLane0))
2143 // Our best hope is a Splat. It may save some cost in some cases.
2144 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2145 else
2146 // NOTE: This should be unreachable.
2147 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2148 }
2149
2150 // Check that we don't have same operands. No need to reorder if operands
2151 // are just perfect diamond or shuffled diamond match. Do not do it only
2152 // for possible broadcasts or non-power of 2 number of scalars (just for
2153 // now).
2154 auto &&SkipReordering = [this]() {
2155 SmallPtrSet<Value *, 4> UniqueValues;
2156 ArrayRef<OperandData> Op0 = OpsVec.front();
2157 for (const OperandData &Data : Op0)
2158 UniqueValues.insert(Data.V);
2159 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2160 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2161 return !UniqueValues.contains(Data.V);
2162 }))
2163 return false;
2164 }
2165 // TODO: Check if we can remove a check for non-power-2 number of
2166 // scalars after full support of non-power-2 vectorization.
2167 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2168 };
2169
2170 // If the initial strategy fails for any of the operand indexes, then we
2171 // perform reordering again in a second pass. This helps avoid assigning
2172 // high priority to the failed strategy, and should improve reordering for
2173 // the non-failed operand indexes.
2174 for (int Pass = 0; Pass != 2; ++Pass) {
2175 // Check if no need to reorder operands since they're are perfect or
2176 // shuffled diamond match.
2177 // Need to do it to avoid extra external use cost counting for
2178 // shuffled matches, which may cause regressions.
2179 if (SkipReordering())
2180 break;
2181 // Skip the second pass if the first pass did not fail.
2182 bool StrategyFailed = false;
2183 // Mark all operand data as free to use.
2184 clearUsed();
2185 // We keep the original operand order for the FirstLane, so reorder the
2186 // rest of the lanes. We are visiting the nodes in a circular fashion,
2187 // using FirstLane as the center point and increasing the radius
2188 // distance.
2189 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2190 for (unsigned I = 0; I < NumOperands; ++I)
2191 MainAltOps[I].push_back(getData(I, FirstLane).V);
2192
2193 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2194 // Visit the lane on the right and then the lane on the left.
2195 for (int Direction : {+1, -1}) {
2196 int Lane = FirstLane + Direction * Distance;
2197 if (Lane < 0 || Lane >= (int)NumLanes)
2198 continue;
2199 int LastLane = Lane - Direction;
2200 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2201 "Out of bounds");
2202 // Look for a good match for each operand.
2203 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2204 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2205 std::optional<unsigned> BestIdx = getBestOperand(
2206 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2207 // By not selecting a value, we allow the operands that follow to
2208 // select a better matching value. We will get a non-null value in
2209 // the next run of getBestOperand().
2210 if (BestIdx) {
2211 // Swap the current operand with the one returned by
2212 // getBestOperand().
2213 swap(OpIdx, *BestIdx, Lane);
2214 } else {
2215 // We failed to find a best operand, set mode to 'Failed'.
2216 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2217 // Enable the second pass.
2218 StrategyFailed = true;
2219 }
2220 // Try to get the alternate opcode and follow it during analysis.
2221 if (MainAltOps[OpIdx].size() != 2) {
2222 OperandData &AltOp = getData(OpIdx, Lane);
2223 InstructionsState OpS =
2224 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2225 if (OpS.getOpcode() && OpS.isAltShuffle())
2226 MainAltOps[OpIdx].push_back(AltOp.V);
2227 }
2228 }
2229 }
2230 }
2231 // Skip second pass if the strategy did not fail.
2232 if (!StrategyFailed)
2233 break;
2234 }
2235 }
2236
2237#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2238 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2239 switch (RMode) {
2240 case ReorderingMode::Load:
2241 return "Load";
2242 case ReorderingMode::Opcode:
2243 return "Opcode";
2244 case ReorderingMode::Constant:
2245 return "Constant";
2246 case ReorderingMode::Splat:
2247 return "Splat";
2248 case ReorderingMode::Failed:
2249 return "Failed";
2250 }
2251 llvm_unreachable("Unimplemented Reordering Type");
2252 }
2253
2254 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2255 raw_ostream &OS) {
2256 return OS << getModeStr(RMode);
2257 }
2258
2259 /// Debug print.
2260 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2261 printMode(RMode, dbgs());
2262 }
2263
2264 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2265 return printMode(RMode, OS);
2266 }
2267
2269 const unsigned Indent = 2;
2270 unsigned Cnt = 0;
2271 for (const OperandDataVec &OpDataVec : OpsVec) {
2272 OS << "Operand " << Cnt++ << "\n";
2273 for (const OperandData &OpData : OpDataVec) {
2274 OS.indent(Indent) << "{";
2275 if (Value *V = OpData.V)
2276 OS << *V;
2277 else
2278 OS << "null";
2279 OS << ", APO:" << OpData.APO << "}\n";
2280 }
2281 OS << "\n";
2282 }
2283 return OS;
2284 }
2285
2286 /// Debug print.
2287 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2288#endif
2289 };
2290
2291 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2292 /// for a pair which have highest score deemed to have best chance to form
2293 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2294 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2295 /// of the cost, considered to be good enough score.
2296 std::optional<int>
2297 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2298 int Limit = LookAheadHeuristics::ScoreFail) const {
2299 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2301 int BestScore = Limit;
2302 std::optional<int> Index;
2303 for (int I : seq<int>(0, Candidates.size())) {
2304 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2305 Candidates[I].second,
2306 /*U1=*/nullptr, /*U2=*/nullptr,
2307 /*Level=*/1, std::nullopt);
2308 if (Score > BestScore) {
2309 BestScore = Score;
2310 Index = I;
2311 }
2312 }
2313 return Index;
2314 }
2315
2316 /// Checks if the instruction is marked for deletion.
2317 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2318
2319 /// Removes an instruction from its block and eventually deletes it.
2320 /// It's like Instruction::eraseFromParent() except that the actual deletion
2321 /// is delayed until BoUpSLP is destructed.
2323 DeletedInstructions.insert(I);
2324 }
2325
2326 /// Checks if the instruction was already analyzed for being possible
2327 /// reduction root.
2329 return AnalyzedReductionsRoots.count(I);
2330 }
2331 /// Register given instruction as already analyzed for being possible
2332 /// reduction root.
2334 AnalyzedReductionsRoots.insert(I);
2335 }
2336 /// Checks if the provided list of reduced values was checked already for
2337 /// vectorization.
2339 return AnalyzedReductionVals.contains(hash_value(VL));
2340 }
2341 /// Adds the list of reduced values to list of already checked values for the
2342 /// vectorization.
2344 AnalyzedReductionVals.insert(hash_value(VL));
2345 }
2346 /// Clear the list of the analyzed reduction root instructions.
2348 AnalyzedReductionsRoots.clear();
2349 AnalyzedReductionVals.clear();
2350 AnalyzedMinBWVals.clear();
2351 }
2352 /// Checks if the given value is gathered in one of the nodes.
2353 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2354 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2355 }
2356
2357 /// Check if the value is vectorized in the tree.
2358 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2359
2360 ~BoUpSLP();
2361
2362private:
2363 /// Determine if a node \p E in can be demoted to a smaller type with a
2364 /// truncation. We collect the entries that will be demoted in ToDemote.
2365 /// \param E Node for analysis
2366 /// \param ToDemote indices of the nodes to be demoted.
2367 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2368 unsigned &BitWidth,
2369 SmallVectorImpl<unsigned> &ToDemote,
2371 unsigned &MaxDepthLevel,
2372 bool &IsProfitableToDemote,
2373 bool IsTruncRoot) const;
2374
2375 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2376 /// reordering (i.e. the operands can be reordered because they have only one
2377 /// user and reordarable).
2378 /// \param ReorderableGathers List of all gather nodes that require reordering
2379 /// (e.g., gather of extractlements or partially vectorizable loads).
2380 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2381 /// reordering, subset of \p NonVectorized.
2382 bool
2383 canReorderOperands(TreeEntry *UserTE,
2384 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2385 ArrayRef<TreeEntry *> ReorderableGathers,
2386 SmallVectorImpl<TreeEntry *> &GatherOps);
2387
2388 /// Checks if the given \p TE is a gather node with clustered reused scalars
2389 /// and reorders it per given \p Mask.
2390 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2391
2392 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2393 /// if any. If it is not vectorized (gather node), returns nullptr.
2394 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2395 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2396 TreeEntry *TE = nullptr;
2397 const auto *It = find_if(VL, [&](Value *V) {
2398 TE = getTreeEntry(V);
2399 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2400 return true;
2401 auto It = MultiNodeScalars.find(V);
2402 if (It != MultiNodeScalars.end()) {
2403 for (TreeEntry *E : It->second) {
2404 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2405 TE = E;
2406 return true;
2407 }
2408 }
2409 }
2410 return false;
2411 });
2412 if (It != VL.end()) {
2413 assert(TE->isSame(VL) && "Expected same scalars.");
2414 return TE;
2415 }
2416 return nullptr;
2417 }
2418
2419 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2420 /// if any. If it is not vectorized (gather node), returns nullptr.
2421 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2422 unsigned OpIdx) const {
2423 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2424 const_cast<TreeEntry *>(UserTE), OpIdx);
2425 }
2426
2427 /// Checks if all users of \p I are the part of the vectorization tree.
2428 bool areAllUsersVectorized(
2429 Instruction *I,
2430 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2431
2432 /// Return information about the vector formed for the specified index
2433 /// of a vector of (the same) instruction.
2435
2436 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2437 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2438
2439 /// \returns Cast context for the given graph node.
2441 getCastContextHint(const TreeEntry &TE) const;
2442
2443 /// \returns the cost of the vectorizable entry.
2444 InstructionCost getEntryCost(const TreeEntry *E,
2445 ArrayRef<Value *> VectorizedVals,
2446 SmallPtrSetImpl<Value *> &CheckedExtracts);
2447
2448 /// This is the recursive part of buildTree.
2449 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2450 const EdgeInfo &EI);
2451
2452 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2453 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2454 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2455 /// returns false, setting \p CurrentOrder to either an empty vector or a
2456 /// non-identity permutation that allows to reuse extract instructions.
2457 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2458 /// extract order.
2459 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2460 SmallVectorImpl<unsigned> &CurrentOrder,
2461 bool ResizeAllowed = false) const;
2462
2463 /// Vectorize a single entry in the tree.
2464 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2465 /// avoid issues with def-use order.
2466 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2467
2468 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2469 /// \p E.
2470 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2471 /// avoid issues with def-use order.
2472 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2473
2474 /// Create a new vector from a list of scalar values. Produces a sequence
2475 /// which exploits values reused across lanes, and arranges the inserts
2476 /// for ease of later optimization.
2477 template <typename BVTy, typename ResTy, typename... Args>
2478 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2479
2480 /// Create a new vector from a list of scalar values. Produces a sequence
2481 /// which exploits values reused across lanes, and arranges the inserts
2482 /// for ease of later optimization.
2483 Value *createBuildVector(const TreeEntry *E);
2484
2485 /// Returns the instruction in the bundle, which can be used as a base point
2486 /// for scheduling. Usually it is the last instruction in the bundle, except
2487 /// for the case when all operands are external (in this case, it is the first
2488 /// instruction in the list).
2489 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2490
2491 /// Tries to find extractelement instructions with constant indices from fixed
2492 /// vector type and gather such instructions into a bunch, which highly likely
2493 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2494 /// was successful, the matched scalars are replaced by poison values in \p VL
2495 /// for future analysis.
2496 std::optional<TargetTransformInfo::ShuffleKind>
2497 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2498 SmallVectorImpl<int> &Mask) const;
2499
2500 /// Tries to find extractelement instructions with constant indices from fixed
2501 /// vector type and gather such instructions into a bunch, which highly likely
2502 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2503 /// was successful, the matched scalars are replaced by poison values in \p VL
2504 /// for future analysis.
2506 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2508 unsigned NumParts) const;
2509
2510 /// Checks if the gathered \p VL can be represented as a single register
2511 /// shuffle(s) of previous tree entries.
2512 /// \param TE Tree entry checked for permutation.
2513 /// \param VL List of scalars (a subset of the TE scalar), checked for
2514 /// permutations. Must form single-register vector.
2515 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2516 /// commands to build the mask using the original vector value, without
2517 /// relying on the potential reordering.
2518 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2519 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2520 std::optional<TargetTransformInfo::ShuffleKind>
2521 isGatherShuffledSingleRegisterEntry(
2522 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2523 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2524 bool ForOrder);
2525
2526 /// Checks if the gathered \p VL can be represented as multi-register
2527 /// shuffle(s) of previous tree entries.
2528 /// \param TE Tree entry checked for permutation.
2529 /// \param VL List of scalars (a subset of the TE scalar), checked for
2530 /// permutations.
2531 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2532 /// commands to build the mask using the original vector value, without
2533 /// relying on the potential reordering.
2534 /// \returns per-register series of ShuffleKind, if gathered values can be
2535 /// represented as shuffles of previous tree entries. \p Mask is filled with
2536 /// the shuffle mask (also on per-register base).
2538 isGatherShuffledEntry(
2539 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2541 unsigned NumParts, bool ForOrder = false);
2542
2543 /// \returns the scalarization cost for this list of values. Assuming that
2544 /// this subtree gets vectorized, we may need to extract the values from the
2545 /// roots. This method calculates the cost of extracting the values.
2546 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2547 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2548
2549 /// Set the Builder insert point to one after the last instruction in
2550 /// the bundle
2551 void setInsertPointAfterBundle(const TreeEntry *E);
2552
2553 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2554 /// specified, the starting vector value is poison.
2555 Value *gather(ArrayRef<Value *> VL, Value *Root);
2556
2557 /// \returns whether the VectorizableTree is fully vectorizable and will
2558 /// be beneficial even the tree height is tiny.
2559 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2560
2561 /// Reorder commutative or alt operands to get better probability of
2562 /// generating vectorized code.
2563 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2566 const BoUpSLP &R);
2567
2568 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2569 /// users of \p TE and collects the stores. It returns the map from the store
2570 /// pointers to the collected stores.
2572 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2573
2574 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2575 /// stores in \p StoresVec can form a vector instruction. If so it returns
2576 /// true and populates \p ReorderIndices with the shuffle indices of the
2577 /// stores when compared to the sorted vector.
2578 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2579 OrdersType &ReorderIndices) const;
2580
2581 /// Iterates through the users of \p TE, looking for scalar stores that can be
2582 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2583 /// their order and builds an order index vector for each store bundle. It
2584 /// returns all these order vectors found.
2585 /// We run this after the tree has formed, otherwise we may come across user
2586 /// instructions that are not yet in the tree.
2588 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2589
2590 struct TreeEntry {
2591 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2592 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2593
2594 /// \returns Common mask for reorder indices and reused scalars.
2595 SmallVector<int> getCommonMask() const {
2597 inversePermutation(ReorderIndices, Mask);
2598 ::addMask(Mask, ReuseShuffleIndices);
2599 return Mask;
2600 }
2601
2602 /// \returns true if the scalars in VL are equal to this entry.
2603 bool isSame(ArrayRef<Value *> VL) const {
2604 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2605 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2606 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2607 return VL.size() == Mask.size() &&
2608 std::equal(VL.begin(), VL.end(), Mask.begin(),
2609 [Scalars](Value *V, int Idx) {
2610 return (isa<UndefValue>(V) &&
2611 Idx == PoisonMaskElem) ||
2612 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2613 });
2614 };
2615 if (!ReorderIndices.empty()) {
2616 // TODO: implement matching if the nodes are just reordered, still can
2617 // treat the vector as the same if the list of scalars matches VL
2618 // directly, without reordering.
2620 inversePermutation(ReorderIndices, Mask);
2621 if (VL.size() == Scalars.size())
2622 return IsSame(Scalars, Mask);
2623 if (VL.size() == ReuseShuffleIndices.size()) {
2624 ::addMask(Mask, ReuseShuffleIndices);
2625 return IsSame(Scalars, Mask);
2626 }
2627 return false;
2628 }
2629 return IsSame(Scalars, ReuseShuffleIndices);
2630 }
2631
2632 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2633 return State == TreeEntry::NeedToGather &&
2634 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2635 UserTreeIndices.front().UserTE == UserEI.UserTE;
2636 }
2637
2638 /// \returns true if current entry has same operands as \p TE.
2639 bool hasEqualOperands(const TreeEntry &TE) const {
2640 if (TE.getNumOperands() != getNumOperands())
2641 return false;
2642 SmallBitVector Used(getNumOperands());
2643 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2644 unsigned PrevCount = Used.count();
2645 for (unsigned K = 0; K < E; ++K) {
2646 if (Used.test(K))
2647 continue;
2648 if (getOperand(K) == TE.getOperand(I)) {
2649 Used.set(K);
2650 break;
2651 }
2652 }
2653 // Check if we actually found the matching operand.
2654 if (PrevCount == Used.count())
2655 return false;
2656 }
2657 return true;
2658 }
2659
2660 /// \return Final vectorization factor for the node. Defined by the total
2661 /// number of vectorized scalars, including those, used several times in the
2662 /// entry and counted in the \a ReuseShuffleIndices, if any.
2663 unsigned getVectorFactor() const {
2664 if (!ReuseShuffleIndices.empty())
2665 return ReuseShuffleIndices.size();
2666 return Scalars.size();
2667 };
2668
2669 /// A vector of scalars.
2670 ValueList Scalars;
2671
2672 /// The Scalars are vectorized into this value. It is initialized to Null.
2673 WeakTrackingVH VectorizedValue = nullptr;
2674
2675 /// New vector phi instructions emitted for the vectorized phi nodes.
2676 PHINode *PHI = nullptr;
2677
2678 /// Do we need to gather this sequence or vectorize it
2679 /// (either with vector instruction or with scatter/gather
2680 /// intrinsics for store/load)?
2681 enum EntryState {
2682 Vectorize,
2683 ScatterVectorize,
2684 StridedVectorize,
2685 NeedToGather
2686 };
2687 EntryState State;
2688
2689 /// Does this sequence require some shuffling?
2690 SmallVector<int, 4> ReuseShuffleIndices;
2691
2692 /// Does this entry require reordering?
2693 SmallVector<unsigned, 4> ReorderIndices;
2694
2695 /// Points back to the VectorizableTree.
2696 ///
2697 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2698 /// to be a pointer and needs to be able to initialize the child iterator.
2699 /// Thus we need a reference back to the container to translate the indices
2700 /// to entries.
2701 VecTreeTy &Container;
2702
2703 /// The TreeEntry index containing the user of this entry. We can actually
2704 /// have multiple users so the data structure is not truly a tree.
2705 SmallVector<EdgeInfo, 1> UserTreeIndices;
2706
2707 /// The index of this treeEntry in VectorizableTree.
2708 int Idx = -1;
2709
2710 private:
2711 /// The operands of each instruction in each lane Operands[op_index][lane].
2712 /// Note: This helps avoid the replication of the code that performs the
2713 /// reordering of operands during buildTree_rec() and vectorizeTree().
2715
2716 /// The main/alternate instruction.
2717 Instruction *MainOp = nullptr;
2718 Instruction *AltOp = nullptr;
2719
2720 public:
2721 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2722 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2723 if (Operands.size() < OpIdx + 1)
2724 Operands.resize(OpIdx + 1);
2725 assert(Operands[OpIdx].empty() && "Already resized?");
2726 assert(OpVL.size() <= Scalars.size() &&
2727 "Number of operands is greater than the number of scalars.");
2728 Operands[OpIdx].resize(OpVL.size());
2729 copy(OpVL, Operands[OpIdx].begin());
2730 }
2731
2732 /// Set the operands of this bundle in their original order.
2733 void setOperandsInOrder() {
2734 assert(Operands.empty() && "Already initialized?");
2735 auto *I0 = cast<Instruction>(Scalars[0]);
2736 Operands.resize(I0->getNumOperands());
2737 unsigned NumLanes = Scalars.size();
2738 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2739 OpIdx != NumOperands; ++OpIdx) {
2740 Operands[OpIdx].resize(NumLanes);
2741 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2742 auto *I = cast<Instruction>(Scalars[Lane]);
2743 assert(I->getNumOperands() == NumOperands &&
2744 "Expected same number of operands");
2745 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2746 }
2747 }
2748 }
2749
2750 /// Reorders operands of the node to the given mask \p Mask.
2751 void reorderOperands(ArrayRef<int> Mask) {
2752 for (ValueList &Operand : Operands)
2753 reorderScalars(Operand, Mask);
2754 }
2755
2756 /// \returns the \p OpIdx operand of this TreeEntry.
2757 ValueList &getOperand(unsigned OpIdx) {
2758 assert(OpIdx < Operands.size() && "Off bounds");
2759 return Operands[OpIdx];
2760 }
2761
2762 /// \returns the \p OpIdx operand of this TreeEntry.
2763 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2764 assert(OpIdx < Operands.size() && "Off bounds");
2765 return Operands[OpIdx];
2766 }
2767
2768 /// \returns the number of operands.
2769 unsigned getNumOperands() const { return Operands.size(); }
2770
2771 /// \return the single \p OpIdx operand.
2772 Value *getSingleOperand(unsigned OpIdx) const {
2773 assert(OpIdx < Operands.size() && "Off bounds");
2774 assert(!Operands[OpIdx].empty() && "No operand available");
2775 return Operands[OpIdx][0];
2776 }
2777
2778 /// Some of the instructions in the list have alternate opcodes.
2779 bool isAltShuffle() const { return MainOp != AltOp; }
2780
2781 bool isOpcodeOrAlt(Instruction *I) const {
2782 unsigned CheckedOpcode = I->getOpcode();
2783 return (getOpcode() == CheckedOpcode ||
2784 getAltOpcode() == CheckedOpcode);
2785 }
2786
2787 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2788 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2789 /// \p OpValue.
2790 Value *isOneOf(Value *Op) const {
2791 auto *I = dyn_cast<Instruction>(Op);
2792 if (I && isOpcodeOrAlt(I))
2793 return Op;
2794 return MainOp;
2795 }
2796
2797 void setOperations(const InstructionsState &S) {
2798 MainOp = S.MainOp;
2799 AltOp = S.AltOp;
2800 }
2801
2802 Instruction *getMainOp() const {
2803 return MainOp;
2804 }
2805
2806 Instruction *getAltOp() const {
2807 return AltOp;
2808 }
2809
2810 /// The main/alternate opcodes for the list of instructions.
2811 unsigned getOpcode() const {
2812 return MainOp ? MainOp->getOpcode() : 0;
2813 }
2814
2815 unsigned getAltOpcode() const {
2816 return AltOp ? AltOp->getOpcode() : 0;
2817 }
2818
2819 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2820 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2821 int findLaneForValue(Value *V) const {
2822 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2823 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2824 if (!ReorderIndices.empty())
2825 FoundLane = ReorderIndices[FoundLane];
2826 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2827 if (!ReuseShuffleIndices.empty()) {
2828 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2829 find(ReuseShuffleIndices, FoundLane));
2830 }
2831 return FoundLane;
2832 }
2833
2834 /// Build a shuffle mask for graph entry which represents a merge of main
2835 /// and alternate operations.
2836 void
2837 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2839 SmallVectorImpl<Value *> *OpScalars = nullptr,
2840 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2841
2842 /// Return true if this is a non-power-of-2 node.
2843 bool isNonPowOf2Vec() const {
2844 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2845 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2846 "Reshuffling not supported with non-power-of-2 vectors yet.");
2847 return IsNonPowerOf2;
2848 }
2849
2850#ifndef NDEBUG
2851 /// Debug printer.
2852 LLVM_DUMP_METHOD void dump() const {
2853 dbgs() << Idx << ".\n";
2854 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2855 dbgs() << "Operand " << OpI << ":\n";
2856 for (const Value *V : Operands[OpI])
2857 dbgs().indent(2) << *V << "\n";
2858 }
2859 dbgs() << "Scalars: \n";
2860 for (Value *V : Scalars)
2861 dbgs().indent(2) << *V << "\n";
2862 dbgs() << "State: ";
2863 switch (State) {
2864 case Vectorize:
2865 dbgs() << "Vectorize\n";
2866 break;
2867 case ScatterVectorize:
2868 dbgs() << "ScatterVectorize\n";
2869 break;
2870 case StridedVectorize:
2871 dbgs() << "StridedVectorize\n";
2872 break;
2873 case NeedToGather:
2874 dbgs() << "NeedToGather\n";
2875 break;
2876 }
2877 dbgs() << "MainOp: ";
2878 if (MainOp)
2879 dbgs() << *MainOp << "\n";
2880 else
2881 dbgs() << "NULL\n";
2882 dbgs() << "AltOp: ";
2883 if (AltOp)
2884 dbgs() << *AltOp << "\n";
2885 else
2886 dbgs() << "NULL\n";
2887 dbgs() << "VectorizedValue: ";
2888 if (VectorizedValue)
2889 dbgs() << *VectorizedValue << "\n";
2890 else
2891 dbgs() << "NULL\n";
2892 dbgs() << "ReuseShuffleIndices: ";
2893 if (ReuseShuffleIndices.empty())
2894 dbgs() << "Empty";
2895 else
2896 for (int ReuseIdx : ReuseShuffleIndices)
2897 dbgs() << ReuseIdx << ", ";
2898 dbgs() << "\n";
2899 dbgs() << "ReorderIndices: ";
2900 for (unsigned ReorderIdx : ReorderIndices)
2901 dbgs() << ReorderIdx << ", ";
2902 dbgs() << "\n";
2903 dbgs() << "UserTreeIndices: ";
2904 for (const auto &EInfo : UserTreeIndices)
2905 dbgs() << EInfo << ", ";
2906 dbgs() << "\n";
2907 }
2908#endif
2909 };
2910
2911#ifndef NDEBUG
2912 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2913 InstructionCost VecCost, InstructionCost ScalarCost,
2914 StringRef Banner) const {
2915 dbgs() << "SLP: " << Banner << ":\n";
2916 E->dump();
2917 dbgs() << "SLP: Costs:\n";
2918 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2919 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2920 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2921 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2922 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2923 }
2924#endif
2925
2926 /// Create a new VectorizableTree entry.
2927 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2928 std::optional<ScheduleData *> Bundle,
2929 const InstructionsState &S,
2930 const EdgeInfo &UserTreeIdx,
2931 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2932 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2933 TreeEntry::EntryState EntryState =
2934 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2935 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2936 ReuseShuffleIndices, ReorderIndices);
2937 }
2938
2939 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2940 TreeEntry::EntryState EntryState,
2941 std::optional<ScheduleData *> Bundle,
2942 const InstructionsState &S,
2943 const EdgeInfo &UserTreeIdx,
2944 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2945 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2946 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2947 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2948 "Need to vectorize gather entry?");
2949 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2950 TreeEntry *Last = VectorizableTree.back().get();
2951 Last->Idx = VectorizableTree.size() - 1;
2952 Last->State = EntryState;
2953 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2954 ReuseShuffleIndices.end());
2955 if (ReorderIndices.empty()) {
2956 Last->Scalars.assign(VL.begin(), VL.end());
2957 Last->setOperations(S);
2958 } else {
2959 // Reorder scalars and build final mask.
2960 Last->Scalars.assign(VL.size(), nullptr);
2961 transform(ReorderIndices, Last->Scalars.begin(),
2962 [VL](unsigned Idx) -> Value * {
2963 if (Idx >= VL.size())
2964 return UndefValue::get(VL.front()->getType());
2965 return VL[Idx];
2966 });
2967 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2968 Last->setOperations(S);
2969 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2970 }
2971 if (Last->State != TreeEntry::NeedToGather) {
2972 for (Value *V : VL) {
2973 const TreeEntry *TE = getTreeEntry(V);
2974 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2975 "Scalar already in tree!");
2976 if (TE) {
2977 if (TE != Last)
2978 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2979 continue;
2980 }
2981 ScalarToTreeEntry[V] = Last;
2982 }
2983 // Update the scheduler bundle to point to this TreeEntry.
2984 ScheduleData *BundleMember = *Bundle;
2985 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2986 isVectorLikeInstWithConstOps(S.MainOp) ||
2987 doesNotNeedToSchedule(VL)) &&
2988 "Bundle and VL out of sync");
2989 if (BundleMember) {
2990 for (Value *V : VL) {
2992 continue;
2993 if (!BundleMember)
2994 continue;
2995 BundleMember->TE = Last;
2996 BundleMember = BundleMember->NextInBundle;
2997 }
2998 }
2999 assert(!BundleMember && "Bundle and VL out of sync");
3000 } else {
3001 // Build a map for gathered scalars to the nodes where they are used.
3002 bool AllConstsOrCasts = true;
3003 for (Value *V : VL)
3004 if (!isConstant(V)) {
3005 auto *I = dyn_cast<CastInst>(V);
3006 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3007 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3008 }
3009 if (AllConstsOrCasts)
3010 CastMaxMinBWSizes =
3011 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3012 MustGather.insert(VL.begin(), VL.end());
3013 }
3014
3015 if (UserTreeIdx.UserTE) {
3016 Last->UserTreeIndices.push_back(UserTreeIdx);
3017 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3018 "Reordering isn't implemented for non-power-of-2 nodes yet");
3019 }
3020 return Last;
3021 }
3022
3023 /// -- Vectorization State --
3024 /// Holds all of the tree entries.
3025 TreeEntry::VecTreeTy VectorizableTree;
3026
3027#ifndef NDEBUG
3028 /// Debug printer.
3029 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3030 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3031 VectorizableTree[Id]->dump();
3032 dbgs() << "\n";
3033 }
3034 }
3035#endif
3036
3037 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3038
3039 const TreeEntry *getTreeEntry(Value *V) const {
3040 return ScalarToTreeEntry.lookup(V);
3041 }
3042
3043 /// Check that the operand node of alternate node does not generate
3044 /// buildvector sequence. If it is, then probably not worth it to build
3045 /// alternate shuffle, if number of buildvector operands + alternate
3046 /// instruction > than the number of buildvector instructions.
3047 /// \param S the instructions state of the analyzed values.
3048 /// \param VL list of the instructions with alternate opcodes.
3049 bool areAltOperandsProfitable(const InstructionsState &S,
3050 ArrayRef<Value *> VL) const;
3051
3052 /// Checks if the specified list of the instructions/values can be vectorized
3053 /// and fills required data before actual scheduling of the instructions.
3054 TreeEntry::EntryState getScalarsVectorizationState(
3055 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3056 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3057
3058 /// Maps a specific scalar to its tree entry.
3059 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3060
3061 /// List of scalars, used in several vectorize nodes, and the list of the
3062 /// nodes.
3064
3065 /// Maps a value to the proposed vectorizable size.
3066 SmallDenseMap<Value *, unsigned> InstrElementSize;
3067
3068 /// A list of scalars that we found that we need to keep as scalars.
3069 ValueSet MustGather;
3070
3071 /// A map between the vectorized entries and the last instructions in the
3072 /// bundles. The bundles are built in use order, not in the def order of the
3073 /// instructions. So, we cannot rely directly on the last instruction in the
3074 /// bundle being the last instruction in the program order during
3075 /// vectorization process since the basic blocks are affected, need to
3076 /// pre-gather them before.
3077 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3078
3079 /// List of gather nodes, depending on other gather/vector nodes, which should
3080 /// be emitted after the vector instruction emission process to correctly
3081 /// handle order of the vector instructions and shuffles.
3082 SetVector<const TreeEntry *> PostponedGathers;
3083
3084 using ValueToGatherNodesMap =
3086 ValueToGatherNodesMap ValueToGatherNodes;
3087
3088 /// This POD struct describes one external user in the vectorized tree.
3089 struct ExternalUser {
3090 ExternalUser(Value *S, llvm::User *U, int L)
3091 : Scalar(S), User(U), Lane(L) {}
3092
3093 // Which scalar in our function.
3094 Value *Scalar;
3095
3096 // Which user that uses the scalar.
3098
3099 // Which lane does the scalar belong to.
3100 int Lane;
3101 };
3102 using UserList = SmallVector<ExternalUser, 16>;
3103
3104 /// Checks if two instructions may access the same memory.
3105 ///
3106 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3107 /// is invariant in the calling loop.
3108 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3109 Instruction *Inst2) {
3110 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3111 return true;
3112 // First check if the result is already in the cache.
3113 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3114 auto It = AliasCache.find(Key);
3115 if (It != AliasCache.end())
3116 return It->second;
3117 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3118 // Store the result in the cache.
3119 AliasCache.try_emplace(Key, Aliased);
3120 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3121 return Aliased;
3122 }
3123
3124 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3125
3126 /// Cache for alias results.
3127 /// TODO: consider moving this to the AliasAnalysis itself.
3129
3130 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3131 // globally through SLP because we don't perform any action which
3132 // invalidates capture results.
3133 BatchAAResults BatchAA;
3134
3135 /// Temporary store for deleted instructions. Instructions will be deleted
3136 /// eventually when the BoUpSLP is destructed. The deferral is required to
3137 /// ensure that there are no incorrect collisions in the AliasCache, which
3138 /// can happen if a new instruction is allocated at the same address as a
3139 /// previously deleted instruction.
3140 DenseSet<Instruction *> DeletedInstructions;
3141
3142 /// Set of the instruction, being analyzed already for reductions.
3143 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3144
3145 /// Set of hashes for the list of reduction values already being analyzed.
3146 DenseSet<size_t> AnalyzedReductionVals;
3147
3148 /// Values, already been analyzed for mininmal bitwidth and found to be
3149 /// non-profitable.
3150 DenseSet<Value *> AnalyzedMinBWVals;
3151
3152 /// A list of values that need to extracted out of the tree.
3153 /// This list holds pairs of (Internal Scalar : External User). External User
3154 /// can be nullptr, it means that this Internal Scalar will be used later,
3155 /// after vectorization.
3156 UserList ExternalUses;
3157
3158 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3159 /// extractelement instructions.
3160 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3161
3162 /// Values used only by @llvm.assume calls.
3164
3165 /// Holds all of the instructions that we gathered, shuffle instructions and
3166 /// extractelements.
3167 SetVector<Instruction *> GatherShuffleExtractSeq;
3168
3169 /// A list of blocks that we are going to CSE.
3170 DenseSet<BasicBlock *> CSEBlocks;
3171
3172 /// Contains all scheduling relevant data for an instruction.
3173 /// A ScheduleData either represents a single instruction or a member of an
3174 /// instruction bundle (= a group of instructions which is combined into a
3175 /// vector instruction).
3176 struct ScheduleData {
3177 // The initial value for the dependency counters. It means that the
3178 // dependencies are not calculated yet.
3179 enum { InvalidDeps = -1 };
3180
3181 ScheduleData() = default;
3182
3183 void init(int BlockSchedulingRegionID, Value *OpVal) {
3184 FirstInBundle = this;
3185 NextInBundle = nullptr;
3186 NextLoadStore = nullptr;
3187 IsScheduled = false;
3188 SchedulingRegionID = BlockSchedulingRegionID;
3189 clearDependencies();
3190 OpValue = OpVal;
3191 TE = nullptr;
3192 }
3193
3194 /// Verify basic self consistency properties
3195 void verify() {
3196 if (hasValidDependencies()) {
3197 assert(UnscheduledDeps <= Dependencies && "invariant");
3198 } else {
3199 assert(UnscheduledDeps == Dependencies && "invariant");
3200 }
3201
3202 if (IsScheduled) {
3203 assert(isSchedulingEntity() &&
3204 "unexpected scheduled state");
3205 for (const ScheduleData *BundleMember = this; BundleMember;
3206 BundleMember = BundleMember->NextInBundle) {
3207 assert(BundleMember->hasValidDependencies() &&
3208 BundleMember->UnscheduledDeps == 0 &&
3209 "unexpected scheduled state");
3210 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3211 "only bundle is marked scheduled");
3212 }
3213 }
3214
3215 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3216 "all bundle members must be in same basic block");
3217 }
3218
3219 /// Returns true if the dependency information has been calculated.
3220 /// Note that depenendency validity can vary between instructions within
3221 /// a single bundle.
3222 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3223
3224 /// Returns true for single instructions and for bundle representatives
3225 /// (= the head of a bundle).
3226 bool isSchedulingEntity() const { return FirstInBundle == this; }
3227
3228 /// Returns true if it represents an instruction bundle and not only a
3229 /// single instruction.
3230 bool isPartOfBundle() const {
3231 return NextInBundle != nullptr || FirstInBundle != this || TE;
3232 }
3233
3234 /// Returns true if it is ready for scheduling, i.e. it has no more
3235 /// unscheduled depending instructions/bundles.
3236 bool isReady() const {
3237 assert(isSchedulingEntity() &&
3238 "can't consider non-scheduling entity for ready list");
3239 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3240 }
3241
3242 /// Modifies the number of unscheduled dependencies for this instruction,
3243 /// and returns the number of remaining dependencies for the containing
3244 /// bundle.
3245 int incrementUnscheduledDeps(int Incr) {
3246 assert(hasValidDependencies() &&
3247 "increment of unscheduled deps would be meaningless");
3248 UnscheduledDeps += Incr;
3249 return FirstInBundle->unscheduledDepsInBundle();
3250 }
3251
3252 /// Sets the number of unscheduled dependencies to the number of
3253 /// dependencies.
3254 void resetUnscheduledDeps() {
3255 UnscheduledDeps = Dependencies;
3256 }
3257
3258 /// Clears all dependency information.
3259 void clearDependencies() {
3260 Dependencies = InvalidDeps;
3261 resetUnscheduledDeps();
3262 MemoryDependencies.clear();
3263 ControlDependencies.clear();
3264 }
3265
3266 int unscheduledDepsInBundle() const {
3267 assert(isSchedulingEntity() && "only meaningful on the bundle");
3268 int Sum = 0;
3269 for (const ScheduleData *BundleMember = this; BundleMember;
3270 BundleMember = BundleMember->NextInBundle) {
3271 if (BundleMember->UnscheduledDeps == InvalidDeps)
3272 return InvalidDeps;
3273 Sum += BundleMember->UnscheduledDeps;
3274 }
3275 return Sum;
3276 }
3277
3278 void dump(raw_ostream &os) const {
3279 if (!isSchedulingEntity()) {
3280 os << "/ " << *Inst;
3281 } else if (NextInBundle) {
3282 os << '[' << *Inst;
3283 ScheduleData *SD = NextInBundle;
3284 while (SD) {
3285 os << ';' << *SD->Inst;
3286 SD = SD->NextInBundle;
3287 }
3288 os << ']';
3289 } else {
3290 os << *Inst;
3291 }
3292 }
3293
3294 Instruction *Inst = nullptr;
3295
3296 /// Opcode of the current instruction in the schedule data.
3297 Value *OpValue = nullptr;
3298
3299 /// The TreeEntry that this instruction corresponds to.
3300 TreeEntry *TE = nullptr;
3301
3302 /// Points to the head in an instruction bundle (and always to this for
3303 /// single instructions).
3304 ScheduleData *FirstInBundle = nullptr;
3305
3306 /// Single linked list of all instructions in a bundle. Null if it is a
3307 /// single instruction.
3308 ScheduleData *NextInBundle = nullptr;
3309
3310 /// Single linked list of all memory instructions (e.g. load, store, call)
3311 /// in the block - until the end of the scheduling region.
3312 ScheduleData *NextLoadStore = nullptr;
3313
3314 /// The dependent memory instructions.
3315 /// This list is derived on demand in calculateDependencies().
3316 SmallVector<ScheduleData *, 4> MemoryDependencies;
3317
3318 /// List of instructions which this instruction could be control dependent
3319 /// on. Allowing such nodes to be scheduled below this one could introduce
3320 /// a runtime fault which didn't exist in the original program.
3321 /// ex: this is a load or udiv following a readonly call which inf loops
3322 SmallVector<ScheduleData *, 4> ControlDependencies;
3323
3324 /// This ScheduleData is in the current scheduling region if this matches
3325 /// the current SchedulingRegionID of BlockScheduling.
3326 int SchedulingRegionID = 0;
3327
3328 /// Used for getting a "good" final ordering of instructions.
3329 int SchedulingPriority = 0;
3330
3331 /// The number of dependencies. Constitutes of the number of users of the
3332 /// instruction plus the number of dependent memory instructions (if any).
3333 /// This value is calculated on demand.
3334 /// If InvalidDeps, the number of dependencies is not calculated yet.
3335 int Dependencies = InvalidDeps;
3336
3337 /// The number of dependencies minus the number of dependencies of scheduled
3338 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3339 /// for scheduling.
3340 /// Note that this is negative as long as Dependencies is not calculated.
3341 int UnscheduledDeps = InvalidDeps;
3342
3343 /// True if this instruction is scheduled (or considered as scheduled in the
3344 /// dry-run).
3345 bool IsScheduled = false;
3346 };
3347
3348#ifndef NDEBUG
3350 const BoUpSLP::ScheduleData &SD) {
3351 SD.dump(os);
3352 return os;
3353 }
3354#endif
3355
3356 friend struct GraphTraits<BoUpSLP *>;
3357 friend struct DOTGraphTraits<BoUpSLP *>;
3358
3359 /// Contains all scheduling data for a basic block.
3360 /// It does not schedules instructions, which are not memory read/write
3361 /// instructions and their operands are either constants, or arguments, or
3362 /// phis, or instructions from others blocks, or their users are phis or from
3363 /// the other blocks. The resulting vector instructions can be placed at the
3364 /// beginning of the basic block without scheduling (if operands does not need
3365 /// to be scheduled) or at the end of the block (if users are outside of the
3366 /// block). It allows to save some compile time and memory used by the
3367 /// compiler.
3368 /// ScheduleData is assigned for each instruction in between the boundaries of
3369 /// the tree entry, even for those, which are not part of the graph. It is
3370 /// required to correctly follow the dependencies between the instructions and
3371 /// their correct scheduling. The ScheduleData is not allocated for the
3372 /// instructions, which do not require scheduling, like phis, nodes with
3373 /// extractelements/insertelements only or nodes with instructions, with
3374 /// uses/operands outside of the block.
3375 struct BlockScheduling {
3376 BlockScheduling(BasicBlock *BB)
3377 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3378
3379 void clear() {
3380 ReadyInsts.clear();
3381 ScheduleStart = nullptr;
3382 ScheduleEnd = nullptr;
3383 FirstLoadStoreInRegion = nullptr;
3384 LastLoadStoreInRegion = nullptr;
3385 RegionHasStackSave = false;
3386
3387 // Reduce the maximum schedule region size by the size of the
3388 // previous scheduling run.
3389 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3390 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3391 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3392 ScheduleRegionSize = 0;
3393
3394 // Make a new scheduling region, i.e. all existing ScheduleData is not
3395 // in the new region yet.
3396 ++SchedulingRegionID;
3397 }
3398
3399 ScheduleData *getScheduleData(Instruction *I) {
3400 if (BB != I->getParent())
3401 // Avoid lookup if can't possibly be in map.
3402 return nullptr;
3403 ScheduleData *SD = ScheduleDataMap.lookup(I);
3404 if (SD && isInSchedulingRegion(SD))
3405 return SD;
3406 return nullptr;
3407 }
3408
3409 ScheduleData *getScheduleData(Value *V) {
3410 if (auto *I = dyn_cast<Instruction>(V))
3411 return getScheduleData(I);
3412 return nullptr;
3413 }
3414
3415 ScheduleData *getScheduleData(Value *V, Value *Key) {
3416 if (V == Key)
3417 return getScheduleData(V);
3418 auto I = ExtraScheduleDataMap.find(V);
3419 if (I != ExtraScheduleDataMap.end()) {
3420 ScheduleData *SD = I->second.lookup(Key);
3421 if (SD && isInSchedulingRegion(SD))
3422 return SD;
3423 }
3424 return nullptr;
3425 }
3426
3427 bool isInSchedulingRegion(ScheduleData *SD) const {
3428 return SD->SchedulingRegionID == SchedulingRegionID;
3429 }
3430
3431 /// Marks an instruction as scheduled and puts all dependent ready
3432 /// instructions into the ready-list.
3433 template <typename ReadyListType>
3434 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3435 SD->IsScheduled = true;
3436 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3437
3438 for (ScheduleData *BundleMember = SD; BundleMember;
3439 BundleMember = BundleMember->NextInBundle) {
3440 if (BundleMember->Inst != BundleMember->OpValue)
3441 continue;
3442
3443 // Handle the def-use chain dependencies.
3444
3445 // Decrement the unscheduled counter and insert to ready list if ready.
3446 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3447 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3448 if (OpDef && OpDef->hasValidDependencies() &&
3449 OpDef->incrementUnscheduledDeps(-1) == 0) {
3450 // There are no more unscheduled dependencies after
3451 // decrementing, so we can put the dependent instruction
3452 // into the ready list.
3453 ScheduleData *DepBundle = OpDef->FirstInBundle;
3454 assert(!DepBundle->IsScheduled &&
3455 "already scheduled bundle gets ready");
3456 ReadyList.insert(DepBundle);
3457 LLVM_DEBUG(dbgs()
3458 << "SLP: gets ready (def): " << *DepBundle << "\n");
3459 }
3460 });
3461 };
3462
3463 // If BundleMember is a vector bundle, its operands may have been
3464 // reordered during buildTree(). We therefore need to get its operands
3465 // through the TreeEntry.
3466 if (TreeEntry *TE = BundleMember->TE) {
3467 // Need to search for the lane since the tree entry can be reordered.
3468 int Lane = std::distance(TE->Scalars.begin(),
3469 find(TE->Scalars, BundleMember->Inst));
3470 assert(Lane >= 0 && "Lane not set");
3471
3472 // Since vectorization tree is being built recursively this assertion
3473 // ensures that the tree entry has all operands set before reaching
3474 // this code. Couple of exceptions known at the moment are extracts
3475 // where their second (immediate) operand is not added. Since
3476 // immediates do not affect scheduler behavior this is considered
3477 // okay.
3478 auto *In = BundleMember->Inst;
3479 assert(
3480 In &&
3481 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3482 In->getNumOperands() == TE->getNumOperands()) &&
3483 "Missed TreeEntry operands?");
3484 (void)In; // fake use to avoid build failure when assertions disabled
3485
3486 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3487 OpIdx != NumOperands; ++OpIdx)
3488 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3489 DecrUnsched(I);
3490 } else {
3491 // If BundleMember is a stand-alone instruction, no operand reordering
3492 // has taken place, so we directly access its operands.
3493 for (Use &U : BundleMember->Inst->operands())
3494 if (auto *I = dyn_cast<Instruction>(U.get()))
3495 DecrUnsched(I);
3496 }
3497 // Handle the memory dependencies.
3498 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3499 if (MemoryDepSD->hasValidDependencies() &&
3500 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3501 // There are no more unscheduled dependencies after decrementing,
3502 // so we can put the dependent instruction into the ready list.
3503 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3504 assert(!DepBundle->IsScheduled &&
3505 "already scheduled bundle gets ready");
3506 ReadyList.insert(DepBundle);
3508 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3509 }
3510 }
3511 // Handle the control dependencies.
3512 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3513 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3514 // There are no more unscheduled dependencies after decrementing,
3515 // so we can put the dependent instruction into the ready list.
3516 ScheduleData *DepBundle = DepSD->FirstInBundle;
3517 assert(!DepBundle->IsScheduled &&
3518 "already scheduled bundle gets ready");
3519 ReadyList.insert(DepBundle);
3521 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3522 }
3523 }
3524 }
3525 }
3526
3527 /// Verify basic self consistency properties of the data structure.
3528 void verify() {
3529 if (!ScheduleStart)
3530 return;
3531
3532 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3533 ScheduleStart->comesBefore(ScheduleEnd) &&
3534 "Not a valid scheduling region?");
3535
3536 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3537 auto *SD = getScheduleData(I);
3538 if (!SD)
3539 continue;
3540 assert(isInSchedulingRegion(SD) &&
3541 "primary schedule data not in window?");
3542 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3543 "entire bundle in window!");
3544 (void)SD;
3545 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3546 }
3547
3548 for (auto *SD : ReadyInsts) {
3549 assert(SD->isSchedulingEntity() && SD->isReady() &&
3550 "item in ready list not ready?");
3551 (void)SD;
3552 }
3553 }
3554
3555 void doForAllOpcodes(Value *V,
3556 function_ref<void(ScheduleData *SD)> Action) {
3557 if (ScheduleData *SD = getScheduleData(V))
3558 Action(SD);
3559 auto I = ExtraScheduleDataMap.find(V);
3560 if (I != ExtraScheduleDataMap.end())
3561 for (auto &P : I->second)
3562 if (isInSchedulingRegion(P.second))
3563 Action(P.second);
3564 }
3565
3566 /// Put all instructions into the ReadyList which are ready for scheduling.
3567 template <typename ReadyListType>
3568 void initialFillReadyList(ReadyListType &ReadyList) {
3569 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3570 doForAllOpcodes(I, [&](ScheduleData *SD) {
3571 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3572 SD->isReady()) {
3573 ReadyList.insert(SD);
3574 LLVM_DEBUG(dbgs()
3575 << "SLP: initially in ready list: " << *SD << "\n");
3576 }
3577 });
3578 }
3579 }
3580
3581 /// Build a bundle from the ScheduleData nodes corresponding to the
3582 /// scalar instruction for each lane.
3583 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3584
3585 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3586 /// cyclic dependencies. This is only a dry-run, no instructions are
3587 /// actually moved at this stage.
3588 /// \returns the scheduling bundle. The returned Optional value is not
3589 /// std::nullopt if \p VL is allowed to be scheduled.
3590 std::optional<ScheduleData *>
3591 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3592 const InstructionsState &S);
3593
3594 /// Un-bundles a group of instructions.
3595 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3596
3597 /// Allocates schedule data chunk.
3598 ScheduleData *allocateScheduleDataChunks();
3599
3600 /// Extends the scheduling region so that V is inside the region.
3601 /// \returns true if the region size is within the limit.
3602 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3603
3604 /// Initialize the ScheduleData structures for new instructions in the
3605 /// scheduling region.
3606 void initScheduleData(Instruction *FromI, Instruction *ToI,
3607 ScheduleData *PrevLoadStore,
3608 ScheduleData *NextLoadStore);
3609
3610 /// Updates the dependency information of a bundle and of all instructions/
3611 /// bundles which depend on the original bundle.
3612 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3613 BoUpSLP *SLP);
3614
3615 /// Sets all instruction in the scheduling region to un-scheduled.
3616 void resetSchedule();
3617
3618 BasicBlock *BB;
3619
3620 /// Simple memory allocation for ScheduleData.
3622
3623 /// The size of a ScheduleData array in ScheduleDataChunks.
3624 int ChunkSize;
3625
3626 /// The allocator position in the current chunk, which is the last entry
3627 /// of ScheduleDataChunks.
3628 int ChunkPos;
3629
3630 /// Attaches ScheduleData to Instruction.
3631 /// Note that the mapping survives during all vectorization iterations, i.e.
3632 /// ScheduleData structures are recycled.
3634
3635 /// Attaches ScheduleData to Instruction with the leading key.
3637 ExtraScheduleDataMap;
3638
3639 /// The ready-list for scheduling (only used for the dry-run).
3640 SetVector<ScheduleData *> ReadyInsts;
3641
3642 /// The first instruction of the scheduling region.
3643 Instruction *ScheduleStart = nullptr;
3644
3645 /// The first instruction _after_ the scheduling region.
3646 Instruction *ScheduleEnd = nullptr;
3647
3648 /// The first memory accessing instruction in the scheduling region
3649 /// (can be null).
3650 ScheduleData *FirstLoadStoreInRegion = nullptr;
3651
3652 /// The last memory accessing instruction in the scheduling region
3653 /// (can be null).
3654 ScheduleData *LastLoadStoreInRegion = nullptr;
3655
3656 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3657 /// region? Used to optimize the dependence calculation for the
3658 /// common case where there isn't.
3659 bool RegionHasStackSave = false;
3660
3661 /// The current size of the scheduling region.
3662 int ScheduleRegionSize = 0;
3663
3664 /// The maximum size allowed for the scheduling region.
3665 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3666
3667 /// The ID of the scheduling region. For a new vectorization iteration this
3668 /// is incremented which "removes" all ScheduleData from the region.
3669 /// Make sure that the initial SchedulingRegionID is greater than the
3670 /// initial SchedulingRegionID in ScheduleData (which is 0).
3671 int SchedulingRegionID = 1;
3672 };
3673
3674 /// Attaches the BlockScheduling structures to basic blocks.
3676
3677 /// Performs the "real" scheduling. Done before vectorization is actually
3678 /// performed in a basic block.
3679 void scheduleBlock(BlockScheduling *BS);
3680
3681 /// List of users to ignore during scheduling and that don't need extracting.
3682 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3683
3684 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3685 /// sorted SmallVectors of unsigned.
3686 struct OrdersTypeDenseMapInfo {
3687 static OrdersType getEmptyKey() {
3688 OrdersType V;
3689 V.push_back(~1U);
3690 return V;
3691 }
3692
3693 static OrdersType getTombstoneKey() {
3694 OrdersType V;
3695 V.push_back(~2U);
3696 return V;
3697 }
3698
3699 static unsigned getHashValue(const OrdersType &V) {
3700 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3701 }
3702
3703 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3704 return LHS == RHS;
3705 }
3706 };
3707
3708 // Analysis and block reference.
3709 Function *F;
3710 ScalarEvolution *SE;
3712 TargetLibraryInfo *TLI;
3713 LoopInfo *LI;
3714 DominatorTree *DT;
3715 AssumptionCache *AC;
3716 DemandedBits *DB;
3717 const DataLayout *DL;
3719
3720 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3721 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3722
3723 /// Instruction builder to construct the vectorized tree.
3725
3726 /// A map of scalar integer values to the smallest bit width with which they
3727 /// can legally be represented. The values map to (width, signed) pairs,
3728 /// where "width" indicates the minimum bit width and "signed" is True if the
3729 /// value must be signed-extended, rather than zero-extended, back to its
3730 /// original width.
3732
3733 /// Final size of the reduced vector, if the current graph represents the
3734 /// input for the reduction and it was possible to narrow the size of the
3735 /// reduction.
3736 unsigned ReductionBitWidth = 0;
3737
3738 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3739 /// type sizes, used in the tree.
3740 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3741
3742 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3743 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3744 DenseSet<unsigned> ExtraBitWidthNodes;
3745};
3746
3747} // end namespace slpvectorizer
3748
3749template <> struct GraphTraits<BoUpSLP *> {
3750 using TreeEntry = BoUpSLP::TreeEntry;
3751
3752 /// NodeRef has to be a pointer per the GraphWriter.
3754
3756
3757 /// Add the VectorizableTree to the index iterator to be able to return
3758 /// TreeEntry pointers.
3759 struct ChildIteratorType
3760 : public iterator_adaptor_base<
3761 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3763
3765 ContainerTy &VT)
3766 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3767
3768 NodeRef operator*() { return I->UserTE; }
3769 };
3770
3772 return R.VectorizableTree[0].get();
3773 }
3774
3775 static ChildIteratorType child_begin(NodeRef N) {
3776 return {N->UserTreeIndices.begin(), N->Container};
3777 }
3778
3779 static ChildIteratorType child_end(NodeRef N) {
3780 return {N->UserTreeIndices.end(), N->Container};
3781 }
3782
3783 /// For the node iterator we just need to turn the TreeEntry iterator into a
3784 /// TreeEntry* iterator so that it dereferences to NodeRef.
3785 class nodes_iterator {
3787 ItTy It;
3788
3789 public:
3790 nodes_iterator(const ItTy &It2) : It(It2) {}
3791 NodeRef operator*() { return It->get(); }
3792 nodes_iterator operator++() {
3793 ++It;
3794 return *this;
3795 }
3796 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3797 };
3798
3799 static nodes_iterator nodes_begin(BoUpSLP *R) {
3800 return nodes_iterator(R->VectorizableTree.begin());
3801 }
3802
3803 static nodes_iterator nodes_end(BoUpSLP *R) {
3804 return nodes_iterator(R->VectorizableTree.end());
3805 }
3806
3807 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3808};
3809
3810template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3811 using TreeEntry = BoUpSLP::TreeEntry;
3812
3813 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3814
3815 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3816 std::string Str;
3818 OS << Entry->Idx << ".\n";
3819 if (isSplat(Entry->Scalars))
3820 OS << "<splat> ";
3821 for (auto *V : Entry->Scalars) {
3822 OS << *V;
3823 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3824 return EU.Scalar == V;
3825 }))
3826 OS << " <extract>";
3827 OS << "\n";
3828 }
3829 return Str;
3830 }
3831
3832 static std::string getNodeAttributes(const TreeEntry *Entry,
3833 const BoUpSLP *) {
3834 if (Entry->State == TreeEntry::NeedToGather)
3835 return "color=red";
3836 if (Entry->State == TreeEntry::ScatterVectorize ||
3837 Entry->State == TreeEntry::StridedVectorize)
3838 return "color=blue";
3839 return "";
3840 }
3841};
3842
3843} // end namespace llvm
3844
3847 for (auto *I : DeletedInstructions) {
3848 for (Use &U : I->operands()) {
3849 auto *Op = dyn_cast<Instruction>(U.get());
3850 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3852 DeadInsts.emplace_back(Op);
3853 }
3854 I->dropAllReferences();
3855 }
3856 for (auto *I : DeletedInstructions) {
3857 assert(I->use_empty() &&
3858 "trying to erase instruction with users.");
3859 I->eraseFromParent();
3860 }
3861
3862 // Cleanup any dead scalar code feeding the vectorized instructions
3864
3865#ifdef EXPENSIVE_CHECKS
3866 // If we could guarantee that this call is not extremely slow, we could
3867 // remove the ifdef limitation (see PR47712).
3868 assert(!verifyFunction(*F, &dbgs()));
3869#endif
3870}
3871
3872/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3873/// contains original mask for the scalars reused in the node. Procedure
3874/// transform this mask in accordance with the given \p Mask.
3876 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3877 "Expected non-empty mask.");
3878 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3879 Prev.swap(Reuses);
3880 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3881 if (Mask[I] != PoisonMaskElem)
3882 Reuses[Mask[I]] = Prev[I];
3883}
3884
3885/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3886/// the original order of the scalars. Procedure transforms the provided order
3887/// in accordance with the given \p Mask. If the resulting \p Order is just an
3888/// identity order, \p Order is cleared.
3890 bool BottomOrder = false) {
3891 assert(!Mask.empty() && "Expected non-empty mask.");
3892 unsigned Sz = Mask.size();
3893 if (BottomOrder) {
3894 SmallVector<unsigned> PrevOrder;
3895 if (Order.empty()) {
3896 PrevOrder.resize(Sz);
3897 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3898 } else {
3899 PrevOrder.swap(Order);
3900 }
3901 Order.assign(Sz, Sz);
3902 for (unsigned I = 0; I < Sz; ++I)
3903 if (Mask[I] != PoisonMaskElem)
3904 Order[I] = PrevOrder[Mask[I]];
3905 if (all_of(enumerate(Order), [&](const auto &Data) {
3906 return Data.value() == Sz || Data.index() == Data.value();
3907 })) {
3908 Order.clear();
3909 return;
3910 }
3911 fixupOrderingIndices(Order);
3912 return;
3913 }
3914 SmallVector<int> MaskOrder;
3915 if (Order.empty()) {
3916 MaskOrder.resize(Sz);
3917 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3918 } else {
3919 inversePermutation(Order, MaskOrder);
3920 }
3921 reorderReuses(MaskOrder, Mask);
3922 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
3923 Order.clear();
3924 return;
3925 }
3926 Order.assign(Sz, Sz);
3927 for (unsigned I = 0; I < Sz; ++I)
3928 if (MaskOrder[I] != PoisonMaskElem)
3929 Order[MaskOrder[I]] = I;
3930 fixupOrderingIndices(Order);
3931}
3932
3933std::optional<BoUpSLP::OrdersType>
3934BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3935 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3936 // Try to find subvector extract/insert patterns and reorder only such
3937 // patterns.
3938 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3939 Type *ScalarTy = GatheredScalars.front()->getType();
3940 int NumScalars = GatheredScalars.size();
3941 if (!isValidElementType(ScalarTy))
3942 return std::nullopt;
3943 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
3944 int NumParts = TTI->getNumberOfParts(VecTy);
3945 if (NumParts == 0 || NumParts >= NumScalars)
3946 NumParts = 1;
3947 SmallVector<int> ExtractMask;
3948 SmallVector<int> Mask;
3951 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3953 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3954 /*ForOrder=*/true);
3955 // No shuffled operands - ignore.
3956 if (GatherShuffles.empty() && ExtractShuffles.empty())
3957 return std::nullopt;
3958 OrdersType CurrentOrder(NumScalars, NumScalars);
3959 if (GatherShuffles.size() == 1 &&
3960 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3961 Entries.front().front()->isSame(TE.Scalars)) {
3962 // Perfect match in the graph, will reuse the previously vectorized
3963 // node. Cost is 0.
3964 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
3965 return CurrentOrder;
3966 }
3967 auto IsSplatMask = [](ArrayRef<int> Mask) {
3968 int SingleElt = PoisonMaskElem;
3969 return all_of(Mask, [&](int I) {
3970 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3971 SingleElt = I;
3972 return I == PoisonMaskElem || I == SingleElt;
3973 });
3974 };
3975 // Exclusive broadcast mask - ignore.
3976 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3977 (Entries.size() != 1 ||
3978 Entries.front().front()->ReorderIndices.empty())) ||
3979 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3980 return std::nullopt;
3981 SmallBitVector ShuffledSubMasks(NumParts);
3982 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3983 ArrayRef<int> Mask, int PartSz, int NumParts,
3984 function_ref<unsigned(unsigned)> GetVF) {
3985 for (int I : seq<int>(0, NumParts)) {
3986 if (ShuffledSubMasks.test(I))
3987 continue;
3988 const int VF = GetVF(I);
3989 if (VF == 0)
3990 continue;
3991 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
3992 // Shuffle of at least 2 vectors - ignore.
3993 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
3994 std::fill(Slice.begin(), Slice.end(), NumScalars);
3995 ShuffledSubMasks.set(I);
3996 continue;
3997 }
3998 // Try to include as much elements from the mask as possible.
3999 int FirstMin = INT_MAX;
4000 int SecondVecFound = false;
4001 for (int K : seq<int>(0, PartSz)) {
4002 int Idx = Mask[I * PartSz + K];
4003 if (Idx == PoisonMaskElem) {
4004 Value *V = GatheredScalars[I * PartSz + K];
4005 if (isConstant(V) && !isa<PoisonValue>(V)) {
4006 SecondVecFound = true;
4007 break;
4008 }
4009 continue;
4010 }
4011 if (Idx < VF) {
4012 if (FirstMin > Idx)
4013 FirstMin = Idx;
4014 } else {
4015 SecondVecFound = true;
4016 break;
4017 }
4018 }
4019 FirstMin = (FirstMin / PartSz) * PartSz;
4020 // Shuffle of at least 2 vectors - ignore.
4021 if (SecondVecFound) {
4022 std::fill(Slice.begin(), Slice.end(), NumScalars);
4023 ShuffledSubMasks.set(I);
4024 continue;
4025 }
4026 for (int K : seq<int>(0, PartSz)) {
4027 int Idx = Mask[I * PartSz + K];
4028 if (Idx == PoisonMaskElem)
4029 continue;
4030 Idx -= FirstMin;
4031 if (Idx >= PartSz) {
4032 SecondVecFound = true;
4033 break;
4034 }
4035 if (CurrentOrder[I * PartSz + Idx] >
4036 static_cast<unsigned>(I * PartSz + K) &&
4037 CurrentOrder[I * PartSz + Idx] !=
4038 static_cast<unsigned>(I * PartSz + Idx))
4039 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4040 }
4041 // Shuffle of at least 2 vectors - ignore.
4042 if (SecondVecFound) {
4043 std::fill(Slice.begin(), Slice.end(), NumScalars);
4044 ShuffledSubMasks.set(I);
4045 continue;
4046 }
4047 }
4048 };
4049 int PartSz = NumScalars / NumParts;
4050 if (!ExtractShuffles.empty())
4051 TransformMaskToOrder(
4052 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4053 if (!ExtractShuffles[I])
4054 return 0U;
4055 unsigned VF = 0;
4056 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
4057 int K = I * PartSz + Idx;
4058 if (ExtractMask[K] == PoisonMaskElem)
4059 continue;
4060 if (!TE.ReuseShuffleIndices.empty())
4061 K = TE.ReuseShuffleIndices[K];
4062 if (!TE.ReorderIndices.empty())
4063 K = std::distance(TE.ReorderIndices.begin(),
4064 find(TE.ReorderIndices, K));
4065 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4066 if (!EI)
4067 continue;
4068 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4069 ->getElementCount()
4070 .getKnownMinValue());
4071 }
4072 return VF;
4073 });
4074 // Check special corner case - single shuffle of the same entry.
4075 if (GatherShuffles.size() == 1 && NumParts != 1) {
4076 if (ShuffledSubMasks.any())
4077 return std::nullopt;
4078 PartSz = NumScalars;
4079 NumParts = 1;
4080 }
4081 if (!Entries.empty())
4082 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4083 if (!GatherShuffles[I])
4084 return 0U;
4085 return std::max(Entries[I].front()->getVectorFactor(),
4086 Entries[I].back()->getVectorFactor());
4087 });
4088 int NumUndefs =
4089 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4090 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4091 return std::nullopt;
4092 return std::move(CurrentOrder);
4093}
4094
4095static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4096 const TargetLibraryInfo &TLI,
4097 bool CompareOpcodes = true) {
4098 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4099 return false;
4100 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4101 if (!GEP1)
4102 return false;
4103 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4104 if (!GEP2)
4105 return false;
4106 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4107 ((isConstant(GEP1->getOperand(1)) &&
4108 isConstant(GEP2->getOperand(1))) ||
4109 !CompareOpcodes ||
4110 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4111 .getOpcode());
4112}
4113
4114/// Calculates minimal alignment as a common alignment.
4115template <typename T>
4117 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4118 for (Value *V : VL.drop_front())
4119 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4120 return CommonAlignment;
4121}
4122
4123/// Check if \p Order represents reverse order.
4125 unsigned Sz = Order.size();
4126 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4127 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4128 });
4129}
4130
4131/// Checks if the provided list of pointers \p Pointers represents the strided
4132/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4133/// Otherwise, if \p Inst is not specified, just initialized optional value is
4134/// returned to show that the pointers represent strided pointers. If \p Inst
4135/// specified, the runtime stride is materialized before the given \p Inst.
4136/// \returns std::nullopt if the pointers are not pointers with the runtime
4137/// stride, nullptr or actual stride value, otherwise.
4138static std::optional<Value *>
4140 const DataLayout &DL, ScalarEvolution &SE,
4141 SmallVectorImpl<unsigned> &SortedIndices,
4142 Instruction *Inst = nullptr) {
4144 const SCEV *PtrSCEVLowest = nullptr;
4145 const SCEV *PtrSCEVHighest = nullptr;
4146 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4147 // addresses).
4148 for (Value *Ptr : PointerOps) {
4149 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4150 if (!PtrSCEV)
4151 return std::nullopt;
4152 SCEVs.push_back(PtrSCEV);
4153 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4154 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4155 continue;
4156 }
4157 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4158 if (isa<SCEVCouldNotCompute>(Diff))
4159 return std::nullopt;
4160 if (Diff->isNonConstantNegative()) {
4161 PtrSCEVLowest = PtrSCEV;
4162 continue;
4163 }
4164 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4165 if (isa<SCEVCouldNotCompute>(Diff1))
4166 return std::nullopt;
4167 if (Diff1->isNonConstantNegative()) {
4168 PtrSCEVHighest = PtrSCEV;
4169 continue;
4170 }
4171 }
4172 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4173 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4174 if (isa<SCEVCouldNotCompute>(Dist))
4175 return std::nullopt;
4176 int Size = DL.getTypeStoreSize(ElemTy);
4177 auto TryGetStride = [&](const SCEV *Dist,
4178 const SCEV *Multiplier) -> const SCEV * {
4179 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4180 if (M->getOperand(0) == Multiplier)
4181 return M->getOperand(1);
4182 if (M->getOperand(1) == Multiplier)
4183 return M->getOperand(0);
4184 return nullptr;
4185 }
4186 if (Multiplier == Dist)
4187 return SE.getConstant(Dist->getType(), 1);
4188 return SE.getUDivExactExpr(Dist, Multiplier);
4189 };
4190 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4191 const SCEV *Stride = nullptr;
4192 if (Size != 1 || SCEVs.size() > 2) {
4193 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4194 Stride = TryGetStride(Dist, Sz);
4195 if (!Stride)
4196 return std::nullopt;
4197 }
4198 if (!Stride || isa<SCEVConstant>(Stride))
4199 return std::nullopt;
4200 // Iterate through all pointers and check if all distances are
4201 // unique multiple of Stride.
4202 using DistOrdPair = std::pair<int64_t, int>;
4203 auto Compare = llvm::less_first();
4204 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4205 int Cnt = 0;
4206 bool IsConsecutive = true;
4207 for (const SCEV *PtrSCEV : SCEVs) {
4208 unsigned Dist = 0;
4209 if (PtrSCEV != PtrSCEVLowest) {
4210 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4211 const SCEV *Coeff = TryGetStride(Diff, Stride);
4212 if (!Coeff)
4213 return std::nullopt;
4214 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4215 if (!SC || isa<SCEVCouldNotCompute>(SC))
4216 return std::nullopt;
4217 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4218 SE.getMulExpr(Stride, SC)))
4219 ->isZero())
4220 return std::nullopt;
4221 Dist = SC->getAPInt().getZExtValue();
4222 }
4223 // If the strides are not the same or repeated, we can't vectorize.
4224 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4225 return std::nullopt;
4226 auto Res = Offsets.emplace(Dist, Cnt);
4227 if (!Res.second)
4228 return std::nullopt;
4229 // Consecutive order if the inserted element is the last one.
4230 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4231 ++Cnt;
4232 }
4233 if (Offsets.size() != SCEVs.size())
4234 return std::nullopt;
4235 SortedIndices.clear();
4236 if (!IsConsecutive) {
4237 // Fill SortedIndices array only if it is non-consecutive.
4238 SortedIndices.resize(PointerOps.size());
4239 Cnt = 0;
4240 for (const std::pair<int64_t, int> &Pair : Offsets) {
4241 SortedIndices[Cnt] = Pair.second;
4242 ++Cnt;
4243 }
4244 }
4245 if (!Inst)
4246 return nullptr;
4247 SCEVExpander Expander(SE, DL, "strided-load-vec");
4248 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4249}
4250
4252 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4253 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4254 // Check that a vectorized load would load the same memory as a scalar
4255 // load. For example, we don't want to vectorize loads that are smaller
4256 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4257 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4258 // from such a struct, we read/write packed bits disagreeing with the
4259 // unvectorized version.
4260 Type *ScalarTy = VL0->getType();
4261
4262 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4263 return LoadsState::Gather;
4264
4265 // Make sure all loads in the bundle are simple - we can't vectorize
4266 // atomic or volatile loads.
4267 PointerOps.clear();
4268 const unsigned Sz = VL.size();
4269 PointerOps.resize(Sz);
4270 auto *POIter = PointerOps.begin();
4271 for (Value *V : VL) {
4272 auto *L = cast<LoadInst>(V);
4273 if (!L->isSimple())
4274 return LoadsState::Gather;
4275 *POIter = L->getPointerOperand();
4276 ++POIter;
4277 }
4278
4279 Order.clear();
4280 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4281 // Check the order of pointer operands or that all pointers are the same.
4282 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4283 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4284 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4285 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4286 "supported with VectorizeNonPowerOf2");
4287 return LoadsState::Gather;
4288 }
4289
4290 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4291 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4292 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4293 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4295 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4296 return arePointersCompatible(P, PointerOps.front(), *TLI);
4297 })) {
4298 if (IsSorted) {
4299 Value *Ptr0;
4300 Value *PtrN;
4301 if (Order.empty()) {
4302 Ptr0 = PointerOps.front();
4303 PtrN = PointerOps.back();
4304 } else {
4305 Ptr0 = PointerOps[Order.front()];
4306 PtrN = PointerOps[Order.back()];
4307 }
4308 std::optional<int> Diff =
4309 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4310 // Check that the sorted loads are consecutive.
4311 if (static_cast<unsigned>(*Diff) == Sz - 1)
4312 return LoadsState::Vectorize;
4313 // Simple check if not a strided access - clear order.
4314 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4315 // Try to generate strided load node if:
4316 // 1. Target with strided load support is detected.
4317 // 2. The number of loads is greater than MinProfitableStridedLoads,
4318 // or the potential stride <= MaxProfitableLoadStride and the
4319 // potential stride is power-of-2 (to avoid perf regressions for the very
4320 // small number of loads) and max distance > number of loads, or potential
4321 // stride is -1.
4322 // 3. The loads are ordered, or number of unordered loads <=
4323 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4324 // (this check is to avoid extra costs for very expensive shuffles).
4325 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4326 (static_cast<unsigned>(std::abs(*Diff)) <=
4328 isPowerOf2_32(std::abs(*Diff)))) &&
4329 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4330 *Diff == -(static_cast<int>(Sz) - 1))) {
4331 int Stride = *Diff / static_cast<int>(Sz - 1);
4332 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4333 Align Alignment =
4334 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4335 ->getAlign();
4336 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4337 // Iterate through all pointers and check if all distances are
4338 // unique multiple of Dist.
4339 SmallSet<int, 4> Dists;
4340 for (Value *Ptr : PointerOps) {
4341 int Dist = 0;
4342 if (Ptr == PtrN)
4343 Dist = *Diff;
4344 else if (Ptr != Ptr0)
4345 Dist =
4346 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4347 // If the strides are not the same or repeated, we can't
4348 // vectorize.
4349 if (((Dist / Stride) * Stride) != Dist ||
4350 !Dists.insert(Dist).second)
4351 break;
4352 }
4353 if (Dists.size() == Sz)
4355 }
4356 }
4357 }
4358 }
4359 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4360 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4361 unsigned MinVF = getMinVF(Sz);
4362 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4363 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4364 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4365 unsigned VectorizedCnt = 0;
4367 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4368 Cnt += VF, ++VectorizedCnt) {
4369 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4371 SmallVector<Value *> PointerOps;
4372 LoadsState LS =
4373 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4374 /*TryRecursiveCheck=*/false);
4375 // Check that the sorted loads are consecutive.
4376 if (LS == LoadsState::Gather)
4377 break;
4378 // If need the reorder - consider as high-cost masked gather for now.
4379 if ((LS == LoadsState::Vectorize ||
4381 !Order.empty() && !isReverseOrder(Order))
4383 States.push_back(LS);
4384 }
4385 // Can be vectorized later as a serie of loads/insertelements.
4386 if (VectorizedCnt == VL.size() / VF) {
4387 // Compare masked gather cost and loads + insersubvector costs.
4389 InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4390 Instruction::Load, VecTy,
4391 cast<LoadInst>(VL0)->getPointerOperand(),
4392 /*VariableMask=*/false, CommonAlignment, CostKind);
4393 InstructionCost VecLdCost = 0;
4394 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4395 for (auto [I, LS] : enumerate(States)) {
4396 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4397 switch (LS) {
4399 VecLdCost += TTI.getMemoryOpCost(
4400 Instruction::Load, SubVecTy, LI0->getAlign(),
4401 LI0->getPointerAddressSpace(), CostKind,
4403 break;
4405 VecLdCost += TTI.getStridedMemoryOpCost(
4406 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4407 /*VariableMask=*/false, CommonAlignment, CostKind);
4408 break;
4410 VecLdCost += TTI.getGatherScatterOpCost(
4411 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4412 /*VariableMask=*/false, CommonAlignment, CostKind);
4413 break;
4414 case LoadsState::Gather:
4416 "Expected only consecutive, strided or masked gather loads.");
4417 }
4418 SmallVector<int> ShuffleMask(VL.size());
4419 for (int Idx : seq<int>(0, VL.size()))
4420 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4421 VecLdCost +=
4422 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4423 ShuffleMask, CostKind, I * VF, SubVecTy);
4424 }
4425 // If masked gather cost is higher - better to vectorize, so
4426 // consider it as a gather node. It will be better estimated
4427 // later.
4428 if (MaskedGatherCost > VecLdCost)
4429 return true;
4430 }
4431 }
4432 return false;
4433 };
4434 // TODO: need to improve analysis of the pointers, if not all of them are
4435 // GEPs or have > 2 operands, we end up with a gather node, which just
4436 // increases the cost.
4437 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4438 bool ProfitableGatherPointers =
4439 L && Sz > 2 &&
4440 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4441 return L->isLoopInvariant(V);
4442 })) <= Sz / 2;
4443 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4444 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4445 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4446 (GEP && GEP->getNumOperands() == 2 &&
4447 isa<Constant, Instruction>(GEP->getOperand(1)));
4448 })) {
4449 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4450 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4451 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4452 // Check if potential masked gather can be represented as series
4453 // of loads + insertsubvectors.
4454 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4455 // If masked gather cost is higher - better to vectorize, so
4456 // consider it as a gather node. It will be better estimated
4457 // later.
4458 return LoadsState::Gather;
4459 }
4461 }
4462 }
4463 }
4464
4465 return LoadsState::Gather;
4466}
4467
4469 const DataLayout &DL, ScalarEvolution &SE,
4470 SmallVectorImpl<unsigned> &SortedIndices) {
4472 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4473 "Expected list of pointer operands.");
4474 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4475 // Ptr into, sort and return the sorted indices with values next to one
4476 // another.
4478 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4479
4480 unsigned Cnt = 1;
4481 for (Value *Ptr : VL.drop_front()) {
4482 bool Found = any_of(Bases, [&](auto &Base) {
4483 std::optional<int> Diff =
4484 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4485 /*StrictCheck=*/true);
4486 if (!Diff)
4487 return false;
4488
4489 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4490 return true;
4491 });
4492
4493 if (!Found) {
4494 // If we haven't found enough to usefully cluster, return early.
4495 if (Bases.size() > VL.size() / 2 - 1)
4496 return false;
4497
4498 // Not found already - add a new Base
4499 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4500 }
4501 }
4502
4503 // For each of the bases sort the pointers by Offset and check if any of the
4504 // base become consecutively allocated.
4505 bool AnyConsecutive = false;
4506 for (auto &Base : Bases) {
4507 auto &Vec = Base.second;
4508 if (Vec.size() > 1) {
4509 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4510 const std::tuple<Value *, int, unsigned> &Y) {
4511 return std::get<1>(X) < std::get<1>(Y);
4512 });
4513 int InitialOffset = std::get<1>(Vec[0]);
4514 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4515 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4516 });
4517 }
4518 }
4519
4520 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4521 SortedIndices.clear();
4522 if (!AnyConsecutive)
4523 return false;
4524
4525 for (auto &Base : Bases) {
4526 for (auto &T : Base.second)
4527 SortedIndices.push_back(std::get<2>(T));
4528 }
4529
4530 assert(SortedIndices.size() == VL.size() &&
4531 "Expected SortedIndices to be the size of VL");
4532 return true;
4533}
4534
4535std::optional<BoUpSLP::OrdersType>
4536BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4537 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4538 Type *ScalarTy = TE.Scalars[0]->getType();
4539
4541 Ptrs.reserve(TE.Scalars.size());
4542 for (Value *V : TE.Scalars) {
4543 auto *L = dyn_cast<LoadInst>(V);
4544 if (!L || !L->isSimple())
4545 return std::nullopt;
4546 Ptrs.push_back(L->getPointerOperand());
4547 }
4548
4549 BoUpSLP::OrdersType Order;
4550 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4551 return std::move(Order);
4552 return std::nullopt;
4553}
4554
4555/// Check if two insertelement instructions are from the same buildvector.
4558 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4559 // Instructions must be from the same basic blocks.
4560 if (VU->getParent() != V->getParent())
4561 return false;
4562 // Checks if 2 insertelements are from the same buildvector.
4563 if (VU->getType() != V->getType())
4564 return false;
4565 // Multiple used inserts are separate nodes.
4566 if (!VU->hasOneUse() && !V->hasOneUse())
4567 return false;
4568 auto *IE1 = VU;
4569 auto *IE2 = V;
4570 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4571 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4572 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4573 return false;
4574 // Go through the vector operand of insertelement instructions trying to find
4575 // either VU as the original vector for IE2 or V as the original vector for
4576 // IE1.
4577 SmallBitVector ReusedIdx(
4578 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4579 bool IsReusedIdx = false;
4580 do {
4581 if (IE2 == VU && !IE1)
4582 return VU->hasOneUse();
4583 if (IE1 == V && !IE2)
4584 return V->hasOneUse();
4585 if (IE1 && IE1 != V) {
4586 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4587 IsReusedIdx |= ReusedIdx.test(Idx1);
4588 ReusedIdx.set(Idx1);
4589 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4590 IE1 = nullptr;
4591 else
4592 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4593 }
4594 if (IE2 && IE2 != VU) {
4595 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4596 IsReusedIdx |= ReusedIdx.test(Idx2);
4597 ReusedIdx.set(Idx2);
4598 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4599 IE2 = nullptr;
4600 else
4601 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4602 }
4603 } while (!IsReusedIdx && (IE1 || IE2));
4604 return false;
4605}
4606
4607std::optional<BoUpSLP::OrdersType>
4608BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4609 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4610 if (TE.isNonPowOf2Vec())
4611 return std::nullopt;
4612
4613 // No need to reorder if need to shuffle reuses, still need to shuffle the
4614 // node.
4615 if (!TE.ReuseShuffleIndices.empty()) {
4616 if (isSplat(TE.Scalars))
4617 return std::nullopt;
4618 // Check if reuse shuffle indices can be improved by reordering.
4619 // For this, check that reuse mask is "clustered", i.e. each scalar values
4620 // is used once in each submask of size <number_of_scalars>.
4621 // Example: 4 scalar values.
4622 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4623 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4624 // element 3 is used twice in the second submask.
4625 unsigned Sz = TE.Scalars.size();
4626 if (TE.State == TreeEntry::NeedToGather) {
4627 if (std::optional<OrdersType> CurrentOrder =
4629 SmallVector<int> Mask;
4630 fixupOrderingIndices(*CurrentOrder);
4631 inversePermutation(*CurrentOrder, Mask);
4632 ::addMask(Mask, TE.ReuseShuffleIndices);
4633 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4634 unsigned Sz = TE.Scalars.size();
4635 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4636 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4637 if (Idx != PoisonMaskElem)
4638 Res[Idx + K * Sz] = I + K * Sz;
4639 }
4640 return std::move(Res);
4641 }
4642 }
4643 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4645 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4646 return std::nullopt;
4647 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4648 Sz)) {
4649 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4650 if (TE.ReorderIndices.empty())
4651 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4652 else
4653 inversePermutation(TE.ReorderIndices, ReorderMask);
4654 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4655 unsigned VF = ReorderMask.size();
4656 OrdersType ResOrder(VF, VF);
4657 unsigned NumParts = VF / Sz;
4658 SmallBitVector UsedVals(NumParts);
4659 for (unsigned I = 0; I < VF; I += Sz) {
4660 int Val = PoisonMaskElem;
4661 unsigned UndefCnt = 0;
4662 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4663 [&](int Idx) {
4664 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4665 Val = Idx;
4666 if (Idx == PoisonMaskElem)
4667 ++UndefCnt;
4668 return Idx != PoisonMaskElem && Idx != Val;
4669 }) ||
4670 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4671 UndefCnt > Sz / 2)
4672 return std::nullopt;
4673 UsedVals.set(Val);
4674 for (unsigned K = 0; K < NumParts; ++K)
4675 ResOrder[Val + Sz * K] = I + K;
4676 }
4677 return std::move(ResOrder);
4678 }
4679 unsigned VF = TE.getVectorFactor();
4680 // Try build correct order for extractelement instructions.
4681 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4682 TE.ReuseShuffleIndices.end());
4683 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4684 all_of(TE.Scalars, [Sz](Value *V) {
4685 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4686 return Idx && *Idx < Sz;
4687 })) {
4688 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4689 if (TE.ReorderIndices.empty())
4690 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4691 else
4692 inversePermutation(TE.ReorderIndices, ReorderMask);
4693 for (unsigned I = 0; I < VF; ++I) {
4694 int &Idx = ReusedMask[I];
4695 if (Idx == PoisonMaskElem)
4696 continue;
4697 Value *V = TE.Scalars[ReorderMask[Idx]];
4698 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4699 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4700 }
4701 }
4702 // Build the order of the VF size, need to reorder reuses shuffles, they are
4703 // always of VF size.
4704 OrdersType ResOrder(VF);
4705 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4706 auto *It = ResOrder.begin();
4707 for (unsigned K = 0; K < VF; K += Sz) {
4708 OrdersType CurrentOrder(TE.ReorderIndices);
4709 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4710 if (SubMask.front() == PoisonMaskElem)
4711 std::iota(SubMask.begin(), SubMask.end(), 0);
4712 reorderOrder(CurrentOrder, SubMask);
4713 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4714 std::advance(It, Sz);
4715 }
4716 if (TE.State == TreeEntry::NeedToGather &&
4717 all_of(enumerate(ResOrder),
4718 [](const auto &Data) { return Data.index() == Data.value(); }))
4719 return std::nullopt; // No need to reorder.
4720 return std::move(ResOrder);
4721 }
4722 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4723 any_of(TE.UserTreeIndices,
4724 [](const EdgeInfo &EI) {
4725 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4726 }) &&
4727 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4728 return std::nullopt;
4729 if ((TE.State == TreeEntry::Vectorize ||
4730 TE.State == TreeEntry::StridedVectorize) &&
4731 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4732 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4733 !TE.isAltShuffle())
4734 return TE.ReorderIndices;
4735 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4736 auto PHICompare = [&](unsigned I1, unsigned I2) {
4737 Value *V1 = TE.Scalars[I1];
4738 Value *V2 = TE.Scalars[I2];
4739 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4740 return false;
4741 if (V1->getNumUses() < V2->getNumUses())
4742 return true;
4743 if (V1->getNumUses() > V2->getNumUses())
4744 return false;
4745 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4746 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4747 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4748 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4750 IE1, IE2,
4751 [](InsertElementInst *II) { return II->getOperand(0); }))
4752 return I1 < I2;
4753 return getInsertIndex(IE1) < getInsertIndex(IE2);
4754 }
4755 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4756 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4757 if (EE1->getOperand(0) != EE2->getOperand(0))
4758 return I1 < I2;
4759 return getInsertIndex(EE1) < getInsertIndex(EE2);
4760 }
4761 return I1 < I2;
4762 };
4763 auto IsIdentityOrder = [](const OrdersType &Order) {
4764 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4765 if (Idx != Order[Idx])
4766 return false;
4767 return true;
4768 };
4769 if (!TE.ReorderIndices.empty())
4770 return TE.ReorderIndices;
4772 SmallVector<unsigned> Phis(TE.Scalars.size());
4773 std::iota(Phis.begin(), Phis.end(), 0);
4774 OrdersType ResOrder(TE.Scalars.size());
4775 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4776 PhiToId[Id] = Id;
4777 stable_sort(Phis, PHICompare);
4778 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4779 ResOrder[Id] = PhiToId[Phis[Id]];
4780 if (IsIdentityOrder(ResOrder))
4781 return std::nullopt; // No need to reorder.
4782 return std::move(ResOrder);
4783 }
4784 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4785 allSameType(TE.Scalars)) {
4786 // TODO: add analysis of other gather nodes with extractelement
4787 // instructions and other values/instructions, not only undefs.
4788 if ((TE.getOpcode() == Instruction::ExtractElement ||
4789 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4790 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4791 all_of(TE.Scalars, [](Value *V) {
4792 auto *EE = dyn_cast<ExtractElementInst>(V);
4793 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4794 })) {
4795 // Check that gather of extractelements can be represented as
4796 // just a shuffle of a single vector.
4797 OrdersType CurrentOrder;
4798 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4799 /*ResizeAllowed=*/true);
4800 if (Reuse || !CurrentOrder.empty())
4801 return std::move(CurrentOrder);
4802 }
4803 // If the gather node is <undef, v, .., poison> and
4804 // insertelement poison, v, 0 [+ permute]
4805 // is cheaper than
4806 // insertelement poison, v, n - try to reorder.
4807 // If rotating the whole graph, exclude the permute cost, the whole graph
4808 // might be transformed.
4809 int Sz = TE.Scalars.size();
4810 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4811 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4812 const auto *It =
4813 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4814 if (It == TE.Scalars.begin())
4815 return OrdersType();
4816 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4817 if (It != TE.Scalars.end()) {
4818 OrdersType Order(Sz, Sz);
4819 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4820 Order[Idx] = 0;
4821 fixupOrderingIndices(Order);
4822 SmallVector<int> Mask;
4823 inversePermutation(Order, Mask);
4824 InstructionCost PermuteCost =
4825 TopToBottom
4826 ? 0
4828 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4829 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4830 PoisonValue::get(Ty), *It);
4831 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4832 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4833 PoisonValue::get(Ty), *It);
4834 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4835 OrdersType Order(Sz, Sz);
4836 Order[Idx] = 0;
4837 return std::move(Order);
4838 }
4839 }
4840 }
4841 if (isSplat(TE.Scalars))
4842 return std::nullopt;
4843 if (TE.Scalars.size() >= 4)
4844 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4845 return Order;
4846 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4847 return CurrentOrder;
4848 }
4849 return std::nullopt;
4850}
4851
4852/// Checks if the given mask is a "clustered" mask with the same clusters of
4853/// size \p Sz, which are not identity submasks.
4855 unsigned Sz) {
4856 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4857 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4858 return false;
4859 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4860 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4861 if (Cluster != FirstCluster)
4862 return false;
4863 }
4864 return true;
4865}
4866
4867void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4868 // Reorder reuses mask.
4869 reorderReuses(TE.ReuseShuffleIndices, Mask);
4870 const unsigned Sz = TE.Scalars.size();
4871 // For vectorized and non-clustered reused no need to do anything else.
4872 if (TE.State != TreeEntry::NeedToGather ||
4874 Sz) ||
4875 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4876 return;
4877 SmallVector<int> NewMask;
4878 inversePermutation(TE.ReorderIndices, NewMask);
4879 addMask(NewMask, TE.ReuseShuffleIndices);
4880 // Clear reorder since it is going to be applied to the new mask.
4881 TE.ReorderIndices.clear();
4882 // Try to improve gathered nodes with clustered reuses, if possible.
4883 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4884 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4885 inversePermutation(NewOrder, NewMask);
4886 reorderScalars(TE.Scalars, NewMask);
4887 // Fill the reuses mask with the identity submasks.
4888 for (auto *It = TE.ReuseShuffleIndices.begin(),
4889 *End = TE.ReuseShuffleIndices.end();
4890 It != End; std::advance(It, Sz))
4891 std::iota(It, std::next(It, Sz), 0);
4892}
4893
4895 ArrayRef<unsigned> SecondaryOrder) {
4896 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4897 "Expected same size of orders");
4898 unsigned Sz = Order.size();
4899 SmallBitVector UsedIndices(Sz);
4900 for (unsigned Idx : seq<unsigned>(0, Sz)) {
4901 if (Order[Idx] != Sz)
4902 UsedIndices.set(Order[Idx]);
4903 }
4904 if (SecondaryOrder.empty()) {
4905 for (unsigned Idx : seq<unsigned>(0, Sz))
4906 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4907 Order[Idx] = Idx;
4908 } else {
4909 for (unsigned Idx : seq<unsigned>(0, Sz))
4910 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4911 !UsedIndices.test(SecondaryOrder[Idx]))
4912 Order[Idx] = SecondaryOrder[Idx];
4913 }
4914}
4915
4917 // Maps VF to the graph nodes.
4919 // ExtractElement gather nodes which can be vectorized and need to handle
4920 // their ordering.
4922
4923 // Phi nodes can have preferred ordering based on their result users
4925
4926 // AltShuffles can also have a preferred ordering that leads to fewer
4927 // instructions, e.g., the addsub instruction in x86.
4928 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4929
4930 // Maps a TreeEntry to the reorder indices of external users.
4932 ExternalUserReorderMap;
4933 // Find all reorderable nodes with the given VF.
4934 // Currently the are vectorized stores,loads,extracts + some gathering of
4935 // extracts.
4936 for_each(VectorizableTree, [&, &TTIRef = *TTI](
4937 const std::unique_ptr<TreeEntry> &TE) {
4938 // Look for external users that will probably be vectorized.
4939 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4940 findExternalStoreUsersReorderIndices(TE.get());
4941 if (!ExternalUserReorderIndices.empty()) {
4942 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4943 ExternalUserReorderMap.try_emplace(TE.get(),
4944 std::move(ExternalUserReorderIndices));
4945 }
4946
4947 // Patterns like [fadd,fsub] can be combined into a single instruction in
4948 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4949 // to take into account their order when looking for the most used order.
4950 if (TE->isAltShuffle()) {
4951 VectorType *VecTy =
4952 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4953 unsigned Opcode0 = TE->getOpcode();
4954 unsigned Opcode1 = TE->getAltOpcode();
4955 // The opcode mask selects between the two opcodes.
4956 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4957 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4958 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4959 OpcodeMask.set(Lane);
4960 // If this pattern is supported by the target then we consider the order.
4961 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4962 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4963 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4964 }
4965 // TODO: Check the reverse order too.
4966 }
4967
4968 if (std::optional<OrdersType> CurrentOrder =
4969 getReorderingData(*TE, /*TopToBottom=*/true)) {
4970 // Do not include ordering for nodes used in the alt opcode vectorization,
4971 // better to reorder them during bottom-to-top stage. If follow the order
4972 // here, it causes reordering of the whole graph though actually it is
4973 // profitable just to reorder the subgraph that starts from the alternate
4974 // opcode vectorization node. Such nodes already end-up with the shuffle
4975 // instruction and it is just enough to change this shuffle rather than
4976 // rotate the scalars for the whole graph.
4977 unsigned Cnt = 0;
4978 const TreeEntry *UserTE = TE.get();
4979 while (UserTE && Cnt < RecursionMaxDepth) {
4980 if (UserTE->UserTreeIndices.size() != 1)
4981 break;
4982 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4983 return EI.UserTE->State == TreeEntry::Vectorize &&
4984 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4985 }))
4986 return;
4987 UserTE = UserTE->UserTreeIndices.back().UserTE;
4988 ++Cnt;
4989 }
4990 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4991 if (!(TE->State == TreeEntry::Vectorize ||
4992 TE->State == TreeEntry::StridedVectorize) ||
4993 !TE->ReuseShuffleIndices.empty())
4994 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4995 if (TE->State == TreeEntry::Vectorize &&
4996 TE->getOpcode() == Instruction::PHI)
4997 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4998 }
4999 });
5000
5001 // Reorder the graph nodes according to their vectorization factor.
5002 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5003 VF /= 2) {
5004 auto It = VFToOrderedEntries.find(VF);
5005 if (It == VFToOrderedEntries.end())
5006 continue;
5007 // Try to find the most profitable order. We just are looking for the most
5008 // used order and reorder scalar elements in the nodes according to this
5009 // mostly used order.
5010 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5011 // All operands are reordered and used only in this node - propagate the
5012 // most used order to the user node.
5015 OrdersUses;
5017 for (const TreeEntry *OpTE : OrderedEntries) {
5018 // No need to reorder this nodes, still need to extend and to use shuffle,
5019 // just need to merge reordering shuffle and the reuse shuffle.
5020 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5021 continue;
5022 // Count number of orders uses.
5023 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5024 &PhisToOrders]() -> const OrdersType & {
5025 if (OpTE->State == TreeEntry::NeedToGather ||
5026 !OpTE->ReuseShuffleIndices.empty()) {
5027 auto It = GathersToOrders.find(OpTE);
5028 if (It != GathersToOrders.end())
5029 return It->second;
5030 }
5031 if (OpTE->isAltShuffle()) {
5032 auto It = AltShufflesToOrders.find(OpTE);
5033 if (It != AltShufflesToOrders.end())
5034 return It->second;
5035 }
5036 if (OpTE->State == TreeEntry::Vectorize &&
5037 OpTE->getOpcode() == Instruction::PHI) {
5038 auto It = PhisToOrders.find(OpTE);
5039 if (It != PhisToOrders.end())
5040 return It->second;
5041 }
5042 return OpTE->ReorderIndices;
5043 }();
5044 // First consider the order of the external scalar users.
5045 auto It = ExternalUserReorderMap.find(OpTE);
5046 if (It != ExternalUserReorderMap.end()) {
5047 const auto &ExternalUserReorderIndices = It->second;
5048 // If the OpTE vector factor != number of scalars - use natural order,
5049 // it is an attempt to reorder node with reused scalars but with
5050 // external uses.
5051 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5052 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5053 ExternalUserReorderIndices.size();
5054 } else {
5055 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5056 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5057 }
5058 // No other useful reorder data in this entry.
5059 if (Order.empty())
5060 continue;
5061 }
5062 // Stores actually store the mask, not the order, need to invert.
5063 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5064 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5065 SmallVector<int> Mask;
5066 inversePermutation(Order, Mask);
5067 unsigned E = Order.size();
5068 OrdersType CurrentOrder(E, E);
5069 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5070 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5071 });
5072 fixupOrderingIndices(CurrentOrder);
5073 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5074 } else {
5075 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5076 }
5077 }
5078 if (OrdersUses.empty())
5079 continue;
5080 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5081 const unsigned Sz = Order.size();
5082 for (unsigned Idx : seq<unsigned>(0, Sz))
5083 if (Idx != Order[Idx] && Order[Idx] != Sz)
5084 return false;
5085 return true;
5086 };
5087 // Choose the most used order.
5088 unsigned IdentityCnt = 0;
5089 unsigned FilledIdentityCnt = 0;
5090 OrdersType IdentityOrder(VF, VF);
5091 for (auto &Pair : OrdersUses) {
5092 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5093 if (!Pair.first.empty())
5094 FilledIdentityCnt += Pair.second;
5095 IdentityCnt += Pair.second;
5096 combineOrders(IdentityOrder, Pair.first);
5097 }
5098 }
5099 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5100 unsigned Cnt = IdentityCnt;
5101 for (auto &Pair : OrdersUses) {
5102 // Prefer identity order. But, if filled identity found (non-empty order)
5103 // with same number of uses, as the new candidate order, we can choose
5104 // this candidate order.
5105 if (Cnt < Pair.second ||
5106 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5107 Cnt == Pair.second && !BestOrder.empty() &&
5108 IsIdentityOrder(BestOrder))) {
5109 combineOrders(Pair.first, BestOrder);
5110 BestOrder = Pair.first;
5111 Cnt = Pair.second;
5112 } else {
5113 combineOrders(BestOrder, Pair.first);
5114 }
5115 }
5116 // Set order of the user node.
5117 if (IsIdentityOrder(BestOrder))
5118 continue;
5119 fixupOrderingIndices(BestOrder);
5120 SmallVector<int> Mask;
5121 inversePermutation(BestOrder, Mask);
5122 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5123 unsigned E = BestOrder.size();
5124 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5125 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5126 });
5127 // Do an actual reordering, if profitable.
5128 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5129 // Just do the reordering for the nodes with the given VF.
5130 if (TE->Scalars.size() != VF) {
5131 if (TE->ReuseShuffleIndices.size() == VF) {
5132 // Need to reorder the reuses masks of the operands with smaller VF to
5133 // be able to find the match between the graph nodes and scalar
5134 // operands of the given node during vectorization/cost estimation.
5135 assert(all_of(TE->UserTreeIndices,
5136 [VF, &TE](const EdgeInfo &EI) {
5137 return EI.UserTE->Scalars.size() == VF ||
5138 EI.UserTE->Scalars.size() ==
5139 TE->Scalars.size();
5140 }) &&
5141 "All users must be of VF size.");
5142 // Update ordering of the operands with the smaller VF than the given
5143 // one.
5144 reorderNodeWithReuses(*TE, Mask);
5145 }
5146 continue;
5147 }
5148 if ((TE->State == TreeEntry::Vectorize ||
5149 TE->State == TreeEntry::StridedVectorize) &&
5151 InsertElementInst>(TE->getMainOp()) &&
5152 !TE->isAltShuffle()) {
5153 // Build correct orders for extract{element,value}, loads and
5154 // stores.
5155 reorderOrder(TE->ReorderIndices, Mask);
5156 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5157 TE->reorderOperands(Mask);
5158 } else {
5159 // Reorder the node and its operands.
5160 TE->reorderOperands(Mask);
5161 assert(TE->ReorderIndices.empty() &&
5162 "Expected empty reorder sequence.");
5163 reorderScalars(TE->Scalars, Mask);
5164 }
5165 if (!TE->ReuseShuffleIndices.empty()) {
5166 // Apply reversed order to keep the original ordering of the reused
5167 // elements to avoid extra reorder indices shuffling.
5168 OrdersType CurrentOrder;
5169 reorderOrder(CurrentOrder, MaskOrder);
5170 SmallVector<int> NewReuses;
5171 inversePermutation(CurrentOrder, NewReuses);
5172 addMask(NewReuses, TE->ReuseShuffleIndices);
5173 TE->ReuseShuffleIndices.swap(NewReuses);
5174 }
5175 }
5176 }
5177}
5178
5179bool BoUpSLP::canReorderOperands(
5180 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5181 ArrayRef<TreeEntry *> ReorderableGathers,
5182 SmallVectorImpl<TreeEntry *> &GatherOps) {
5183 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5184 if (UserTE->isNonPowOf2Vec())
5185 return false;
5186
5187 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5188 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5189 return OpData.first == I &&
5190 (OpData.second->State == TreeEntry::Vectorize ||
5191 OpData.second->State == TreeEntry::StridedVectorize);
5192 }))
5193 continue;
5194 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5195 // Do not reorder if operand node is used by many user nodes.
5196 if (any_of(TE->UserTreeIndices,
5197 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5198 return false;
5199 // Add the node to the list of the ordered nodes with the identity
5200 // order.
5201 Edges.emplace_back(I, TE);
5202 // Add ScatterVectorize nodes to the list of operands, where just
5203 // reordering of the scalars is required. Similar to the gathers, so
5204 // simply add to the list of gathered ops.
5205 // If there are reused scalars, process this node as a regular vectorize
5206 // node, just reorder reuses mask.
5207 if (TE->State != TreeEntry::Vectorize &&
5208 TE->State != TreeEntry::StridedVectorize &&
5209 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5210 GatherOps.push_back(TE);
5211 continue;
5212 }
5213 TreeEntry *Gather = nullptr;
5214 if (count_if(ReorderableGathers,
5215 [&Gather, UserTE, I](TreeEntry *TE) {
5216 assert(TE->State != TreeEntry::Vectorize &&
5217 TE->State != TreeEntry::StridedVectorize &&
5218 "Only non-vectorized nodes are expected.");
5219 if (any_of(TE->UserTreeIndices,
5220 [UserTE, I](const EdgeInfo &EI) {
5221 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5222 })) {
5223 assert(TE->isSame(UserTE->getOperand(I)) &&
5224 "Operand entry does not match operands.");
5225 Gather = TE;
5226 return true;
5227 }
5228 return false;
5229 }) > 1 &&
5230 !allConstant(UserTE->getOperand(I)))
5231 return false;
5232 if (Gather)
5233 GatherOps.push_back(Gather);
5234 }
5235 return true;
5236}
5237
5238void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5239 SetVector<TreeEntry *> OrderedEntries;
5240 DenseSet<const TreeEntry *> GathersToOrders;
5241 // Find all reorderable leaf nodes with the given VF.
5242 // Currently the are vectorized loads,extracts without alternate operands +
5243 // some gathering of extracts.
5244 SmallVector<TreeEntry *> NonVectorized;
5245 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5246 if (TE->State != TreeEntry::Vectorize &&
5247 TE->State != TreeEntry::StridedVectorize)
5248 NonVectorized.push_back(TE.get());
5249 if (std::optional<OrdersType> CurrentOrder =
5250 getReorderingData(*TE, /*TopToBottom=*/false)) {
5251 OrderedEntries.insert(TE.get());
5252 if (!(TE->State == TreeEntry::Vectorize ||
5253 TE->State == TreeEntry::StridedVectorize) ||
5254 !TE->ReuseShuffleIndices.empty())
5255 GathersToOrders.insert(TE.get());
5256 }
5257 }
5258
5259 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5260 // I.e., if the node has operands, that are reordered, try to make at least
5261 // one operand order in the natural order and reorder others + reorder the
5262 // user node itself.
5264 while (!OrderedEntries.empty()) {
5265 // 1. Filter out only reordered nodes.
5266 // 2. If the entry has multiple uses - skip it and jump to the next node.
5268 SmallVector<TreeEntry *> Filtered;
5269 for (TreeEntry *TE : OrderedEntries) {
5270 if (!(TE->State == TreeEntry::Vectorize ||
5271 TE->State == TreeEntry::StridedVectorize ||
5272 (TE->State == TreeEntry::NeedToGather &&
5273 GathersToOrders.contains(TE))) ||
5274 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5275 !all_of(drop_begin(TE->UserTreeIndices),
5276 [TE](const EdgeInfo &EI) {
5277 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5278 }) ||
5279 !Visited.insert(TE).second) {
5280 Filtered.push_back(TE);
5281 continue;
5282 }
5283 // Build a map between user nodes and their operands order to speedup
5284 // search. The graph currently does not provide this dependency directly.
5285 for (EdgeInfo &EI : TE->UserTreeIndices) {
5286 TreeEntry *UserTE = EI.UserTE;
5287 auto It = Users.find(UserTE);
5288 if (It == Users.end())
5289 It = Users.insert({UserTE, {}}).first;
5290 It->second.emplace_back(EI.EdgeIdx, TE);
5291 }
5292 }
5293 // Erase filtered entries.
5294 for (TreeEntry *TE : Filtered)
5295 OrderedEntries.remove(TE);
5297 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5298 UsersVec(Users.begin(), Users.end());
5299 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5300 return Data1.first->Idx > Data2.first->Idx;
5301 });
5302 for (auto &Data : UsersVec) {
5303 // Check that operands are used only in the User node.
5304 SmallVector<TreeEntry *> GatherOps;
5305 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5306 GatherOps)) {
5307 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5308 OrderedEntries.remove(Op.second);
5309 continue;
5310 }
5311 // All operands are reordered and used only in this node - propagate the
5312 // most used order to the user node.
5315 OrdersUses;
5316 // Do the analysis for each tree entry only once, otherwise the order of
5317 // the same node my be considered several times, though might be not
5318 // profitable.
5321 for (const auto &Op : Data.second) {
5322 TreeEntry *OpTE = Op.second;
5323 if (!VisitedOps.insert(OpTE).second)
5324 continue;
5325 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5326 continue;
5327 const auto Order = [&]() -> const OrdersType {
5328 if (OpTE->State == TreeEntry::NeedToGather ||
5329 !OpTE->ReuseShuffleIndices.empty())
5330 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5331 .value_or(OrdersType(1));
5332 return OpTE->ReorderIndices;
5333 }();
5334 // The order is partially ordered, skip it in favor of fully non-ordered
5335 // orders.
5336 if (Order.size() == 1)
5337 continue;
5338 unsigned NumOps = count_if(
5339 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5340 return P.second == OpTE;
5341 });
5342 // Stores actually store the mask, not the order, need to invert.
5343 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5344 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5345 SmallVector<int> Mask;
5346 inversePermutation(Order, Mask);
5347 unsigned E = Order.size();
5348 OrdersType CurrentOrder(E, E);
5349 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5350 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5351 });
5352 fixupOrderingIndices(CurrentOrder);
5353 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5354 NumOps;
5355 } else {
5356 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5357 }
5358 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5359 const auto AllowsReordering = [&](const TreeEntry *TE) {
5360 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5361 if (TE->isNonPowOf2Vec())
5362 return false;
5363 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5364 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5365 (IgnoreReorder && TE->Idx == 0))
5366 return true;
5367 if (TE->State == TreeEntry::NeedToGather) {
5368 if (GathersToOrders.contains(TE))
5369 return !getReorderingData(*TE, /*TopToBottom=*/false)
5370 .value_or(OrdersType(1))
5371 .empty();
5372 return true;
5373 }
5374 return false;
5375 };
5376 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5377 TreeEntry *UserTE = EI.UserTE;
5378 if (!VisitedUsers.insert(UserTE).second)
5379 continue;
5380 // May reorder user node if it requires reordering, has reused
5381 // scalars, is an alternate op vectorize node or its op nodes require
5382 // reordering.
5383 if (AllowsReordering(UserTE))
5384 continue;
5385 // Check if users allow reordering.
5386 // Currently look up just 1 level of operands to avoid increase of
5387 // the compile time.
5388 // Profitable to reorder if definitely more operands allow
5389 // reordering rather than those with natural order.
5391 if (static_cast<unsigned>(count_if(
5392 Ops, [UserTE, &AllowsReordering](
5393 const std::pair<unsigned, TreeEntry *> &Op) {
5394 return AllowsReordering(Op.second) &&
5395 all_of(Op.second->UserTreeIndices,
5396 [UserTE](const EdgeInfo &EI) {
5397 return EI.UserTE == UserTE;
5398 });
5399 })) <= Ops.size() / 2)
5400 ++Res.first->second;
5401 }
5402 }
5403 if (OrdersUses.empty()) {
5404 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5405 OrderedEntries.remove(Op.second);
5406 continue;
5407 }
5408 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5409 const unsigned Sz = Order.size();
5410 for (unsigned Idx : seq<unsigned>(0, Sz))
5411 if (Idx != Order[Idx] && Order[Idx] != Sz)
5412 return false;
5413 return true;
5414 };
5415 // Choose the most used order.
5416 unsigned IdentityCnt = 0;
5417 unsigned VF = Data.second.front().second->getVectorFactor();
5418 OrdersType IdentityOrder(VF, VF);
5419 for (auto &Pair : OrdersUses) {
5420 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5421 IdentityCnt += Pair.second;
5422 combineOrders(IdentityOrder, Pair.first);
5423 }
5424 }
5425 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5426 unsigned Cnt = IdentityCnt;
5427 for (auto &Pair : OrdersUses) {
5428 // Prefer identity order. But, if filled identity found (non-empty
5429 // order) with same number of uses, as the new candidate order, we can
5430 // choose this candidate order.
5431 if (Cnt < Pair.second) {
5432 combineOrders(Pair.first, BestOrder);
5433 BestOrder = Pair.first;
5434 Cnt = Pair.second;
5435 } else {
5436 combineOrders(BestOrder, Pair.first);
5437 }
5438 }
5439 // Set order of the user node.
5440 if (IsIdentityOrder(BestOrder)) {
5441 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5442 OrderedEntries.remove(Op.second);
5443 continue;
5444 }
5445 fixupOrderingIndices(BestOrder);
5446 // Erase operands from OrderedEntries list and adjust their orders.
5447 VisitedOps.clear();
5448 SmallVector<int> Mask;
5449 inversePermutation(BestOrder, Mask);
5450 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5451 unsigned E = BestOrder.size();
5452 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5453 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5454 });
5455 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5456 TreeEntry *TE = Op.second;
5457 OrderedEntries.remove(TE);
5458 if (!VisitedOps.insert(TE).second)
5459 continue;
5460 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5461 reorderNodeWithReuses(*TE, Mask);
5462 continue;
5463 }
5464 // Gathers are processed separately.
5465 if (TE->State != TreeEntry::Vectorize &&
5466 TE->State != TreeEntry::StridedVectorize &&
5467 (TE->State != TreeEntry::ScatterVectorize ||
5468 TE->ReorderIndices.empty()))
5469 continue;
5470 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5471 TE->ReorderIndices.empty()) &&
5472 "Non-matching sizes of user/operand entries.");
5473 reorderOrder(TE->ReorderIndices, Mask);
5474 if (IgnoreReorder && TE == VectorizableTree.front().get())
5475 IgnoreReorder = false;
5476 }
5477 // For gathers just need to reorder its scalars.
5478 for (TreeEntry *Gather : GatherOps) {
5479 assert(Gather->ReorderIndices.empty() &&
5480 "Unexpected reordering of gathers.");
5481 if (!Gather->ReuseShuffleIndices.empty()) {
5482 // Just reorder reuses indices.
5483 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5484 continue;
5485 }
5486 reorderScalars(Gather->Scalars, Mask);
5487 OrderedEntries.remove(Gather);
5488 }
5489 // Reorder operands of the user node and set the ordering for the user
5490 // node itself.
5491 if (Data.first->State != TreeEntry::Vectorize ||
5492 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5493 Data.first->getMainOp()) ||
5494 Data.first->isAltShuffle())
5495 Data.first->reorderOperands(Mask);
5496 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5497 Data.first->isAltShuffle() ||
5498 Data.first->State == TreeEntry::StridedVectorize) {
5499 reorderScalars(Data.first->Scalars, Mask);
5500 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5501 /*BottomOrder=*/true);
5502 if (Data.first->ReuseShuffleIndices.empty() &&
5503 !Data.first->ReorderIndices.empty() &&
5504 !Data.first->isAltShuffle()) {
5505 // Insert user node to the list to try to sink reordering deeper in
5506 // the graph.
5507 OrderedEntries.insert(Data.first);
5508 }
5509 } else {
5510 reorderOrder(Data.first->ReorderIndices, Mask);
5511 }
5512 }
5513 }
5514 // If the reordering is unnecessary, just remove the reorder.
5515 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5516 VectorizableTree.front()->ReuseShuffleIndices.empty())
5517 VectorizableTree.front()->ReorderIndices.clear();
5518}
5519
5521 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5522 DenseMap<Value *, unsigned> ScalarToExtUses;
5523 // Collect the values that we need to extract from the tree.
5524 for (auto &TEPtr : VectorizableTree) {
5525 TreeEntry *Entry = TEPtr.get();
5526
5527 // No need to handle users of gathered values.
5528 if (Entry->State == TreeEntry::NeedToGather)
5529 continue;
5530
5531 // For each lane:
5532 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5533 Value *Scalar = Entry->Scalars[Lane];
5534 if (!isa<Instruction>(Scalar))
5535 continue;
5536 // All uses must be replaced already? No need to do it again.
5537 auto It = ScalarToExtUses.find(Scalar);
5538 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5539 continue;
5540
5541 // Check if the scalar is externally used as an extra arg.
5542 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5543 if (ExtI != ExternallyUsedValues.end()) {
5544 int FoundLane = Entry->findLaneForValue(Scalar);
5545 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5546 << FoundLane << " from " << *Scalar << ".\n");
5547 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5548 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5549 continue;
5550 }
5551 for (User *U : Scalar->users()) {
5552 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5553
5554 Instruction *UserInst = dyn_cast<Instruction>(U);
5555 if (!UserInst || isDeleted(UserInst))
5556 continue;
5557
5558 // Ignore users in the user ignore list.
5559 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5560 continue;
5561
5562 // Skip in-tree scalars that become vectors
5563 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5564 // Some in-tree scalars will remain as scalar in vectorized
5565 // instructions. If that is the case, the one in FoundLane will
5566 // be used.
5567 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5569 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5570 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5571 << ".\n");
5572 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5573 continue;
5574 }
5575 U = nullptr;
5576 if (It != ScalarToExtUses.end()) {
5577 ExternalUses[It->second].User = nullptr;
5578 break;
5579 }
5580 }
5581
5582 int FoundLane = Entry->findLaneForValue(Scalar);
5583 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5584 << " from lane " << FoundLane << " from " << *Scalar
5585 << ".\n");
5586 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5587 ExternalUses.emplace_back(Scalar, U, FoundLane);
5588 if (!U)
5589 break;
5590 }
5591 }
5592 }
5593}
5594
5596BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5598 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5599 Value *V = TE->Scalars[Lane];
5600 // To save compilation time we don't visit if we have too many users.
5601 if (V->hasNUsesOrMore(UsesLimit))
5602 break;
5603
5604 // Collect stores per pointer object.
5605 for (User *U : V->users()) {
5606 auto *SI = dyn_cast<StoreInst>(U);
5607 if (SI == nullptr || !SI->isSimple() ||
5608 !isValidElementType(SI->getValueOperand()->getType()))
5609 continue;
5610 // Skip entry if already
5611 if (getTreeEntry(U))
5612 continue;
5613
5614 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5615 auto &StoresVec = PtrToStoresMap[Ptr];
5616 // For now just keep one store per pointer object per lane.
5617 // TODO: Extend this to support multiple stores per pointer per lane
5618 if (StoresVec.size() > Lane)
5619 continue;
5620 // Skip if in different BBs.
5621 if (!StoresVec.empty() &&
5622 SI->getParent() != StoresVec.back()->getParent())
5623 continue;
5624 // Make sure that the stores are of the same type.
5625 if (!StoresVec.empty() &&
5626 SI->getValueOperand()->getType() !=
5627 StoresVec.back()->getValueOperand()->getType())
5628 continue;
5629 StoresVec.push_back(SI);
5630 }
5631 }
5632 return PtrToStoresMap;
5633}
5634
5635bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5636 OrdersType &ReorderIndices) const {
5637 // We check whether the stores in StoreVec can form a vector by sorting them
5638 // and checking whether they are consecutive.
5639
5640 // To avoid calling getPointersDiff() while sorting we create a vector of
5641 // pairs {store, offset from first} and sort this instead.
5642 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5643 StoreInst *S0 = StoresVec[0];
5644 StoreOffsetVec[0] = {S0, 0};
5645 Type *S0Ty = S0->getValueOperand()->getType();
5646 Value *S0Ptr = S0->getPointerOperand();
5647 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5648 StoreInst *SI = StoresVec[Idx];
5649 std::optional<int> Diff =
5650 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5651 SI->getPointerOperand(), *DL, *SE,
5652 /*StrictCheck=*/true);
5653 // We failed to compare the pointers so just abandon this StoresVec.
5654 if (!Diff)
5655 return false;
5656 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5657 }
5658
5659 // Sort the vector based on the pointers. We create a copy because we may
5660 // need the original later for calculating the reorder (shuffle) indices.
5661 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5662 const std::pair<StoreInst *, int> &Pair2) {
5663 int Offset1 = Pair1.second;
5664 int Offset2 = Pair2.second;
5665 return Offset1 < Offset2;
5666 });
5667
5668 // Check if the stores are consecutive by checking if their difference is 1.
5669 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5670 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5671 return false;
5672
5673 // Calculate the shuffle indices according to their offset against the sorted
5674 // StoreOffsetVec.
5675 ReorderIndices.reserve(StoresVec.size());
5676 for (StoreInst *SI : StoresVec) {
5677 unsigned Idx = find_if(StoreOffsetVec,
5678 [SI](const std::pair<StoreInst *, int> &Pair) {
5679 return Pair.first == SI;
5680 }) -
5681 StoreOffsetVec.begin();
5682 ReorderIndices.push_back(Idx);
5683 }
5684 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5685 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5686 // same convention here.
5687 auto IsIdentityOrder = [](const OrdersType &Order) {
5688 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5689 if (Idx != Order[Idx])
5690 return false;
5691 return true;
5692 };
5693 if (IsIdentityOrder(ReorderIndices))
5694 ReorderIndices.clear();
5695
5696 return true;
5697}
5698
5699#ifndef NDEBUG
5701 for (unsigned Idx : Order)
5702 dbgs() << Idx << ", ";
5703 dbgs() << "\n";
5704}
5705#endif
5706
5708BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5709 unsigned NumLanes = TE->Scalars.size();
5710
5712 collectUserStores(TE);
5713
5714 // Holds the reorder indices for each candidate store vector that is a user of
5715 // the current TreeEntry.
5716 SmallVector<OrdersType, 1> ExternalReorderIndices;
5717
5718 // Now inspect the stores collected per pointer and look for vectorization
5719 // candidates. For each candidate calculate the reorder index vector and push
5720 // it into `ExternalReorderIndices`
5721 for (const auto &Pair : PtrToStoresMap) {
5722 auto &StoresVec = Pair.second;
5723 // If we have fewer than NumLanes stores, then we can't form a vector.
5724 if (StoresVec.size() != NumLanes)
5725 continue;
5726
5727 // If the stores are not consecutive then abandon this StoresVec.
5728 OrdersType ReorderIndices;
5729 if (!canFormVector(StoresVec, ReorderIndices))
5730 continue;
5731
5732 // We now know that the scalars in StoresVec can form a vector instruction,
5733 // so set the reorder indices.
5734 ExternalReorderIndices.push_back(ReorderIndices);
5735 }
5736 return ExternalReorderIndices;
5737}
5738
5740 const SmallDenseSet<Value *> &UserIgnoreLst) {
5741 deleteTree();
5742 UserIgnoreList = &UserIgnoreLst;
5743 if (!allSameType(Roots))
5744 return;
5745 buildTree_rec(Roots, 0, EdgeInfo());
5746}
5747
5749 deleteTree();
5750 if (!allSameType(Roots))
5751 return;
5752 buildTree_rec(Roots, 0, EdgeInfo());
5753}
5754
5755/// \return true if the specified list of values has only one instruction that
5756/// requires scheduling, false otherwise.
5757#ifndef NDEBUG
5759 Value *NeedsScheduling = nullptr;
5760 for (Value *V : VL) {
5762 continue;
5763 if (!NeedsScheduling) {
5764 NeedsScheduling = V;
5765 continue;
5766 }
5767 return false;
5768 }
5769 return NeedsScheduling;
5770}
5771#endif
5772
5773/// Generates key/subkey pair for the given value to provide effective sorting
5774/// of the values and better detection of the vectorizable values sequences. The
5775/// keys/subkeys can be used for better sorting of the values themselves (keys)
5776/// and in values subgroups (subkeys).
5777static std::pair<size_t, size_t> generateKeySubkey(
5778 Value *V, const TargetLibraryInfo *TLI,
5779 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5780 bool AllowAlternate) {
5781 hash_code Key = hash_value(V->getValueID() + 2);
5782 hash_code SubKey = hash_value(0);
5783 // Sort the loads by the distance between the pointers.
5784 if (auto *LI = dyn_cast<LoadInst>(V)) {
5785 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5786 if (LI->isSimple())
5787 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5788 else
5789 Key = SubKey = hash_value(LI);
5790 } else if (isVectorLikeInstWithConstOps(V)) {
5791 // Sort extracts by the vector operands.
5792 if (isa<ExtractElementInst, UndefValue>(V))
5793 Key = hash_value(Value::UndefValueVal + 1);
5794 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5795 if (!isUndefVector(EI->getVectorOperand()).all() &&
5796 !isa<UndefValue>(EI->getIndexOperand()))
5797 SubKey = hash_value(EI->getVectorOperand());
5798 }
5799 } else if (auto *I = dyn_cast<Instruction>(V)) {
5800 // Sort other instructions just by the opcodes except for CMPInst.
5801 // For CMP also sort by the predicate kind.
5802 if ((isa<BinaryOperator, CastInst>(I)) &&
5803 isValidForAlternation(I->getOpcode())) {
5804 if (AllowAlternate)
5805 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5806 else
5807 Key = hash_combine(hash_value(I->getOpcode()), Key);
5808 SubKey = hash_combine(
5809 hash_value(I->getOpcode()), hash_value(I->getType()),
5810 hash_value(isa<BinaryOperator>(I)
5811 ? I->getType()
5812 : cast<CastInst>(I)->getOperand(0)->getType()));
5813 // For casts, look through the only operand to improve compile time.
5814 if (isa<CastInst>(I)) {
5815 std::pair<size_t, size_t> OpVals =
5816 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5817 /*AllowAlternate=*/true);
5818 Key = hash_combine(OpVals.first, Key);
5819 SubKey = hash_combine(OpVals.first, SubKey);
5820 }
5821 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5822 CmpInst::Predicate Pred = CI->getPredicate();
5823 if (CI->isCommutative())
5824 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5826 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5827 hash_value(SwapPred),
5828 hash_value(CI->getOperand(0)->getType()));
5829 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5832 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5833 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5834 SubKey = hash_combine(hash_value(I->getOpcode()),
5835 hash_value(Call->getCalledFunction()));
5836 } else {
5837 Key = hash_combine(hash_value(Call), Key);
5838 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5839 }
5840 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5841 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5842 hash_value(Op.Tag), SubKey);
5843 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5844 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5845 SubKey = hash_value(Gep->getPointerOperand());
5846 else
5847 SubKey = hash_value(Gep);
5848 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5849 !isa<ConstantInt>(I->getOperand(1))) {
5850 // Do not try to vectorize instructions with potentially high cost.
5851 SubKey = hash_value(I);
5852 } else {
5853 SubKey = hash_value(I->getOpcode());
5854 }
5855 Key = hash_combine(hash_value(I->getParent()), Key);
5856 }
5857 return std::make_pair(Key, SubKey);
5858}
5859
5860/// Checks if the specified instruction \p I is an alternate operation for
5861/// the given \p MainOp and \p AltOp instructions.
5862static bool isAlternateInstruction(const Instruction *I,
5863 const Instruction *MainOp,
5864 const Instruction *AltOp,
5865 const TargetLibraryInfo &TLI);
5866
5867bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5868 ArrayRef<Value *> VL) const {
5869 unsigned Opcode0 = S.getOpcode();
5870 unsigned Opcode1 = S.getAltOpcode();
5871 // The opcode mask selects between the two opcodes.
5872 SmallBitVector OpcodeMask(VL.size(), false);
5873 for (unsigned Lane : seq<unsigned>(0, VL.size()))
5874 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
5875 OpcodeMask.set(Lane);
5876 // If this pattern is supported by the target then consider it profitable.
5877 if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
5878 Opcode0, Opcode1, OpcodeMask))
5879 return true;
5881 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5882 Operands.emplace_back();
5883 // Prepare the operand vector.
5884 for (Value *V : VL)
5885 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
5886 }
5887 if (Operands.size() == 2) {
5888 // Try find best operands candidates.
5889 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5891 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
5892 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
5893 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
5894 std::optional<int> Res = findBestRootPair(Candidates);
5895 switch (Res.value_or(0)) {
5896 case 0:
5897 break;
5898 case 1:
5899 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
5900 break;
5901 case 2:
5902 std::swap(Operands[0][I], Operands[1][I]);
5903 break;
5904 default:
5905 llvm_unreachable("Unexpected index.");
5906 }
5907 }
5908 }
5909 DenseSet<unsigned> UniqueOpcodes;
5910 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
5911 unsigned NonInstCnt = 0;
5912 // Estimate number of instructions, required for the vectorized node and for
5913 // the buildvector node.
5914 unsigned UndefCnt = 0;
5915 // Count the number of extra shuffles, required for vector nodes.
5916 unsigned ExtraShuffleInsts = 0;
5917 // Check that operands do not contain same values and create either perfect
5918 // diamond match or shuffled match.
5919 if (Operands.size() == 2) {
5920 // Do not count same operands twice.
5921 if (Operands.front() == Operands.back()) {
5922 Operands.erase(Operands.begin());
5923 } else if (!allConstant(Operands.front()) &&
5924 all_of(Operands.front(), [&](Value *V) {
5925 return is_contained(Operands.back(), V);
5926 })) {
5927 Operands.erase(Operands.begin());
5928 ++ExtraShuffleInsts;
5929 }
5930 }
5931 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
5932 // Vectorize node, if:
5933 // 1. at least single operand is constant or splat.
5934 // 2. Operands have many loop invariants (the instructions are not loop
5935 // invariants).
5936 // 3. At least single unique operands is supposed to vectorized.
5937 return none_of(Operands,
5938 [&](ArrayRef<Value *> Op) {
5939 if (allConstant(Op) ||
5940 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
5941 getSameOpcode(Op, *TLI).MainOp))
5942 return false;
5944 for (Value *V : Op) {
5945 if (isa<Constant, ExtractElementInst>(V) ||
5946 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
5947 if (isa<UndefValue>(V))
5948 ++UndefCnt;
5949 continue;
5950 }
5951 auto Res = Uniques.try_emplace(V, 0);
5952 // Found first duplicate - need to add shuffle.
5953 if (!Res.second && Res.first->second == 1)
5954 ++ExtraShuffleInsts;
5955 ++Res.first->getSecond();
5956 if (auto *I = dyn_cast<Instruction>(V))
5957 UniqueOpcodes.insert(I->getOpcode());
5958 else if (Res.second)
5959 ++NonInstCnt;
5960 }
5961 return none_of(Uniques, [&](const auto &P) {
5962 return P.first->hasNUsesOrMore(P.second + 1) &&
5963 none_of(P.first->users(), [&](User *U) {
5964 return getTreeEntry(U) || Uniques.contains(U);
5965 });
5966 });
5967 }) ||
5968 // Do not vectorize node, if estimated number of vector instructions is
5969 // more than estimated number of buildvector instructions. Number of
5970 // vector operands is number of vector instructions + number of vector
5971 // instructions for operands (buildvectors). Number of buildvector
5972 // instructions is just number_of_operands * number_of_scalars.
5973 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5974 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
5975 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5976}
5977
5978BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5979 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5980 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5981 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5982
5983 unsigned ShuffleOrOp =
5984 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
5985 auto *VL0 = cast<Instruction>(S.OpValue);
5986 switch (ShuffleOrOp) {
5987 case Instruction::PHI: {
5988 // Check for terminator values (e.g. invoke).
5989 for (Value *V : VL)
5990 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
5991 Instruction *Term = dyn_cast<Instruction>(Incoming);
5992 if (Term && Term->isTerminator()) {
5994 << "SLP: Need to swizzle PHINodes (terminator use).\n");
5995 return TreeEntry::NeedToGather;
5996 }
5997 }
5998
5999 return TreeEntry::Vectorize;
6000 }
6001 case Instruction::ExtractValue:
6002 case Instruction::ExtractElement: {
6003 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6004 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6005 if (!isPowerOf2_32(VL.size()))
6006 return TreeEntry::NeedToGather;
6007 if (Reuse || !CurrentOrder.empty())
6008 return TreeEntry::Vectorize;
6009 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6010 return TreeEntry::NeedToGather;
6011 }
6012 case Instruction::InsertElement: {
6013 // Check that we have a buildvector and not a shuffle of 2 or more
6014 // different vectors.
6015 ValueSet SourceVectors;
6016 for (Value *V : VL) {
6017 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6018 assert(getInsertIndex(V) != std::nullopt &&
6019 "Non-constant or undef index?");
6020 }
6021
6022 if (count_if(VL, [&SourceVectors](Value *V) {
6023 return !SourceVectors.contains(V);
6024 }) >= 2) {
6025 // Found 2nd source vector - cancel.
6026 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6027 "different source vectors.\n");
6028 return TreeEntry::NeedToGather;
6029 }
6030
6031 return TreeEntry::Vectorize;
6032 }
6033 case Instruction::Load: {
6034 // Check that a vectorized load would load the same memory as a scalar
6035 // load. For example, we don't want to vectorize loads that are smaller
6036 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6037 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6038 // from such a struct, we read/write packed bits disagreeing with the
6039 // unvectorized version.
6040 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6042 return TreeEntry::Vectorize;
6044 return TreeEntry::ScatterVectorize;
6046 return TreeEntry::StridedVectorize;
6047 case LoadsState::Gather:
6048#ifndef NDEBUG
6049 Type *ScalarTy = VL0->getType();
6050 if (DL->getTypeSizeInBits(ScalarTy) !=
6051 DL->getTypeAllocSizeInBits(ScalarTy))
6052 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6053 else if (any_of(VL,
6054 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6055 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6056 else
6057 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6058#endif // NDEBUG
6059 return TreeEntry::NeedToGather;
6060 }
6061 llvm_unreachable("Unexpected state of loads");
6062 }
6063 case Instruction::ZExt:
6064 case Instruction::SExt:
6065 case Instruction::FPToUI:
6066 case Instruction::FPToSI:
6067 case Instruction::FPExt:
6068 case Instruction::PtrToInt:
6069 case Instruction::IntToPtr:
6070 case Instruction::SIToFP:
6071 case Instruction::UIToFP:
6072 case Instruction::Trunc:
6073 case Instruction::FPTrunc:
6074 case Instruction::BitCast: {
6075 Type *SrcTy = VL0->getOperand(0)->getType();
6076 for (Value *V : VL) {
6077 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6078 if (Ty != SrcTy || !isValidElementType(Ty)) {
6079 LLVM_DEBUG(
6080 dbgs() << "SLP: Gathering casts with different src types.\n");
6081 return TreeEntry::NeedToGather;
6082 }
6083 }
6084 return TreeEntry::Vectorize;
6085 }
6086 case Instruction::ICmp:
6087 case Instruction::FCmp: {
6088 // Check that all of the compares have the same predicate.
6089 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6091 Type *ComparedTy = VL0->getOperand(0)->getType();
6092 for (Value *V : VL) {
6093 CmpInst *Cmp = cast<CmpInst>(V);
6094 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6095 Cmp->getOperand(0)->getType() != ComparedTy) {
6096 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6097 return TreeEntry::NeedToGather;
6098 }
6099 }
6100 return TreeEntry::Vectorize;
6101 }
6102 case Instruction::Select:
6103 case Instruction::FNeg:
6104 case Instruction::Add:
6105 case Instruction::FAdd:
6106 case Instruction::Sub:
6107 case Instruction::FSub:
6108 case Instruction::Mul:
6109 case Instruction::FMul:
6110 case Instruction::UDiv:
6111 case Instruction::SDiv:
6112 case Instruction::FDiv:
6113 case Instruction::URem:
6114 case Instruction::SRem:
6115 case Instruction::FRem:
6116 case Instruction::Shl:
6117 case Instruction::LShr:
6118 case Instruction::AShr:
6119 case Instruction::And:
6120 case Instruction::Or:
6121 case Instruction::Xor:
6122 return TreeEntry::Vectorize;
6123 case Instruction::GetElementPtr: {
6124 // We don't combine GEPs with complicated (nested) indexing.
6125 for (Value *V : VL) {
6126 auto *I = dyn_cast<GetElementPtrInst>(V);
6127 if (!I)
6128 continue;
6129 if (I->getNumOperands() != 2) {
6130 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6131 return TreeEntry::NeedToGather;
6132 }
6133 }
6134
6135 // We can't combine several GEPs into one vector if they operate on
6136 // different types.
6137 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6138 for (Value *V : VL) {
6139 auto *GEP = dyn_cast<GEPOperator>(V);
6140 if (!GEP)
6141 continue;
6142 Type *CurTy = GEP->getSourceElementType();
6143 if (Ty0 != CurTy) {
6144 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6145 return TreeEntry::NeedToGather;
6146 }
6147 }
6148
6149 // We don't combine GEPs with non-constant indexes.
6150 Type *Ty1 = VL0->getOperand(1)->getType();
6151 for (Value *V : VL) {
6152 auto *I = dyn_cast<GetElementPtrInst>(V);
6153 if (!I)
6154 continue;
6155 auto *Op = I->getOperand(1);
6156 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6157 (Op->getType() != Ty1 &&
6158 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6159 Op->getType()->getScalarSizeInBits() >
6160 DL->getIndexSizeInBits(
6161 V->getType()->getPointerAddressSpace())))) {
6162 LLVM_DEBUG(
6163 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6164 return TreeEntry::NeedToGather;
6165 }
6166 }
6167
6168 return TreeEntry::Vectorize;
6169 }
6170 case Instruction::Store: {
6171 // Check if the stores are consecutive or if we need to swizzle them.
6172 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6173 // Avoid types that are padded when being allocated as scalars, while
6174 // being packed together in a vector (such as i1).
6175 if (DL->getTypeSizeInBits(ScalarTy) !=
6176 DL->getTypeAllocSizeInBits(ScalarTy)) {
6177 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6178 return TreeEntry::NeedToGather;
6179 }
6180 // Make sure all stores in the bundle are simple - we can't vectorize
6181 // atomic or volatile stores.
6182 for (Value *V : VL) {
6183 auto *SI = cast<StoreInst>(V);
6184 if (!SI->isSimple()) {
6185 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6186 return TreeEntry::NeedToGather;
6187 }
6188 PointerOps.push_back(SI->getPointerOperand());
6189 }
6190
6191 // Check the order of pointer operands.
6192 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6193 Value *Ptr0;
6194 Value *PtrN;
6195 if (CurrentOrder.empty()) {
6196 Ptr0 = PointerOps.front();
6197 PtrN = PointerOps.back();
6198 } else {
6199 Ptr0 = PointerOps[CurrentOrder.front()];
6200 PtrN = PointerOps[CurrentOrder.back()];
6201 }
6202 std::optional<int> Dist =
6203 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6204 // Check that the sorted pointer operands are consecutive.
6205 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6206 return TreeEntry::Vectorize;
6207 }
6208
6209 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6210 return TreeEntry::NeedToGather;
6211 }
6212 case Instruction::Call: {
6213 // Check if the calls are all to the same vectorizable intrinsic or
6214 // library function.
6215 CallInst *CI = cast<CallInst>(VL0);
6217
6218 VFShape Shape = VFShape::get(
6219 CI->getFunctionType(),
6220 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6221 false /*HasGlobalPred*/);
6222 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6223
6224 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6225 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6226 return TreeEntry::NeedToGather;
6227 }
6228 Function *F = CI->getCalledFunction();
6229 unsigned NumArgs = CI->arg_size();
6230 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6231 for (unsigned J = 0; J != NumArgs; ++J)
6233 ScalarArgs[J] = CI->getArgOperand(J);
6234 for (Value *V : VL) {
6235 CallInst *CI2 = dyn_cast<CallInst>(V);
6236 if (!CI2 || CI2->getCalledFunction() != F ||
6237 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6238 (VecFunc &&
6239 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6241 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6242 << "\n");
6243 return TreeEntry::NeedToGather;
6244 }
6245 // Some intrinsics have scalar arguments and should be same in order for
6246 // them to be vectorized.
6247 for (unsigned J = 0; J != NumArgs; ++J) {
6249 Value *A1J = CI2->getArgOperand(J);
6250 if (ScalarArgs[J] != A1J) {
6252 << "SLP: mismatched arguments in call:" << *CI
6253 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6254 return TreeEntry::NeedToGather;
6255 }
6256 }
6257 }
6258 // Verify that the bundle operands are identical between the two calls.
6259 if (CI->hasOperandBundles() &&
6260 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6261 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6262 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6263 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6264 << "!=" << *V << '\n');
6265 return TreeEntry::NeedToGather;
6266 }
6267 }
6268
6269 return TreeEntry::Vectorize;
6270 }
6271 case Instruction::ShuffleVector: {
6272 // If this is not an alternate sequence of opcode like add-sub
6273 // then do not vectorize this instruction.
6274 if (!S.isAltShuffle()) {
6275 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6276 return TreeEntry::NeedToGather;
6277 }
6278 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6279 LLVM_DEBUG(
6280 dbgs()
6281 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6282 "the whole alt sequence is not profitable.\n");
6283 return TreeEntry::NeedToGather;
6284 }
6285
6286 return TreeEntry::Vectorize;
6287 }
6288 default:
6289 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6290 return TreeEntry::NeedToGather;
6291 }
6292}
6293
6294void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6295 const EdgeInfo &UserTreeIdx) {
6296 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6297
6298 SmallVector<int> ReuseShuffleIndicies;
6299 SmallVector<Value *> UniqueValues;
6300 SmallVector<Value *> NonUniqueValueVL;
6301 auto TryToFindDuplicates = [&](const InstructionsState &S,
6302 bool DoNotFail = false) {
6303 // Check that every instruction appears once in this bundle.
6304 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6305 for (Value *V : VL) {
6306 if (isConstant(V)) {
6307 ReuseShuffleIndicies.emplace_back(
6308 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6309 UniqueValues.emplace_back(V);
6310 continue;
6311 }
6312 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6313 ReuseShuffleIndicies.emplace_back(Res.first->second);
6314 if (Res.second)
6315 UniqueValues.emplace_back(V);
6316 }
6317 size_t NumUniqueScalarValues = UniqueValues.size();
6318 if (NumUniqueScalarValues == VL.size()) {
6319 ReuseShuffleIndicies.clear();
6320 } else {
6321 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6322 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6323 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6324 "for nodes with padding.\n");
6325 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6326 return false;
6327 }
6328 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6329 if (NumUniqueScalarValues <= 1 ||
6330 (UniquePositions.size() == 1 && all_of(UniqueValues,
6331 [](Value *V) {
6332 return isa<UndefValue>(V) ||
6333 !isConstant(V);
6334 })) ||
6335 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6336 if (DoNotFail && UniquePositions.size() > 1 &&
6337 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6338 all_of(UniqueValues, [=](Value *V) {
6339 return isa<ExtractElementInst>(V) ||
6340 areAllUsersVectorized(cast<Instruction>(V),
6341 UserIgnoreList);
6342 })) {
6343 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6344 if (PWSz == VL.size()) {
6345 ReuseShuffleIndicies.clear();
6346 } else {
6347 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6348 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6349 UniqueValues.back());
6350 VL = NonUniqueValueVL;
6351 }
6352 return true;
6353 }
6354 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6355 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6356 return false;
6357 }
6358 VL = UniqueValues;
6359 }
6360 return true;
6361 };
6362
6363 InstructionsState S = getSameOpcode(VL, *TLI);
6364
6365 // Don't vectorize ephemeral values.
6366 if (!EphValues.empty()) {
6367 for (Value *V : VL) {
6368 if (EphValues.count(V)) {
6369 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6370 << ") is ephemeral.\n");
6371 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6372 return;
6373 }
6374 }
6375 }
6376
6377 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6378 // a load), in which case peek through to include it in the tree, without
6379 // ballooning over-budget.
6380 if (Depth >= RecursionMaxDepth &&
6381 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6382 VL.size() >= 4 &&
6383 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6384 return match(I,
6386 cast<Instruction>(I)->getOpcode() ==
6387 cast<Instruction>(S.MainOp)->getOpcode();
6388 })))) {
6389 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6390 if (TryToFindDuplicates(S))
6391 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6392 ReuseShuffleIndicies);
6393 return;
6394 }
6395
6396 // Don't handle scalable vectors
6397 if (S.getOpcode() == Instruction::ExtractElement &&
6398 isa<ScalableVectorType>(
6399 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6400 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6401 if (TryToFindDuplicates(S))
6402 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6403 ReuseShuffleIndicies);
6404 return;
6405 }
6406
6407 // Don't handle vectors.
6408 if (S.OpValue->getType()->isVectorTy() &&
6409 !isa<InsertElementInst>(S.OpValue)) {
6410 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6411 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6412 return;
6413 }
6414
6415 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6416 if (SI->getValueOperand()->getType()->isVectorTy()) {
6417 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6418 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6419 return;
6420 }
6421
6422 // If all of the operands are identical or constant we have a simple solution.
6423 // If we deal with insert/extract instructions, they all must have constant
6424 // indices, otherwise we should gather them, not try to vectorize.
6425 // If alternate op node with 2 elements with gathered operands - do not
6426 // vectorize.
6427 auto &&NotProfitableForVectorization = [&S, this,
6429 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6430 return false;
6431 if (VectorizableTree.size() < MinTreeSize)
6432 return false;
6433 if (Depth >= RecursionMaxDepth - 1)
6434 return true;
6435 // Check if all operands are extracts, part of vector node or can build a
6436 // regular vectorize node.
6437 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6438 for (Value *V : VL) {
6439 auto *I = cast<Instruction>(V);
6440 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6441 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6442 }));
6443 }
6444 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6445 if ((IsCommutative &&
6446 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6447 (!IsCommutative &&
6448 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6449 return true;
6450 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6452 auto *I1 = cast<Instruction>(VL.front());
6453 auto *I2 = cast<Instruction>(VL.back());
6454 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6455 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6456 I2->getOperand(Op));
6457 if (static_cast<unsigned>(count_if(
6458 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6460 })) >= S.MainOp->getNumOperands() / 2)
6461 return false;
6462 if (S.MainOp->getNumOperands() > 2)
6463 return true;
6464 if (IsCommutative) {
6465 // Check permuted operands.
6466 Candidates.clear();
6467 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6468 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6469 I2->getOperand((Op + 1) % E));
6470 if (any_of(
6471 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6473 }))
6474 return false;
6475 }
6476 return true;
6477 };
6478 SmallVector<unsigned> SortedIndices;
6479 BasicBlock *BB = nullptr;
6480 bool IsScatterVectorizeUserTE =
6481 UserTreeIdx.UserTE &&
6482 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6483 bool AreAllSameInsts =
6484 (S.getOpcode() && allSameBlock(VL)) ||
6485 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6486 VL.size() > 2 &&
6487 all_of(VL,
6488 [&BB](Value *V) {
6489 auto *I = dyn_cast<GetElementPtrInst>(V);
6490 if (!I)
6491 return doesNotNeedToBeScheduled(V);
6492 if (!BB)
6493 BB = I->getParent();
6494 return BB == I->getParent() && I->getNumOperands() == 2;
6495 }) &&
6496 BB &&
6497 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6498 SortedIndices));
6499 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6500 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6501 S.OpValue) &&
6503 NotProfitableForVectorization(VL)) {
6504 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6505 if (TryToFindDuplicates(S))
6506 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6507 ReuseShuffleIndicies);
6508 return;
6509 }
6510
6511 // We now know that this is a vector of instructions of the same type from
6512 // the same block.
6513
6514 // Check if this is a duplicate of another entry.
6515 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6516 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6517 if (!E->isSame(VL)) {
6518 auto It = MultiNodeScalars.find(S.OpValue);
6519 if (It != MultiNodeScalars.end()) {
6520 auto *TEIt = find_if(It->getSecond(),
6521 [&](TreeEntry *ME) { return ME->isSame(VL); });
6522 if (TEIt != It->getSecond().end())
6523 E = *TEIt;
6524 else
6525 E = nullptr;
6526 } else {
6527 E = nullptr;
6528 }
6529 }
6530 if (!E) {
6531 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6532 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6533 if (TryToFindDuplicates(S))
6534 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6535 ReuseShuffleIndicies);
6536 return;
6537 }
6538 } else {
6539 // Record the reuse of the tree node. FIXME, currently this is only used
6540 // to properly draw the graph rather than for the actual vectorization.
6541 E->UserTreeIndices.push_back(UserTreeIdx);
6542 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6543 << ".\n");
6544 return;
6545 }
6546 }
6547
6548 // Check that none of the instructions in the bundle are already in the tree.
6549 for (Value *V : VL) {
6550 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6552 continue;
6553 if (getTreeEntry(V)) {
6554 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6555 << ") is already in tree.\n");
6556 if (TryToFindDuplicates(S))
6557 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6558 ReuseShuffleIndicies);
6559 return;
6560 }
6561 }
6562
6563 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6564 if (UserIgnoreList && !UserIgnoreList->empty()) {
6565 for (Value *V : VL) {
6566 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6567 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6568 if (TryToFindDuplicates(S))
6569 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6570 ReuseShuffleIndicies);
6571 return;
6572 }
6573 }
6574 }
6575
6576 // Special processing for sorted pointers for ScatterVectorize node with
6577 // constant indeces only.
6578 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6579 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6580 !(S.getOpcode() && allSameBlock(VL))) {
6581 assert(S.OpValue->getType()->isPointerTy() &&
6582 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6583 "Expected pointers only.");
6584 // Reset S to make it GetElementPtr kind of node.
6585 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6586 assert(It != VL.end() && "Expected at least one GEP.");
6587 S = getSameOpcode(*It, *TLI);
6588 }
6589
6590 // Check that all of the users of the scalars that we want to vectorize are
6591 // schedulable.
6592 auto *VL0 = cast<Instruction>(S.OpValue);
6593 BB = VL0->getParent();
6594
6595 if (!DT->isReachableFromEntry(BB)) {
6596 // Don't go into unreachable blocks. They may contain instructions with
6597 // dependency cycles which confuse the final scheduling.
6598 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6599 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6600 return;
6601 }
6602
6603 // Don't go into catchswitch blocks, which can happen with PHIs.
6604 // Such blocks can only have PHIs and the catchswitch. There is no
6605 // place to insert a shuffle if we need to, so just avoid that issue.
6606 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6607 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6608 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6609 return;
6610 }
6611
6612 // Check that every instruction appears once in this bundle.
6613 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6614 return;
6615
6616 // Perform specific checks for each particular instruction kind.
6617 OrdersType CurrentOrder;
6618 SmallVector<Value *> PointerOps;
6619 TreeEntry::EntryState State = getScalarsVectorizationState(
6620 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6621 if (State == TreeEntry::NeedToGather) {
6622 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6623 ReuseShuffleIndicies);
6624 return;
6625 }
6626
6627 auto &BSRef = BlocksSchedules[BB];
6628 if (!BSRef)
6629 BSRef = std::make_unique<BlockScheduling>(BB);
6630
6631 BlockScheduling &BS = *BSRef;
6632
6633 std::optional<ScheduleData *> Bundle =
6634 BS.tryScheduleBundle(UniqueValues, this, S);
6635#ifdef EXPENSIVE_CHECKS
6636 // Make sure we didn't break any internal invariants
6637 BS.verify();
6638#endif
6639 if (!Bundle) {
6640 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6641 assert((!BS.getScheduleData(VL0) ||
6642 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6643 "tryScheduleBundle should cancelScheduling on failure");
6644 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6645 ReuseShuffleIndicies);
6646 return;
6647 }
6648 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6649
6650 unsigned ShuffleOrOp = S.isAltShuffle() ?
6651 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6652 switch (ShuffleOrOp) {
6653 case Instruction::PHI: {
6654 auto *PH = cast<PHINode>(VL0);
6655
6656 TreeEntry *TE =
6657 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6658 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6659
6660 // Keeps the reordered operands to avoid code duplication.
6661 SmallVector<ValueList, 2> OperandsVec;
6662 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
6663 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
6664 ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
6665 TE->setOperand(I, Operands);
6666 OperandsVec.push_back(Operands);
6667 continue;
6668 }
6670 // Prepare the operand vector.
6671 for (Value *V : VL)
6672 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6673 PH->getIncomingBlock(I)));
6674 TE->setOperand(I, Operands);
6675 OperandsVec.push_back(Operands);
6676 }
6677 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
6678 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
6679 return;
6680 }
6681 case Instruction::ExtractValue:
6682 case Instruction::ExtractElement: {
6683 if (CurrentOrder.empty()) {
6684 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6685 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6686 ReuseShuffleIndicies);
6687 // This is a special case, as it does not gather, but at the same time
6688 // we are not extending buildTree_rec() towards the operands.
6689 ValueList Op0;
6690 Op0.assign(VL.size(), VL0->getOperand(0));
6691 VectorizableTree.back()->setOperand(0, Op0);
6692 return;
6693 }
6694 LLVM_DEBUG({
6695 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6696 "with order";
6697 for (unsigned Idx : CurrentOrder)
6698 dbgs() << " " << Idx;
6699 dbgs() << "\n";
6700 });
6701 fixupOrderingIndices(CurrentOrder);
6702 // Insert new order with initial value 0, if it does not exist,
6703 // otherwise return the iterator to the existing one.
6704 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6705 ReuseShuffleIndicies, CurrentOrder);
6706 // This is a special case, as it does not gather, but at the same time
6707 // we are not extending buildTree_rec() towards the operands.
6708 ValueList Op0;
6709 Op0.assign(VL.size(), VL0->getOperand(0));
6710 VectorizableTree.back()->setOperand(0, Op0);
6711 return;
6712 }
6713 case Instruction::InsertElement: {
6714 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6715
6716 auto OrdCompare = [](const std::pair<int, int> &P1,
6717 const std::pair<int, int> &P2) {
6718 return P1.first > P2.first;
6719 };
6721 decltype(OrdCompare)>
6722 Indices(OrdCompare);
6723 for (int I = 0, E = VL.size(); I < E; ++I) {
6724 unsigned Idx = *getInsertIndex(VL[I]);
6725 Indices.emplace(Idx, I);
6726 }
6727 OrdersType CurrentOrder(VL.size(), VL.size());
6728 bool IsIdentity = true;
6729 for (int I = 0, E = VL.size(); I < E; ++I) {
6730 CurrentOrder[Indices.top().second] = I;
6731 IsIdentity &= Indices.top().second == I;
6732 Indices.pop();
6733 }
6734 if (IsIdentity)
6735 CurrentOrder.clear();
6736 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6737 std::nullopt, CurrentOrder);
6738 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6739
6740 constexpr int NumOps = 2;
6741 ValueList VectorOperands[NumOps];
6742 for (int I = 0; I < NumOps; ++I) {
6743 for (Value *V : VL)
6744 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
6745
6746 TE->setOperand(I, VectorOperands[I]);
6747 }
6748 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
6749 return;
6750 }
6751 case Instruction::Load: {
6752 // Check that a vectorized load would load the same memory as a scalar
6753 // load. For example, we don't want to vectorize loads that are smaller
6754 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6755 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6756 // from such a struct, we read/write packed bits disagreeing with the
6757 // unvectorized version.
6758 TreeEntry *TE = nullptr;
6759 fixupOrderingIndices(CurrentOrder);
6760 switch (State) {
6761 case TreeEntry::Vectorize:
6762 if (CurrentOrder.empty()) {
6763 // Original loads are consecutive and does not require reordering.
6764 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6765 ReuseShuffleIndicies);
6766 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6767 } else {
6768 // Need to reorder.
6769 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6770 ReuseShuffleIndicies, CurrentOrder);
6771 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6772 }
6773 TE->setOperandsInOrder();
6774 break;
6775 case TreeEntry::StridedVectorize:
6776 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6777 if (CurrentOrder.empty()) {
6778 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6779 UserTreeIdx, ReuseShuffleIndicies);
6780 } else {
6781 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6782 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6783 }
6784 TE->setOperandsInOrder();
6785 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6786 break;
6787 case TreeEntry::ScatterVectorize:
6788 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6789 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6790 UserTreeIdx, ReuseShuffleIndicies);
6791 TE->setOperandsInOrder();
6792 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6793 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6794 break;
6795 case TreeEntry::NeedToGather:
6796 llvm_unreachable("Unexpected loads state.");
6797 }
6798 return;
6799 }
6800 case Instruction::ZExt:
6801 case Instruction::SExt:
6802 case Instruction::FPToUI:
6803 case Instruction::FPToSI:
6804 case Instruction::FPExt:
6805 case Instruction::PtrToInt:
6806 case Instruction::IntToPtr:
6807 case Instruction::SIToFP:
6808 case Instruction::UIToFP:
6809 case Instruction::Trunc:
6810 case Instruction::FPTrunc:
6811 case Instruction::BitCast: {
6812 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6813 std::make_pair(std::numeric_limits<unsigned>::min(),
6814 std::numeric_limits<unsigned>::max()));
6815 if (ShuffleOrOp == Instruction::ZExt ||
6816 ShuffleOrOp == Instruction::SExt) {
6817 CastMaxMinBWSizes = std::make_pair(
6818 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6819 PrevMaxBW),
6820 std::min<unsigned>(
6821 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6822 PrevMinBW));
6823 } else if (ShuffleOrOp == Instruction::Trunc) {
6824 CastMaxMinBWSizes = std::make_pair(
6825 std::max<unsigned>(
6826 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6827 PrevMaxBW),
6828 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6829 PrevMinBW));
6830 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6831 } else if (ShuffleOrOp == Instruction::SIToFP ||
6832 ShuffleOrOp == Instruction::UIToFP) {
6833 unsigned NumSignBits =
6834 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6835 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6836 APInt Mask = DB->getDemandedBits(OpI);
6837 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
6838 }
6839 if (NumSignBits * 2 >=
6840 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6841 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6842 }
6843 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6844 ReuseShuffleIndicies);
6845 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6846
6847 TE->setOperandsInOrder();
6848 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6850 // Prepare the operand vector.
6851 for (Value *V : VL)
6852 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6853
6854 buildTree_rec(Operands, Depth + 1, {TE, I});
6855 }
6856 return;
6857 }
6858 case Instruction::ICmp:
6859 case Instruction::FCmp: {
6860 // Check that all of the compares have the same predicate.
6861 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6862 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6863 ReuseShuffleIndicies);
6864 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6865
6867 if (cast<CmpInst>(VL0)->isCommutative()) {
6868 // Commutative predicate - collect + sort operands of the instructions
6869 // so that each side is more likely to have the same opcode.
6871 "Commutative Predicate mismatch");
6872 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6873 } else {
6874 // Collect operands - commute if it uses the swapped predicate.
6875 for (Value *V : VL) {
6876 auto *Cmp = cast<CmpInst>(V);
6877 Value *LHS = Cmp->getOperand(0);
6878 Value *RHS = Cmp->getOperand(1);
6879 if (Cmp->getPredicate() != P0)
6880 std::swap(LHS, RHS);
6881 Left.push_back(LHS);
6882 Right.push_back(RHS);
6883 }
6884 }
6885 TE->setOperand(0, Left);
6886 TE->setOperand(1, Right);
6887 buildTree_rec(Left, Depth + 1, {TE, 0});
6888 buildTree_rec(Right, Depth + 1, {TE, 1});
6889 if (ShuffleOrOp == Instruction::ICmp) {
6890 unsigned NumSignBits0 =
6891 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6892 if (NumSignBits0 * 2 >=
6893 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6894 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
6895 unsigned NumSignBits1 =
6896 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
6897 if (NumSignBits1 * 2 >=
6898 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6899 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
6900 }
6901 return;
6902 }
6903 case Instruction::Select:
6904 case Instruction::FNeg:
6905 case Instruction::Add:
6906 case Instruction::FAdd:
6907 case Instruction::Sub:
6908 case Instruction::FSub:
6909 case Instruction::Mul:
6910 case Instruction::FMul:
6911 case Instruction::UDiv:
6912 case Instruction::SDiv:
6913 case Instruction::FDiv:
6914 case Instruction::URem:
6915 case Instruction::SRem:
6916 case Instruction::FRem:
6917 case Instruction::Shl:
6918 case Instruction::LShr:
6919 case Instruction::AShr:
6920 case Instruction::And:
6921 case Instruction::Or:
6922 case Instruction::Xor: {
6923 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6924 ReuseShuffleIndicies);
6925 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6926
6927 // Sort operands of the instructions so that each side is more likely to
6928 // have the same opcode.
6929 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
6931 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6932 TE->setOperand(0, Left);
6933 TE->setOperand(1, Right);
6934 buildTree_rec(Left, Depth + 1, {TE, 0});
6935 buildTree_rec(Right, Depth + 1, {TE, 1});
6936 return;
6937 }
6938
6939 TE->setOperandsInOrder();
6940 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6942 // Prepare the operand vector.
6943 for (Value *V : VL)
6944 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6945
6946 buildTree_rec(Operands, Depth + 1, {TE, I});
6947 }
6948 return;
6949 }
6950 case Instruction::GetElementPtr: {
6951 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6952 ReuseShuffleIndicies);
6953 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6955 // Prepare the operand vector for pointer operands.
6956 for (Value *V : VL) {
6957 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6958 if (!GEP) {
6959 Operands.front().push_back(V);
6960 continue;
6961 }
6962 Operands.front().push_back(GEP->getPointerOperand());
6963 }
6964 TE->setOperand(0, Operands.front());
6965 // Need to cast all indices to the same type before vectorization to
6966 // avoid crash.
6967 // Required to be able to find correct matches between different gather
6968 // nodes and reuse the vectorized values rather than trying to gather them
6969 // again.
6970 int IndexIdx = 1;
6971 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6972 Type *Ty = all_of(VL,
6973 [VL0Ty, IndexIdx](Value *V) {
6974 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6975 if (!GEP)
6976 return true;
6977 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
6978 })
6979 ? VL0Ty
6980 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
6981 ->getPointerOperandType()
6982 ->getScalarType());
6983 // Prepare the operand vector.
6984 for (Value *V : VL) {
6985 auto *I = dyn_cast<GetElementPtrInst>(V);
6986 if (!I) {
6987 Operands.back().push_back(
6988 ConstantInt::get(Ty, 0, /*isSigned=*/false));
6989 continue;
6990 }
6991 auto *Op = I->getOperand(IndexIdx);
6992 auto *CI = dyn_cast<ConstantInt>(Op);
6993 if (!CI)
6994 Operands.back().push_back(Op);
6995 else
6996 Operands.back().push_back(ConstantFoldIntegerCast(
6997 CI, Ty, CI->getValue().isSignBitSet(), *DL));
6998 }
6999 TE->setOperand(IndexIdx, Operands.back());
7000
7001 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7002 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7003 return;
7004 }
7005 case Instruction::Store: {
7006 // Check if the stores are consecutive or if we need to swizzle them.
7007 ValueList Operands(VL.size());
7008 auto *OIter = Operands.begin();
7009 for (Value *V : VL) {
7010 auto *SI = cast<StoreInst>(V);
7011 *OIter = SI->getValueOperand();
7012 ++OIter;
7013 }
7014 // Check that the sorted pointer operands are consecutive.
7015 if (CurrentOrder.empty()) {
7016 // Original stores are consecutive and does not require reordering.
7017 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7018 ReuseShuffleIndicies);
7019 TE->setOperandsInOrder();
7020 buildTree_rec(Operands, Depth + 1, {TE, 0});
7021 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7022 } else {
7023 fixupOrderingIndices(CurrentOrder);
7024 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7025 ReuseShuffleIndicies, CurrentOrder);
7026 TE->setOperandsInOrder();
7027 buildTree_rec(Operands, Depth + 1, {TE, 0});
7028 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7029 }
7030 return;
7031 }
7032 case Instruction::Call: {
7033 // Check if the calls are all to the same vectorizable intrinsic or
7034 // library function.
7035 CallInst *CI = cast<CallInst>(VL0);
7037
7038 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7039 ReuseShuffleIndicies);
7040 // Sort operands of the instructions so that each side is more likely to
7041 // have the same opcode.
7042 if (isCommutative(VL0)) {
7044 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7045 TE->setOperand(0, Left);
7046 TE->setOperand(1, Right);
7048 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7049 Operands.emplace_back();
7051 continue;
7052 for (Value *V : VL) {
7053 auto *CI2 = cast<CallInst>(V);
7054 Operands.back().push_back(CI2->getArgOperand(I));
7055 }
7056 TE->setOperand(I, Operands.back());
7057 }
7058 buildTree_rec(Left, Depth + 1, {TE, 0});
7059 buildTree_rec(Right, Depth + 1, {TE, 1});
7060 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7061 if (Operands[I - 2].empty())
7062 continue;
7063 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7064 }
7065 return;
7066 }
7067 TE->setOperandsInOrder();
7068 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7069 // For scalar operands no need to create an entry since no need to
7070 // vectorize it.
7072 continue;
7074 // Prepare the operand vector.
7075 for (Value *V : VL) {
7076 auto *CI2 = cast<CallInst>(V);
7077 Operands.push_back(CI2->getArgOperand(I));
7078 }
7079 buildTree_rec(Operands, Depth + 1, {TE, I});
7080 }
7081 return;
7082 }
7083 case Instruction::ShuffleVector: {
7084 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7085 ReuseShuffleIndicies);
7086 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7087
7088 // Reorder operands if reordering would enable vectorization.
7089 auto *CI = dyn_cast<CmpInst>(VL0);
7090 if (isa<BinaryOperator>(VL0) || CI) {
7092 if (!CI || all_of(VL, [](Value *V) {
7093 return cast<CmpInst>(V)->isCommutative();
7094 })) {
7095 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7096 } else {
7097 auto *MainCI = cast<CmpInst>(S.MainOp);
7098 auto *AltCI = cast<CmpInst>(S.AltOp);
7099 CmpInst::Predicate MainP = MainCI->getPredicate();
7100 CmpInst::Predicate AltP = AltCI->getPredicate();
7101 assert(MainP != AltP &&
7102 "Expected different main/alternate predicates.");
7103 // Collect operands - commute if it uses the swapped predicate or
7104 // alternate operation.
7105 for (Value *V : VL) {
7106 auto *Cmp = cast<CmpInst>(V);
7107 Value *LHS = Cmp->getOperand(0);
7108 Value *RHS = Cmp->getOperand(1);
7109
7110 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7111 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7112 std::swap(LHS, RHS);
7113 } else {
7114 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7115 std::swap(LHS, RHS);
7116 }
7117 Left.push_back(LHS);
7118 Right.push_back(RHS);
7119 }
7120 }
7121 TE->setOperand(0, Left);
7122 TE->setOperand(1, Right);
7123 buildTree_rec(Left, Depth + 1, {TE, 0});
7124 buildTree_rec(Right, Depth + 1, {TE, 1});
7125 return;
7126 }
7127
7128 TE->setOperandsInOrder();
7129 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7131 // Prepare the operand vector.
7132 for (Value *V : VL)
7133 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7134
7135 buildTree_rec(Operands, Depth + 1, {TE, I});
7136 }
7137 return;
7138 }
7139 default:
7140 break;
7141 }
7142 llvm_unreachable("Unexpected vectorization of the instructions.");
7143}
7144
7146 unsigned N = 1;
7147 Type *EltTy = T;
7148
7149 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7150 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7151 // Check that struct is homogeneous.
7152 for (const auto *Ty : ST->elements())
7153 if (Ty != *ST->element_begin())
7154 return 0;
7155 N *= ST->getNumElements();
7156 EltTy = *ST->element_begin();
7157 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7158 N *= AT->getNumElements();
7159 EltTy = AT->getElementType();
7160 } else {
7161 auto *VT = cast<FixedVectorType>(EltTy);
7162 N *= VT->getNumElements();
7163 EltTy = VT->getElementType();
7164 }
7165 }
7166
7167 if (!isValidElementType(EltTy))
7168 return 0;
7169 uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
7170 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7171 VTSize != DL->getTypeStoreSizeInBits(T))
7172 return 0;
7173 return N;
7174}
7175
7176bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7177 SmallVectorImpl<unsigned> &CurrentOrder,
7178 bool ResizeAllowed) const {
7179 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7180 assert(It != VL.end() && "Expected at least one extract instruction.");
7181 auto *E0 = cast<Instruction>(*It);
7182 assert(
7183 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7184 "Invalid opcode");
7185 // Check if all of the extracts come from the same vector and from the
7186 // correct offset.
7187 Value *Vec = E0->getOperand(0);
7188
7189 CurrentOrder.clear();
7190
7191 // We have to extract from a vector/aggregate with the same number of elements.
7192 unsigned NElts;
7193 if (E0->getOpcode() == Instruction::ExtractValue) {
7194 NElts = canMapToVector(Vec->getType());
7195 if (!NElts)
7196 return false;
7197 // Check if load can be rewritten as load of vector.
7198 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7199 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7200 return false;
7201 } else {
7202 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7203 }
7204
7205 unsigned E = VL.size();
7206 if (!ResizeAllowed && NElts != E)
7207 return false;
7209 unsigned MinIdx = NElts, MaxIdx = 0;
7210 for (auto [I, V] : enumerate(VL)) {
7211 auto *Inst = dyn_cast<Instruction>(V);
7212 if (!Inst)
7213 continue;
7214 if (Inst->getOperand(0) != Vec)
7215 return false;
7216 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7217 if (isa<UndefValue>(EE->getIndexOperand()))
7218 continue;
7219 std::optional<unsigned> Idx = getExtractIndex(Inst);
7220 if (!Idx)
7221 return false;
7222 const unsigned ExtIdx = *Idx;
7223 if (ExtIdx >= NElts)
7224 continue;
7225 Indices[I] = ExtIdx;
7226 if (MinIdx > ExtIdx)
7227 MinIdx = ExtIdx;
7228 if (MaxIdx < ExtIdx)
7229 MaxIdx = ExtIdx;
7230 }
7231 if (MaxIdx - MinIdx + 1 > E)
7232 return false;
7233 if (MaxIdx + 1 <= E)
7234 MinIdx = 0;
7235
7236 // Check that all of the indices extract from the correct offset.
7237 bool ShouldKeepOrder = true;
7238 // Assign to all items the initial value E + 1 so we can check if the extract
7239 // instruction index was used already.
7240 // Also, later we can check that all the indices are used and we have a
7241 // consecutive access in the extract instructions, by checking that no
7242 // element of CurrentOrder still has value E + 1.
7243 CurrentOrder.assign(E, E);
7244 for (unsigned I = 0; I < E; ++I) {
7245 if (Indices[I] == PoisonMaskElem)
7246 continue;
7247 const unsigned ExtIdx = Indices[I] - MinIdx;
7248 if (CurrentOrder[ExtIdx] != E) {
7249 CurrentOrder.clear();
7250 return false;
7251 }
7252 ShouldKeepOrder &= ExtIdx == I;
7253 CurrentOrder[ExtIdx] = I;
7254 }
7255 if (ShouldKeepOrder)
7256 CurrentOrder.clear();
7257
7258 return ShouldKeepOrder;
7259}
7260
7261bool BoUpSLP::areAllUsersVectorized(
7262 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7263 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7264 all_of(I->users(), [this](User *U) {
7265 return ScalarToTreeEntry.contains(U) ||
7266 isVectorLikeInstWithConstOps(U) ||
7267 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7268 });
7269}
7270
7271static std::pair<InstructionCost, InstructionCost>
7274 ArrayRef<Type *> ArgTys) {
7276
7277 // Calculate the cost of the scalar and vector calls.
7278 FastMathFlags FMF;
7279 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7280 FMF = FPCI->getFastMathFlags();
7282 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7283 dyn_cast<IntrinsicInst>(CI));
7284 auto IntrinsicCost =
7286
7287 auto Shape = VFShape::get(CI->getFunctionType(),
7289 false /*HasGlobalPred*/);
7290 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7291 auto LibCost = IntrinsicCost;
7292 if (!CI->isNoBuiltin() && VecFunc) {
7293 // Calculate the cost of the vector library call.
7294 // If the corresponding vector call is cheaper, return its cost.
7295 LibCost =
7296 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7297 }
7298 return {IntrinsicCost, LibCost};
7299}
7300
7301void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7302 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7303 SmallVectorImpl<Value *> *OpScalars,
7304 SmallVectorImpl<Value *> *AltScalars) const {
7305 unsigned Sz = Scalars.size();
7306 Mask.assign(Sz, PoisonMaskElem);
7307 SmallVector<int> OrderMask;
7308 if (!ReorderIndices.empty())
7309 inversePermutation(ReorderIndices, OrderMask);
7310 for (unsigned I = 0; I < Sz; ++I) {
7311 unsigned Idx = I;
7312 if (!ReorderIndices.empty())
7313 Idx = OrderMask[I];
7314 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7315 if (IsAltOp(OpInst)) {
7316 Mask[I] = Sz + Idx;
7317 if (AltScalars)
7318 AltScalars->push_back(OpInst);
7319 } else {
7320 Mask[I] = Idx;
7321 if (OpScalars)
7322 OpScalars->push_back(OpInst);
7323 }
7324 }
7325 if (!ReuseShuffleIndices.empty()) {
7326 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7327 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7328 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7329 });
7330 Mask.swap(NewMask);
7331 }
7332}
7333
7335 const Instruction *MainOp,
7336 const Instruction *AltOp,
7337 const TargetLibraryInfo &TLI) {
7338 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7339 auto *AltCI = cast<CmpInst>(AltOp);
7340 CmpInst::Predicate MainP = MainCI->getPredicate();
7341 CmpInst::Predicate AltP = AltCI->getPredicate();
7342 assert(MainP != AltP && "Expected different main/alternate predicates.");
7343 auto *CI = cast<CmpInst>(I);
7344 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7345 return false;
7346 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7347 return true;
7348 CmpInst::Predicate P = CI->getPredicate();
7350
7351 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7352 "CmpInst expected to match either main or alternate predicate or "
7353 "their swap.");
7354 (void)AltP;
7355 return MainP != P && MainP != SwappedP;
7356 }
7357 return I->getOpcode() == AltOp->getOpcode();
7358}
7359
7360TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7361 assert(!Ops.empty());
7362 const auto *Op0 = Ops.front();
7363
7364 const bool IsConstant = all_of(Ops, [](Value *V) {
7365 // TODO: We should allow undef elements here
7366 return isConstant(V) && !isa<UndefValue>(V);
7367 });
7368 const bool IsUniform = all_of(Ops, [=](Value *V) {
7369 // TODO: We should allow undef elements here
7370 return V == Op0;
7371 });
7372 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7373 // TODO: We should allow undef elements here
7374 if (auto *CI = dyn_cast<ConstantInt>(V))
7375 return CI->getValue().isPowerOf2();
7376 return false;
7377 });
7378 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7379 // TODO: We should allow undef elements here
7380 if (auto *CI = dyn_cast<ConstantInt>(V))
7381 return CI->getValue().isNegatedPowerOf2();
7382 return false;
7383 });
7384
7386 if (IsConstant && IsUniform)
7388 else if (IsConstant)
7390 else if (IsUniform)
7392
7394 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7395 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7396
7397 return {VK, VP};
7398}
7399
7400namespace {
7401/// The base class for shuffle instruction emission and shuffle cost estimation.
7402class BaseShuffleAnalysis {
7403protected:
7404 /// Checks if the mask is an identity mask.
7405 /// \param IsStrict if is true the function returns false if mask size does
7406 /// not match vector size.
7407 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7408 bool IsStrict) {
7409 int Limit = Mask.size();
7410 int VF = VecTy->getNumElements();
7411 int Index = -1;
7412 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7413 return true;
7414 if (!IsStrict) {
7415 // Consider extract subvector starting from index 0.
7417 Index == 0)
7418 return true;
7419 // All VF-size submasks are identity (e.g.
7420 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7421 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7422 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7423 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7425 }))
7426 return true;
7427 }
7428 return false;
7429 }
7430
7431 /// Tries to combine 2 different masks into single one.
7432 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7433 /// change the size of the vector, \p LocalVF is the original size of the
7434 /// shuffled vector.
7435 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7436 ArrayRef<int> ExtMask) {
7437 unsigned VF = Mask.size();
7438 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7439 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7440 if (ExtMask[I] == PoisonMaskElem)
7441 continue;
7442 int MaskedIdx = Mask[ExtMask[I] % VF];
7443 NewMask[I] =
7444 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7445 }
7446 Mask.swap(NewMask);
7447 }
7448
7449 /// Looks through shuffles trying to reduce final number of shuffles in the
7450 /// code. The function looks through the previously emitted shuffle
7451 /// instructions and properly mark indices in mask as undef.
7452 /// For example, given the code
7453 /// \code
7454 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7455 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7456 /// \endcode
7457 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7458 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7459 /// <0, 1, 2, 3> for the shuffle.
7460 /// If 2 operands are of different size, the smallest one will be resized and
7461 /// the mask recalculated properly.
7462 /// For example, given the code
7463 /// \code
7464 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7465 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7466 /// \endcode
7467 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7468 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7469 /// <0, 1, 2, 3> for the shuffle.
7470 /// So, it tries to transform permutations to simple vector merge, if
7471 /// possible.
7472 /// \param V The input vector which must be shuffled using the given \p Mask.
7473 /// If the better candidate is found, \p V is set to this best candidate
7474 /// vector.
7475 /// \param Mask The input mask for the shuffle. If the best candidate is found
7476 /// during looking-through-shuffles attempt, it is updated accordingly.
7477 /// \param SinglePermute true if the shuffle operation is originally a
7478 /// single-value-permutation. In this case the look-through-shuffles procedure
7479 /// may look for resizing shuffles as the best candidates.
7480 /// \return true if the shuffle results in the non-resizing identity shuffle
7481 /// (and thus can be ignored), false - otherwise.
7482 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7483 bool SinglePermute) {
7484 Value *Op = V;
7485 ShuffleVectorInst *IdentityOp = nullptr;
7486 SmallVector<int> IdentityMask;
7487 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7488 // Exit if not a fixed vector type or changing size shuffle.
7489 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7490 if (!SVTy)
7491 break;
7492 // Remember the identity or broadcast mask, if it is not a resizing
7493 // shuffle. If no better candidates are found, this Op and Mask will be
7494 // used in the final shuffle.
7495 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7496 if (!IdentityOp || !SinglePermute ||
7497 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7499 IdentityMask.size()))) {
7500 IdentityOp = SV;
7501 // Store current mask in the IdentityMask so later we did not lost
7502 // this info if IdentityOp is selected as the best candidate for the
7503 // permutation.
7504 IdentityMask.assign(Mask);
7505 }
7506 }
7507 // Remember the broadcast mask. If no better candidates are found, this Op
7508 // and Mask will be used in the final shuffle.
7509 // Zero splat can be used as identity too, since it might be used with
7510 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7511 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7512 // expensive, the analysis founds out, that the source vector is just a
7513 // broadcast, this original mask can be transformed to identity mask <0,
7514 // 1, 2, 3>.
7515 // \code
7516 // %0 = shuffle %v, poison, zeroinitalizer
7517 // %res = shuffle %0, poison, <3, 1, 2, 0>
7518 // \endcode
7519 // may be transformed to
7520 // \code
7521 // %0 = shuffle %v, poison, zeroinitalizer
7522 // %res = shuffle %0, poison, <0, 1, 2, 3>
7523 // \endcode
7524 if (SV->isZeroEltSplat()) {
7525 IdentityOp = SV;
7526 IdentityMask.assign(Mask);
7527 }
7528 int LocalVF = Mask.size();
7529 if (auto *SVOpTy =
7530 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7531 LocalVF = SVOpTy->getNumElements();
7532 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7533 for (auto [Idx, I] : enumerate(Mask)) {
7534 if (I == PoisonMaskElem ||
7535 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7536 continue;
7537 ExtMask[Idx] = SV->getMaskValue(I);
7538 }
7539 bool IsOp1Undef =
7540 isUndefVector(SV->getOperand(0),
7541 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7542 .all();
7543 bool IsOp2Undef =
7544 isUndefVector(SV->getOperand(1),
7545 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7546 .all();
7547 if (!IsOp1Undef && !IsOp2Undef) {
7548 // Update mask and mark undef elems.
7549 for (int &I : Mask) {
7550 if (I == PoisonMaskElem)
7551 continue;
7552 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7554 I = PoisonMaskElem;
7555 }
7556 break;
7557 }
7558 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7559 SV->getShuffleMask().end());
7560 combineMasks(LocalVF, ShuffleMask, Mask);
7561 Mask.swap(ShuffleMask);
7562 if (IsOp2Undef)
7563 Op = SV->getOperand(0);
7564 else
7565 Op = SV->getOperand(1);
7566 }
7567 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7568 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7570 if (IdentityOp) {
7571 V = IdentityOp;
7572 assert(Mask.size() == IdentityMask.size() &&
7573 "Expected masks of same sizes.");
7574 // Clear known poison elements.
7575 for (auto [I, Idx] : enumerate(Mask))
7576 if (Idx == PoisonMaskElem)
7577 IdentityMask[I] = PoisonMaskElem;
7578 Mask.swap(IdentityMask);
7579 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7580 return SinglePermute &&
7581 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7582 /*IsStrict=*/true) ||
7583 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7584 Shuffle->isZeroEltSplat() &&
7586 }
7587 V = Op;
7588 return false;
7589 }
7590 V = Op;
7591 return true;
7592 }
7593
7594 /// Smart shuffle instruction emission, walks through shuffles trees and
7595 /// tries to find the best matching vector for the actual shuffle
7596 /// instruction.
7597 template <typename T, typename ShuffleBuilderTy>
7598 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7599 ShuffleBuilderTy &Builder) {
7600 assert(V1 && "Expected at least one vector value.");
7601 if (V2)
7602 Builder.resizeToMatch(V1, V2);
7603 int VF = Mask.size();
7604 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7605 VF = FTy->getNumElements();
7606 if (V2 &&
7607 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7608 // Peek through shuffles.
7609 Value *Op1 = V1;
7610 Value *Op2 = V2;
7611 int VF =
7612 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7613 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7614 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7615 for (int I = 0, E = Mask.size(); I < E; ++I) {
7616 if (Mask[I] < VF)
7617 CombinedMask1[I] = Mask[I];
7618 else
7619 CombinedMask2[I] = Mask[I] - VF;
7620 }
7621 Value *PrevOp1;
7622 Value *PrevOp2;
7623 do {
7624 PrevOp1 = Op1;
7625 PrevOp2 = Op2;
7626 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7627 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7628 // Check if we have 2 resizing shuffles - need to peek through operands
7629 // again.
7630 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7631 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7632 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7633 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7634 if (I == PoisonMaskElem)
7635 continue;
7636 ExtMask1[Idx] = SV1->getMaskValue(I);
7637 }
7638 SmallBitVector UseMask1 = buildUseMask(
7639 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7640 ->getNumElements(),
7641 ExtMask1, UseMask::SecondArg);
7642 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7643 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7644 if (I == PoisonMaskElem)
7645 continue;
7646 ExtMask2[Idx] = SV2->getMaskValue(I);
7647 }
7648 SmallBitVector UseMask2 = buildUseMask(
7649 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7650 ->getNumElements(),
7651 ExtMask2, UseMask::SecondArg);
7652 if (SV1->getOperand(0)->getType() ==
7653 SV2->getOperand(0)->getType() &&
7654 SV1->getOperand(0)->getType() != SV1->getType() &&
7655 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7656 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7657 Op1 = SV1->getOperand(0);
7658 Op2 = SV2->getOperand(0);
7659 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7660 SV1->getShuffleMask().end());
7661 int LocalVF = ShuffleMask1.size();
7662 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7663 LocalVF = FTy->getNumElements();
7664 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7665 CombinedMask1.swap(ShuffleMask1);
7666 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7667 SV2->getShuffleMask().end());
7668 LocalVF = ShuffleMask2.size();
7669 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7670 LocalVF = FTy->getNumElements();
7671 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7672 CombinedMask2.swap(ShuffleMask2);
7673 }
7674 }
7675 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7676 Builder.resizeToMatch(Op1, Op2);
7677 VF = std::max(cast<VectorType>(Op1->getType())
7678 ->getElementCount()
7679 .getKnownMinValue(),
7680 cast<VectorType>(Op2->getType())
7681 ->getElementCount()
7682 .getKnownMinValue());
7683 for (int I = 0, E = Mask.size(); I < E; ++I) {
7684 if (CombinedMask2[I] != PoisonMaskElem) {
7685 assert(CombinedMask1[I] == PoisonMaskElem &&
7686 "Expected undefined mask element");
7687 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7688 }
7689 }
7690 if (Op1 == Op2 &&
7691 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7692 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7693 isa<ShuffleVectorInst>(Op1) &&
7694 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7695 ArrayRef(CombinedMask1))))
7696 return Builder.createIdentity(Op1);
7697 return Builder.createShuffleVector(
7698 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7699 CombinedMask1);
7700 }
7701 if (isa<PoisonValue>(V1))
7702 return Builder.createPoison(
7703 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7704 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7705 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7706 assert(V1 && "Expected non-null value after looking through shuffles.");
7707
7708 if (!IsIdentity)
7709 return Builder.createShuffleVector(V1, NewMask);
7710 return Builder.createIdentity(V1);
7711 }
7712};
7713} // namespace
7714
7715/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7716/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7717/// subvector pattern.
7718static InstructionCost
7720 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7722 int Index = 0, VectorType *SubTp = nullptr,
7723 ArrayRef<const Value *> Args = std::nullopt) {
7724 if (Kind != TTI::SK_PermuteTwoSrc)
7725 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7726 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7727 int NumSubElts;
7728 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7729 Mask, NumSrcElts, NumSubElts, Index)) {
7730 if (Index + NumSubElts > NumSrcElts &&
7731 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7732 return TTI.getShuffleCost(
7734 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7736 }
7737 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7738}
7739
7740/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7741static std::pair<InstructionCost, InstructionCost>
7743 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7744 Type *ScalarTy, VectorType *VecTy) {
7745 InstructionCost ScalarCost = 0;
7746 InstructionCost VecCost = 0;
7747 // Here we differentiate two cases: (1) when Ptrs represent a regular
7748 // vectorization tree node (as they are pointer arguments of scattered
7749 // loads) or (2) when Ptrs are the arguments of loads or stores being
7750 // vectorized as plane wide unit-stride load/store since all the
7751 // loads/stores are known to be from/to adjacent locations.
7752 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7753 // Case 2: estimate costs for pointer related costs when vectorizing to
7754 // a wide load/store.
7755 // Scalar cost is estimated as a set of pointers with known relationship
7756 // between them.
7757 // For vector code we will use BasePtr as argument for the wide load/store
7758 // but we also need to account all the instructions which are going to
7759 // stay in vectorized code due to uses outside of these scalar
7760 // loads/stores.
7761 ScalarCost = TTI.getPointersChainCost(
7762 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7763 CostKind);
7764
7765 SmallVector<const Value *> PtrsRetainedInVecCode;
7766 for (Value *V : Ptrs) {
7767 if (V == BasePtr) {
7768 PtrsRetainedInVecCode.push_back(V);
7769 continue;
7770 }
7771 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7772 // For simplicity assume Ptr to stay in vectorized code if it's not a
7773 // GEP instruction. We don't care since it's cost considered free.
7774 // TODO: We should check for any uses outside of vectorizable tree
7775 // rather than just single use.
7776 if (!Ptr || !Ptr->hasOneUse())
7777 PtrsRetainedInVecCode.push_back(V);
7778 }
7779
7780 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7781 // If all pointers stay in vectorized code then we don't have
7782 // any savings on that.
7783 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7784 }
7785 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7786 TTI::PointersChainInfo::getKnownStride(),
7787 VecTy, CostKind);
7788 } else {
7789 // Case 1: Ptrs are the arguments of loads that we are going to transform
7790 // into masked gather load intrinsic.
7791 // All the scalar GEPs will be removed as a result of vectorization.
7792 // For any external uses of some lanes extract element instructions will
7793 // be generated (which cost is estimated separately).
7794 TTI::PointersChainInfo PtrsInfo =
7795 all_of(Ptrs,
7796 [](const Value *V) {
7797 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7798 return Ptr && !Ptr->hasAllConstantIndices();
7799 })
7800 ? TTI::PointersChainInfo::getUnknownStride()
7801 : TTI::PointersChainInfo::getKnownStride();
7802
7803 ScalarCost =
7804 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7805 if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7806 SmallVector<const Value *> Indices(BaseGEP->indices());
7807 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7808 BaseGEP->getPointerOperand(), Indices, VecTy,
7809 CostKind);
7810 }
7811 }
7812
7813 return std::make_pair(ScalarCost, VecCost);
7814}
7815
7816/// Merges shuffle masks and emits final shuffle instruction, if required. It
7817/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7818/// when the actual shuffle instruction is generated only if this is actually
7819/// required. Otherwise, the shuffle instruction emission is delayed till the
7820/// end of the process, to reduce the number of emitted instructions and further
7821/// analysis/transformations.
7822class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7823 bool IsFinalized = false;
7824 SmallVector<int> CommonMask;
7826 const TargetTransformInfo &TTI;
7828 SmallDenseSet<Value *> VectorizedVals;
7829 BoUpSLP &R;
7830 SmallPtrSetImpl<Value *> &CheckedExtracts;
7831 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7832 /// While set, still trying to estimate the cost for the same nodes and we
7833 /// can delay actual cost estimation (virtual shuffle instruction emission).
7834 /// May help better estimate the cost if same nodes must be permuted + allows
7835 /// to move most of the long shuffles cost estimation to TTI.
7836 bool SameNodesEstimated = true;
7837
7838 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
7839 if (Ty->getScalarType()->isPointerTy()) {
7843 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
7844 Ty->getScalarType());
7845 if (auto *VTy = dyn_cast<VectorType>(Ty))
7846 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
7847 return Res;
7848 }
7849 return Constant::getAllOnesValue(Ty);
7850 }
7851
7852 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
7853 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
7854 return TTI::TCC_Free;
7855 auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
7856 InstructionCost GatherCost = 0;
7857 SmallVector<Value *> Gathers(VL.begin(), VL.end());
7858 // Improve gather cost for gather of loads, if we can group some of the
7859 // loads into vector loads.
7860 InstructionsState S = getSameOpcode(VL, *R.TLI);
7861 const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
7862 unsigned MinVF = R.getMinVF(2 * Sz);
7863 if (VL.size() > 2 &&
7864 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7865 (InVectors.empty() &&
7866 any_of(seq<unsigned>(0, VL.size() / MinVF),
7867 [&](unsigned Idx) {
7868 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7869 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7870 return S.getOpcode() == Instruction::Load &&
7871 !S.isAltShuffle();
7872 }))) &&
7873 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
7874 !isSplat(Gathers)) {
7875 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
7876 SetVector<Value *> VectorizedLoads;
7878 SmallVector<unsigned> ScatterVectorized;
7879 unsigned StartIdx = 0;
7880 unsigned VF = VL.size() / 2;
7881 for (; VF >= MinVF; VF /= 2) {
7882 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
7883 Cnt += VF) {
7884 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7885 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7886 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
7887 if (SliceS.getOpcode() != Instruction::Load ||
7888 SliceS.isAltShuffle())
7889 continue;
7890 }
7891 if (!VectorizedLoads.count(Slice.front()) &&
7892 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
7893 SmallVector<Value *> PointerOps;
7894 OrdersType CurrentOrder;
7895 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
7896 CurrentOrder, PointerOps);
7897 switch (LS) {
7901 // Mark the vectorized loads so that we don't vectorize them
7902 // again.
7903 // TODO: better handling of loads with reorders.
7904 if (((LS == LoadsState::Vectorize ||
7906 CurrentOrder.empty()) ||
7908 isReverseOrder(CurrentOrder)))
7909 VectorizedStarts.emplace_back(Cnt, LS);
7910 else
7911 ScatterVectorized.push_back(Cnt);
7912 VectorizedLoads.insert(Slice.begin(), Slice.end());
7913 // If we vectorized initial block, no need to try to vectorize
7914 // it again.
7915 if (Cnt == StartIdx)
7916 StartIdx += VF;
7917 break;
7918 case LoadsState::Gather:
7919 break;
7920 }
7921 }
7922 }
7923 // Check if the whole array was vectorized already - exit.
7924 if (StartIdx >= VL.size())
7925 break;
7926 // Found vectorizable parts - exit.
7927 if (!VectorizedLoads.empty())
7928 break;
7929 }
7930 if (!VectorizedLoads.empty()) {
7931 unsigned NumParts = TTI.getNumberOfParts(VecTy);
7932 bool NeedInsertSubvectorAnalysis =
7933 !NumParts || (VL.size() / VF) > NumParts;
7934 // Get the cost for gathered loads.
7935 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7936 if (VectorizedLoads.contains(VL[I]))
7937 continue;
7938 GatherCost +=
7939 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
7940 }
7941 // Exclude potentially vectorized loads from list of gathered
7942 // scalars.
7943 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
7944 // The cost for vectorized loads.
7945 InstructionCost ScalarsCost = 0;
7946 for (Value *V : VectorizedLoads) {
7947 auto *LI = cast<LoadInst>(V);
7948 ScalarsCost +=
7949 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
7950 LI->getAlign(), LI->getPointerAddressSpace(),
7951 CostKind, TTI::OperandValueInfo(), LI);
7952 }
7953 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
7954 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
7955 auto *LI = cast<LoadInst>(VL[P.first]);
7956 Align Alignment = LI->getAlign();
7957 GatherCost +=
7958 P.second == LoadsState::Vectorize
7959 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7960 LI->getPointerAddressSpace(), CostKind,
7963 Instruction::Load, LoadTy, LI->getPointerOperand(),
7964 /*VariableMask=*/false, Alignment, CostKind, LI);
7965 // Estimate GEP cost.
7966 SmallVector<Value *> PointerOps(VF);
7967 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
7968 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
7969 auto [ScalarGEPCost, VectorGEPCost] =
7970 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
7971 Instruction::Load, CostKind, LI->getType(), LoadTy);
7972 GatherCost += VectorGEPCost - ScalarGEPCost;
7973 }
7974 for (unsigned P : ScatterVectorized) {
7975 auto *LI0 = cast<LoadInst>(VL[P]);
7976 ArrayRef<Value *> Slice = VL.slice(P, VF);
7977 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
7978 GatherCost += TTI.getGatherScatterOpCost(
7979 Instruction::Load, LoadTy, LI0->getPointerOperand(),
7980 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
7981 // Estimate GEP cost.
7982 SmallVector<Value *> PointerOps(VF);
7983 for (auto [I, V] : enumerate(Slice))
7984 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
7985 OrdersType Order;
7986 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
7987 Order)) {
7988 // TODO: improve checks if GEPs can be vectorized.
7989 Value *Ptr0 = PointerOps.front();
7990 Type *ScalarTy = Ptr0->getType();
7991 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
7992 auto [ScalarGEPCost, VectorGEPCost] =
7993 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
7994 CostKind, ScalarTy, VecTy);
7995 GatherCost += VectorGEPCost - ScalarGEPCost;
7996 if (!Order.empty()) {
7997 SmallVector<int> Mask;
7998 inversePermutation(Order, Mask);
8000 VecTy, Mask, CostKind);
8001 }
8002 } else {
8003 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
8004 }
8005 }
8006 if (NeedInsertSubvectorAnalysis) {
8007 // Add the cost for the subvectors insert.
8008 SmallVector<int> ShuffleMask(VL.size());
8009 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8010 for (unsigned Idx : seq<unsigned>(0, E))
8011 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8012 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8013 ShuffleMask, CostKind, I, LoadTy);
8014 }
8015 }
8016 GatherCost -= ScalarsCost;
8017 }
8018 GatherCost = std::min(BaseCost, GatherCost);
8019 } else if (!Root && isSplat(VL)) {
8020 // Found the broadcasting of the single scalar, calculate the cost as
8021 // the broadcast.
8022 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8023 assert(It != VL.end() && "Expected at least one non-undef value.");
8024 // Add broadcast for non-identity shuffle only.
8025 bool NeedShuffle =
8026 count(VL, *It) > 1 &&
8027 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8028 if (!NeedShuffle)
8029 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8030 CostKind, std::distance(VL.begin(), It),
8031 PoisonValue::get(VecTy), *It);
8032
8033 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8034 transform(VL, ShuffleMask.begin(), [](Value *V) {
8035 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8036 });
8038 Instruction::InsertElement, VecTy, CostKind, 0,
8039 PoisonValue::get(VecTy), *It);
8040 return InsertCost +
8042 ShuffleMask, CostKind, /*Index=*/0,
8043 /*SubTp=*/nullptr, /*Args=*/*It);
8044 }
8045 return GatherCost +
8046 (all_of(Gathers, IsaPred<UndefValue>)
8048 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
8049 };
8050
8051 /// Compute the cost of creating a vector containing the extracted values from
8052 /// \p VL.
8054 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8055 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8056 unsigned NumParts) {
8057 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8058 unsigned NumElts =
8059 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8060 auto *EE = dyn_cast<ExtractElementInst>(V);
8061 if (!EE)
8062 return Sz;
8063 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8064 if (!VecTy)
8065 return Sz;
8066 return std::max(Sz, VecTy->getNumElements());
8067 });
8068 unsigned NumSrcRegs = TTI.getNumberOfParts(
8069 FixedVectorType::get(VL.front()->getType(), NumElts));
8070 if (NumSrcRegs == 0)
8071 NumSrcRegs = 1;
8072 // FIXME: this must be moved to TTI for better estimation.
8073 unsigned EltsPerVector = PowerOf2Ceil(std::max(
8074 divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
8075 auto CheckPerRegistersShuffle =
8076 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8077 DenseSet<int> RegIndices;
8078 // Check that if trying to permute same single/2 input vectors.
8080 int FirstRegId = -1;
8081 for (int &I : Mask) {
8082 if (I == PoisonMaskElem)
8083 continue;
8084 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8085 if (FirstRegId < 0)
8086 FirstRegId = RegId;
8087 RegIndices.insert(RegId);
8088 if (RegIndices.size() > 2)
8089 return std::nullopt;
8090 if (RegIndices.size() == 2)
8091 ShuffleKind = TTI::SK_PermuteTwoSrc;
8092 I = (I % NumElts) % EltsPerVector +
8093 (RegId == FirstRegId ? 0 : EltsPerVector);
8094 }
8095 return ShuffleKind;
8096 };
8098
8099 // Process extracts in blocks of EltsPerVector to check if the source vector
8100 // operand can be re-used directly. If not, add the cost of creating a
8101 // shuffle to extract the values into a vector register.
8102 for (unsigned Part = 0; Part < NumParts; ++Part) {
8103 if (!ShuffleKinds[Part])
8104 continue;
8105 ArrayRef<int> MaskSlice =
8106 Mask.slice(Part * EltsPerVector,
8107 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8108 ? Mask.size() % EltsPerVector
8109 : EltsPerVector);
8110 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8111 copy(MaskSlice, SubMask.begin());
8112 std::optional<TTI::ShuffleKind> RegShuffleKind =
8113 CheckPerRegistersShuffle(SubMask);
8114 if (!RegShuffleKind) {
8116 TTI, *ShuffleKinds[Part],
8117 FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
8118 continue;
8119 }
8120 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8121 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8123 TTI, *RegShuffleKind,
8124 FixedVectorType::get(VL.front()->getType(), EltsPerVector),
8125 SubMask);
8126 }
8127 }
8128 return Cost;
8129 }
8130 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8131 /// shuffle emission.
8132 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8133 ArrayRef<int> Mask) {
8134 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8135 if (Mask[Idx] != PoisonMaskElem)
8136 CommonMask[Idx] = Idx;
8137 }
8138 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8139 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8140 /// elements.
8141 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8142 ArrayRef<int> Mask, unsigned Part,
8143 unsigned SliceSize) {
8144 if (SameNodesEstimated) {
8145 // Delay the cost estimation if the same nodes are reshuffling.
8146 // If we already requested the cost of reshuffling of E1 and E2 before, no
8147 // need to estimate another cost with the sub-Mask, instead include this
8148 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8149 // estimation.
8150 if ((InVectors.size() == 2 &&
8151 InVectors.front().get<const TreeEntry *>() == &E1 &&
8152 InVectors.back().get<const TreeEntry *>() == E2) ||
8153 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8154 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8155 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8156 "Expected all poisoned elements.");
8157 ArrayRef<int> SubMask =
8158 ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
8159 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8160 return;
8161 }
8162 // Found non-matching nodes - need to estimate the cost for the matched
8163 // and transform mask.
8164 Cost += createShuffle(InVectors.front(),
8165 InVectors.size() == 1 ? nullptr : InVectors.back(),
8166 CommonMask);
8167 transformMaskAfterShuffle(CommonMask, CommonMask);
8168 }
8169 SameNodesEstimated = false;
8170 if (!E2 && InVectors.size() == 1) {
8171 unsigned VF = E1.getVectorFactor();
8172 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8173 VF = std::max(VF,
8174 cast<FixedVectorType>(V1->getType())->getNumElements());
8175 } else {
8176 const auto *E = InVectors.front().get<const TreeEntry *>();
8177 VF = std::max(VF, E->getVectorFactor());
8178 }
8179 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8180 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8181 CommonMask[Idx] = Mask[Idx] + VF;
8182 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8183 transformMaskAfterShuffle(CommonMask, CommonMask);
8184 } else {
8185 Cost += createShuffle(&E1, E2, Mask);
8186 transformMaskAfterShuffle(CommonMask, Mask);
8187 }
8188 }
8189
8190 class ShuffleCostBuilder {
8191 const TargetTransformInfo &TTI;
8192
8193 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8194 int Index = -1;
8195 return Mask.empty() ||
8196 (VF == Mask.size() &&
8199 Index == 0);
8200 }
8201
8202 public:
8203 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8204 ~ShuffleCostBuilder() = default;
8205 InstructionCost createShuffleVector(Value *V1, Value *,
8206 ArrayRef<int> Mask) const {
8207 // Empty mask or identity mask are free.
8208 unsigned VF =
8209 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8210 if (isEmptyOrIdentity(Mask, VF))
8211 return TTI::TCC_Free;
8212 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8213 cast<VectorType>(V1->getType()), Mask);
8214 }
8215 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8216 // Empty mask or identity mask are free.
8217 unsigned VF =
8218 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8219 if (isEmptyOrIdentity(Mask, VF))
8220 return TTI::TCC_Free;
8222 cast<VectorType>(V1->getType()), Mask);
8223 }
8224 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8225 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8226 return TTI::TCC_Free;
8227 }
8228 void resizeToMatch(Value *&, Value *&) const {}
8229 };
8230
8231 /// Smart shuffle instruction emission, walks through shuffles trees and
8232 /// tries to find the best matching vector for the actual shuffle
8233 /// instruction.
8235 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8237 ArrayRef<int> Mask) {
8238 ShuffleCostBuilder Builder(TTI);
8239 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8240 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8241 unsigned CommonVF = Mask.size();
8242 if (!V1 && !V2 && !P2.isNull()) {
8243 // Shuffle 2 entry nodes.
8244 const TreeEntry *E = P1.get<const TreeEntry *>();
8245 unsigned VF = E->getVectorFactor();
8246 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8247 CommonVF = std::max(VF, E2->getVectorFactor());
8248 assert(all_of(Mask,
8249 [=](int Idx) {
8250 return Idx < 2 * static_cast<int>(CommonVF);
8251 }) &&
8252 "All elements in mask must be less than 2 * CommonVF.");
8253 if (E->Scalars.size() == E2->Scalars.size()) {
8254 SmallVector<int> EMask = E->getCommonMask();
8255 SmallVector<int> E2Mask = E2->getCommonMask();
8256 if (!EMask.empty() || !E2Mask.empty()) {
8257 for (int &Idx : CommonMask) {
8258 if (Idx == PoisonMaskElem)
8259 continue;
8260 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8261 Idx = EMask[Idx];
8262 else if (Idx >= static_cast<int>(CommonVF))
8263 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8264 E->Scalars.size();
8265 }
8266 }
8267 CommonVF = E->Scalars.size();
8268 }
8270 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8271 V2 = getAllOnesValue(
8272 *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8273 } else if (!V1 && P2.isNull()) {
8274 // Shuffle single entry node.
8275 const TreeEntry *E = P1.get<const TreeEntry *>();
8276 unsigned VF = E->getVectorFactor();
8277 CommonVF = VF;
8278 assert(
8279 all_of(Mask,
8280 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8281 "All elements in mask must be less than CommonVF.");
8282 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8283 SmallVector<int> EMask = E->getCommonMask();
8284 assert(!EMask.empty() && "Expected non-empty common mask.");
8285 for (int &Idx : CommonMask) {
8286 if (Idx != PoisonMaskElem)
8287 Idx = EMask[Idx];
8288 }
8289 CommonVF = E->Scalars.size();
8290 }
8292 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8293 // Not identity/broadcast? Try to see if the original vector is better.
8294 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8295 CommonVF == CommonMask.size() &&
8296 any_of(enumerate(CommonMask),
8297 [](const auto &&P) {
8298 return P.value() != PoisonMaskElem &&
8299 static_cast<unsigned>(P.value()) != P.index();
8300 }) &&
8301 any_of(CommonMask,
8302 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8303 SmallVector<int> ReorderMask;
8304 inversePermutation(E->ReorderIndices, ReorderMask);
8305 ::addMask(CommonMask, ReorderMask);
8306 }
8307 } else if (V1 && P2.isNull()) {
8308 // Shuffle single vector.
8309 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8310 assert(
8311 all_of(Mask,
8312 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8313 "All elements in mask must be less than CommonVF.");
8314 } else if (V1 && !V2) {
8315 // Shuffle vector and tree node.
8316 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8317 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8318 CommonVF = std::max(VF, E2->getVectorFactor());
8319 assert(all_of(Mask,
8320 [=](int Idx) {
8321 return Idx < 2 * static_cast<int>(CommonVF);
8322 }) &&
8323 "All elements in mask must be less than 2 * CommonVF.");
8324 if (E2->Scalars.size() == VF && VF != CommonVF) {
8325 SmallVector<int> E2Mask = E2->getCommonMask();
8326 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8327 for (int &Idx : CommonMask) {
8328 if (Idx == PoisonMaskElem)
8329 continue;
8330 if (Idx >= static_cast<int>(CommonVF))
8331 Idx = E2Mask[Idx - CommonVF] + VF;
8332 }
8333 CommonVF = VF;
8334 }
8336 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8337 V2 = getAllOnesValue(
8338 *R.DL,
8339 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8340 } else if (!V1 && V2) {
8341 // Shuffle vector and tree node.
8342 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8343 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8344 CommonVF = std::max(VF, E1->getVectorFactor());
8345 assert(all_of(Mask,
8346 [=](int Idx) {
8347 return Idx < 2 * static_cast<int>(CommonVF);
8348 }) &&
8349 "All elements in mask must be less than 2 * CommonVF.");
8350 if (E1->Scalars.size() == VF && VF != CommonVF) {
8351 SmallVector<int> E1Mask = E1->getCommonMask();
8352 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8353 for (int &Idx : CommonMask) {
8354 if (Idx == PoisonMaskElem)
8355 continue;
8356 if (Idx >= static_cast<int>(CommonVF))
8357 Idx = E1Mask[Idx - CommonVF] + VF;
8358 else
8359 Idx = E1Mask[Idx];
8360 }
8361 CommonVF = VF;
8362 }
8364 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8365 V2 = getAllOnesValue(
8366 *R.DL,
8367 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8368 } else {
8369 assert(V1 && V2 && "Expected both vectors.");
8370 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8371 CommonVF =
8372 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8373 assert(all_of(Mask,
8374 [=](int Idx) {
8375 return Idx < 2 * static_cast<int>(CommonVF);
8376 }) &&
8377 "All elements in mask must be less than 2 * CommonVF.");
8378 if (V1->getType() != V2->getType()) {
8380 cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
8381 V2 = getAllOnesValue(
8382 *R.DL, FixedVectorType::get(
8383 cast<FixedVectorType>(V1->getType())->getElementType(),
8384 CommonVF));
8385 }
8386 }
8388 cast<FixedVectorType>(V1->getType())->getElementType(),
8389 CommonMask.size()));
8390 if (InVectors.size() == 2)
8391 InVectors.pop_back();
8392 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8393 V1, V2, CommonMask, Builder);
8394 }
8395
8396public:
8398 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8399 SmallPtrSetImpl<Value *> &CheckedExtracts)
8400 : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8401 R(R), CheckedExtracts(CheckedExtracts) {}
8402 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8403 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8404 unsigned NumParts, bool &UseVecBaseAsInput) {
8405 UseVecBaseAsInput = false;
8406 if (Mask.empty())
8407 return nullptr;
8408 Value *VecBase = nullptr;
8409 ArrayRef<Value *> VL = E->Scalars;
8410 // If the resulting type is scalarized, do not adjust the cost.
8411 if (NumParts == VL.size())
8412 return nullptr;
8413 // Check if it can be considered reused if same extractelements were
8414 // vectorized already.
8415 bool PrevNodeFound = any_of(
8416 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8417 [&](const std::unique_ptr<TreeEntry> &TE) {
8418 return ((!TE->isAltShuffle() &&
8419 TE->getOpcode() == Instruction::ExtractElement) ||
8420 TE->State == TreeEntry::NeedToGather) &&
8421 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8422 return VL.size() > Data.index() &&
8423 (Mask[Data.index()] == PoisonMaskElem ||
8424 isa<UndefValue>(VL[Data.index()]) ||
8425 Data.value() == VL[Data.index()]);
8426 });
8427 });
8428 SmallPtrSet<Value *, 4> UniqueBases;
8429 unsigned SliceSize = VL.size() / NumParts;
8430 for (unsigned Part = 0; Part < NumParts; ++Part) {
8431 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8432 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
8433 // Ignore non-extractelement scalars.
8434 if (isa<UndefValue>(V) ||
8435 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8436 continue;
8437 // If all users of instruction are going to be vectorized and this
8438 // instruction itself is not going to be vectorized, consider this
8439 // instruction as dead and remove its cost from the final cost of the
8440 // vectorized tree.
8441 // Also, avoid adjusting the cost for extractelements with multiple uses
8442 // in different graph entries.
8443 auto *EE = cast<ExtractElementInst>(V);
8444 VecBase = EE->getVectorOperand();
8445 UniqueBases.insert(VecBase);
8446 const TreeEntry *VE = R.getTreeEntry(V);
8447 if (!CheckedExtracts.insert(V).second ||
8448 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8449 (VE && VE != E))
8450 continue;
8451 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8452 if (!EEIdx)
8453 continue;
8454 unsigned Idx = *EEIdx;
8455 // Take credit for instruction that will become dead.
8456 if (EE->hasOneUse() || !PrevNodeFound) {
8457 Instruction *Ext = EE->user_back();
8458 if (isa<SExtInst, ZExtInst>(Ext) &&
8459 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8460 // Use getExtractWithExtendCost() to calculate the cost of
8461 // extractelement/ext pair.
8462 Cost -=
8463 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8464 EE->getVectorOperandType(), Idx);
8465 // Add back the cost of s|zext which is subtracted separately.
8467 Ext->getOpcode(), Ext->getType(), EE->getType(),
8468 TTI::getCastContextHint(Ext), CostKind, Ext);
8469 continue;
8470 }
8471 }
8472 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8473 CostKind, Idx);
8474 }
8475 }
8476 // Check that gather of extractelements can be represented as just a
8477 // shuffle of a single/two vectors the scalars are extracted from.
8478 // Found the bunch of extractelement instructions that must be gathered
8479 // into a vector and can be represented as a permutation elements in a
8480 // single input vector or of 2 input vectors.
8481 // Done for reused if same extractelements were vectorized already.
8482 if (!PrevNodeFound)
8483 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8484 InVectors.assign(1, E);
8485 CommonMask.assign(Mask.begin(), Mask.end());
8486 transformMaskAfterShuffle(CommonMask, CommonMask);
8487 SameNodesEstimated = false;
8488 if (NumParts != 1 && UniqueBases.size() != 1) {
8489 UseVecBaseAsInput = true;
8490 VecBase = Constant::getNullValue(
8491 FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
8492 }
8493 return VecBase;
8494 }
8495 /// Checks if the specified entry \p E needs to be delayed because of its
8496 /// dependency nodes.
8497 std::optional<InstructionCost>
8498 needToDelay(const TreeEntry *,
8500 // No need to delay the cost estimation during analysis.
8501 return std::nullopt;
8502 }
8503 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8504 if (&E1 == &E2) {
8505 assert(all_of(Mask,
8506 [&](int Idx) {
8507 return Idx < static_cast<int>(E1.getVectorFactor());
8508 }) &&
8509 "Expected single vector shuffle mask.");
8510 add(E1, Mask);
8511 return;
8512 }
8513 if (InVectors.empty()) {
8514 CommonMask.assign(Mask.begin(), Mask.end());
8515 InVectors.assign({&E1, &E2});
8516 return;
8517 }
8518 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8519 auto *MaskVecTy =
8520 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8521 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8522 if (NumParts == 0 || NumParts >= Mask.size())
8523 NumParts = 1;
8524 unsigned SliceSize = Mask.size() / NumParts;
8525 const auto *It =
8526 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8527 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8528 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8529 }
8530 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8531 if (InVectors.empty()) {
8532 CommonMask.assign(Mask.begin(), Mask.end());
8533 InVectors.assign(1, &E1);
8534 return;
8535 }
8536 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8537 auto *MaskVecTy =
8538 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8539 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8540 if (NumParts == 0 || NumParts >= Mask.size())
8541 NumParts = 1;
8542 unsigned SliceSize = Mask.size() / NumParts;
8543 const auto *It =
8544 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8545 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8546 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8547 if (!SameNodesEstimated && InVectors.size() == 1)
8548 InVectors.emplace_back(&E1);
8549 }
8550 /// Adds 2 input vectors and the mask for their shuffling.
8551 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8552 // May come only for shuffling of 2 vectors with extractelements, already
8553 // handled in adjustExtracts.
8554 assert(InVectors.size() == 1 &&
8555 all_of(enumerate(CommonMask),
8556 [&](auto P) {
8557 if (P.value() == PoisonMaskElem)
8558 return Mask[P.index()] == PoisonMaskElem;
8559 auto *EI =
8560 cast<ExtractElementInst>(InVectors.front()
8561 .get<const TreeEntry *>()
8562 ->Scalars[P.index()]);
8563 return EI->getVectorOperand() == V1 ||
8564 EI->getVectorOperand() == V2;
8565 }) &&
8566 "Expected extractelement vectors.");
8567 }
8568 /// Adds another one input vector and the mask for the shuffling.
8569 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8570 if (InVectors.empty()) {
8571 assert(CommonMask.empty() && !ForExtracts &&
8572 "Expected empty input mask/vectors.");
8573 CommonMask.assign(Mask.begin(), Mask.end());
8574 InVectors.assign(1, V1);
8575 return;
8576 }
8577 if (ForExtracts) {
8578 // No need to add vectors here, already handled them in adjustExtracts.
8579 assert(InVectors.size() == 1 &&
8580 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8581 all_of(enumerate(CommonMask),
8582 [&](auto P) {
8583 Value *Scalar = InVectors.front()
8584 .get<const TreeEntry *>()
8585 ->Scalars[P.index()];
8586 if (P.value() == PoisonMaskElem)
8587 return P.value() == Mask[P.index()] ||
8588 isa<UndefValue>(Scalar);
8589 if (isa<Constant>(V1))
8590 return true;
8591 auto *EI = cast<ExtractElementInst>(Scalar);
8592 return EI->getVectorOperand() == V1;
8593 }) &&
8594 "Expected only tree entry for extractelement vectors.");
8595 return;
8596 }
8597 assert(!InVectors.empty() && !CommonMask.empty() &&
8598 "Expected only tree entries from extracts/reused buildvectors.");
8599 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8600 if (InVectors.size() == 2) {
8601 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8602 transformMaskAfterShuffle(CommonMask, CommonMask);
8603 VF = std::max<unsigned>(VF, CommonMask.size());
8604 } else if (const auto *InTE =
8605 InVectors.front().dyn_cast<const TreeEntry *>()) {
8606 VF = std::max(VF, InTE->getVectorFactor());
8607 } else {
8608 VF = std::max(
8609 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8610 ->getNumElements());
8611 }
8612 InVectors.push_back(V1);
8613 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8614 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8615 CommonMask[Idx] = Mask[Idx] + VF;
8616 }
8617 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8618 Value *Root = nullptr) {
8619 Cost += getBuildVectorCost(VL, Root);
8620 if (!Root) {
8621 // FIXME: Need to find a way to avoid use of getNullValue here.
8623 unsigned VF = VL.size();
8624 if (MaskVF != 0)
8625 VF = std::min(VF, MaskVF);
8626 for (Value *V : VL.take_front(VF)) {
8627 if (isa<UndefValue>(V)) {
8628 Vals.push_back(cast<Constant>(V));
8629 continue;
8630 }
8631 Vals.push_back(Constant::getNullValue(V->getType()));
8632 }
8633 return ConstantVector::get(Vals);
8634 }
8637 cast<FixedVectorType>(Root->getType())->getNumElements()),
8638 getAllOnesValue(*R.DL, VL.front()->getType()));
8639 }
8641 /// Finalize emission of the shuffles.
8643 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8644 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8645 IsFinalized = true;
8646 if (Action) {
8647 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8648 if (InVectors.size() == 2)
8649 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8650 else
8651 Cost += createShuffle(Vec, nullptr, CommonMask);
8652 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8653 if (CommonMask[Idx] != PoisonMaskElem)
8654 CommonMask[Idx] = Idx;
8655 assert(VF > 0 &&
8656 "Expected vector length for the final value before action.");
8657 Value *V = Vec.get<Value *>();
8658 Action(V, CommonMask);
8659 InVectors.front() = V;
8660 }
8661 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8662 if (CommonMask.empty()) {
8663 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8664 return Cost;
8665 }
8666 return Cost +
8667 createShuffle(InVectors.front(),
8668 InVectors.size() == 2 ? InVectors.back() : nullptr,
8669 CommonMask);
8670 }
8671
8673 assert((IsFinalized || CommonMask.empty()) &&
8674 "Shuffle construction must be finalized.");
8675 }
8676};
8677
8678const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8679 unsigned Idx) const {
8680 Value *Op = E->getOperand(Idx).front();
8681 if (const TreeEntry *TE = getTreeEntry(Op)) {
8682 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8683 return EI.EdgeIdx == Idx && EI.UserTE == E;
8684 }) != TE->UserTreeIndices.end())
8685 return TE;
8686 auto MIt = MultiNodeScalars.find(Op);
8687 if (MIt != MultiNodeScalars.end()) {
8688 for (const TreeEntry *TE : MIt->second) {
8689 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8690 return EI.EdgeIdx == Idx && EI.UserTE == E;
8691 }) != TE->UserTreeIndices.end())
8692 return TE;
8693 }
8694 }
8695 }
8696 const auto *It =
8697 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8698 return TE->State == TreeEntry::NeedToGather &&
8699 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8700 return EI.EdgeIdx == Idx && EI.UserTE == E;
8701 }) != TE->UserTreeIndices.end();
8702 });
8703 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
8704 return It->get();
8705}
8706
8707TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
8708 if (TE.State == TreeEntry::ScatterVectorize ||
8709 TE.State == TreeEntry::StridedVectorize)
8711 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
8712 !TE.isAltShuffle()) {
8713 if (TE.ReorderIndices.empty())
8716 inversePermutation(TE.ReorderIndices, Mask);
8717 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
8719 }
8721}
8722
8723/// Builds the arguments types vector for the given call instruction with the
8724/// given \p ID for the specified vector factor.
8726 const Intrinsic::ID ID,
8727 const unsigned VF,
8728 unsigned MinBW) {
8729 SmallVector<Type *> ArgTys;
8730 for (auto [Idx, Arg] : enumerate(CI->args())) {
8733 ArgTys.push_back(Arg->getType());
8734 continue;
8735 }
8736 if (MinBW > 0) {
8738 IntegerType::get(CI->getContext(), MinBW), VF));
8739 continue;
8740 }
8741 }
8742 ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
8743 }
8744 return ArgTys;
8745}
8746
8748BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8749 SmallPtrSetImpl<Value *> &CheckedExtracts) {
8750 ArrayRef<Value *> VL = E->Scalars;
8751
8752 Type *ScalarTy = VL[0]->getType();
8753 if (E->State != TreeEntry::NeedToGather) {
8754 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
8755 ScalarTy = SI->getValueOperand()->getType();
8756 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
8757 ScalarTy = CI->getOperand(0)->getType();
8758 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8759 ScalarTy = IE->getOperand(1)->getType();
8760 }
8761 if (!isValidElementType(ScalarTy))
8763 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8765
8766 // If we have computed a smaller type for the expression, update VecTy so
8767 // that the costs will be accurate.
8768 auto It = MinBWs.find(E);
8769 Type *OrigScalarTy = ScalarTy;
8770 if (It != MinBWs.end()) {
8771 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
8772 VecTy = FixedVectorType::get(ScalarTy, VL.size());
8773 }
8774 unsigned EntryVF = E->getVectorFactor();
8775 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
8776
8777 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8778 if (E->State == TreeEntry::NeedToGather) {
8779 if (allConstant(VL))
8780 return 0;
8781 if (isa<InsertElementInst>(VL[0]))
8783 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8784 E, *TTI, VectorizedVals, *this, CheckedExtracts);
8785 }
8786 InstructionCost CommonCost = 0;
8788 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
8789 if (!E->ReorderIndices.empty() &&
8790 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8791 SmallVector<int> NewMask;
8792 if (E->getOpcode() == Instruction::Store) {
8793 // For stores the order is actually a mask.
8794 NewMask.resize(E->ReorderIndices.size());
8795 copy(E->ReorderIndices, NewMask.begin());
8796 } else {
8797 inversePermutation(E->ReorderIndices, NewMask);
8798 }
8799 ::addMask(Mask, NewMask);
8800 }
8801 if (NeedToShuffleReuses)
8802 ::addMask(Mask, E->ReuseShuffleIndices);
8803 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
8804 CommonCost =
8805 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
8806 assert((E->State == TreeEntry::Vectorize ||
8807 E->State == TreeEntry::ScatterVectorize ||
8808 E->State == TreeEntry::StridedVectorize) &&
8809 "Unhandled state");
8810 assert(E->getOpcode() &&
8811 ((allSameType(VL) && allSameBlock(VL)) ||
8812 (E->getOpcode() == Instruction::GetElementPtr &&
8813 E->getMainOp()->getType()->isPointerTy())) &&
8814 "Invalid VL");
8815 Instruction *VL0 = E->getMainOp();
8816 unsigned ShuffleOrOp =
8817 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
8818 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
8819 const unsigned Sz = UniqueValues.size();
8820 SmallBitVector UsedScalars(Sz, false);
8821 for (unsigned I = 0; I < Sz; ++I) {
8822 if (getTreeEntry(UniqueValues[I]) == E)
8823 continue;
8824 UsedScalars.set(I);
8825 }
8826 auto GetCastContextHint = [&](Value *V) {
8827 if (const TreeEntry *OpTE = getTreeEntry(V))
8828 return getCastContextHint(*OpTE);
8829 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
8830 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8833 };
8834 auto GetCostDiff =
8835 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
8837 // Calculate the cost of this instruction.
8838 InstructionCost ScalarCost = 0;
8839 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8840 // For some of the instructions no need to calculate cost for each
8841 // particular instruction, we can use the cost of the single
8842 // instruction x total number of scalar instructions.
8843 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8844 } else {
8845 for (unsigned I = 0; I < Sz; ++I) {
8846 if (UsedScalars.test(I))
8847 continue;
8848 ScalarCost += ScalarEltCost(I);
8849 }
8850 }
8851
8852 InstructionCost VecCost = VectorCost(CommonCost);
8853 // Check if the current node must be resized, if the parent node is not
8854 // resized.
8855 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
8856 const EdgeInfo &EI = E->UserTreeIndices.front();
8857 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8858 EI.EdgeIdx != 0) &&
8859 It != MinBWs.end()) {
8860 auto UserBWIt = MinBWs.find(EI.UserTE);
8861 Type *UserScalarTy =
8862 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8863 if (UserBWIt != MinBWs.end())
8864 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
8865 UserBWIt->second.first);
8866 if (ScalarTy != UserScalarTy) {
8867 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
8868 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
8869 unsigned VecOpcode;
8870 auto *UserVecTy =
8871 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
8872 if (BWSz > SrcBWSz)
8873 VecOpcode = Instruction::Trunc;
8874 else
8875 VecOpcode =
8876 It->second.second ? Instruction::SExt : Instruction::ZExt;
8877 TTI::CastContextHint CCH = GetCastContextHint(VL0);
8878 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
8879 CostKind);
8880 }
8881 }
8882 }
8883 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8884 ScalarCost, "Calculated costs for Tree"));
8885 return VecCost - ScalarCost;
8886 };
8887 // Calculate cost difference from vectorizing set of GEPs.
8888 // Negative value means vectorizing is profitable.
8889 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
8890 assert((E->State == TreeEntry::Vectorize ||
8891 E->State == TreeEntry::StridedVectorize) &&
8892 "Entry state expected to be Vectorize or StridedVectorize here.");
8893 InstructionCost ScalarCost = 0;
8894 InstructionCost VecCost = 0;
8895 std::tie(ScalarCost, VecCost) = getGEPCosts(
8896 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
8897 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8898 "Calculated GEPs cost for Tree"));
8899
8900 return VecCost - ScalarCost;
8901 };
8902
8903 switch (ShuffleOrOp) {
8904 case Instruction::PHI: {
8905 // Count reused scalars.
8906 InstructionCost ScalarCost = 0;
8908 for (Value *V : UniqueValues) {
8909 auto *PHI = dyn_cast<PHINode>(V);
8910 if (!PHI)
8911 continue;
8912
8913 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
8914 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
8915 Value *Op = PHI->getIncomingValue(I);
8916 Operands[I] = Op;
8917 }
8918 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
8919 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
8920 if (!OpTE->ReuseShuffleIndices.empty())
8921 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8922 OpTE->Scalars.size());
8923 }
8924
8925 return CommonCost - ScalarCost;
8926 }
8927 case Instruction::ExtractValue:
8928 case Instruction::ExtractElement: {
8929 auto GetScalarCost = [&](unsigned Idx) {
8930 auto *I = cast<Instruction>(UniqueValues[Idx]);
8931 VectorType *SrcVecTy;
8932 if (ShuffleOrOp == Instruction::ExtractElement) {
8933 auto *EE = cast<ExtractElementInst>(I);
8934 SrcVecTy = EE->getVectorOperandType();
8935 } else {
8936 auto *EV = cast<ExtractValueInst>(I);
8937 Type *AggregateTy = EV->getAggregateOperand()->getType();
8938 unsigned NumElts;
8939 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8940 NumElts = ATy->getNumElements();
8941 else
8942 NumElts = AggregateTy->getStructNumElements();
8943 SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
8944 }
8945 if (I->hasOneUse()) {
8946 Instruction *Ext = I->user_back();
8947 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
8948 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8949 // Use getExtractWithExtendCost() to calculate the cost of
8950 // extractelement/ext pair.
8952 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
8953 // Subtract the cost of s|zext which is subtracted separately.
8955 Ext->getOpcode(), Ext->getType(), I->getType(),
8957 return Cost;
8958 }
8959 }
8960 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
8962 };
8963 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
8964 return GetCostDiff(GetScalarCost, GetVectorCost);
8965 }
8966 case Instruction::InsertElement: {
8967 assert(E->ReuseShuffleIndices.empty() &&
8968 "Unique insertelements only are expected.");
8969 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
8970 unsigned const NumElts = SrcVecTy->getNumElements();
8971 unsigned const NumScalars = VL.size();
8972
8973 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
8974
8975 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
8976 unsigned OffsetBeg = *getInsertIndex(VL.front());
8977 unsigned OffsetEnd = OffsetBeg;
8978 InsertMask[OffsetBeg] = 0;
8979 for (auto [I, V] : enumerate(VL.drop_front())) {
8980 unsigned Idx = *getInsertIndex(V);
8981 if (OffsetBeg > Idx)
8982 OffsetBeg = Idx;
8983 else if (OffsetEnd < Idx)
8984 OffsetEnd = Idx;
8985 InsertMask[Idx] = I + 1;
8986 }
8987 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
8988 if (NumOfParts > 0)
8989 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
8990 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
8991 VecScalarsSz;
8992 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
8993 unsigned InsertVecSz = std::min<unsigned>(
8994 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
8995 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
8996 bool IsWholeSubvector =
8997 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
8998 // Check if we can safely insert a subvector. If it is not possible, just
8999 // generate a whole-sized vector and shuffle the source vector and the new
9000 // subvector.
9001 if (OffsetBeg + InsertVecSz > VecSz) {
9002 // Align OffsetBeg to generate correct mask.
9003 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9004 InsertVecSz = VecSz;
9005 }
9006
9007 APInt DemandedElts = APInt::getZero(NumElts);
9008 // TODO: Add support for Instruction::InsertValue.
9010 if (!E->ReorderIndices.empty()) {
9011 inversePermutation(E->ReorderIndices, Mask);
9012 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9013 } else {
9014 Mask.assign(VecSz, PoisonMaskElem);
9015 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9016 }
9017 bool IsIdentity = true;
9018 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9019 Mask.swap(PrevMask);
9020 for (unsigned I = 0; I < NumScalars; ++I) {
9021 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
9022 DemandedElts.setBit(InsertIdx);
9023 IsIdentity &= InsertIdx - OffsetBeg == I;
9024 Mask[InsertIdx - OffsetBeg] = I;
9025 }
9026 assert(Offset < NumElts && "Failed to find vector index offset");
9027
9029 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9030 /*Insert*/ true, /*Extract*/ false,
9031 CostKind);
9032
9033 // First cost - resize to actual vector size if not identity shuffle or
9034 // need to shift the vector.
9035 // Do not calculate the cost if the actual size is the register size and
9036 // we can merge this shuffle with the following SK_Select.
9037 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
9038 if (!IsIdentity)
9040 InsertVecTy, Mask);
9041 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9042 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9043 }));
9044 // Second cost - permutation with subvector, if some elements are from the
9045 // initial vector or inserting a subvector.
9046 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9047 // subvector of ActualVecTy.
9048 SmallBitVector InMask =
9049 isUndefVector(FirstInsert->getOperand(0),
9050 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9051 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9052 if (InsertVecSz != VecSz) {
9053 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
9055 std::nullopt, CostKind, OffsetBeg - Offset,
9056 InsertVecTy);
9057 } else {
9058 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9059 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9060 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9061 I <= End; ++I)
9062 if (Mask[I] != PoisonMaskElem)
9063 Mask[I] = I + VecSz;
9064 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9065 Mask[I] =
9066 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9067 Cost +=
9068 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9069 }
9070 }
9071 return Cost;
9072 }
9073 case Instruction::ZExt:
9074 case Instruction::SExt:
9075 case Instruction::FPToUI:
9076 case Instruction::FPToSI:
9077 case Instruction::FPExt:
9078 case Instruction::PtrToInt:
9079 case Instruction::IntToPtr:
9080 case Instruction::SIToFP:
9081 case Instruction::UIToFP:
9082 case Instruction::Trunc:
9083 case Instruction::FPTrunc:
9084 case Instruction::BitCast: {
9085 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9086 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9087 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9088 unsigned Opcode = ShuffleOrOp;
9089 unsigned VecOpcode = Opcode;
9090 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9091 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9092 // Check if the values are candidates to demote.
9093 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9094 if (SrcIt != MinBWs.end()) {
9095 SrcBWSz = SrcIt->second.first;
9096 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9097 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9098 }
9099 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9100 if (BWSz == SrcBWSz) {
9101 VecOpcode = Instruction::BitCast;
9102 } else if (BWSz < SrcBWSz) {
9103 VecOpcode = Instruction::Trunc;
9104 } else if (It != MinBWs.end()) {
9105 assert(BWSz > SrcBWSz && "Invalid cast!");
9106 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9107 } else if (SrcIt != MinBWs.end()) {
9108 assert(BWSz > SrcBWSz && "Invalid cast!");
9109 VecOpcode =
9110 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9111 }
9112 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9113 !SrcIt->second.second) {
9114 VecOpcode = Instruction::UIToFP;
9115 }
9116 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9117 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9118 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9119 VL0->getOperand(0)->getType(),
9121 };
9122 auto GetVectorCost = [=](InstructionCost CommonCost) {
9123 // Do not count cost here if minimum bitwidth is in effect and it is just
9124 // a bitcast (here it is just a noop).
9125 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9126 return CommonCost;
9127 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9128 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9129 return CommonCost +
9130 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9131 VecOpcode == Opcode ? VI : nullptr);
9132 };
9133 return GetCostDiff(GetScalarCost, GetVectorCost);
9134 }
9135 case Instruction::FCmp:
9136 case Instruction::ICmp:
9137 case Instruction::Select: {
9138 CmpInst::Predicate VecPred, SwappedVecPred;
9139 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9140 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9141 match(VL0, MatchCmp))
9142 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9143 else
9144 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9147 auto GetScalarCost = [&](unsigned Idx) {
9148 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9149 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9152 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9153 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9154 !match(VI, MatchCmp)) ||
9155 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9156 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9159
9160 return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
9161 Builder.getInt1Ty(), CurrentPred, CostKind,
9162 VI);
9163 };
9164 auto GetVectorCost = [&](InstructionCost CommonCost) {
9165 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9166
9168 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9169 // Check if it is possible and profitable to use min/max for selects
9170 // in VL.
9171 //
9172 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9173 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9174 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9175 {VecTy, VecTy});
9176 InstructionCost IntrinsicCost =
9177 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9178 // If the selects are the only uses of the compares, they will be
9179 // dead and we can adjust the cost by removing their cost.
9180 if (IntrinsicAndUse.second)
9181 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
9182 MaskTy, VecPred, CostKind);
9183 VecCost = std::min(VecCost, IntrinsicCost);
9184 }
9185 return VecCost + CommonCost;
9186 };
9187 return GetCostDiff(GetScalarCost, GetVectorCost);
9188 }
9189 case Instruction::FNeg:
9190 case Instruction::Add:
9191 case Instruction::FAdd:
9192 case Instruction::Sub:
9193 case Instruction::FSub:
9194 case Instruction::Mul:
9195 case Instruction::FMul:
9196 case Instruction::UDiv:
9197 case Instruction::SDiv:
9198 case Instruction::FDiv:
9199 case Instruction::URem:
9200 case Instruction::SRem:
9201 case Instruction::FRem:
9202 case Instruction::Shl:
9203 case Instruction::LShr:
9204 case Instruction::AShr:
9205 case Instruction::And:
9206 case Instruction::Or:
9207 case Instruction::Xor: {
9208 auto GetScalarCost = [&](unsigned Idx) {
9209 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9210 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9211 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9212 TTI::OperandValueInfo Op2Info =
9213 TTI::getOperandInfo(VI->getOperand(OpIdx));
9214 SmallVector<const Value *> Operands(VI->operand_values());
9215 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9216 Op1Info, Op2Info, Operands, VI);
9217 };
9218 auto GetVectorCost = [=](InstructionCost CommonCost) {
9219 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9220 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9221 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9222 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9223 Op2Info, std::nullopt, nullptr, TLI) +
9224 CommonCost;
9225 };
9226 return GetCostDiff(GetScalarCost, GetVectorCost);
9227 }
9228 case Instruction::GetElementPtr: {
9229 return CommonCost + GetGEPCostDiff(VL, VL0);
9230 }
9231 case Instruction::Load: {
9232 auto GetScalarCost = [&](unsigned Idx) {
9233 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9234 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9235 VI->getAlign(), VI->getPointerAddressSpace(),
9237 };
9238 auto *LI0 = cast<LoadInst>(VL0);
9239 auto GetVectorCost = [&](InstructionCost CommonCost) {
9240 InstructionCost VecLdCost;
9241 if (E->State == TreeEntry::Vectorize) {
9242 VecLdCost = TTI->getMemoryOpCost(
9243 Instruction::Load, VecTy, LI0->getAlign(),
9244 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9245 } else if (E->State == TreeEntry::StridedVectorize) {
9246 Align CommonAlignment =
9247 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9248 VecLdCost = TTI->getStridedMemoryOpCost(
9249 Instruction::Load, VecTy, LI0->getPointerOperand(),
9250 /*VariableMask=*/false, CommonAlignment, CostKind);
9251 } else {
9252 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9253 Align CommonAlignment =
9254 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9255 VecLdCost = TTI->getGatherScatterOpCost(
9256 Instruction::Load, VecTy, LI0->getPointerOperand(),
9257 /*VariableMask=*/false, CommonAlignment, CostKind);
9258 }
9259 return VecLdCost + CommonCost;
9260 };
9261
9262 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9263 // If this node generates masked gather load then it is not a terminal node.
9264 // Hence address operand cost is estimated separately.
9265 if (E->State == TreeEntry::ScatterVectorize)
9266 return Cost;
9267
9268 // Estimate cost of GEPs since this tree node is a terminator.
9269 SmallVector<Value *> PointerOps(VL.size());
9270 for (auto [I, V] : enumerate(VL))
9271 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9272 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9273 }
9274 case Instruction::Store: {
9275 bool IsReorder = !E->ReorderIndices.empty();
9276 auto GetScalarCost = [=](unsigned Idx) {
9277 auto *VI = cast<StoreInst>(VL[Idx]);
9278 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9279 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9280 VI->getAlign(), VI->getPointerAddressSpace(),
9281 CostKind, OpInfo, VI);
9282 };
9283 auto *BaseSI =
9284 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9285 auto GetVectorCost = [=](InstructionCost CommonCost) {
9286 // We know that we can merge the stores. Calculate the cost.
9287 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9288 return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9289 BaseSI->getPointerAddressSpace(), CostKind,
9290 OpInfo) +
9291 CommonCost;
9292 };
9293 SmallVector<Value *> PointerOps(VL.size());
9294 for (auto [I, V] : enumerate(VL)) {
9295 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9296 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9297 }
9298
9299 return GetCostDiff(GetScalarCost, GetVectorCost) +
9300 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9301 }
9302 case Instruction::Call: {
9303 auto GetScalarCost = [&](unsigned Idx) {
9304 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9307 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9308 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9309 }
9312 CI->getFunctionType()->params(), CostKind);
9313 };
9314 auto GetVectorCost = [=](InstructionCost CommonCost) {
9315 auto *CI = cast<CallInst>(VL0);
9317 SmallVector<Type *> ArgTys =
9319 It != MinBWs.end() ? It->second.first : 0);
9320 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9321 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9322 };
9323 return GetCostDiff(GetScalarCost, GetVectorCost);
9324 }
9325 case Instruction::ShuffleVector: {
9326 assert(E->isAltShuffle() &&
9327 ((Instruction::isBinaryOp(E->getOpcode()) &&
9328 Instruction::isBinaryOp(E->getAltOpcode())) ||
9329 (Instruction::isCast(E->getOpcode()) &&
9330 Instruction::isCast(E->getAltOpcode())) ||
9331 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9332 "Invalid Shuffle Vector Operand");
9333 // Try to find the previous shuffle node with the same operands and same
9334 // main/alternate ops.
9335 auto TryFindNodeWithEqualOperands = [=]() {
9336 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9337 if (TE.get() == E)
9338 break;
9339 if (TE->isAltShuffle() &&
9340 ((TE->getOpcode() == E->getOpcode() &&
9341 TE->getAltOpcode() == E->getAltOpcode()) ||
9342 (TE->getOpcode() == E->getAltOpcode() &&
9343 TE->getAltOpcode() == E->getOpcode())) &&
9344 TE->hasEqualOperands(*E))
9345 return true;
9346 }
9347 return false;
9348 };
9349 auto GetScalarCost = [&](unsigned Idx) {
9350 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9351 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9352 (void)E;
9353 return TTI->getInstructionCost(VI, CostKind);
9354 };
9355 // Need to clear CommonCost since the final shuffle cost is included into
9356 // vector cost.
9357 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9358 // VecCost is equal to sum of the cost of creating 2 vectors
9359 // and the cost of creating shuffle.
9360 InstructionCost VecCost = 0;
9361 if (TryFindNodeWithEqualOperands()) {
9362 LLVM_DEBUG({
9363 dbgs() << "SLP: diamond match for alternate node found.\n";
9364 E->dump();
9365 });
9366 // No need to add new vector costs here since we're going to reuse
9367 // same main/alternate vector ops, just do different shuffling.
9368 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9369 VecCost =
9370 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9371 VecCost +=
9372 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9373 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9374 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9375 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9376 CI0->getPredicate(), CostKind, VL0);
9377 VecCost += TTIRef.getCmpSelInstrCost(
9378 E->getOpcode(), VecTy, MaskTy,
9379 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9380 E->getAltOp());
9381 } else {
9382 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9383 auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9384 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9385 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9386 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9387 unsigned SrcBWSz =
9388 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9389 if (SrcIt != MinBWs.end()) {
9390 SrcBWSz = SrcIt->second.first;
9391 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9392 SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9393 }
9394 if (BWSz <= SrcBWSz) {
9395 if (BWSz < SrcBWSz)
9396 VecCost =
9397 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9399 LLVM_DEBUG({
9400 dbgs()
9401 << "SLP: alternate extension, which should be truncated.\n";
9402 E->dump();
9403 });
9404 return VecCost;
9405 }
9406 }
9407 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9409 VecCost +=
9410 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9412 }
9414 E->buildAltOpShuffleMask(
9415 [E](Instruction *I) {
9416 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9417 return I->getOpcode() == E->getAltOpcode();
9418 },
9419 Mask);
9421 FinalVecTy, Mask);
9422 // Patterns like [fadd,fsub] can be combined into a single instruction
9423 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9424 // need to take into account their order when looking for the most used
9425 // order.
9426 unsigned Opcode0 = E->getOpcode();
9427 unsigned Opcode1 = E->getAltOpcode();
9428 // The opcode mask selects between the two opcodes.
9429 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9430 for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9431 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9432 OpcodeMask.set(Lane);
9433 // If this pattern is supported by the target then we consider the
9434 // order.
9435 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9436 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9437 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9438 return AltVecCost < VecCost ? AltVecCost : VecCost;
9439 }
9440 // TODO: Check the reverse order too.
9441 return VecCost;
9442 };
9443 return GetCostDiff(GetScalarCost, GetVectorCost);
9444 }
9445 default:
9446 llvm_unreachable("Unknown instruction");
9447 }
9448}
9449
9450bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9451 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9452 << VectorizableTree.size() << " is fully vectorizable .\n");
9453
9454 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9456 return TE->State == TreeEntry::NeedToGather &&
9457 !any_of(TE->Scalars,
9458 [this](Value *V) { return EphValues.contains(V); }) &&
9459 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9460 TE->Scalars.size() < Limit ||
9461 ((TE->getOpcode() == Instruction::ExtractElement ||
9462 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9463 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9464 (TE->State == TreeEntry::NeedToGather &&
9465 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9466 };
9467
9468 // We only handle trees of heights 1 and 2.
9469 if (VectorizableTree.size() == 1 &&
9470 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9471 (ForReduction &&
9472 AreVectorizableGathers(VectorizableTree[0].get(),
9473 VectorizableTree[0]->Scalars.size()) &&
9474 VectorizableTree[0]->getVectorFactor() > 2)))
9475 return true;
9476
9477 if (VectorizableTree.size() != 2)
9478 return false;
9479
9480 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9481 // with the second gather nodes if they have less scalar operands rather than
9482 // the initial tree element (may be profitable to shuffle the second gather)
9483 // or they are extractelements, which form shuffle.
9485 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9486 AreVectorizableGathers(VectorizableTree[1].get(),
9487 VectorizableTree[0]->Scalars.size()))
9488 return true;
9489
9490 // Gathering cost would be too much for tiny trees.
9491 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9492 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9493 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9494 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9495 return false;
9496
9497 return true;
9498}
9499
9500static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9502 bool MustMatchOrInst) {
9503 // Look past the root to find a source value. Arbitrarily follow the
9504 // path through operand 0 of any 'or'. Also, peek through optional
9505 // shift-left-by-multiple-of-8-bits.
9506 Value *ZextLoad = Root;
9507 const APInt *ShAmtC;
9508 bool FoundOr = false;
9509 while (!isa<ConstantExpr>(ZextLoad) &&
9510 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9511 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9512 ShAmtC->urem(8) == 0))) {
9513 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9514 ZextLoad = BinOp->getOperand(0);
9515 if (BinOp->getOpcode() == Instruction::Or)
9516 FoundOr = true;
9517 }
9518 // Check if the input is an extended load of the required or/shift expression.
9519 Value *Load;
9520 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9521 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9522 return false;
9523
9524 // Require that the total load bit width is a legal integer type.
9525 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9526 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9527 Type *SrcTy = Load->getType();
9528 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9529 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9530 return false;
9531
9532 // Everything matched - assume that we can fold the whole sequence using
9533 // load combining.
9534 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9535 << *(cast<Instruction>(Root)) << "\n");
9536
9537 return true;
9538}
9539
9541 if (RdxKind != RecurKind::Or)
9542 return false;
9543
9544 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9545 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9546 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9547 /* MatchOr */ false);
9548}
9549
9551 // Peek through a final sequence of stores and check if all operations are
9552 // likely to be load-combined.
9553 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9554 for (Value *Scalar : VectorizableTree[0]->Scalars) {
9555 Value *X;
9556 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9557 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9558 return false;
9559 }
9560 return true;
9561}
9562
9563bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9564 // No need to vectorize inserts of gathered values.
9565 if (VectorizableTree.size() == 2 &&
9566 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9567 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9568 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9569 !(isSplat(VectorizableTree[1]->Scalars) ||
9570 allConstant(VectorizableTree[1]->Scalars))))
9571 return true;
9572
9573 // If the graph includes only PHI nodes and gathers, it is defnitely not
9574 // profitable for the vectorization, we can skip it, if the cost threshold is
9575 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9576 // gathers/buildvectors.
9577 constexpr int Limit = 4;
9578 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9579 !VectorizableTree.empty() &&
9580 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9581 return (TE->State == TreeEntry::NeedToGather &&
9582 TE->getOpcode() != Instruction::ExtractElement &&
9583 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9584 TE->getOpcode() == Instruction::PHI;
9585 }))
9586 return true;
9587
9588 // We can vectorize the tree if its size is greater than or equal to the
9589 // minimum size specified by the MinTreeSize command line option.
9590 if (VectorizableTree.size() >= MinTreeSize)
9591 return false;
9592
9593 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9594 // can vectorize it if we can prove it fully vectorizable.
9595 if (isFullyVectorizableTinyTree(ForReduction))
9596 return false;
9597
9598 // Check if any of the gather node forms an insertelement buildvector
9599 // somewhere.
9600 bool IsAllowedSingleBVNode =
9601 VectorizableTree.size() > 1 ||
9602 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9603 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9604 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9605 allSameBlock(VectorizableTree.front()->Scalars));
9606 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9607 return TE->State == TreeEntry::NeedToGather &&
9608 all_of(TE->Scalars, [&](Value *V) {
9609 return isa<ExtractElementInst, UndefValue>(V) ||
9610 (IsAllowedSingleBVNode &&
9611 !V->hasNUsesOrMore(UsesLimit) &&
9612 any_of(V->users(), IsaPred<InsertElementInst>));
9613 });
9614 }))
9615 return false;
9616
9617 assert(VectorizableTree.empty()
9618 ? ExternalUses.empty()
9619 : true && "We shouldn't have any external users");
9620
9621 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9622 // vectorizable.
9623 return true;
9624}
9625
9627 // Walk from the bottom of the tree to the top, tracking which values are
9628 // live. When we see a call instruction that is not part of our tree,
9629 // query TTI to see if there is a cost to keeping values live over it
9630 // (for example, if spills and fills are required).
9631 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9633
9635 Instruction *PrevInst = nullptr;
9636
9637 // The entries in VectorizableTree are not necessarily ordered by their
9638 // position in basic blocks. Collect them and order them by dominance so later
9639 // instructions are guaranteed to be visited first. For instructions in
9640 // different basic blocks, we only scan to the beginning of the block, so
9641 // their order does not matter, as long as all instructions in a basic block
9642 // are grouped together. Using dominance ensures a deterministic order.
9643 SmallVector<Instruction *, 16> OrderedScalars;
9644 for (const auto &TEPtr : VectorizableTree) {
9645 if (TEPtr->State != TreeEntry::Vectorize)
9646 continue;
9647 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9648 if (!Inst)
9649 continue;
9650 OrderedScalars.push_back(Inst);
9651 }
9652 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
9653 auto *NodeA = DT->getNode(A->getParent());
9654 auto *NodeB = DT->getNode(B->getParent());
9655 assert(NodeA && "Should only process reachable instructions");
9656 assert(NodeB && "Should only process reachable instructions");
9657 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9658 "Different nodes should have different DFS numbers");
9659 if (NodeA != NodeB)
9660 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9661 return B->comesBefore(A);
9662 });
9663
9664 for (Instruction *Inst : OrderedScalars) {
9665 if (!PrevInst) {
9666 PrevInst = Inst;
9667 continue;
9668 }
9669
9670 // Update LiveValues.
9671 LiveValues.erase(PrevInst);
9672 for (auto &J : PrevInst->operands()) {
9673 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9674 LiveValues.insert(cast<Instruction>(&*J));
9675 }
9676
9677 LLVM_DEBUG({
9678 dbgs() << "SLP: #LV: " << LiveValues.size();
9679 for (auto *X : LiveValues)
9680 dbgs() << " " << X->getName();
9681 dbgs() << ", Looking at ";
9682 Inst->dump();
9683 });
9684
9685 // Now find the sequence of instructions between PrevInst and Inst.
9686 unsigned NumCalls = 0;
9687 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
9688 PrevInstIt =
9689 PrevInst->getIterator().getReverse();
9690 while (InstIt != PrevInstIt) {
9691 if (PrevInstIt == PrevInst->getParent()->rend()) {
9692 PrevInstIt = Inst->getParent()->rbegin();
9693 continue;
9694 }
9695
9696 auto NoCallIntrinsic = [this](Instruction *I) {
9697 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
9698 if (II->isAssumeLikeIntrinsic())
9699 return true;
9700 FastMathFlags FMF;
9702 for (auto &ArgOp : II->args())
9703 Tys.push_back(ArgOp->getType());
9704 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
9705 FMF = FPMO->getFastMathFlags();
9706 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
9707 FMF);
9708 InstructionCost IntrCost =
9711 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
9712 if (IntrCost < CallCost)
9713 return true;
9714 }
9715 return false;
9716 };
9717
9718 // Debug information does not impact spill cost.
9719 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9720 &*PrevInstIt != PrevInst)
9721 NumCalls++;
9722
9723 ++PrevInstIt;
9724 }
9725
9726 if (NumCalls) {
9728 for (auto *II : LiveValues) {
9729 auto *ScalarTy = II->getType();
9730 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9731 ScalarTy = VectorTy->getElementType();
9732 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
9733 }
9734 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
9735 }
9736
9737 PrevInst = Inst;
9738 }
9739
9740 return Cost;
9741}
9742
9743/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
9744/// buildvector sequence.
9746 const InsertElementInst *IE2) {
9747 if (IE1 == IE2)
9748 return false;
9749 const auto *I1 = IE1;
9750 const auto *I2 = IE2;
9751 const InsertElementInst *PrevI1;
9752 const InsertElementInst *PrevI2;
9753 unsigned Idx1 = *getInsertIndex(IE1);
9754 unsigned Idx2 = *getInsertIndex(IE2);
9755 do {
9756 if (I2 == IE1)
9757 return true;
9758 if (I1 == IE2)
9759 return false;
9760 PrevI1 = I1;
9761 PrevI2 = I2;
9762 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9763 getInsertIndex(I1).value_or(Idx2) != Idx2)
9764 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9765 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
9766 getInsertIndex(I2).value_or(Idx1) != Idx1)
9767 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9768 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9769 llvm_unreachable("Two different buildvectors not expected.");
9770}
9771
9772namespace {
9773/// Returns incoming Value *, if the requested type is Value * too, or a default
9774/// value, otherwise.
9775struct ValueSelect {
9776 template <typename U>
9777 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
9778 return V;
9779 }
9780 template <typename U>
9781 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
9782 return U();
9783 }
9784};
9785} // namespace
9786
9787/// Does the analysis of the provided shuffle masks and performs the requested
9788/// actions on the vectors with the given shuffle masks. It tries to do it in
9789/// several steps.
9790/// 1. If the Base vector is not undef vector, resizing the very first mask to
9791/// have common VF and perform action for 2 input vectors (including non-undef
9792/// Base). Other shuffle masks are combined with the resulting after the 1 stage
9793/// and processed as a shuffle of 2 elements.
9794/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
9795/// action only for 1 vector with the given mask, if it is not the identity
9796/// mask.
9797/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
9798/// vectors, combing the masks properly between the steps.
9799template <typename T>
9801 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
9802 function_ref<unsigned(T *)> GetVF,
9803 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
9805 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
9806 SmallVector<int> Mask(ShuffleMask.begin()->second);
9807 auto VMIt = std::next(ShuffleMask.begin());
9808 T *Prev = nullptr;
9809 SmallBitVector UseMask =
9810 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9811 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
9812 if (!IsBaseUndef.all()) {
9813 // Base is not undef, need to combine it with the next subvectors.
9814 std::pair<T *, bool> Res =
9815 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
9816 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
9817 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
9818 if (Mask[Idx] == PoisonMaskElem)
9819 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
9820 else
9821 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
9822 }
9823 auto *V = ValueSelect::get<T *>(Base);
9824 (void)V;
9825 assert((!V || GetVF(V) == Mask.size()) &&
9826 "Expected base vector of VF number of elements.");
9827 Prev = Action(Mask, {nullptr, Res.first});
9828 } else if (ShuffleMask.size() == 1) {
9829 // Base is undef and only 1 vector is shuffled - perform the action only for
9830 // single vector, if the mask is not the identity mask.
9831 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9832 /*ForSingleMask=*/true);
9833 if (Res.second)
9834 // Identity mask is found.
9835 Prev = Res.first;
9836 else
9837 Prev = Action(Mask, {ShuffleMask.begin()->first});
9838 } else {
9839 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
9840 // shuffles step by step, combining shuffle between the steps.
9841 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9842 unsigned Vec2VF = GetVF(VMIt->first);
9843 if (Vec1VF == Vec2VF) {
9844 // No need to resize the input vectors since they are of the same size, we
9845 // can shuffle them directly.
9846 ArrayRef<int> SecMask = VMIt->second;
9847 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9848 if (SecMask[I] != PoisonMaskElem) {
9849 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9850 Mask[I] = SecMask[I] + Vec1VF;
9851 }
9852 }
9853 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9854 } else {
9855 // Vectors of different sizes - resize and reshuffle.
9856 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9857 /*ForSingleMask=*/false);
9858 std::pair<T *, bool> Res2 =
9859 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9860 ArrayRef<int> SecMask = VMIt->second;
9861 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9862 if (Mask[I] != PoisonMaskElem) {
9863 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9864 if (Res1.second)
9865 Mask[I] = I;
9866 } else if (SecMask[I] != PoisonMaskElem) {
9867 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9868 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
9869 }
9870 }
9871 Prev = Action(Mask, {Res1.first, Res2.first});
9872 }
9873 VMIt = std::next(VMIt);
9874 }
9875 bool IsBaseNotUndef = !IsBaseUndef.all();
9876 (void)IsBaseNotUndef;
9877 // Perform requested actions for the remaining masks/vectors.
9878 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9879 // Shuffle other input vectors, if any.
9880 std::pair<T *, bool> Res =
9881 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9882 ArrayRef<int> SecMask = VMIt->second;
9883 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9884 if (SecMask[I] != PoisonMaskElem) {
9885 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
9886 "Multiple uses of scalars.");
9887 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
9888 } else if (Mask[I] != PoisonMaskElem) {
9889 Mask[I] = I;
9890 }
9891 }
9892 Prev = Action(Mask, {Prev, Res.first});
9893 }
9894 return Prev;
9895}
9896
9899 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
9900 << VectorizableTree.size() << ".\n");
9901
9902 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9903
9904 SmallPtrSet<Value *, 4> CheckedExtracts;
9905 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
9906 TreeEntry &TE = *VectorizableTree[I];
9907 if (TE.State == TreeEntry::NeedToGather) {
9908 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
9909 E && E->getVectorFactor() == TE.getVectorFactor() &&
9910 E->isSame(TE.Scalars)) {
9911 // Some gather nodes might be absolutely the same as some vectorizable
9912 // nodes after reordering, need to handle it.
9913 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
9914 << shortBundleName(TE.Scalars) << ".\n"
9915 << "SLP: Current total cost = " << Cost << "\n");
9916 continue;
9917 }
9918 }
9919
9920 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
9921 Cost += C;
9922 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
9923 << shortBundleName(TE.Scalars) << ".\n"
9924 << "SLP: Current total cost = " << Cost << "\n");
9925 }
9926
9927 SmallPtrSet<Value *, 16> ExtractCostCalculated;
9928 InstructionCost ExtractCost = 0;
9931 SmallVector<APInt> DemandedElts;
9932 SmallDenseSet<Value *, 4> UsedInserts;
9934 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
9935 for (ExternalUser &EU : ExternalUses) {
9936 // We only add extract cost once for the same scalar.
9937 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9938 !ExtractCostCalculated.insert(EU.Scalar).second)
9939 continue;
9940
9941 // Uses by ephemeral values are free (because the ephemeral value will be
9942 // removed prior to code generation, and so the extraction will be
9943 // removed as well).
9944 if (EphValues.count(EU.User))
9945 continue;
9946
9947 // No extract cost for vector "scalar"
9948 if (isa<FixedVectorType>(EU.Scalar->getType()))
9949 continue;
9950
9951 // If found user is an insertelement, do not calculate extract cost but try
9952 // to detect it as a final shuffled/identity match.
9953 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
9954 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
9955 if (!UsedInserts.insert(VU).second)
9956 continue;
9957 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
9958 if (InsertIdx) {
9959 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
9960 auto *It = find_if(
9961 FirstUsers,
9962 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
9964 VU, cast<InsertElementInst>(Pair.first),
9965 [this](InsertElementInst *II) -> Value * {
9966 Value *Op0 = II->getOperand(0);
9967 if (getTreeEntry(II) && !getTreeEntry(Op0))
9968 return nullptr;
9969 return Op0;
9970 });
9971 });
9972 int VecId = -1;
9973 if (It == FirstUsers.end()) {
9974 (void)ShuffleMasks.emplace_back();
9975 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
9976 if (Mask.empty())
9977 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
9978 // Find the insertvector, vectorized in tree, if any.
9979 Value *Base = VU;
9980 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
9981 if (IEBase != EU.User &&
9982 (!IEBase->hasOneUse() ||
9983 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
9984 break;
9985 // Build the mask for the vectorized insertelement instructions.
9986 if (const TreeEntry *E = getTreeEntry(IEBase)) {
9987 VU = IEBase;
9988 do {
9989 IEBase = cast<InsertElementInst>(Base);
9990 int Idx = *getInsertIndex(IEBase);
9991 assert(Mask[Idx] == PoisonMaskElem &&
9992 "InsertElementInstruction used already.");
9993 Mask[Idx] = Idx;
9994 Base = IEBase->getOperand(0);
9995 } while (E == getTreeEntry(Base));
9996 break;
9997 }
9998 Base = cast<InsertElementInst>(Base)->getOperand(0);
9999 }
10000 FirstUsers.emplace_back(VU, ScalarTE);
10001 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10002 VecId = FirstUsers.size() - 1;
10003 auto It = MinBWs.find(ScalarTE);
10004 if (It != MinBWs.end() &&
10005 VectorCasts
10006 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10007 .second) {
10008 unsigned BWSz = It->second.first;
10009 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10010 unsigned VecOpcode;
10011 if (DstBWSz < BWSz)
10012 VecOpcode = Instruction::Trunc;
10013 else
10014 VecOpcode =
10015 It->second.second ? Instruction::SExt : Instruction::ZExt;
10018 VecOpcode, FTy,
10020 IntegerType::get(FTy->getContext(), BWSz),
10021 FTy->getNumElements()),
10023 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10024 << " for extending externally used vector with "
10025 "non-equal minimum bitwidth.\n");
10026 Cost += C;
10027 }
10028 } else {
10029 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10030 It->first = VU;
10031 VecId = std::distance(FirstUsers.begin(), It);
10032 }
10033 int InIdx = *InsertIdx;
10034 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10035 if (Mask.empty())
10036 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10037 Mask[InIdx] = EU.Lane;
10038 DemandedElts[VecId].setBit(InIdx);
10039 continue;
10040 }
10041 }
10042 }
10043 // Leave the GEPs as is, they are free in most cases and better to keep them
10044 // as GEPs.
10046 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10047 if (!ValueToExtUses) {
10048 ValueToExtUses.emplace();
10049 for_each(enumerate(ExternalUses), [&](const auto &P) {
10050 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10051 });
10052 }
10053 // Can use original GEP, if no operands vectorized or they are marked as
10054 // externally used already.
10055 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10056 if (!getTreeEntry(V))
10057 return true;
10058 auto It = ValueToExtUses->find(V);
10059 if (It != ValueToExtUses->end()) {
10060 // Replace all uses to avoid compiler crash.
10061 ExternalUses[It->second].User = nullptr;
10062 return true;
10063 }
10064 return false;
10065 });
10066 if (CanBeUsedAsGEP) {
10067 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10068 ExternalUsesAsGEPs.insert(EU.Scalar);
10069 continue;
10070 }
10071 }
10072
10073 // If we plan to rewrite the tree in a smaller type, we will need to sign
10074 // extend the extracted value back to the original type. Here, we account
10075 // for the extract and the added cost of the sign extend if needed.
10076 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10077 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10078 if (It != MinBWs.end()) {
10079 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10080 unsigned Extend =
10081 It->second.second ? Instruction::SExt : Instruction::ZExt;
10082 VecTy = FixedVectorType::get(MinTy, BundleWidth);
10083 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10084 VecTy, EU.Lane);
10085 } else {
10086 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10087 CostKind, EU.Lane);
10088 }
10089 }
10090 // Add reduced value cost, if resized.
10091 if (!VectorizedVals.empty()) {
10092 const TreeEntry &Root = *VectorizableTree.front().get();
10093 auto BWIt = MinBWs.find(&Root);
10094 if (BWIt != MinBWs.end()) {
10095 Type *DstTy = Root.Scalars.front()->getType();
10096 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10097 unsigned SrcSz =
10098 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10099 if (OriginalSz != SrcSz) {
10100 unsigned Opcode = Instruction::Trunc;
10101 if (OriginalSz > SrcSz)
10102 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10103 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10104 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10107 }
10108 }
10109 }
10110
10111 InstructionCost SpillCost = getSpillCost();
10112 Cost += SpillCost + ExtractCost;
10113 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10114 bool) {
10115 InstructionCost C = 0;
10116 unsigned VF = Mask.size();
10117 unsigned VecVF = TE->getVectorFactor();
10118 if (VF != VecVF &&
10119 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10121 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10122 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10123 OrigMask.begin());
10124 C = TTI->getShuffleCost(
10126 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
10127 LLVM_DEBUG(
10128 dbgs() << "SLP: Adding cost " << C
10129 << " for final shuffle of insertelement external users.\n";
10130 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10131 Cost += C;
10132 return std::make_pair(TE, true);
10133 }
10134 return std::make_pair(TE, false);
10135 };
10136 // Calculate the cost of the reshuffled vectors, if any.
10137 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10138 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10139 auto Vector = ShuffleMasks[I].takeVector();
10140 unsigned VF = 0;
10141 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10143 assert((TEs.size() == 1 || TEs.size() == 2) &&
10144 "Expected exactly 1 or 2 tree entries.");
10145 if (TEs.size() == 1) {
10146 if (VF == 0)
10147 VF = TEs.front()->getVectorFactor();
10148 auto *FTy =
10149 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10150 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10151 !all_of(enumerate(Mask), [=](const auto &Data) {
10152 return Data.value() == PoisonMaskElem ||
10153 (Data.index() < VF &&
10154 static_cast<int>(Data.index()) == Data.value());
10155 })) {
10158 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10159 << " for final shuffle of insertelement "
10160 "external users.\n";
10161 TEs.front()->dump();
10162 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10163 Cost += C;
10164 }
10165 } else {
10166 if (VF == 0) {
10167 if (TEs.front() &&
10168 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10169 VF = TEs.front()->getVectorFactor();
10170 else
10171 VF = Mask.size();
10172 }
10173 auto *FTy =
10174 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10177 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10178 << " for final shuffle of vector node and external "
10179 "insertelement users.\n";
10180 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10181 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10182 Cost += C;
10183 }
10184 VF = Mask.size();
10185 return TEs.back();
10186 };
10187 (void)performExtractsShuffleAction<const TreeEntry>(
10188 MutableArrayRef(Vector.data(), Vector.size()), Base,
10189 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10190 EstimateShufflesCost);
10192 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10193 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10194 Cost -= InsertCost;
10195 }
10196
10197 // Add the cost for reduced value resize (if required).
10198 if (ReductionBitWidth != 0) {
10199 assert(UserIgnoreList && "Expected reduction tree.");
10200 const TreeEntry &E = *VectorizableTree.front().get();
10201 auto It = MinBWs.find(&E);
10202 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10203 unsigned SrcSize = It->second.first;
10204 unsigned DstSize = ReductionBitWidth;
10205 unsigned Opcode = Instruction::Trunc;
10206 if (SrcSize < DstSize)
10207 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10208 auto *SrcVecTy =
10209 FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10210 auto *DstVecTy =
10211 FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
10212 TTI::CastContextHint CCH = getCastContextHint(E);
10213 InstructionCost CastCost;
10214 switch (E.getOpcode()) {
10215 case Instruction::SExt:
10216 case Instruction::ZExt:
10217 case Instruction::Trunc: {
10218 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10219 CCH = getCastContextHint(*OpTE);
10220 break;
10221 }
10222 default:
10223 break;
10224 }
10225 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10227 Cost += CastCost;
10228 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10229 << " for final resize for reduction from " << SrcVecTy
10230 << " to " << DstVecTy << "\n";
10231 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10232 }
10233 }
10234
10235#ifndef NDEBUG
10236 SmallString<256> Str;
10237 {
10239 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10240 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10241 << "SLP: Total Cost = " << Cost << ".\n";
10242 }
10243 LLVM_DEBUG(dbgs() << Str);
10244 if (ViewSLPTree)
10245 ViewGraph(this, "SLP" + F->getName(), false, Str);
10246#endif
10247
10248 return Cost;
10249}
10250
10251/// Tries to find extractelement instructions with constant indices from fixed
10252/// vector type and gather such instructions into a bunch, which highly likely
10253/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10254/// successful, the matched scalars are replaced by poison values in \p VL for
10255/// future analysis.
10256std::optional<TTI::ShuffleKind>
10257BoUpSLP::tryToGatherSingleRegisterExtractElements(
10259 // Scan list of gathered scalars for extractelements that can be represented
10260 // as shuffles.
10262 SmallVector<int> UndefVectorExtracts;
10263 for (int I = 0, E = VL.size(); I < E; ++I) {
10264 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10265 if (!EI) {
10266 if (isa<UndefValue>(VL[I]))
10267 UndefVectorExtracts.push_back(I);
10268 continue;
10269 }
10270 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10271 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10272 continue;
10273 std::optional<unsigned> Idx = getExtractIndex(EI);
10274 // Undefined index.
10275 if (!Idx) {
10276 UndefVectorExtracts.push_back(I);
10277 continue;
10278 }
10279 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10280 ExtractMask.reset(*Idx);
10281 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10282 UndefVectorExtracts.push_back(I);
10283 continue;
10284 }
10285 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10286 }
10287 // Sort the vector operands by the maximum number of uses in extractelements.
10289 for (const auto &Data : VectorOpToIdx)
10290 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
10291 .push_back(Data.first);
10292 for (auto &Data : VFToVector) {
10293 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
10294 return VectorOpToIdx.find(V1)->second.size() >
10295 VectorOpToIdx.find(V2)->second.size();
10296 });
10297 }
10298 // Find the best pair of the vectors with the same number of elements or a
10299 // single vector.
10300 const int UndefSz = UndefVectorExtracts.size();
10301 unsigned SingleMax = 0;
10302 Value *SingleVec = nullptr;
10303 unsigned PairMax = 0;
10304 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10305 for (auto &Data : VFToVector) {
10306 Value *V1 = Data.second.front();
10307 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10308 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10309 SingleVec = V1;
10310 }
10311 Value *V2 = nullptr;
10312 if (Data.second.size() > 1)
10313 V2 = *std::next(Data.second.begin());
10314 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10315 UndefSz) {
10316 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10317 PairVec = std::make_pair(V1, V2);
10318 }
10319 }
10320 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10321 return std::nullopt;
10322 // Check if better to perform a shuffle of 2 vectors or just of a single
10323 // vector.
10324 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10325 SmallVector<Value *> GatheredExtracts(
10326 VL.size(), PoisonValue::get(VL.front()->getType()));
10327 if (SingleMax >= PairMax && SingleMax) {
10328 for (int Idx : VectorOpToIdx[SingleVec])
10329 std::swap(GatheredExtracts[Idx], VL[Idx]);
10330 } else {
10331 for (Value *V : {PairVec.first, PairVec.second})
10332 for (int Idx : VectorOpToIdx[V])
10333 std::swap(GatheredExtracts[Idx], VL[Idx]);
10334 }
10335 // Add extracts from undefs too.
10336 for (int Idx : UndefVectorExtracts)
10337 std::swap(GatheredExtracts[Idx], VL[Idx]);
10338 // Check that gather of extractelements can be represented as just a
10339 // shuffle of a single/two vectors the scalars are extracted from.
10340 std::optional<TTI::ShuffleKind> Res =
10341 isFixedVectorShuffle(GatheredExtracts, Mask);
10342 if (!Res) {
10343 // TODO: try to check other subsets if possible.
10344 // Restore the original VL if attempt was not successful.
10345 copy(SavedVL, VL.begin());
10346 return std::nullopt;
10347 }
10348 // Restore unused scalars from mask, if some of the extractelements were not
10349 // selected for shuffle.
10350 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10351 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10352 isa<UndefValue>(GatheredExtracts[I])) {
10353 std::swap(VL[I], GatheredExtracts[I]);
10354 continue;
10355 }
10356 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10357 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10358 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10359 is_contained(UndefVectorExtracts, I))
10360 continue;
10361 }
10362 return Res;
10363}
10364
10365/// Tries to find extractelement instructions with constant indices from fixed
10366/// vector type and gather such instructions into a bunch, which highly likely
10367/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10368/// successful, the matched scalars are replaced by poison values in \p VL for
10369/// future analysis.
10371BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10373 unsigned NumParts) const {
10374 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10375 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10376 Mask.assign(VL.size(), PoisonMaskElem);
10377 unsigned SliceSize = VL.size() / NumParts;
10378 for (unsigned Part = 0; Part < NumParts; ++Part) {
10379 // Scan list of gathered scalars for extractelements that can be represented
10380 // as shuffles.
10382 MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
10383 SmallVector<int> SubMask;
10384 std::optional<TTI::ShuffleKind> Res =
10385 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10386 ShufflesRes[Part] = Res;
10387 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10388 }
10389 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10390 return Res.has_value();
10391 }))
10392 ShufflesRes.clear();
10393 return ShufflesRes;
10394}
10395
10396std::optional<TargetTransformInfo::ShuffleKind>
10397BoUpSLP::isGatherShuffledSingleRegisterEntry(
10398 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10399 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10400 Entries.clear();
10401 // TODO: currently checking only for Scalars in the tree entry, need to count
10402 // reused elements too for better cost estimation.
10403 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10404 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10405 const BasicBlock *TEInsertBlock = nullptr;
10406 // Main node of PHI entries keeps the correct order of operands/incoming
10407 // blocks.
10408 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10409 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10410 TEInsertPt = TEInsertBlock->getTerminator();
10411 } else {
10412 TEInsertBlock = TEInsertPt->getParent();
10413 }
10414 if (!DT->isReachableFromEntry(TEInsertBlock))
10415 return std::nullopt;
10416 auto *NodeUI = DT->getNode(TEInsertBlock);
10417 assert(NodeUI && "Should only process reachable instructions");
10418 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10419 auto CheckOrdering = [&](const Instruction *InsertPt) {
10420 // Argument InsertPt is an instruction where vector code for some other
10421 // tree entry (one that shares one or more scalars with TE) is going to be
10422 // generated. This lambda returns true if insertion point of vector code
10423 // for the TE dominates that point (otherwise dependency is the other way
10424 // around). The other node is not limited to be of a gather kind. Gather
10425 // nodes are not scheduled and their vector code is inserted before their
10426 // first user. If user is PHI, that is supposed to be at the end of a
10427 // predecessor block. Otherwise it is the last instruction among scalars of
10428 // the user node. So, instead of checking dependency between instructions
10429 // themselves, we check dependency between their insertion points for vector
10430 // code (since each scalar instruction ends up as a lane of a vector
10431 // instruction).
10432 const BasicBlock *InsertBlock = InsertPt->getParent();
10433 auto *NodeEUI = DT->getNode(InsertBlock);
10434 if (!NodeEUI)
10435 return false;
10436 assert((NodeUI == NodeEUI) ==
10437 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10438 "Different nodes should have different DFS numbers");
10439 // Check the order of the gather nodes users.
10440 if (TEInsertPt->getParent() != InsertBlock &&
10441 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10442 return false;
10443 if (TEInsertPt->getParent() == InsertBlock &&
10444 TEInsertPt->comesBefore(InsertPt))
10445 return false;
10446 return true;
10447 };
10448 // Find all tree entries used by the gathered values. If no common entries
10449 // found - not a shuffle.
10450 // Here we build a set of tree nodes for each gathered value and trying to
10451 // find the intersection between these sets. If we have at least one common
10452 // tree node for each gathered value - we have just a permutation of the
10453 // single vector. If we have 2 different sets, we're in situation where we
10454 // have a permutation of 2 input vectors.
10456 DenseMap<Value *, int> UsedValuesEntry;
10457 for (Value *V : VL) {
10458 if (isConstant(V))
10459 continue;
10460 // Build a list of tree entries where V is used.
10462 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10463 if (TEPtr == TE)
10464 continue;
10465 assert(any_of(TEPtr->Scalars,
10466 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10467 "Must contain at least single gathered value.");
10468 assert(TEPtr->UserTreeIndices.size() == 1 &&
10469 "Expected only single user of a gather node.");
10470 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10471
10472 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10473 const Instruction *InsertPt =
10474 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10475 : &getLastInstructionInBundle(UseEI.UserTE);
10476 if (TEInsertPt == InsertPt) {
10477 // If 2 gathers are operands of the same entry (regardless of whether
10478 // user is PHI or else), compare operands indices, use the earlier one
10479 // as the base.
10480 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10481 continue;
10482 // If the user instruction is used for some reason in different
10483 // vectorized nodes - make it depend on index.
10484 if (TEUseEI.UserTE != UseEI.UserTE &&
10485 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10486 continue;
10487 }
10488
10489 // Check if the user node of the TE comes after user node of TEPtr,
10490 // otherwise TEPtr depends on TE.
10491 if ((TEInsertBlock != InsertPt->getParent() ||
10492 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10493 !CheckOrdering(InsertPt))
10494 continue;
10495 VToTEs.insert(TEPtr);
10496 }
10497 if (const TreeEntry *VTE = getTreeEntry(V)) {
10498 if (ForOrder) {
10499 if (VTE->State != TreeEntry::Vectorize) {
10500 auto It = MultiNodeScalars.find(V);
10501 if (It == MultiNodeScalars.end())
10502 continue;
10503 VTE = *It->getSecond().begin();
10504 // Iterate through all vectorized nodes.
10505 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10506 return MTE->State == TreeEntry::Vectorize;
10507 });
10508 if (MIt == It->getSecond().end())
10509 continue;
10510 VTE = *MIt;
10511 }
10512 }
10513 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10514 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10515 continue;
10516 VToTEs.insert(VTE);
10517 }
10518 if (VToTEs.empty())
10519 continue;
10520 if (UsedTEs.empty()) {
10521 // The first iteration, just insert the list of nodes to vector.
10522 UsedTEs.push_back(VToTEs);
10523 UsedValuesEntry.try_emplace(V, 0);
10524 } else {
10525 // Need to check if there are any previously used tree nodes which use V.
10526 // If there are no such nodes, consider that we have another one input
10527 // vector.
10528 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10529 unsigned Idx = 0;
10530 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10531 // Do we have a non-empty intersection of previously listed tree entries
10532 // and tree entries using current V?
10533 set_intersect(VToTEs, Set);
10534 if (!VToTEs.empty()) {
10535 // Yes, write the new subset and continue analysis for the next
10536 // scalar.
10537 Set.swap(VToTEs);
10538 break;
10539 }
10540 VToTEs = SavedVToTEs;
10541 ++Idx;
10542 }
10543 // No non-empty intersection found - need to add a second set of possible
10544 // source vectors.
10545 if (Idx == UsedTEs.size()) {
10546 // If the number of input vectors is greater than 2 - not a permutation,
10547 // fallback to the regular gather.
10548 // TODO: support multiple reshuffled nodes.
10549 if (UsedTEs.size() == 2)
10550 continue;
10551 UsedTEs.push_back(SavedVToTEs);
10552 Idx = UsedTEs.size() - 1;
10553 }
10554 UsedValuesEntry.try_emplace(V, Idx);
10555 }
10556 }
10557
10558 if (UsedTEs.empty()) {
10559 Entries.clear();
10560 return std::nullopt;
10561 }
10562
10563 unsigned VF = 0;
10564 if (UsedTEs.size() == 1) {
10565 // Keep the order to avoid non-determinism.
10566 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10567 UsedTEs.front().end());
10568 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10569 return TE1->Idx < TE2->Idx;
10570 });
10571 // Try to find the perfect match in another gather node at first.
10572 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10573 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10574 });
10575 if (It != FirstEntries.end() &&
10576 ((*It)->getVectorFactor() == VL.size() ||
10577 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10578 TE->ReuseShuffleIndices.size() == VL.size() &&
10579 (*It)->isSame(TE->Scalars)))) {
10580 Entries.push_back(*It);
10581 if ((*It)->getVectorFactor() == VL.size()) {
10582 std::iota(std::next(Mask.begin(), Part * VL.size()),
10583 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10584 } else {
10585 SmallVector<int> CommonMask = TE->getCommonMask();
10586 copy(CommonMask, Mask.begin());
10587 }
10588 // Clear undef scalars.
10589 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10590 if (isa<PoisonValue>(VL[I]))
10593 }
10594 // No perfect match, just shuffle, so choose the first tree node from the
10595 // tree.
10596 Entries.push_back(FirstEntries.front());
10597 } else {
10598 // Try to find nodes with the same vector factor.
10599 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10600 // Keep the order of tree nodes to avoid non-determinism.
10602 for (const TreeEntry *TE : UsedTEs.front()) {
10603 unsigned VF = TE->getVectorFactor();
10604 auto It = VFToTE.find(VF);
10605 if (It != VFToTE.end()) {
10606 if (It->second->Idx > TE->Idx)
10607 It->getSecond() = TE;
10608 continue;
10609 }
10610 VFToTE.try_emplace(VF, TE);
10611 }
10612 // Same, keep the order to avoid non-determinism.
10613 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10614 UsedTEs.back().end());
10615 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10616 return TE1->Idx < TE2->Idx;
10617 });
10618 for (const TreeEntry *TE : SecondEntries) {
10619 auto It = VFToTE.find(TE->getVectorFactor());
10620 if (It != VFToTE.end()) {
10621 VF = It->first;
10622 Entries.push_back(It->second);
10623 Entries.push_back(TE);
10624 break;
10625 }
10626 }
10627 // No 2 source vectors with the same vector factor - just choose 2 with max
10628 // index.
10629 if (Entries.empty()) {
10630 Entries.push_back(*llvm::max_element(
10631 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10632 return TE1->Idx < TE2->Idx;
10633 }));
10634 Entries.push_back(SecondEntries.front());
10635 VF = std::max(Entries.front()->getVectorFactor(),
10636 Entries.back()->getVectorFactor());
10637 }
10638 }
10639
10640 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
10641 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10642 // vectorized.
10643 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10644 auto *PHI = cast<PHINode>(V);
10645 auto *PHI1 = cast<PHINode>(V1);
10646 // Check that all incoming values are compatible/from same parent (if they
10647 // are instructions).
10648 // The incoming values are compatible if they all are constants, or
10649 // instruction with the same/alternate opcodes from the same basic block.
10650 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10651 Value *In = PHI->getIncomingValue(I);
10652 Value *In1 = PHI1->getIncomingValue(I);
10653 if (isConstant(In) && isConstant(In1))
10654 continue;
10655 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
10656 return false;
10657 if (cast<Instruction>(In)->getParent() !=
10658 cast<Instruction>(In1)->getParent())
10659 return false;
10660 }
10661 return true;
10662 };
10663 // Check if the value can be ignored during analysis for shuffled gathers.
10664 // We suppose it is better to ignore instruction, which do not form splats,
10665 // are not vectorized/not extractelements (these instructions will be handled
10666 // by extractelements processing) or may form vector node in future.
10667 auto MightBeIgnored = [=](Value *V) {
10668 auto *I = dyn_cast<Instruction>(V);
10669 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
10671 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
10672 };
10673 // Check that the neighbor instruction may form a full vector node with the
10674 // current instruction V. It is possible, if they have same/alternate opcode
10675 // and same parent basic block.
10676 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
10677 Value *V1 = VL[Idx];
10678 bool UsedInSameVTE = false;
10679 auto It = UsedValuesEntry.find(V1);
10680 if (It != UsedValuesEntry.end())
10681 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
10682 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10683 getSameOpcode({V, V1}, *TLI).getOpcode() &&
10684 cast<Instruction>(V)->getParent() ==
10685 cast<Instruction>(V1)->getParent() &&
10686 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10687 };
10688 // Build a shuffle mask for better cost estimation and vector emission.
10689 SmallBitVector UsedIdxs(Entries.size());
10691 for (int I = 0, E = VL.size(); I < E; ++I) {
10692 Value *V = VL[I];
10693 auto It = UsedValuesEntry.find(V);
10694 if (It == UsedValuesEntry.end())
10695 continue;
10696 // Do not try to shuffle scalars, if they are constants, or instructions
10697 // that can be vectorized as a result of the following vector build
10698 // vectorization.
10699 if (isConstant(V) || (MightBeIgnored(V) &&
10700 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
10701 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
10702 continue;
10703 unsigned Idx = It->second;
10704 EntryLanes.emplace_back(Idx, I);
10705 UsedIdxs.set(Idx);
10706 }
10707 // Iterate through all shuffled scalars and select entries, which can be used
10708 // for final shuffle.
10710 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
10711 if (!UsedIdxs.test(I))
10712 continue;
10713 // Fix the entry number for the given scalar. If it is the first entry, set
10714 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
10715 // These indices are used when calculating final shuffle mask as the vector
10716 // offset.
10717 for (std::pair<unsigned, int> &Pair : EntryLanes)
10718 if (Pair.first == I)
10719 Pair.first = TempEntries.size();
10720 TempEntries.push_back(Entries[I]);
10721 }
10722 Entries.swap(TempEntries);
10723 if (EntryLanes.size() == Entries.size() &&
10724 !VL.equals(ArrayRef(TE->Scalars)
10725 .slice(Part * VL.size(),
10726 std::min<int>(VL.size(), TE->Scalars.size())))) {
10727 // We may have here 1 or 2 entries only. If the number of scalars is equal
10728 // to the number of entries, no need to do the analysis, it is not very
10729 // profitable. Since VL is not the same as TE->Scalars, it means we already
10730 // have some shuffles before. Cut off not profitable case.
10731 Entries.clear();
10732 return std::nullopt;
10733 }
10734 // Build the final mask, check for the identity shuffle, if possible.
10735 bool IsIdentity = Entries.size() == 1;
10736 // Pair.first is the offset to the vector, while Pair.second is the index of
10737 // scalar in the list.
10738 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
10739 unsigned Idx = Part * VL.size() + Pair.second;
10740 Mask[Idx] =
10741 Pair.first * VF +
10742 (ForOrder ? std::distance(
10743 Entries[Pair.first]->Scalars.begin(),
10744 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10745 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10746 IsIdentity &= Mask[Idx] == Pair.second;
10747 }
10748 switch (Entries.size()) {
10749 case 1:
10750 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10752 break;
10753 case 2:
10754 if (EntryLanes.size() > 2 || VL.size() <= 2)
10756 break;
10757 default:
10758 break;
10759 }
10760 Entries.clear();
10761 // Clear the corresponding mask elements.
10762 std::fill(std::next(Mask.begin(), Part * VL.size()),
10763 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
10764 return std::nullopt;
10765}
10766
10768BoUpSLP::isGatherShuffledEntry(
10769 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
10770 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
10771 bool ForOrder) {
10772 assert(NumParts > 0 && NumParts < VL.size() &&
10773 "Expected positive number of registers.");
10774 Entries.clear();
10775 // No need to check for the topmost gather node.
10776 if (TE == VectorizableTree.front().get())
10777 return {};
10778 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10779 if (TE->isNonPowOf2Vec())
10780 return {};
10781 Mask.assign(VL.size(), PoisonMaskElem);
10782 assert(TE->UserTreeIndices.size() == 1 &&
10783 "Expected only single user of the gather node.");
10784 assert(VL.size() % NumParts == 0 &&
10785 "Number of scalars must be divisible by NumParts.");
10786 unsigned SliceSize = VL.size() / NumParts;
10788 for (unsigned Part = 0; Part < NumParts; ++Part) {
10789 ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
10790 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
10791 std::optional<TTI::ShuffleKind> SubRes =
10792 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10793 ForOrder);
10794 if (!SubRes)
10795 SubEntries.clear();
10796 Res.push_back(SubRes);
10797 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
10798 SubEntries.front()->getVectorFactor() == VL.size() &&
10799 (SubEntries.front()->isSame(TE->Scalars) ||
10800 SubEntries.front()->isSame(VL))) {
10801 SmallVector<const TreeEntry *> LocalSubEntries;
10802 LocalSubEntries.swap(SubEntries);
10803 Entries.clear();
10804 Res.clear();
10805 std::iota(Mask.begin(), Mask.end(), 0);
10806 // Clear undef scalars.
10807 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10808 if (isa<PoisonValue>(VL[I]))
10810 Entries.emplace_back(1, LocalSubEntries.front());
10812 return Res;
10813 }
10814 }
10815 if (all_of(Res,
10816 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
10817 Entries.clear();
10818 return {};
10819 }
10820 return Res;
10821}
10822
10823InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
10824 bool ForPoisonSrc) const {
10825 // Find the type of the operands in VL.
10826 Type *ScalarTy = VL[0]->getType();
10827 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10828 ScalarTy = SI->getValueOperand()->getType();
10829 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
10830 bool DuplicateNonConst = false;
10831 // Find the cost of inserting/extracting values from the vector.
10832 // Check if the same elements are inserted several times and count them as
10833 // shuffle candidates.
10834 APInt ShuffledElements = APInt::getZero(VL.size());
10835 DenseMap<Value *, unsigned> UniqueElements;
10838 auto EstimateInsertCost = [&](unsigned I, Value *V) {
10839 if (!ForPoisonSrc)
10840 Cost +=
10841 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
10842 I, Constant::getNullValue(VecTy), V);
10843 };
10844 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10845 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
10846 Value *V = VL[I];
10847 // No need to shuffle duplicates for constants.
10848 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
10849 ShuffledElements.setBit(I);
10850 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
10851 continue;
10852 }
10853
10854 auto Res = UniqueElements.try_emplace(V, I);
10855 if (Res.second) {
10856 EstimateInsertCost(I, V);
10857 ShuffleMask[I] = I;
10858 continue;
10859 }
10860
10861 DuplicateNonConst = true;
10862 ShuffledElements.setBit(I);
10863 ShuffleMask[I] = Res.first->second;
10864 }
10865 if (ForPoisonSrc)
10866 Cost =
10867 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
10868 /*Extract*/ false, CostKind);
10869 if (DuplicateNonConst)
10871 VecTy, ShuffleMask);
10872 return Cost;
10873}
10874
10875// Perform operand reordering on the instructions in VL and return the reordered
10876// operands in Left and Right.
10877void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
10880 const BoUpSLP &R) {
10881 if (VL.empty())
10882 return;
10883 VLOperands Ops(VL, R);
10884 // Reorder the operands in place.
10885 Ops.reorder();
10886 Left = Ops.getVL(0);
10887 Right = Ops.getVL(1);
10888}
10889
10890Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
10891 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
10892 if (Res.second)
10893 return *Res.second;
10894 // Get the basic block this bundle is in. All instructions in the bundle
10895 // should be in this block (except for extractelement-like instructions with
10896 // constant indeces).
10897 auto *Front = E->getMainOp();
10898 auto *BB = Front->getParent();
10899 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
10900 if (E->getOpcode() == Instruction::GetElementPtr &&
10901 !isa<GetElementPtrInst>(V))
10902 return true;
10903 auto *I = cast<Instruction>(V);
10904 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10905 isVectorLikeInstWithConstOps(I);
10906 }));
10907
10908 auto FindLastInst = [&]() {
10909 Instruction *LastInst = Front;
10910 for (Value *V : E->Scalars) {
10911 auto *I = dyn_cast<Instruction>(V);
10912 if (!I)
10913 continue;
10914 if (LastInst->getParent() == I->getParent()) {
10915 if (LastInst->comesBefore(I))
10916 LastInst = I;
10917 continue;
10918 }
10919 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10920 !isa<GetElementPtrInst>(I)) ||
10921 (isVectorLikeInstWithConstOps(LastInst) &&
10923 "Expected vector-like or non-GEP in GEP node insts only.");
10924 if (!DT->isReachableFromEntry(LastInst->getParent())) {
10925 LastInst = I;
10926 continue;
10927 }
10928 if (!DT->isReachableFromEntry(I->getParent()))
10929 continue;
10930 auto *NodeA = DT->getNode(LastInst->getParent());
10931 auto *NodeB = DT->getNode(I->getParent());
10932 assert(NodeA && "Should only process reachable instructions");
10933 assert(NodeB && "Should only process reachable instructions");
10934 assert((NodeA == NodeB) ==
10935 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10936 "Different nodes should have different DFS numbers");
10937 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10938 LastInst = I;
10939 }
10940 BB = LastInst->getParent();
10941 return LastInst;
10942 };
10943
10944 auto FindFirstInst = [&]() {
10945 Instruction *FirstInst = Front;
10946 for (Value *V : E->Scalars) {
10947 auto *I = dyn_cast<Instruction>(V);
10948 if (!I)
10949 continue;
10950 if (FirstInst->getParent() == I->getParent()) {
10951 if (I->comesBefore(FirstInst))
10952 FirstInst = I;
10953 continue;
10954 }
10955 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10956 !isa<GetElementPtrInst>(I)) ||
10957 (isVectorLikeInstWithConstOps(FirstInst) &&
10959 "Expected vector-like or non-GEP in GEP node insts only.");
10960 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
10961 FirstInst = I;
10962 continue;
10963 }
10964 if (!DT->isReachableFromEntry(I->getParent()))
10965 continue;
10966 auto *NodeA = DT->getNode(FirstInst->getParent());
10967 auto *NodeB = DT->getNode(I->getParent());
10968 assert(NodeA && "Should only process reachable instructions");
10969 assert(NodeB && "Should only process reachable instructions");
10970 assert((NodeA == NodeB) ==
10971 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10972 "Different nodes should have different DFS numbers");
10973 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
10974 FirstInst = I;
10975 }
10976 return FirstInst;
10977 };
10978
10979 // Set the insert point to the beginning of the basic block if the entry
10980 // should not be scheduled.
10981 if (doesNotNeedToSchedule(E->Scalars) ||
10982 (E->State != TreeEntry::NeedToGather &&
10983 all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
10984 if ((E->getOpcode() == Instruction::GetElementPtr &&
10985 any_of(E->Scalars,
10986 [](Value *V) {
10987 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
10988 })) ||
10989 all_of(E->Scalars,
10990 [](Value *V) {
10991 return !isVectorLikeInstWithConstOps(V) &&
10992 isUsedOutsideBlock(V);
10993 }) ||
10994 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
10995 all_of(E->Scalars, IsaPred<ExtractElementInst, UndefValue>)))
10996 Res.second = FindLastInst();
10997 else
10998 Res.second = FindFirstInst();
10999 return *Res.second;
11000 }
11001
11002 // Find the last instruction. The common case should be that BB has been
11003 // scheduled, and the last instruction is VL.back(). So we start with
11004 // VL.back() and iterate over schedule data until we reach the end of the
11005 // bundle. The end of the bundle is marked by null ScheduleData.
11006 if (BlocksSchedules.count(BB)) {
11007 Value *V = E->isOneOf(E->Scalars.back());
11009 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11010 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11011 if (Bundle && Bundle->isPartOfBundle())
11012 for (; Bundle; Bundle = Bundle->NextInBundle)
11013 if (Bundle->OpValue == Bundle->Inst)
11014 Res.second = Bundle->Inst;
11015 }
11016
11017 // LastInst can still be null at this point if there's either not an entry
11018 // for BB in BlocksSchedules or there's no ScheduleData available for
11019 // VL.back(). This can be the case if buildTree_rec aborts for various
11020 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11021 // size is reached, etc.). ScheduleData is initialized in the scheduling
11022 // "dry-run".
11023 //
11024 // If this happens, we can still find the last instruction by brute force. We
11025 // iterate forwards from Front (inclusive) until we either see all
11026 // instructions in the bundle or reach the end of the block. If Front is the
11027 // last instruction in program order, LastInst will be set to Front, and we
11028 // will visit all the remaining instructions in the block.
11029 //
11030 // One of the reasons we exit early from buildTree_rec is to place an upper
11031 // bound on compile-time. Thus, taking an additional compile-time hit here is
11032 // not ideal. However, this should be exceedingly rare since it requires that
11033 // we both exit early from buildTree_rec and that the bundle be out-of-order
11034 // (causing us to iterate all the way to the end of the block).
11035 if (!Res.second)
11036 Res.second = FindLastInst();
11037 assert(Res.second && "Failed to find last instruction in bundle");
11038 return *Res.second;
11039}
11040
11041void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11042 auto *Front = E->getMainOp();
11043 Instruction *LastInst = &getLastInstructionInBundle(E);
11044 assert(LastInst && "Failed to find last instruction in bundle");
11045 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11046 // If the instruction is PHI, set the insert point after all the PHIs.
11047 bool IsPHI = isa<PHINode>(LastInst);
11048 if (IsPHI)
11049 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11050 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11051 doesNotNeedToSchedule(E->Scalars))) {
11052 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11053 } else {
11054 // Set the insertion point after the last instruction in the bundle. Set the
11055 // debug location to Front.
11056 Builder.SetInsertPoint(
11057 LastInst->getParent(),
11059 }
11060 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11061}
11062
11063Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
11064 // List of instructions/lanes from current block and/or the blocks which are
11065 // part of the current loop. These instructions will be inserted at the end to
11066 // make it possible to optimize loops and hoist invariant instructions out of
11067 // the loops body with better chances for success.
11069 SmallSet<int, 4> PostponedIndices;
11070 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11071 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11073 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11074 InsertBB = InsertBB->getSinglePredecessor();
11075 return InsertBB && InsertBB == InstBB;
11076 };
11077 for (int I = 0, E = VL.size(); I < E; ++I) {
11078 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11079 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11080 getTreeEntry(Inst) ||
11081 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11082 PostponedIndices.insert(I).second)
11083 PostponedInsts.emplace_back(Inst, I);
11084 }
11085
11086 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11087 Type *Ty) {
11088 Value *Scalar = V;
11089 if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
11090 assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11091 "Expected integer types only.");
11092 Vec = Builder.CreateIntCast(
11093 Vec,
11094 VectorType::get(Ty,
11095 cast<VectorType>(Vec->getType())->getElementCount()),
11096 !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
11097 }
11098
11099 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11100 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11101 if (!InsElt)
11102 return Vec;
11103 GatherShuffleExtractSeq.insert(InsElt);
11104 CSEBlocks.insert(InsElt->getParent());
11105 // Add to our 'need-to-extract' list.
11106 if (isa<Instruction>(V)) {
11107 if (TreeEntry *Entry = getTreeEntry(V)) {
11108 // Find which lane we need to extract.
11109 User *UserOp = nullptr;
11110 if (Scalar != V) {
11111 if (auto *SI = dyn_cast<Instruction>(Scalar))
11112 UserOp = SI;
11113 } else {
11114 UserOp = InsElt;
11115 }
11116 if (UserOp) {
11117 unsigned FoundLane = Entry->findLaneForValue(V);
11118 ExternalUses.emplace_back(V, UserOp, FoundLane);
11119 }
11120 }
11121 }
11122 return Vec;
11123 };
11124 Value *Val0 =
11125 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
11126 Type *ScalarTy = Val0->getType();
11127 FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11128 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11129 SmallVector<int> NonConsts;
11130 // Insert constant values at first.
11131 for (int I = 0, E = VL.size(); I < E; ++I) {
11132 if (PostponedIndices.contains(I))
11133 continue;
11134 if (!isConstant(VL[I])) {
11135 NonConsts.push_back(I);
11136 continue;
11137 }
11138 if (Root) {
11139 if (!isa<UndefValue>(VL[I])) {
11140 NonConsts.push_back(I);
11141 continue;
11142 }
11143 if (isa<PoisonValue>(VL[I]))
11144 continue;
11145 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11146 if (SV->getMaskValue(I) == PoisonMaskElem)
11147 continue;
11148 }
11149 }
11150 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11151 }
11152 // Insert non-constant values.
11153 for (int I : NonConsts)
11154 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11155 // Append instructions, which are/may be part of the loop, in the end to make
11156 // it possible to hoist non-loop-based instructions.
11157 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11158 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11159
11160 return Vec;
11161}
11162
11163/// Merges shuffle masks and emits final shuffle instruction, if required. It
11164/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11165/// when the actual shuffle instruction is generated only if this is actually
11166/// required. Otherwise, the shuffle instruction emission is delayed till the
11167/// end of the process, to reduce the number of emitted instructions and further
11168/// analysis/transformations.
11169/// The class also will look through the previously emitted shuffle instructions
11170/// and properly mark indices in mask as undef.
11171/// For example, given the code
11172/// \code
11173/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11174/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11175/// \endcode
11176/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11177/// look through %s1 and %s2 and emit
11178/// \code
11179/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11180/// \endcode
11181/// instead.
11182/// If 2 operands are of different size, the smallest one will be resized and
11183/// the mask recalculated properly.
11184/// For example, given the code
11185/// \code
11186/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11187/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11188/// \endcode
11189/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11190/// look through %s1 and %s2 and emit
11191/// \code
11192/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11193/// \endcode
11194/// instead.
11195class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11196 bool IsFinalized = false;
11197 /// Combined mask for all applied operands and masks. It is built during
11198 /// analysis and actual emission of shuffle vector instructions.
11199 SmallVector<int> CommonMask;
11200 /// List of operands for the shuffle vector instruction. It hold at max 2
11201 /// operands, if the 3rd is going to be added, the first 2 are combined into
11202 /// shuffle with \p CommonMask mask, the first operand sets to be the
11203 /// resulting shuffle and the second operand sets to be the newly added
11204 /// operand. The \p CommonMask is transformed in the proper way after that.
11205 SmallVector<Value *, 2> InVectors;
11206 IRBuilderBase &Builder;
11207 BoUpSLP &R;
11208
11209 class ShuffleIRBuilder {
11210 IRBuilderBase &Builder;
11211 /// Holds all of the instructions that we gathered.
11212 SetVector<Instruction *> &GatherShuffleExtractSeq;
11213 /// A list of blocks that we are going to CSE.
11214 DenseSet<BasicBlock *> &CSEBlocks;
11215 /// Data layout.
11216 const DataLayout &DL;
11217
11218 public:
11219 ShuffleIRBuilder(IRBuilderBase &Builder,
11220 SetVector<Instruction *> &GatherShuffleExtractSeq,
11221 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11222 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11223 CSEBlocks(CSEBlocks), DL(DL) {}
11224 ~ShuffleIRBuilder() = default;
11225 /// Creates shufflevector for the 2 operands with the given mask.
11226 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11227 if (V1->getType() != V2->getType()) {
11229 V1->getType()->isIntOrIntVectorTy() &&
11230 "Expected integer vector types only.");
11231 if (V1->getType() != V2->getType()) {
11232 if (cast<VectorType>(V2->getType())
11233 ->getElementType()
11234 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11235 ->getElementType()
11236 ->getIntegerBitWidth())
11237 V2 = Builder.CreateIntCast(
11238 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11239 else
11240 V1 = Builder.CreateIntCast(
11241 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11242 }
11243 }
11244 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11245 if (auto *I = dyn_cast<Instruction>(Vec)) {
11246 GatherShuffleExtractSeq.insert(I);
11247 CSEBlocks.insert(I->getParent());
11248 }
11249 return Vec;
11250 }
11251 /// Creates permutation of the single vector operand with the given mask, if
11252 /// it is not identity mask.
11253 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11254 if (Mask.empty())
11255 return V1;
11256 unsigned VF = Mask.size();
11257 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11258 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11259 return V1;
11260 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11261 if (auto *I = dyn_cast<Instruction>(Vec)) {
11262 GatherShuffleExtractSeq.insert(I);
11263 CSEBlocks.insert(I->getParent());
11264 }
11265 return Vec;
11266 }
11267 Value *createIdentity(Value *V) { return V; }
11268 Value *createPoison(Type *Ty, unsigned VF) {
11269 return PoisonValue::get(FixedVectorType::get(Ty, VF));
11270 }
11271 /// Resizes 2 input vector to match the sizes, if the they are not equal
11272 /// yet. The smallest vector is resized to the size of the larger vector.
11273 void resizeToMatch(Value *&V1, Value *&V2) {
11274 if (V1->getType() == V2->getType())
11275 return;
11276 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11277 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11278 int VF = std::max(V1VF, V2VF);
11279 int MinVF = std::min(V1VF, V2VF);
11280 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11281 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11282 0);
11283 Value *&Op = MinVF == V1VF ? V1 : V2;
11284 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11285 if (auto *I = dyn_cast<Instruction>(Op)) {
11286 GatherShuffleExtractSeq.insert(I);
11287 CSEBlocks.insert(I->getParent());
11288 }
11289 if (MinVF == V1VF)
11290 V1 = Op;
11291 else
11292 V2 = Op;
11293 }
11294 };
11295
11296 /// Smart shuffle instruction emission, walks through shuffles trees and
11297 /// tries to find the best matching vector for the actual shuffle
11298 /// instruction.
11299 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11300 assert(V1 && "Expected at least one vector value.");
11301 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11302 R.CSEBlocks, *R.DL);
11303 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11304 ShuffleBuilder);
11305 }
11306
11307 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11308 /// shuffle emission.
11309 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11310 ArrayRef<int> Mask) {
11311 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11312 if (Mask[Idx] != PoisonMaskElem)
11313 CommonMask[Idx] = Idx;
11314 }
11315
11316public:
11318 : Builder(Builder), R(R) {}
11319
11320 /// Adjusts extractelements after reusing them.
11321 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11322 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11323 unsigned NumParts, bool &UseVecBaseAsInput) {
11324 UseVecBaseAsInput = false;
11325 SmallPtrSet<Value *, 4> UniqueBases;
11326 Value *VecBase = nullptr;
11327 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11328 int Idx = Mask[I];
11329 if (Idx == PoisonMaskElem)
11330 continue;
11331 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11332 VecBase = EI->getVectorOperand();
11333 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11334 VecBase = TE->VectorizedValue;
11335 assert(VecBase && "Expected vectorized value.");
11336 UniqueBases.insert(VecBase);
11337 // If the only one use is vectorized - can delete the extractelement
11338 // itself.
11339 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11340 any_of(EI->users(), [&](User *U) {
11341 const TreeEntry *UTE = R.getTreeEntry(U);
11342 return !UTE || R.MultiNodeScalars.contains(U) ||
11343 count_if(R.VectorizableTree,
11344 [&](const std::unique_ptr<TreeEntry> &TE) {
11345 return any_of(TE->UserTreeIndices,
11346 [&](const EdgeInfo &Edge) {
11347 return Edge.UserTE == UTE;
11348 }) &&
11349 is_contained(TE->Scalars, EI);
11350 }) != 1;
11351 }))
11352 continue;
11353 R.eraseInstruction(EI);
11354 }
11355 if (NumParts == 1 || UniqueBases.size() == 1)
11356 return VecBase;
11357 UseVecBaseAsInput = true;
11358 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11359 for (auto [I, Idx] : enumerate(Mask))
11360 if (Idx != PoisonMaskElem)
11361 Idx = I;
11362 };
11363 // Perform multi-register vector shuffle, joining them into a single virtual
11364 // long vector.
11365 // Need to shuffle each part independently and then insert all this parts
11366 // into a long virtual vector register, forming the original vector.
11367 Value *Vec = nullptr;
11368 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11369 unsigned SliceSize = E->Scalars.size() / NumParts;
11370 for (unsigned Part = 0; Part < NumParts; ++Part) {
11372 ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
11373 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
11374 constexpr int MaxBases = 2;
11375 SmallVector<Value *, MaxBases> Bases(MaxBases);
11376#ifndef NDEBUG
11377 int PrevSize = 0;
11378#endif // NDEBUG
11379 for (const auto [I, V]: enumerate(VL)) {
11380 if (SubMask[I] == PoisonMaskElem)
11381 continue;
11382 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11383 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11384 VecOp = TE->VectorizedValue;
11385 assert(VecOp && "Expected vectorized value.");
11386 const int Size =
11387 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11388#ifndef NDEBUG
11389 assert((PrevSize == Size || PrevSize == 0) &&
11390 "Expected vectors of the same size.");
11391 PrevSize = Size;
11392#endif // NDEBUG
11393 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11394 }
11395 if (!Bases.front())
11396 continue;
11397 Value *SubVec;
11398 if (Bases.back()) {
11399 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11400 TransformToIdentity(SubMask);
11401 } else {
11402 SubVec = Bases.front();
11403 }
11404 if (!Vec) {
11405 Vec = SubVec;
11406 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11407 [&](unsigned P) {
11408 ArrayRef<int> SubMask =
11409 Mask.slice(P * SliceSize, SliceSize);
11410 return all_of(SubMask, [](int Idx) {
11411 return Idx == PoisonMaskElem;
11412 });
11413 })) &&
11414 "Expected first part or all previous parts masked.");
11415 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11416 } else {
11417 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11418 if (Vec->getType() != SubVec->getType()) {
11419 unsigned SubVecVF =
11420 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11421 VF = std::max(VF, SubVecVF);
11422 }
11423 // Adjust SubMask.
11424 for (int &Idx : SubMask)
11425 if (Idx != PoisonMaskElem)
11426 Idx += VF;
11427 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11428 Vec = createShuffle(Vec, SubVec, VecMask);
11429 TransformToIdentity(VecMask);
11430 }
11431 }
11432 copy(VecMask, Mask.begin());
11433 return Vec;
11434 }
11435 /// Checks if the specified entry \p E needs to be delayed because of its
11436 /// dependency nodes.
11437 std::optional<Value *>
11438 needToDelay(const TreeEntry *E,
11440 // No need to delay emission if all deps are ready.
11441 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
11442 return all_of(
11443 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
11444 }))
11445 return std::nullopt;
11446 // Postpone gather emission, will be emitted after the end of the
11447 // process to keep correct order.
11448 auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
11449 E->getVectorFactor());
11450 return Builder.CreateAlignedLoad(
11452 MaybeAlign());
11453 }
11454 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11455 /// shuffling.
11456 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11457 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11458 }
11459 /// Adds single input vector (in form of tree entry) and the mask for its
11460 /// shuffling.
11461 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11462 add(E1.VectorizedValue, Mask);
11463 }
11464 /// Adds 2 input vectors and the mask for their shuffling.
11465 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11466 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11467 if (InVectors.empty()) {
11468 InVectors.push_back(V1);
11469 InVectors.push_back(V2);
11470 CommonMask.assign(Mask.begin(), Mask.end());
11471 return;
11472 }
11473 Value *Vec = InVectors.front();
11474 if (InVectors.size() == 2) {
11475 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11476 transformMaskAfterShuffle(CommonMask, CommonMask);
11477 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
11478 Mask.size()) {
11479 Vec = createShuffle(Vec, nullptr, CommonMask);
11480 transformMaskAfterShuffle(CommonMask, CommonMask);
11481 }
11482 V1 = createShuffle(V1, V2, Mask);
11483 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11484 if (Mask[Idx] != PoisonMaskElem)
11485 CommonMask[Idx] = Idx + Sz;
11486 InVectors.front() = Vec;
11487 if (InVectors.size() == 2)
11488 InVectors.back() = V1;
11489 else
11490 InVectors.push_back(V1);
11491 }
11492 /// Adds another one input vector and the mask for the shuffling.
11493 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11494 if (InVectors.empty()) {
11495 if (!isa<FixedVectorType>(V1->getType())) {
11496 V1 = createShuffle(V1, nullptr, CommonMask);
11497 CommonMask.assign(Mask.size(), PoisonMaskElem);
11498 transformMaskAfterShuffle(CommonMask, Mask);
11499 }
11500 InVectors.push_back(V1);
11501 CommonMask.assign(Mask.begin(), Mask.end());
11502 return;
11503 }
11504 const auto *It = find(InVectors, V1);
11505 if (It == InVectors.end()) {
11506 if (InVectors.size() == 2 ||
11507 InVectors.front()->getType() != V1->getType() ||
11508 !isa<FixedVectorType>(V1->getType())) {
11509 Value *V = InVectors.front();
11510 if (InVectors.size() == 2) {
11511 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11512 transformMaskAfterShuffle(CommonMask, CommonMask);
11513 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11514 CommonMask.size()) {
11515 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11516 transformMaskAfterShuffle(CommonMask, CommonMask);
11517 }
11518 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11519 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11520 CommonMask[Idx] =
11521 V->getType() != V1->getType()
11522 ? Idx + Sz
11523 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11524 ->getNumElements();
11525 if (V->getType() != V1->getType())
11526 V1 = createShuffle(V1, nullptr, Mask);
11527 InVectors.front() = V;
11528 if (InVectors.size() == 2)
11529 InVectors.back() = V1;
11530 else
11531 InVectors.push_back(V1);
11532 return;
11533 }
11534 // Check if second vector is required if the used elements are already
11535 // used from the first one.
11536 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11537 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11538 InVectors.push_back(V1);
11539 break;
11540 }
11541 }
11542 int VF = CommonMask.size();
11543 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11544 VF = FTy->getNumElements();
11545 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11546 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11547 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11548 }
11549 /// Adds another one input vector and the mask for the shuffling.
11551 SmallVector<int> NewMask;
11552 inversePermutation(Order, NewMask);
11553 add(V1, NewMask);
11554 }
11555 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11556 Value *Root = nullptr) {
11557 return R.gather(VL, Root);
11558 }
11559 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11560 /// Finalize emission of the shuffles.
11561 /// \param Action the action (if any) to be performed before final applying of
11562 /// the \p ExtMask mask.
11563 Value *
11564 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11565 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11566 IsFinalized = true;
11567 if (Action) {
11568 Value *Vec = InVectors.front();
11569 if (InVectors.size() == 2) {
11570 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11571 InVectors.pop_back();
11572 } else {
11573 Vec = createShuffle(Vec, nullptr, CommonMask);
11574 }
11575 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11576 if (CommonMask[Idx] != PoisonMaskElem)
11577 CommonMask[Idx] = Idx;
11578 assert(VF > 0 &&
11579 "Expected vector length for the final value before action.");
11580 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11581 if (VecVF < VF) {
11582 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11583 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11584 Vec = createShuffle(Vec, nullptr, ResizeMask);
11585 }
11586 Action(Vec, CommonMask);
11587 InVectors.front() = Vec;
11588 }
11589 if (!ExtMask.empty()) {
11590 if (CommonMask.empty()) {
11591 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11592 } else {
11593 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11594 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11595 if (ExtMask[I] == PoisonMaskElem)
11596 continue;
11597 NewMask[I] = CommonMask[ExtMask[I]];
11598 }
11599 CommonMask.swap(NewMask);
11600 }
11601 }
11602 if (CommonMask.empty()) {
11603 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11604 return InVectors.front();
11605 }
11606 if (InVectors.size() == 2)
11607 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11608 return createShuffle(InVectors.front(), nullptr, CommonMask);
11609 }
11610
11612 assert((IsFinalized || CommonMask.empty()) &&
11613 "Shuffle construction must be finalized.");
11614 }
11615};
11616
11617Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11618 bool PostponedPHIs) {
11619 ValueList &VL = E->getOperand(NodeIdx);
11620 const unsigned VF = VL.size();
11621 InstructionsState S = getSameOpcode(VL, *TLI);
11622 // Special processing for GEPs bundle, which may include non-gep values.
11623 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11624 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11625 if (It != VL.end())
11626 S = getSameOpcode(*It, *TLI);
11627 }
11628 if (S.getOpcode()) {
11629 auto CheckSameVE = [&](const TreeEntry *VE) {
11630 return VE->isSame(VL) &&
11631 (any_of(VE->UserTreeIndices,
11632 [E, NodeIdx](const EdgeInfo &EI) {
11633 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11634 }) ||
11635 any_of(VectorizableTree,
11636 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
11637 return TE->isOperandGatherNode({E, NodeIdx}) &&
11638 VE->isSame(TE->Scalars);
11639 }));
11640 };
11641 TreeEntry *VE = getTreeEntry(S.OpValue);
11642 bool IsSameVE = VE && CheckSameVE(VE);
11643 if (!IsSameVE) {
11644 auto It = MultiNodeScalars.find(S.OpValue);
11645 if (It != MultiNodeScalars.end()) {
11646 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
11647 return TE != VE && CheckSameVE(TE);
11648 });
11649 if (I != It->getSecond().end()) {
11650 VE = *I;
11651 IsSameVE = true;
11652 }
11653 }
11654 }
11655 if (IsSameVE) {
11656 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
11657 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11658 ShuffleBuilder.add(V, Mask);
11659 return ShuffleBuilder.finalize(std::nullopt);
11660 };
11661 Value *V = vectorizeTree(VE, PostponedPHIs);
11662 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
11663 if (!VE->ReuseShuffleIndices.empty()) {
11664 // Reshuffle to get only unique values.
11665 // If some of the scalars are duplicated in the vectorization
11666 // tree entry, we do not vectorize them but instead generate a
11667 // mask for the reuses. But if there are several users of the
11668 // same entry, they may have different vectorization factors.
11669 // This is especially important for PHI nodes. In this case, we
11670 // need to adapt the resulting instruction for the user
11671 // vectorization factor and have to reshuffle it again to take
11672 // only unique elements of the vector. Without this code the
11673 // function incorrectly returns reduced vector instruction with
11674 // the same elements, not with the unique ones.
11675
11676 // block:
11677 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
11678 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
11679 // ... (use %2)
11680 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
11681 // br %block
11683 for (auto [I, V] : enumerate(VL)) {
11684 if (isa<PoisonValue>(V))
11685 continue;
11686 Mask[I] = VE->findLaneForValue(V);
11687 }
11688 V = FinalShuffle(V, Mask);
11689 } else {
11690 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
11691 "Expected vectorization factor less "
11692 "than original vector size.");
11693 SmallVector<int> UniformMask(VF, 0);
11694 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11695 V = FinalShuffle(V, UniformMask);
11696 }
11697 }
11698 // Need to update the operand gather node, if actually the operand is not a
11699 // vectorized node, but the buildvector/gather node, which matches one of
11700 // the vectorized nodes.
11701 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
11702 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11703 }) == VE->UserTreeIndices.end()) {
11704 auto *It = find_if(
11705 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11706 return TE->State == TreeEntry::NeedToGather &&
11707 TE->UserTreeIndices.front().UserTE == E &&
11708 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11709 });
11710 assert(It != VectorizableTree.end() && "Expected gather node operand.");
11711 (*It)->VectorizedValue = V;
11712 }
11713 return V;
11714 }
11715 }
11716
11717 // Find the corresponding gather entry and vectorize it.
11718 // Allows to be more accurate with tree/graph transformations, checks for the
11719 // correctness of the transformations in many cases.
11720 auto *I = find_if(VectorizableTree,
11721 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
11722 return TE->isOperandGatherNode({E, NodeIdx});
11723 });
11724 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
11725 assert(I->get()->UserTreeIndices.size() == 1 &&
11726 "Expected only single user for the gather node.");
11727 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
11728 return vectorizeTree(I->get(), PostponedPHIs);
11729}
11730
11731template <typename BVTy, typename ResTy, typename... Args>
11732ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11733 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
11734 unsigned VF = E->getVectorFactor();
11735
11736 bool NeedFreeze = false;
11737 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
11738 E->ReuseShuffleIndices.end());
11739 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
11740 // Build a mask out of the reorder indices and reorder scalars per this
11741 // mask.
11742 SmallVector<int> ReorderMask;
11743 inversePermutation(E->ReorderIndices, ReorderMask);
11744 if (!ReorderMask.empty())
11745 reorderScalars(GatheredScalars, ReorderMask);
11746 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
11747 unsigned I, unsigned SliceSize) {
11748 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
11749 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11750 }))
11751 return false;
11752 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11753 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11754 if (UserTE->getNumOperands() != 2)
11755 return false;
11756 auto *It =
11757 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
11758 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
11759 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11760 }) != TE->UserTreeIndices.end();
11761 });
11762 if (It == VectorizableTree.end())
11763 return false;
11764 int Idx;
11765 if ((Mask.size() < InputVF &&
11767 Idx == 0) ||
11768 (Mask.size() == InputVF &&
11769 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
11770 std::iota(std::next(Mask.begin(), I * SliceSize),
11771 std::next(Mask.begin(), (I + 1) * SliceSize), 0);
11772 } else {
11773 unsigned IVal =
11774 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
11775 std::fill(std::next(Mask.begin(), I * SliceSize),
11776 std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
11777 }
11778 return true;
11779 };
11780 BVTy ShuffleBuilder(Params...);
11781 ResTy Res = ResTy();
11783 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
11785 Value *ExtractVecBase = nullptr;
11786 bool UseVecBaseAsInput = false;
11789 Type *ScalarTy = GatheredScalars.front()->getType();
11790 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
11791 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11792 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11793 NumParts = 1;
11794 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
11795 // Check for gathered extracts.
11796 bool Resized = false;
11797 ExtractShuffles =
11798 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11799 if (!ExtractShuffles.empty()) {
11800 SmallVector<const TreeEntry *> ExtractEntries;
11801 for (auto [Idx, I] : enumerate(ExtractMask)) {
11802 if (I == PoisonMaskElem)
11803 continue;
11804 if (const auto *TE = getTreeEntry(
11805 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
11806 ExtractEntries.push_back(TE);
11807 }
11808 if (std::optional<ResTy> Delayed =
11809 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11810 // Delay emission of gathers which are not ready yet.
11811 PostponedGathers.insert(E);
11812 // Postpone gather emission, will be emitted after the end of the
11813 // process to keep correct order.
11814 return *Delayed;
11815 }
11816 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
11817 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11818 ExtractVecBase = VecBase;
11819 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11820 if (VF == VecBaseTy->getNumElements() &&
11821 GatheredScalars.size() != VF) {
11822 Resized = true;
11823 GatheredScalars.append(VF - GatheredScalars.size(),
11824 PoisonValue::get(ScalarTy));
11825 }
11826 }
11827 }
11828 // Gather extracts after we check for full matched gathers only.
11829 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
11830 E->isAltShuffle() ||
11831 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
11832 isSplat(E->Scalars) ||
11833 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11834 GatherShuffles =
11835 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11836 }
11837 if (!GatherShuffles.empty()) {
11838 if (std::optional<ResTy> Delayed =
11839 ShuffleBuilder.needToDelay(E, Entries)) {
11840 // Delay emission of gathers which are not ready yet.
11841 PostponedGathers.insert(E);
11842 // Postpone gather emission, will be emitted after the end of the
11843 // process to keep correct order.
11844 return *Delayed;
11845 }
11846 if (GatherShuffles.size() == 1 &&
11847 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
11848 Entries.front().front()->isSame(E->Scalars)) {
11849 // Perfect match in the graph, will reuse the previously vectorized
11850 // node. Cost is 0.
11851 LLVM_DEBUG(
11852 dbgs()
11853 << "SLP: perfect diamond match for gather bundle "
11854 << shortBundleName(E->Scalars) << ".\n");
11855 // Restore the mask for previous partially matched values.
11856 Mask.resize(E->Scalars.size());
11857 const TreeEntry *FrontTE = Entries.front().front();
11858 if (FrontTE->ReorderIndices.empty() &&
11859 ((FrontTE->ReuseShuffleIndices.empty() &&
11860 E->Scalars.size() == FrontTE->Scalars.size()) ||
11861 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11862 std::iota(Mask.begin(), Mask.end(), 0);
11863 } else {
11864 for (auto [I, V] : enumerate(E->Scalars)) {
11865 if (isa<PoisonValue>(V)) {
11867 continue;
11868 }
11869 Mask[I] = FrontTE->findLaneForValue(V);
11870 }
11871 }
11872 ShuffleBuilder.add(*FrontTE, Mask);
11873 Res = ShuffleBuilder.finalize(E->getCommonMask());
11874 return Res;
11875 }
11876 if (!Resized) {
11877 if (GatheredScalars.size() != VF &&
11878 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
11879 return any_of(TEs, [&](const TreeEntry *TE) {
11880 return TE->getVectorFactor() == VF;
11881 });
11882 }))
11883 GatheredScalars.append(VF - GatheredScalars.size(),
11884 PoisonValue::get(ScalarTy));
11885 }
11886 // Remove shuffled elements from list of gathers.
11887 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11888 if (Mask[I] != PoisonMaskElem)
11889 GatheredScalars[I] = PoisonValue::get(ScalarTy);
11890 }
11891 }
11892 }
11893 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
11894 SmallVectorImpl<int> &ReuseMask,
11895 bool IsRootPoison) {
11896 // For splats with can emit broadcasts instead of gathers, so try to find
11897 // such sequences.
11898 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
11899 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
11900 Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
11901 SmallVector<int> UndefPos;
11902 DenseMap<Value *, unsigned> UniquePositions;
11903 // Gather unique non-const values and all constant values.
11904 // For repeated values, just shuffle them.
11905 int NumNonConsts = 0;
11906 int SinglePos = 0;
11907 for (auto [I, V] : enumerate(Scalars)) {
11908 if (isa<UndefValue>(V)) {
11909 if (!isa<PoisonValue>(V)) {
11910 ReuseMask[I] = I;
11911 UndefPos.push_back(I);
11912 }
11913 continue;
11914 }
11915 if (isConstant(V)) {
11916 ReuseMask[I] = I;
11917 continue;
11918 }
11919 ++NumNonConsts;
11920 SinglePos = I;
11921 Value *OrigV = V;
11922 Scalars[I] = PoisonValue::get(ScalarTy);
11923 if (IsSplat) {
11924 Scalars.front() = OrigV;
11925 ReuseMask[I] = 0;
11926 } else {
11927 const auto Res = UniquePositions.try_emplace(OrigV, I);
11928 Scalars[Res.first->second] = OrigV;
11929 ReuseMask[I] = Res.first->second;
11930 }
11931 }
11932 if (NumNonConsts == 1) {
11933 // Restore single insert element.
11934 if (IsSplat) {
11935 ReuseMask.assign(VF, PoisonMaskElem);
11936 std::swap(Scalars.front(), Scalars[SinglePos]);
11937 if (!UndefPos.empty() && UndefPos.front() == 0)
11938 Scalars.front() = UndefValue::get(ScalarTy);
11939 }
11940 ReuseMask[SinglePos] = SinglePos;
11941 } else if (!UndefPos.empty() && IsSplat) {
11942 // For undef values, try to replace them with the simple broadcast.
11943 // We can do it if the broadcasted value is guaranteed to be
11944 // non-poisonous, or by freezing the incoming scalar value first.
11945 auto *It = find_if(Scalars, [this, E](Value *V) {
11946 return !isa<UndefValue>(V) &&
11947 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
11948 (E->UserTreeIndices.size() == 1 &&
11949 any_of(V->uses(), [E](const Use &U) {
11950 // Check if the value already used in the same operation in
11951 // one of the nodes already.
11952 return E->UserTreeIndices.front().EdgeIdx !=
11953 U.getOperandNo() &&
11954 is_contained(
11955 E->UserTreeIndices.front().UserTE->Scalars,
11956 U.getUser());
11957 })));
11958 });
11959 if (It != Scalars.end()) {
11960 // Replace undefs by the non-poisoned scalars and emit broadcast.
11961 int Pos = std::distance(Scalars.begin(), It);
11962 for (int I : UndefPos) {
11963 // Set the undef position to the non-poisoned scalar.
11964 ReuseMask[I] = Pos;
11965 // Replace the undef by the poison, in the mask it is replaced by
11966 // non-poisoned scalar already.
11967 if (I != Pos)
11968 Scalars[I] = PoisonValue::get(ScalarTy);
11969 }
11970 } else {
11971 // Replace undefs by the poisons, emit broadcast and then emit
11972 // freeze.
11973 for (int I : UndefPos) {
11974 ReuseMask[I] = PoisonMaskElem;
11975 if (isa<UndefValue>(Scalars[I]))
11976 Scalars[I] = PoisonValue::get(ScalarTy);
11977 }
11978 NeedFreeze = true;
11979 }
11980 }
11981 };
11982 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
11983 bool IsNonPoisoned = true;
11984 bool IsUsedInExpr = true;
11985 Value *Vec1 = nullptr;
11986 if (!ExtractShuffles.empty()) {
11987 // Gather of extractelements can be represented as just a shuffle of
11988 // a single/two vectors the scalars are extracted from.
11989 // Find input vectors.
11990 Value *Vec2 = nullptr;
11991 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
11992 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
11993 ExtractMask[I] = PoisonMaskElem;
11994 }
11995 if (UseVecBaseAsInput) {
11996 Vec1 = ExtractVecBase;
11997 } else {
11998 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
11999 if (ExtractMask[I] == PoisonMaskElem)
12000 continue;
12001 if (isa<UndefValue>(E->Scalars[I]))
12002 continue;
12003 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12004 Value *VecOp = EI->getVectorOperand();
12005 if (const auto *TE = getTreeEntry(VecOp))
12006 if (TE->VectorizedValue)
12007 VecOp = TE->VectorizedValue;
12008 if (!Vec1) {
12009 Vec1 = VecOp;
12010 } else if (Vec1 != VecOp) {
12011 assert((!Vec2 || Vec2 == VecOp) &&
12012 "Expected only 1 or 2 vectors shuffle.");
12013 Vec2 = VecOp;
12014 }
12015 }
12016 }
12017 if (Vec2) {
12018 IsUsedInExpr = false;
12019 IsNonPoisoned &=
12021 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12022 } else if (Vec1) {
12023 IsUsedInExpr &= FindReusedSplat(
12024 ExtractMask,
12025 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12026 ExtractMask.size());
12027 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12028 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12029 } else {
12030 IsUsedInExpr = false;
12031 ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
12032 ScalarTy, GatheredScalars.size())),
12033 ExtractMask, /*ForExtracts=*/true);
12034 }
12035 }
12036 if (!GatherShuffles.empty()) {
12037 unsigned SliceSize = E->Scalars.size() / NumParts;
12038 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12039 for (const auto [I, TEs] : enumerate(Entries)) {
12040 if (TEs.empty()) {
12041 assert(!GatherShuffles[I] &&
12042 "No shuffles with empty entries list expected.");
12043 continue;
12044 }
12045 assert((TEs.size() == 1 || TEs.size() == 2) &&
12046 "Expected shuffle of 1 or 2 entries.");
12047 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
12048 VecMask.assign(VecMask.size(), PoisonMaskElem);
12049 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12050 if (TEs.size() == 1) {
12051 IsUsedInExpr &= FindReusedSplat(
12052 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12053 ShuffleBuilder.add(*TEs.front(), VecMask);
12054 if (TEs.front()->VectorizedValue)
12055 IsNonPoisoned &=
12056 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12057 } else {
12058 IsUsedInExpr = false;
12059 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12060 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12061 IsNonPoisoned &=
12062 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12063 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12064 }
12065 }
12066 }
12067 // Try to figure out best way to combine values: build a shuffle and insert
12068 // elements or just build several shuffles.
12069 // Insert non-constant scalars.
12070 SmallVector<Value *> NonConstants(GatheredScalars);
12071 int EMSz = ExtractMask.size();
12072 int MSz = Mask.size();
12073 // Try to build constant vector and shuffle with it only if currently we
12074 // have a single permutation and more than 1 scalar constants.
12075 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12076 bool IsIdentityShuffle =
12077 ((UseVecBaseAsInput ||
12078 all_of(ExtractShuffles,
12079 [](const std::optional<TTI::ShuffleKind> &SK) {
12080 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12082 })) &&
12083 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12084 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12085 (!GatherShuffles.empty() &&
12086 all_of(GatherShuffles,
12087 [](const std::optional<TTI::ShuffleKind> &SK) {
12088 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12090 }) &&
12091 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12093 bool EnoughConstsForShuffle =
12094 IsSingleShuffle &&
12095 (none_of(GatheredScalars,
12096 [](Value *V) {
12097 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12098 }) ||
12099 any_of(GatheredScalars,
12100 [](Value *V) {
12101 return isa<Constant>(V) && !isa<UndefValue>(V);
12102 })) &&
12103 (!IsIdentityShuffle ||
12104 (GatheredScalars.size() == 2 &&
12105 any_of(GatheredScalars,
12106 [](Value *V) { return !isa<UndefValue>(V); })) ||
12107 count_if(GatheredScalars, [](Value *V) {
12108 return isa<Constant>(V) && !isa<PoisonValue>(V);
12109 }) > 1);
12110 // NonConstants array contains just non-constant values, GatheredScalars
12111 // contains only constant to build final vector and then shuffle.
12112 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12113 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12114 NonConstants[I] = PoisonValue::get(ScalarTy);
12115 else
12116 GatheredScalars[I] = PoisonValue::get(ScalarTy);
12117 }
12118 // Generate constants for final shuffle and build a mask for them.
12119 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12120 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12121 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12122 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12123 ShuffleBuilder.add(BV, BVMask);
12124 }
12125 if (all_of(NonConstants, [=](Value *V) {
12126 return isa<PoisonValue>(V) ||
12127 (IsSingleShuffle && ((IsIdentityShuffle &&
12128 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12129 }))
12130 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12131 else
12132 Res = ShuffleBuilder.finalize(
12133 E->ReuseShuffleIndices, E->Scalars.size(),
12134 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12135 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12136 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12137 });
12138 } else if (!allConstant(GatheredScalars)) {
12139 // Gather unique scalars and all constants.
12140 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12141 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12142 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12143 ShuffleBuilder.add(BV, ReuseMask);
12144 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12145 } else {
12146 // Gather all constants.
12147 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12148 for (auto [I, V] : enumerate(E->Scalars)) {
12149 if (!isa<PoisonValue>(V))
12150 Mask[I] = I;
12151 }
12152 Value *BV = ShuffleBuilder.gather(E->Scalars);
12153 ShuffleBuilder.add(BV, Mask);
12154 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12155 }
12156
12157 if (NeedFreeze)
12158 Res = ShuffleBuilder.createFreeze(Res);
12159 return Res;
12160}
12161
12162Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
12163 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
12164 *this);
12165}
12166
12167Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12168 IRBuilderBase::InsertPointGuard Guard(Builder);
12169
12170 if (E->VectorizedValue &&
12171 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12172 E->isAltShuffle())) {
12173 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12174 return E->VectorizedValue;
12175 }
12176
12177 if (E->State == TreeEntry::NeedToGather) {
12178 // Set insert point for non-reduction initial nodes.
12179 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12180 setInsertPointAfterBundle(E);
12181 Value *Vec = createBuildVector(E);
12182 E->VectorizedValue = Vec;
12183 return Vec;
12184 }
12185
12186 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12187 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12188 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
12189 if (E->getOpcode() == Instruction::Store) {
12191 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12192 E->ReorderIndices.size());
12193 ShuffleBuilder.add(V, Mask);
12194 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12195 ShuffleBuilder.addOrdered(V, std::nullopt);
12196 } else {
12197 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12198 }
12199 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12200 };
12201
12202 assert((E->State == TreeEntry::Vectorize ||
12203 E->State == TreeEntry::ScatterVectorize ||
12204 E->State == TreeEntry::StridedVectorize) &&
12205 "Unhandled state");
12206 unsigned ShuffleOrOp =
12207 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12208 Instruction *VL0 = E->getMainOp();
12209 Type *ScalarTy = VL0->getType();
12210 if (auto *Store = dyn_cast<StoreInst>(VL0))
12211 ScalarTy = Store->getValueOperand()->getType();
12212 else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
12213 ScalarTy = IE->getOperand(1)->getType();
12214 auto It = MinBWs.find(E);
12215 if (It != MinBWs.end())
12216 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12217 auto GetOperandSignedness = [&](unsigned Idx) {
12218 const TreeEntry *OpE = getOperandEntry(E, Idx);
12219 bool IsSigned = false;
12220 auto It = MinBWs.find(OpE);
12221 if (It != MinBWs.end())
12222 IsSigned = It->second.second;
12223 else
12224 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12225 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12226 });
12227 return IsSigned;
12228 };
12229 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
12230 switch (ShuffleOrOp) {
12231 case Instruction::PHI: {
12232 assert((E->ReorderIndices.empty() ||
12233 E != VectorizableTree.front().get() ||
12234 !E->UserTreeIndices.empty()) &&
12235 "PHI reordering is free.");
12236 if (PostponedPHIs && E->VectorizedValue)
12237 return E->VectorizedValue;
12238 auto *PH = cast<PHINode>(VL0);
12239 Builder.SetInsertPoint(PH->getParent(),
12240 PH->getParent()->getFirstNonPHIIt());
12241 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12242 if (PostponedPHIs || !E->VectorizedValue) {
12243 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12244 E->PHI = NewPhi;
12245 Value *V = NewPhi;
12246
12247 // Adjust insertion point once all PHI's have been generated.
12248 Builder.SetInsertPoint(PH->getParent(),
12249 PH->getParent()->getFirstInsertionPt());
12250 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12251
12252 V = FinalShuffle(V, E, VecTy);
12253
12254 E->VectorizedValue = V;
12255 if (PostponedPHIs)
12256 return V;
12257 }
12258 PHINode *NewPhi = cast<PHINode>(E->PHI);
12259 // If phi node is fully emitted - exit.
12260 if (NewPhi->getNumIncomingValues() != 0)
12261 return NewPhi;
12262
12263 // PHINodes may have multiple entries from the same block. We want to
12264 // visit every block once.
12266
12267 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12269 BasicBlock *IBB = PH->getIncomingBlock(I);
12270
12271 // Stop emission if all incoming values are generated.
12272 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12273 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12274 return NewPhi;
12275 }
12276
12277 if (!VisitedBBs.insert(IBB).second) {
12278 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12279 continue;
12280 }
12281
12282 Builder.SetInsertPoint(IBB->getTerminator());
12283 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12284 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12285 if (VecTy != Vec->getType()) {
12286 assert((It != MinBWs.end() ||
12287 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12288 MinBWs.contains(getOperandEntry(E, I))) &&
12289 "Expected item in MinBWs.");
12290 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12291 }
12292 NewPhi->addIncoming(Vec, IBB);
12293 }
12294
12295 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12296 "Invalid number of incoming values");
12297 return NewPhi;
12298 }
12299
12300 case Instruction::ExtractElement: {
12301 Value *V = E->getSingleOperand(0);
12302 if (const TreeEntry *TE = getTreeEntry(V))
12303 V = TE->VectorizedValue;
12304 setInsertPointAfterBundle(E);
12305 V = FinalShuffle(V, E, VecTy);
12306 E->VectorizedValue = V;
12307 return V;
12308 }
12309 case Instruction::ExtractValue: {
12310 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12311 Builder.SetInsertPoint(LI);
12312 Value *Ptr = LI->getPointerOperand();
12313 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12314 Value *NewV = propagateMetadata(V, E->Scalars);
12315 NewV = FinalShuffle(NewV, E, VecTy);
12316 E->VectorizedValue = NewV;
12317 return NewV;
12318 }
12319 case Instruction::InsertElement: {
12320 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12321 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12322 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12323 ArrayRef<Value *> Op = E->getOperand(1);
12324 Type *ScalarTy = Op.front()->getType();
12325 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12326 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12327 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12328 assert(Res.first > 0 && "Expected item in MinBWs.");
12329 V = Builder.CreateIntCast(
12330 V,
12332 ScalarTy,
12333 cast<FixedVectorType>(V->getType())->getNumElements()),
12334 Res.second);
12335 }
12336
12337 // Create InsertVector shuffle if necessary
12338 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12339 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12340 }));
12341 const unsigned NumElts =
12342 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12343 const unsigned NumScalars = E->Scalars.size();
12344
12345 unsigned Offset = *getInsertIndex(VL0);
12346 assert(Offset < NumElts && "Failed to find vector index offset");
12347
12348 // Create shuffle to resize vector
12350 if (!E->ReorderIndices.empty()) {
12351 inversePermutation(E->ReorderIndices, Mask);
12352 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12353 } else {
12354 Mask.assign(NumElts, PoisonMaskElem);
12355 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12356 }
12357 // Create InsertVector shuffle if necessary
12358 bool IsIdentity = true;
12359 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12360 Mask.swap(PrevMask);
12361 for (unsigned I = 0; I < NumScalars; ++I) {
12362 Value *Scalar = E->Scalars[PrevMask[I]];
12363 unsigned InsertIdx = *getInsertIndex(Scalar);
12364 IsIdentity &= InsertIdx - Offset == I;
12365 Mask[InsertIdx - Offset] = I;
12366 }
12367 if (!IsIdentity || NumElts != NumScalars) {
12368 Value *V2 = nullptr;
12369 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12370 SmallVector<int> InsertMask(Mask);
12371 if (NumElts != NumScalars && Offset == 0) {
12372 // Follow all insert element instructions from the current buildvector
12373 // sequence.
12374 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12375 do {
12376 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
12377 if (!InsertIdx)
12378 break;
12379 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12380 InsertMask[*InsertIdx] = *InsertIdx;
12381 if (!Ins->hasOneUse())
12382 break;
12383 Ins = dyn_cast_or_null<InsertElementInst>(
12384 Ins->getUniqueUndroppableUser());
12385 } while (Ins);
12386 SmallBitVector UseMask =
12387 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12388 SmallBitVector IsFirstPoison =
12389 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12390 SmallBitVector IsFirstUndef =
12391 isUndefVector(FirstInsert->getOperand(0), UseMask);
12392 if (!IsFirstPoison.all()) {
12393 unsigned Idx = 0;
12394 for (unsigned I = 0; I < NumElts; I++) {
12395 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12396 IsFirstUndef.test(I)) {
12397 if (IsVNonPoisonous) {
12398 InsertMask[I] = I < NumScalars ? I : 0;
12399 continue;
12400 }
12401 if (!V2)
12402 V2 = UndefValue::get(V->getType());
12403 if (Idx >= NumScalars)
12404 Idx = NumScalars - 1;
12405 InsertMask[I] = NumScalars + Idx;
12406 ++Idx;
12407 } else if (InsertMask[I] != PoisonMaskElem &&
12408 Mask[I] == PoisonMaskElem) {
12409 InsertMask[I] = PoisonMaskElem;
12410 }
12411 }
12412 } else {
12413 InsertMask = Mask;
12414 }
12415 }
12416 if (!V2)
12417 V2 = PoisonValue::get(V->getType());
12418 V = Builder.CreateShuffleVector(V, V2, InsertMask);
12419 if (auto *I = dyn_cast<Instruction>(V)) {
12420 GatherShuffleExtractSeq.insert(I);
12421 CSEBlocks.insert(I->getParent());
12422 }
12423 }
12424
12425 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12426 for (unsigned I = 0; I < NumElts; I++) {
12427 if (Mask[I] != PoisonMaskElem)
12428 InsertMask[Offset + I] = I;
12429 }
12430 SmallBitVector UseMask =
12431 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12432 SmallBitVector IsFirstUndef =
12433 isUndefVector(FirstInsert->getOperand(0), UseMask);
12434 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12435 NumElts != NumScalars) {
12436 if (IsFirstUndef.all()) {
12437 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
12438 SmallBitVector IsFirstPoison =
12439 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12440 if (!IsFirstPoison.all()) {
12441 for (unsigned I = 0; I < NumElts; I++) {
12442 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
12443 InsertMask[I] = I + NumElts;
12444 }
12445 }
12446 V = Builder.CreateShuffleVector(
12447 V,
12448 IsFirstPoison.all() ? PoisonValue::get(V->getType())
12449 : FirstInsert->getOperand(0),
12450 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
12451 if (auto *I = dyn_cast<Instruction>(V)) {
12452 GatherShuffleExtractSeq.insert(I);
12453 CSEBlocks.insert(I->getParent());
12454 }
12455 }
12456 } else {
12457 SmallBitVector IsFirstPoison =
12458 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12459 for (unsigned I = 0; I < NumElts; I++) {
12460 if (InsertMask[I] == PoisonMaskElem)
12461 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
12462 else
12463 InsertMask[I] += NumElts;
12464 }
12465 V = Builder.CreateShuffleVector(
12466 FirstInsert->getOperand(0), V, InsertMask,
12467 cast<Instruction>(E->Scalars.back())->getName());
12468 if (auto *I = dyn_cast<Instruction>(V)) {
12469 GatherShuffleExtractSeq.insert(I);
12470 CSEBlocks.insert(I->getParent());
12471 }
12472 }
12473 }
12474
12475 ++NumVectorInstructions;
12476 E->VectorizedValue = V;
12477 return V;
12478 }
12479 case Instruction::ZExt:
12480 case Instruction::SExt:
12481 case Instruction::FPToUI:
12482 case Instruction::FPToSI:
12483 case Instruction::FPExt:
12484 case Instruction::PtrToInt:
12485 case Instruction::IntToPtr:
12486 case Instruction::SIToFP:
12487 case Instruction::UIToFP:
12488 case Instruction::Trunc:
12489 case Instruction::FPTrunc:
12490 case Instruction::BitCast: {
12491 setInsertPointAfterBundle(E);
12492
12493 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12494 if (E->VectorizedValue) {
12495 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12496 return E->VectorizedValue;
12497 }
12498
12499 auto *CI = cast<CastInst>(VL0);
12500 Instruction::CastOps VecOpcode = CI->getOpcode();
12501 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
12502 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12503 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12504 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12505 SrcScalarTy != CI->getOperand(0)->getType())) {
12506 // Check if the values are candidates to demote.
12507 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12508 if (SrcIt != MinBWs.end())
12509 SrcBWSz = SrcIt->second.first;
12510 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12511 if (BWSz == SrcBWSz) {
12512 VecOpcode = Instruction::BitCast;
12513 } else if (BWSz < SrcBWSz) {
12514 VecOpcode = Instruction::Trunc;
12515 } else if (It != MinBWs.end()) {
12516 assert(BWSz > SrcBWSz && "Invalid cast!");
12517 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12518 } else if (SrcIt != MinBWs.end()) {
12519 assert(BWSz > SrcBWSz && "Invalid cast!");
12520 VecOpcode =
12521 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12522 }
12523 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12524 !SrcIt->second.second) {
12525 VecOpcode = Instruction::UIToFP;
12526 }
12527 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12528 ? InVec
12529 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12530 V = FinalShuffle(V, E, VecTy);
12531
12532 E->VectorizedValue = V;
12533 ++NumVectorInstructions;
12534 return V;
12535 }
12536 case Instruction::FCmp:
12537 case Instruction::ICmp: {
12538 setInsertPointAfterBundle(E);
12539
12540 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12541 if (E->VectorizedValue) {
12542 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12543 return E->VectorizedValue;
12544 }
12545 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12546 if (E->VectorizedValue) {
12547 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12548 return E->VectorizedValue;
12549 }
12550 if (L->getType() != R->getType()) {
12551 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12552 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12553 MinBWs.contains(getOperandEntry(E, 0)) ||
12554 MinBWs.contains(getOperandEntry(E, 1))) &&
12555 "Expected item in MinBWs.");
12556 if (cast<VectorType>(L->getType())
12557 ->getElementType()
12558 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
12559 ->getElementType()
12560 ->getIntegerBitWidth()) {
12561 Type *CastTy = R->getType();
12562 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
12563 } else {
12564 Type *CastTy = L->getType();
12565 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
12566 }
12567 }
12568
12569 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12570 Value *V = Builder.CreateCmp(P0, L, R);
12571 propagateIRFlags(V, E->Scalars, VL0);
12572 // Do not cast for cmps.
12573 VecTy = cast<FixedVectorType>(V->getType());
12574 V = FinalShuffle(V, E, VecTy);
12575
12576 E->VectorizedValue = V;
12577 ++NumVectorInstructions;
12578 return V;
12579 }
12580 case Instruction::Select: {
12581 setInsertPointAfterBundle(E);
12582
12583 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12584 if (E->VectorizedValue) {
12585 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12586 return E->VectorizedValue;
12587 }
12588 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12589 if (E->VectorizedValue) {
12590 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12591 return E->VectorizedValue;
12592 }
12593 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12594 if (E->VectorizedValue) {
12595 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12596 return E->VectorizedValue;
12597 }
12598 if (True->getType() != VecTy || False->getType() != VecTy) {
12599 assert((It != MinBWs.end() ||
12600 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12601 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12602 MinBWs.contains(getOperandEntry(E, 1)) ||
12603 MinBWs.contains(getOperandEntry(E, 2))) &&
12604 "Expected item in MinBWs.");
12605 if (True->getType() != VecTy)
12606 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
12607 if (False->getType() != VecTy)
12608 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
12609 }
12610
12611 Value *V = Builder.CreateSelect(Cond, True, False);
12612 V = FinalShuffle(V, E, VecTy);
12613
12614 E->VectorizedValue = V;
12615 ++NumVectorInstructions;
12616 return V;
12617 }
12618 case Instruction::FNeg: {
12619 setInsertPointAfterBundle(E);
12620
12621 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
12622
12623 if (E->VectorizedValue) {
12624 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12625 return E->VectorizedValue;
12626 }
12627
12628 Value *V = Builder.CreateUnOp(
12629 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
12630 propagateIRFlags(V, E->Scalars, VL0);
12631 if (auto *I = dyn_cast<Instruction>(V))
12632 V = propagateMetadata(I, E->Scalars);
12633
12634 V = FinalShuffle(V, E, VecTy);
12635
12636 E->VectorizedValue = V;
12637 ++NumVectorInstructions;
12638
12639 return V;
12640 }
12641 case Instruction::Add:
12642 case Instruction::FAdd:
12643 case Instruction::Sub:
12644 case Instruction::FSub:
12645 case Instruction::Mul:
12646 case Instruction::FMul:
12647 case Instruction::UDiv:
12648 case Instruction::SDiv:
12649 case Instruction::FDiv:
12650 case Instruction::URem:
12651 case Instruction::SRem:
12652 case Instruction::FRem:
12653 case Instruction::Shl:
12654 case Instruction::LShr:
12655 case Instruction::AShr:
12656 case Instruction::And:
12657 case Instruction::Or:
12658 case Instruction::Xor: {
12659 setInsertPointAfterBundle(E);
12660
12661 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
12662 if (E->VectorizedValue) {
12663 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12664 return E->VectorizedValue;
12665 }
12666 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
12667 if (E->VectorizedValue) {
12668 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12669 return E->VectorizedValue;
12670 }
12671 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
12672 assert((It != MinBWs.end() ||
12673 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12674 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12675 MinBWs.contains(getOperandEntry(E, 0)) ||
12676 MinBWs.contains(getOperandEntry(E, 1))) &&
12677 "Expected item in MinBWs.");
12678 if (LHS->getType() != VecTy)
12679 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
12680 if (RHS->getType() != VecTy)
12681 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
12682 }
12683
12684 Value *V = Builder.CreateBinOp(
12685 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
12686 RHS);
12687 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
12688 if (auto *I = dyn_cast<Instruction>(V)) {
12689 V = propagateMetadata(I, E->Scalars);
12690 // Drop nuw flags for abs(sub(commutative), true).
12691 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
12692 any_of(E->Scalars, [](Value *V) {
12693 return isCommutative(cast<Instruction>(V));
12694 }))
12695 I->setHasNoUnsignedWrap(/*b=*/false);
12696 }
12697
12698 V = FinalShuffle(V, E, VecTy);
12699
12700 E->VectorizedValue = V;
12701 ++NumVectorInstructions;
12702
12703 return V;
12704 }
12705 case Instruction::Load: {
12706 // Loads are inserted at the head of the tree because we don't want to
12707 // sink them all the way down past store instructions.
12708 setInsertPointAfterBundle(E);
12709
12710 LoadInst *LI = cast<LoadInst>(VL0);
12711 Instruction *NewLI;
12712 Value *PO = LI->getPointerOperand();
12713 if (E->State == TreeEntry::Vectorize) {
12714 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
12715 } else if (E->State == TreeEntry::StridedVectorize) {
12716 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12717 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12718 PO = IsReverseOrder ? PtrN : Ptr0;
12719 std::optional<int> Diff = getPointersDiff(
12720 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
12721 Type *StrideTy = DL->getIndexType(PO->getType());
12722 Value *StrideVal;
12723 if (Diff) {
12724 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12725 StrideVal =
12726 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12727 DL->getTypeAllocSize(ScalarTy));
12728 } else {
12729 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
12730 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
12731 return cast<LoadInst>(V)->getPointerOperand();
12732 });
12733 OrdersType Order;
12734 std::optional<Value *> Stride =
12735 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
12736 &*Builder.GetInsertPoint());
12737 Value *NewStride =
12738 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
12739 StrideVal = Builder.CreateMul(
12740 NewStride,
12741 ConstantInt::get(
12742 StrideTy,
12743 (IsReverseOrder ? -1 : 1) *
12744 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
12745 }
12746 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12747 auto *Inst = Builder.CreateIntrinsic(
12748 Intrinsic::experimental_vp_strided_load,
12749 {VecTy, PO->getType(), StrideTy},
12750 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
12751 Builder.getInt32(E->Scalars.size())});
12752 Inst->addParamAttr(
12753 /*ArgNo=*/0,
12754 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
12755 NewLI = Inst;
12756 } else {
12757 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
12758 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12759 if (E->VectorizedValue) {
12760 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12761 return E->VectorizedValue;
12762 }
12763 // Use the minimum alignment of the gathered loads.
12764 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12765 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
12766 }
12767 Value *V = propagateMetadata(NewLI, E->Scalars);
12768
12769 V = FinalShuffle(V, E, VecTy);
12770 E->VectorizedValue = V;
12771 ++NumVectorInstructions;
12772 return V;
12773 }
12774 case Instruction::Store: {
12775 auto *SI = cast<StoreInst>(VL0);
12776
12777 setInsertPointAfterBundle(E);
12778
12779 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12780 if (VecValue->getType() != VecTy)
12781 VecValue =
12782 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12783 VecValue = FinalShuffle(VecValue, E, VecTy);
12784
12785 Value *Ptr = SI->getPointerOperand();
12786 StoreInst *ST =
12787 Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
12788
12789 Value *V = propagateMetadata(ST, E->Scalars);
12790
12791 E->VectorizedValue = V;
12792 ++NumVectorInstructions;
12793 return V;
12794 }
12795 case Instruction::GetElementPtr: {
12796 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12797 setInsertPointAfterBundle(E);
12798
12799 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12800 if (E->VectorizedValue) {
12801 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12802 return E->VectorizedValue;
12803 }
12804
12805 SmallVector<Value *> OpVecs;
12806 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
12807 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12808 if (E->VectorizedValue) {
12809 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12810 return E->VectorizedValue;
12811 }
12812 OpVecs.push_back(OpVec);
12813 }
12814
12815 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12816 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
12818 for (Value *V : E->Scalars) {
12819 if (isa<GetElementPtrInst>(V))
12820 GEPs.push_back(V);
12821 }
12822 V = propagateMetadata(I, GEPs);
12823 }
12824
12825 V = FinalShuffle(V, E, VecTy);
12826
12827 E->VectorizedValue = V;
12828 ++NumVectorInstructions;
12829
12830 return V;
12831 }
12832 case Instruction::Call: {
12833 CallInst *CI = cast<CallInst>(VL0);
12834 setInsertPointAfterBundle(E);
12835
12837
12838 SmallVector<Type *> ArgTys =
12840 It != MinBWs.end() ? It->second.first : 0);
12841 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
12842 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12843 VecCallCosts.first <= VecCallCosts.second;
12844
12845 Value *ScalarArg = nullptr;
12846 SmallVector<Value *> OpVecs;
12847 SmallVector<Type *, 2> TysForDecl;
12848 // Add return type if intrinsic is overloaded on it.
12849 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
12850 TysForDecl.push_back(VecTy);
12851 auto *CEI = cast<CallInst>(VL0);
12852 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
12853 ValueList OpVL;
12854 // Some intrinsics have scalar arguments. This argument should not be
12855 // vectorized.
12856 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
12857 ScalarArg = CEI->getArgOperand(I);
12858 // if decided to reduce bitwidth of abs intrinsic, it second argument
12859 // must be set false (do not return poison, if value issigned min).
12860 if (ID == Intrinsic::abs && It != MinBWs.end() &&
12861 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
12862 ScalarArg = Builder.getFalse();
12863 OpVecs.push_back(ScalarArg);
12865 TysForDecl.push_back(ScalarArg->getType());
12866 continue;
12867 }
12868
12869 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
12870 if (E->VectorizedValue) {
12871 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12872 return E->VectorizedValue;
12873 }
12874 ScalarArg = CEI->getArgOperand(I);
12875 if (cast<VectorType>(OpVec->getType())->getElementType() !=
12876 ScalarArg->getType() &&
12877 It == MinBWs.end()) {
12878 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
12879 VecTy->getNumElements());
12880 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
12881 } else if (It != MinBWs.end()) {
12882 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
12883 }
12884 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
12885 OpVecs.push_back(OpVec);
12886 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
12887 TysForDecl.push_back(OpVec->getType());
12888 }
12889
12890 Function *CF;
12891 if (!UseIntrinsic) {
12892 VFShape Shape =
12895 static_cast<unsigned>(VecTy->getNumElements())),
12896 false /*HasGlobalPred*/);
12897 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
12898 } else {
12899 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
12900 }
12901
12903 CI->getOperandBundlesAsDefs(OpBundles);
12904 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
12905
12906 propagateIRFlags(V, E->Scalars, VL0);
12907 V = FinalShuffle(V, E, VecTy);
12908
12909 E->VectorizedValue = V;
12910 ++NumVectorInstructions;
12911 return V;
12912 }
12913 case Instruction::ShuffleVector: {
12914 assert(E->isAltShuffle() &&
12915 ((Instruction::isBinaryOp(E->getOpcode()) &&
12916 Instruction::isBinaryOp(E->getAltOpcode())) ||
12917 (Instruction::isCast(E->getOpcode()) &&
12918 Instruction::isCast(E->getAltOpcode())) ||
12919 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12920 "Invalid Shuffle Vector Operand");
12921
12922 Value *LHS = nullptr, *RHS = nullptr;
12923 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
12924 setInsertPointAfterBundle(E);
12925 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12926 if (E->VectorizedValue) {
12927 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12928 return E->VectorizedValue;
12929 }
12930 RHS = vectorizeOperand(E, 1, PostponedPHIs);
12931 } else {
12932 setInsertPointAfterBundle(E);
12933 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12934 }
12935 if (E->VectorizedValue) {
12936 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12937 return E->VectorizedValue;
12938 }
12939 if (LHS && RHS &&
12940 ((Instruction::isBinaryOp(E->getOpcode()) &&
12941 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
12942 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
12943 assert((It != MinBWs.end() ||
12944 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12945 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12946 MinBWs.contains(getOperandEntry(E, 0)) ||
12947 MinBWs.contains(getOperandEntry(E, 1))) &&
12948 "Expected item in MinBWs.");
12949 Type *CastTy = VecTy;
12950 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
12951 if (cast<VectorType>(LHS->getType())
12952 ->getElementType()
12953 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
12954 ->getElementType()
12955 ->getIntegerBitWidth())
12956 CastTy = RHS->getType();
12957 else
12958 CastTy = LHS->getType();
12959 }
12960 if (LHS->getType() != CastTy)
12961 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
12962 if (RHS->getType() != CastTy)
12963 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
12964 }
12965
12966 Value *V0, *V1;
12967 if (Instruction::isBinaryOp(E->getOpcode())) {
12968 V0 = Builder.CreateBinOp(
12969 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
12970 V1 = Builder.CreateBinOp(
12971 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
12972 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
12973 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
12974 auto *AltCI = cast<CmpInst>(E->getAltOp());
12975 CmpInst::Predicate AltPred = AltCI->getPredicate();
12976 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
12977 } else {
12978 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
12979 unsigned SrcBWSz = DL->getTypeSizeInBits(
12980 cast<VectorType>(LHS->getType())->getElementType());
12981 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12982 if (BWSz <= SrcBWSz) {
12983 if (BWSz < SrcBWSz)
12984 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
12985 assert(LHS->getType() == VecTy && "Expected same type as operand.");
12986 if (auto *I = dyn_cast<Instruction>(LHS))
12987 LHS = propagateMetadata(I, E->Scalars);
12988 E->VectorizedValue = LHS;
12989 ++NumVectorInstructions;
12990 return LHS;
12991 }
12992 }
12993 V0 = Builder.CreateCast(
12994 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
12995 V1 = Builder.CreateCast(
12996 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
12997 }
12998 // Add V0 and V1 to later analysis to try to find and remove matching
12999 // instruction, if any.
13000 for (Value *V : {V0, V1}) {
13001 if (auto *I = dyn_cast<Instruction>(V)) {
13002 GatherShuffleExtractSeq.insert(I);
13003 CSEBlocks.insert(I->getParent());
13004 }
13005 }
13006
13007 // Create shuffle to take alternate operations from the vector.
13008 // Also, gather up main and alt scalar ops to propagate IR flags to
13009 // each vector operation.
13010 ValueList OpScalars, AltScalars;
13012 E->buildAltOpShuffleMask(
13013 [E, this](Instruction *I) {
13014 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13015 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13016 *TLI);
13017 },
13018 Mask, &OpScalars, &AltScalars);
13019
13020 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13021 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13022 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13023 // Drop nuw flags for abs(sub(commutative), true).
13024 if (auto *I = dyn_cast<Instruction>(Vec);
13025 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13026 any_of(E->Scalars, [](Value *V) {
13027 auto *IV = cast<Instruction>(V);
13028 return IV->getOpcode() == Instruction::Sub &&
13029 isCommutative(cast<Instruction>(IV));
13030 }))
13031 I->setHasNoUnsignedWrap(/*b=*/false);
13032 };
13033 DropNuwFlag(V0, E->getOpcode());
13034 DropNuwFlag(V1, E->getAltOpcode());
13035
13036 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13037 if (auto *I = dyn_cast<Instruction>(V)) {
13038 V = propagateMetadata(I, E->Scalars);
13039 GatherShuffleExtractSeq.insert(I);
13040 CSEBlocks.insert(I->getParent());
13041 }
13042
13043 E->VectorizedValue = V;
13044 ++NumVectorInstructions;
13045
13046 return V;
13047 }
13048 default:
13049 llvm_unreachable("unknown inst");
13050 }
13051 return nullptr;
13052}
13053
13055 ExtraValueToDebugLocsMap ExternallyUsedValues;
13056 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13057 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13058}
13059
13060namespace {
13061/// Data type for handling buildvector sequences with the reused scalars from
13062/// other tree entries.
13063struct ShuffledInsertData {
13064 /// List of insertelements to be replaced by shuffles.
13065 SmallVector<InsertElementInst *> InsertElements;
13066 /// The parent vectors and shuffle mask for the given list of inserts.
13068};
13069} // namespace
13070
13072 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13073 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13074 Instruction *ReductionRoot) {
13075 // All blocks must be scheduled before any instructions are inserted.
13076 for (auto &BSIter : BlocksSchedules) {
13077 scheduleBlock(BSIter.second.get());
13078 }
13079 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13080 // need to rebuild it.
13081 EntryToLastInstruction.clear();
13082
13083 if (ReductionRoot)
13084 Builder.SetInsertPoint(ReductionRoot->getParent(),
13085 ReductionRoot->getIterator());
13086 else
13087 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13088
13089 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13090 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13091 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13092 if (TE->State == TreeEntry::Vectorize &&
13093 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13094 TE->VectorizedValue)
13095 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13096 // Run through the list of postponed gathers and emit them, replacing the temp
13097 // emitted allocas with actual vector instructions.
13098 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13100 for (const TreeEntry *E : PostponedNodes) {
13101 auto *TE = const_cast<TreeEntry *>(E);
13102 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13103 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13104 TE->UserTreeIndices.front().EdgeIdx)))
13105 // Found gather node which is absolutely the same as one of the
13106 // vectorized nodes. It may happen after reordering.
13107 continue;
13108 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13109 TE->VectorizedValue = nullptr;
13110 auto *UserI =
13111 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13112 // If user is a PHI node, its vector code have to be inserted right before
13113 // block terminator. Since the node was delayed, there were some unresolved
13114 // dependencies at the moment when stab instruction was emitted. In a case
13115 // when any of these dependencies turn out an operand of another PHI, coming
13116 // from this same block, position of a stab instruction will become invalid.
13117 // The is because source vector that supposed to feed this gather node was
13118 // inserted at the end of the block [after stab instruction]. So we need
13119 // to adjust insertion point again to the end of block.
13120 if (isa<PHINode>(UserI)) {
13121 // Insert before all users.
13122 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13123 for (User *U : PrevVec->users()) {
13124 if (U == UserI)
13125 continue;
13126 auto *UI = dyn_cast<Instruction>(U);
13127 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13128 continue;
13129 if (UI->comesBefore(InsertPt))
13130 InsertPt = UI;
13131 }
13132 Builder.SetInsertPoint(InsertPt);
13133 } else {
13134 Builder.SetInsertPoint(PrevVec);
13135 }
13136 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13137 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13138 if (Vec->getType() != PrevVec->getType()) {
13139 assert(Vec->getType()->isIntOrIntVectorTy() &&
13140 PrevVec->getType()->isIntOrIntVectorTy() &&
13141 "Expected integer vector types only.");
13142 assert(MinBWs.contains(TE->UserTreeIndices.front().UserTE) &&
13143 "Expected user in MinBWs.");
13144 bool IsSigned = MinBWs.lookup(TE->UserTreeIndices.front().UserTE).second;
13145 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), IsSigned);
13146 }
13147 PrevVec->replaceAllUsesWith(Vec);
13148 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13149 // Replace the stub vector node, if it was used before for one of the
13150 // buildvector nodes already.
13151 auto It = PostponedValues.find(PrevVec);
13152 if (It != PostponedValues.end()) {
13153 for (TreeEntry *VTE : It->getSecond())
13154 VTE->VectorizedValue = Vec;
13155 }
13156 eraseInstruction(PrevVec);
13157 }
13158
13159 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13160 << " values .\n");
13161
13162 SmallVector<ShuffledInsertData> ShuffledInserts;
13163 // Maps vector instruction to original insertelement instruction
13164 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13165 // Maps extract Scalar to the corresponding extractelement instruction in the
13166 // basic block. Only one extractelement per block should be emitted.
13167 DenseMap<Value *,
13169 ScalarToEEs;
13170 SmallDenseSet<Value *, 4> UsedInserts;
13172 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13173 // Extract all of the elements with the external uses.
13174 for (const auto &ExternalUse : ExternalUses) {
13175 Value *Scalar = ExternalUse.Scalar;
13176 llvm::User *User = ExternalUse.User;
13177
13178 // Skip users that we already RAUW. This happens when one instruction
13179 // has multiple uses of the same value.
13180 if (User && !is_contained(Scalar->users(), User))
13181 continue;
13182 TreeEntry *E = getTreeEntry(Scalar);
13183 assert(E && "Invalid scalar");
13184 assert(E->State != TreeEntry::NeedToGather &&
13185 "Extracting from a gather list");
13186 // Non-instruction pointers are not deleted, just skip them.
13187 if (E->getOpcode() == Instruction::GetElementPtr &&
13188 !isa<GetElementPtrInst>(Scalar))
13189 continue;
13190
13191 Value *Vec = E->VectorizedValue;
13192 assert(Vec && "Can't find vectorizable value");
13193
13194 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13195 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13196 if (Scalar->getType() != Vec->getType()) {
13197 Value *Ex = nullptr;
13198 Value *ExV = nullptr;
13199 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13200 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13201 auto It = ScalarToEEs.find(Scalar);
13202 if (It != ScalarToEEs.end()) {
13203 // No need to emit many extracts, just move the only one in the
13204 // current block.
13205 auto EEIt = It->second.find(Builder.GetInsertBlock());
13206 if (EEIt != It->second.end()) {
13207 Instruction *I = EEIt->second.first;
13208 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13209 Builder.GetInsertPoint()->comesBefore(I)) {
13210 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13211 Builder.GetInsertPoint());
13212 if (auto *CI = EEIt->second.second)
13213 CI->moveAfter(I);
13214 }
13215 Ex = I;
13216 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13217 }
13218 }
13219 if (!Ex) {
13220 // "Reuse" the existing extract to improve final codegen.
13221 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13222 Value *V = ES->getVectorOperand();
13223 if (const TreeEntry *ETE = getTreeEntry(V))
13224 V = ETE->VectorizedValue;
13225 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13226 } else if (ReplaceGEP) {
13227 // Leave the GEPs as is, they are free in most cases and better to
13228 // keep them as GEPs.
13229 auto *CloneGEP = GEP->clone();
13230 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13231 Builder.GetInsertPoint());
13232 if (GEP->hasName())
13233 CloneGEP->takeName(GEP);
13234 Ex = CloneGEP;
13235 } else {
13236 Ex = Builder.CreateExtractElement(Vec, Lane);
13237 }
13238 // If necessary, sign-extend or zero-extend ScalarRoot
13239 // to the larger type.
13240 ExV = Ex;
13241 if (Scalar->getType() != Ex->getType())
13242 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13243 MinBWs.find(E)->second.second);
13244 if (auto *I = dyn_cast<Instruction>(Ex))
13245 ScalarToEEs[Scalar].try_emplace(
13246 Builder.GetInsertBlock(),
13247 std::make_pair(I, cast<Instruction>(ExV)));
13248 }
13249 // The then branch of the previous if may produce constants, since 0
13250 // operand might be a constant.
13251 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13252 GatherShuffleExtractSeq.insert(ExI);
13253 CSEBlocks.insert(ExI->getParent());
13254 }
13255 return ExV;
13256 }
13257 assert(isa<FixedVectorType>(Scalar->getType()) &&
13258 isa<InsertElementInst>(Scalar) &&
13259 "In-tree scalar of vector type is not insertelement?");
13260 auto *IE = cast<InsertElementInst>(Scalar);
13261 VectorToInsertElement.try_emplace(Vec, IE);
13262 return Vec;
13263 };
13264 // If User == nullptr, the Scalar remains as scalar in vectorized
13265 // instructions or is used as extra arg. Generate ExtractElement instruction
13266 // and update the record for this scalar in ExternallyUsedValues.
13267 if (!User) {
13268 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13269 continue;
13270 assert((ExternallyUsedValues.count(Scalar) ||
13271 any_of(Scalar->users(),
13272 [&](llvm::User *U) {
13273 if (ExternalUsesAsGEPs.contains(U))
13274 return true;
13275 TreeEntry *UseEntry = getTreeEntry(U);
13276 return UseEntry &&
13277 (UseEntry->State == TreeEntry::Vectorize ||
13278 UseEntry->State ==
13279 TreeEntry::StridedVectorize) &&
13280 (E->State == TreeEntry::Vectorize ||
13281 E->State == TreeEntry::StridedVectorize) &&
13282 doesInTreeUserNeedToExtract(
13283 Scalar,
13284 cast<Instruction>(UseEntry->Scalars.front()),
13285 TLI);
13286 })) &&
13287 "Scalar with nullptr User must be registered in "
13288 "ExternallyUsedValues map or remain as scalar in vectorized "
13289 "instructions");
13290 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13291 if (auto *PHI = dyn_cast<PHINode>(VecI))
13292 Builder.SetInsertPoint(PHI->getParent(),
13293 PHI->getParent()->getFirstNonPHIIt());
13294 else
13295 Builder.SetInsertPoint(VecI->getParent(),
13296 std::next(VecI->getIterator()));
13297 } else {
13298 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13299 }
13300 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13301 // Required to update internally referenced instructions.
13302 Scalar->replaceAllUsesWith(NewInst);
13303 ReplacedExternals.emplace_back(Scalar, NewInst);
13304 continue;
13305 }
13306
13307 if (auto *VU = dyn_cast<InsertElementInst>(User)) {
13308 // Skip if the scalar is another vector op or Vec is not an instruction.
13309 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13310 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13311 if (!UsedInserts.insert(VU).second)
13312 continue;
13313 // Need to use original vector, if the root is truncated.
13314 auto BWIt = MinBWs.find(E);
13315 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13316 auto *ScalarTy = FTy->getElementType();
13317 auto Key = std::make_pair(Vec, ScalarTy);
13318 auto VecIt = VectorCasts.find(Key);
13319 if (VecIt == VectorCasts.end()) {
13320 IRBuilderBase::InsertPointGuard Guard(Builder);
13321 if (auto *IVec = dyn_cast<Instruction>(Vec))
13322 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13323 Vec = Builder.CreateIntCast(
13324 Vec,
13326 ScalarTy,
13327 cast<FixedVectorType>(Vec->getType())->getNumElements()),
13328 BWIt->second.second);
13329 VectorCasts.try_emplace(Key, Vec);
13330 } else {
13331 Vec = VecIt->second;
13332 }
13333 }
13334
13335 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
13336 if (InsertIdx) {
13337 auto *It =
13338 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
13339 // Checks if 2 insertelements are from the same buildvector.
13340 InsertElementInst *VecInsert = Data.InsertElements.front();
13342 VU, VecInsert,
13343 [](InsertElementInst *II) { return II->getOperand(0); });
13344 });
13345 unsigned Idx = *InsertIdx;
13346 if (It == ShuffledInserts.end()) {
13347 (void)ShuffledInserts.emplace_back();
13348 It = std::next(ShuffledInserts.begin(),
13349 ShuffledInserts.size() - 1);
13350 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13351 if (Mask.empty())
13352 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13353 // Find the insertvector, vectorized in tree, if any.
13354 Value *Base = VU;
13355 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
13356 if (IEBase != User &&
13357 (!IEBase->hasOneUse() ||
13358 getInsertIndex(IEBase).value_or(Idx) == Idx))
13359 break;
13360 // Build the mask for the vectorized insertelement instructions.
13361 if (const TreeEntry *E = getTreeEntry(IEBase)) {
13362 do {
13363 IEBase = cast<InsertElementInst>(Base);
13364 int IEIdx = *getInsertIndex(IEBase);
13365 assert(Mask[Idx] == PoisonMaskElem &&
13366 "InsertElementInstruction used already.");
13367 Mask[IEIdx] = IEIdx;
13368 Base = IEBase->getOperand(0);
13369 } while (E == getTreeEntry(Base));
13370 break;
13371 }
13372 Base = cast<InsertElementInst>(Base)->getOperand(0);
13373 // After the vectorization the def-use chain has changed, need
13374 // to look through original insertelement instructions, if they
13375 // get replaced by vector instructions.
13376 auto It = VectorToInsertElement.find(Base);
13377 if (It != VectorToInsertElement.end())
13378 Base = It->second;
13379 }
13380 }
13381 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13382 if (Mask.empty())
13383 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13384 Mask[Idx] = ExternalUse.Lane;
13385 It->InsertElements.push_back(cast<InsertElementInst>(User));
13386 continue;
13387 }
13388 }
13389 }
13390 }
13391
13392 // Generate extracts for out-of-tree users.
13393 // Find the insertion point for the extractelement lane.
13394 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13395 if (PHINode *PH = dyn_cast<PHINode>(User)) {
13396 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13397 if (PH->getIncomingValue(I) == Scalar) {
13398 Instruction *IncomingTerminator =
13399 PH->getIncomingBlock(I)->getTerminator();
13400 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13401 Builder.SetInsertPoint(VecI->getParent(),
13402 std::next(VecI->getIterator()));
13403 } else {
13404 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
13405 }
13406 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13407 PH->setOperand(I, NewInst);
13408 }
13409 }
13410 } else {
13411 Builder.SetInsertPoint(cast<Instruction>(User));
13412 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13413 User->replaceUsesOfWith(Scalar, NewInst);
13414 }
13415 } else {
13416 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13417 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13418 User->replaceUsesOfWith(Scalar, NewInst);
13419 }
13420
13421 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13422 }
13423
13424 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13425 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13426 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13427 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13428 for (int I = 0, E = Mask.size(); I < E; ++I) {
13429 if (Mask[I] < VF)
13430 CombinedMask1[I] = Mask[I];
13431 else
13432 CombinedMask2[I] = Mask[I] - VF;
13433 }
13434 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
13435 ShuffleBuilder.add(V1, CombinedMask1);
13436 if (V2)
13437 ShuffleBuilder.add(V2, CombinedMask2);
13438 return ShuffleBuilder.finalize(std::nullopt);
13439 };
13440
13441 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13442 bool ForSingleMask) {
13443 unsigned VF = Mask.size();
13444 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13445 if (VF != VecVF) {
13446 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13447 Vec = CreateShuffle(Vec, nullptr, Mask);
13448 return std::make_pair(Vec, true);
13449 }
13450 if (!ForSingleMask) {
13451 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13452 for (unsigned I = 0; I < VF; ++I) {
13453 if (Mask[I] != PoisonMaskElem)
13454 ResizeMask[Mask[I]] = Mask[I];
13455 }
13456 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13457 }
13458 }
13459
13460 return std::make_pair(Vec, false);
13461 };
13462 // Perform shuffling of the vectorize tree entries for better handling of
13463 // external extracts.
13464 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13465 // Find the first and the last instruction in the list of insertelements.
13466 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
13467 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13468 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13469 Builder.SetInsertPoint(LastInsert);
13470 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13471 Value *NewInst = performExtractsShuffleAction<Value>(
13472 MutableArrayRef(Vector.data(), Vector.size()),
13473 FirstInsert->getOperand(0),
13474 [](Value *Vec) {
13475 return cast<VectorType>(Vec->getType())
13476 ->getElementCount()
13477 .getKnownMinValue();
13478 },
13479 ResizeToVF,
13480 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13481 ArrayRef<Value *> Vals) {
13482 assert((Vals.size() == 1 || Vals.size() == 2) &&
13483 "Expected exactly 1 or 2 input values.");
13484 if (Vals.size() == 1) {
13485 // Do not create shuffle if the mask is a simple identity
13486 // non-resizing mask.
13487 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13488 ->getNumElements() ||
13489 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13490 return CreateShuffle(Vals.front(), nullptr, Mask);
13491 return Vals.front();
13492 }
13493 return CreateShuffle(Vals.front() ? Vals.front()
13494 : FirstInsert->getOperand(0),
13495 Vals.back(), Mask);
13496 });
13497 auto It = ShuffledInserts[I].InsertElements.rbegin();
13498 // Rebuild buildvector chain.
13499 InsertElementInst *II = nullptr;
13500 if (It != ShuffledInserts[I].InsertElements.rend())
13501 II = *It;
13503 while (It != ShuffledInserts[I].InsertElements.rend()) {
13504 assert(II && "Must be an insertelement instruction.");
13505 if (*It == II)
13506 ++It;
13507 else
13508 Inserts.push_back(cast<Instruction>(II));
13509 II = dyn_cast<InsertElementInst>(II->getOperand(0));
13510 }
13511 for (Instruction *II : reverse(Inserts)) {
13512 II->replaceUsesOfWith(II->getOperand(0), NewInst);
13513 if (auto *NewI = dyn_cast<Instruction>(NewInst))
13514 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
13515 II->moveAfter(NewI);
13516 NewInst = II;
13517 }
13518 LastInsert->replaceAllUsesWith(NewInst);
13519 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
13520 IE->replaceUsesOfWith(IE->getOperand(0),
13521 PoisonValue::get(IE->getOperand(0)->getType()));
13522 IE->replaceUsesOfWith(IE->getOperand(1),
13523 PoisonValue::get(IE->getOperand(1)->getType()));
13524 eraseInstruction(IE);
13525 }
13526 CSEBlocks.insert(LastInsert->getParent());
13527 }
13528
13529 SmallVector<Instruction *> RemovedInsts;
13530 // For each vectorized value:
13531 for (auto &TEPtr : VectorizableTree) {
13532 TreeEntry *Entry = TEPtr.get();
13533
13534 // No need to handle users of gathered values.
13535 if (Entry->State == TreeEntry::NeedToGather)
13536 continue;
13537
13538 assert(Entry->VectorizedValue && "Can't find vectorizable value");
13539
13540 // For each lane:
13541 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13542 Value *Scalar = Entry->Scalars[Lane];
13543
13544 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13545 !isa<GetElementPtrInst>(Scalar))
13546 continue;
13547#ifndef NDEBUG
13548 Type *Ty = Scalar->getType();
13549 if (!Ty->isVoidTy()) {
13550 for (User *U : Scalar->users()) {
13551 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
13552
13553 // It is legal to delete users in the ignorelist.
13554 assert((getTreeEntry(U) ||
13555 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13556 (isa_and_nonnull<Instruction>(U) &&
13557 isDeleted(cast<Instruction>(U)))) &&
13558 "Deleting out-of-tree value");
13559 }
13560 }
13561#endif
13562 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
13563 eraseInstruction(cast<Instruction>(Scalar));
13564 // Retain to-be-deleted instructions for some debug-info
13565 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
13566 // deletion - instructions are not deleted until later.
13567 RemovedInsts.push_back(cast<Instruction>(Scalar));
13568 }
13569 }
13570
13571 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
13572 // new vector instruction.
13573 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13574 V->mergeDIAssignID(RemovedInsts);
13575
13576 Builder.ClearInsertionPoint();
13577 InstrElementSize.clear();
13578
13579 const TreeEntry &RootTE = *VectorizableTree.front().get();
13580 Value *Vec = RootTE.VectorizedValue;
13581 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13582 It != MinBWs.end() &&
13583 ReductionBitWidth != It->second.first) {
13584 IRBuilder<>::InsertPointGuard Guard(Builder);
13585 Builder.SetInsertPoint(ReductionRoot->getParent(),
13586 ReductionRoot->getIterator());
13587 Vec = Builder.CreateIntCast(
13588 Vec,
13589 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
13590 cast<VectorType>(Vec->getType())->getElementCount()),
13591 It->second.second);
13592 }
13593 return Vec;
13594}
13595
13597 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
13598 << " gather sequences instructions.\n");
13599 // LICM InsertElementInst sequences.
13600 for (Instruction *I : GatherShuffleExtractSeq) {
13601 if (isDeleted(I))
13602 continue;
13603
13604 // Check if this block is inside a loop.
13605 Loop *L = LI->getLoopFor(I->getParent());
13606 if (!L)
13607 continue;
13608
13609 // Check if it has a preheader.
13610 BasicBlock *PreHeader = L->getLoopPreheader();
13611 if (!PreHeader)
13612 continue;
13613
13614 // If the vector or the element that we insert into it are
13615 // instructions that are defined in this basic block then we can't
13616 // hoist this instruction.
13617 if (any_of(I->operands(), [L](Value *V) {
13618 auto *OpI = dyn_cast<Instruction>(V);
13619 return OpI && L->contains(OpI);
13620 }))
13621 continue;
13622
13623 // We can hoist this instruction. Move it to the pre-header.
13624 I->moveBefore(PreHeader->getTerminator());
13625 CSEBlocks.insert(PreHeader);
13626 }
13627
13628 // Make a list of all reachable blocks in our CSE queue.
13630 CSEWorkList.reserve(CSEBlocks.size());
13631 for (BasicBlock *BB : CSEBlocks)
13632 if (DomTreeNode *N = DT->getNode(BB)) {
13634 CSEWorkList.push_back(N);
13635 }
13636
13637 // Sort blocks by domination. This ensures we visit a block after all blocks
13638 // dominating it are visited.
13639 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
13640 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
13641 "Different nodes should have different DFS numbers");
13642 return A->getDFSNumIn() < B->getDFSNumIn();
13643 });
13644
13645 // Less defined shuffles can be replaced by the more defined copies.
13646 // Between two shuffles one is less defined if it has the same vector operands
13647 // and its mask indeces are the same as in the first one or undefs. E.g.
13648 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
13649 // poison, <0, 0, 0, 0>.
13650 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
13651 SmallVectorImpl<int> &NewMask) {
13652 if (I1->getType() != I2->getType())
13653 return false;
13654 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13655 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13656 if (!SI1 || !SI2)
13657 return I1->isIdenticalTo(I2);
13658 if (SI1->isIdenticalTo(SI2))
13659 return true;
13660 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
13661 if (SI1->getOperand(I) != SI2->getOperand(I))
13662 return false;
13663 // Check if the second instruction is more defined than the first one.
13664 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13665 ArrayRef<int> SM1 = SI1->getShuffleMask();
13666 // Count trailing undefs in the mask to check the final number of used
13667 // registers.
13668 unsigned LastUndefsCnt = 0;
13669 for (int I = 0, E = NewMask.size(); I < E; ++I) {
13670 if (SM1[I] == PoisonMaskElem)
13671 ++LastUndefsCnt;
13672 else
13673 LastUndefsCnt = 0;
13674 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
13675 NewMask[I] != SM1[I])
13676 return false;
13677 if (NewMask[I] == PoisonMaskElem)
13678 NewMask[I] = SM1[I];
13679 }
13680 // Check if the last undefs actually change the final number of used vector
13681 // registers.
13682 return SM1.size() - LastUndefsCnt > 1 &&
13683 TTI->getNumberOfParts(SI1->getType()) ==
13685 FixedVectorType::get(SI1->getType()->getElementType(),
13686 SM1.size() - LastUndefsCnt));
13687 };
13688 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
13689 // instructions. TODO: We can further optimize this scan if we split the
13690 // instructions into different buckets based on the insert lane.
13692 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
13693 assert(*I &&
13694 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
13695 "Worklist not sorted properly!");
13696 BasicBlock *BB = (*I)->getBlock();
13697 // For all instructions in blocks containing gather sequences:
13698 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
13699 if (isDeleted(&In))
13700 continue;
13701 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13702 !GatherShuffleExtractSeq.contains(&In))
13703 continue;
13704
13705 // Check if we can replace this instruction with any of the
13706 // visited instructions.
13707 bool Replaced = false;
13708 for (Instruction *&V : Visited) {
13709 SmallVector<int> NewMask;
13710 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13711 DT->dominates(V->getParent(), In.getParent())) {
13712 In.replaceAllUsesWith(V);
13713 eraseInstruction(&In);
13714 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
13715 if (!NewMask.empty())
13716 SI->setShuffleMask(NewMask);
13717 Replaced = true;
13718 break;
13719 }
13720 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13721 GatherShuffleExtractSeq.contains(V) &&
13722 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13723 DT->dominates(In.getParent(), V->getParent())) {
13724 In.moveAfter(V);
13725 V->replaceAllUsesWith(&In);
13727 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13728 if (!NewMask.empty())
13729 SI->setShuffleMask(NewMask);
13730 V = &In;
13731 Replaced = true;
13732 break;
13733 }
13734 }
13735 if (!Replaced) {
13736 assert(!is_contained(Visited, &In));
13737 Visited.push_back(&In);
13738 }
13739 }
13740 }
13741 CSEBlocks.clear();
13742 GatherShuffleExtractSeq.clear();
13743}
13744
13745BoUpSLP::ScheduleData *
13746BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
13747 ScheduleData *Bundle = nullptr;
13748 ScheduleData *PrevInBundle = nullptr;
13749 for (Value *V : VL) {
13751 continue;
13752 ScheduleData *BundleMember = getScheduleData(V);
13753 assert(BundleMember &&
13754 "no ScheduleData for bundle member "
13755 "(maybe not in same basic block)");
13756 assert(BundleMember->isSchedulingEntity() &&
13757 "bundle member already part of other bundle");
13758 if (PrevInBundle) {
13759 PrevInBundle->NextInBundle = BundleMember;
13760 } else {
13761 Bundle = BundleMember;
13762 }
13763
13764 // Group the instructions to a bundle.
13765 BundleMember->FirstInBundle = Bundle;
13766 PrevInBundle = BundleMember;
13767 }
13768 assert(Bundle && "Failed to find schedule bundle");
13769 return Bundle;
13770}
13771
13772// Groups the instructions to a bundle (which is then a single scheduling entity)
13773// and schedules instructions until the bundle gets ready.
13774std::optional<BoUpSLP::ScheduleData *>
13775BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
13776 const InstructionsState &S) {
13777 // No need to schedule PHIs, insertelement, extractelement and extractvalue
13778 // instructions.
13779 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
13781 return nullptr;
13782
13783 // Initialize the instruction bundle.
13784 Instruction *OldScheduleEnd = ScheduleEnd;
13785 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
13786
13787 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
13788 ScheduleData *Bundle) {
13789 // The scheduling region got new instructions at the lower end (or it is a
13790 // new region for the first bundle). This makes it necessary to
13791 // recalculate all dependencies.
13792 // It is seldom that this needs to be done a second time after adding the
13793 // initial bundle to the region.
13794 if (ScheduleEnd != OldScheduleEnd) {
13795 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
13796 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
13797 ReSchedule = true;
13798 }
13799 if (Bundle) {
13800 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
13801 << " in block " << BB->getName() << "\n");
13802 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
13803 }
13804
13805 if (ReSchedule) {
13806 resetSchedule();
13807 initialFillReadyList(ReadyInsts);
13808 }
13809
13810 // Now try to schedule the new bundle or (if no bundle) just calculate
13811 // dependencies. As soon as the bundle is "ready" it means that there are no
13812 // cyclic dependencies and we can schedule it. Note that's important that we
13813 // don't "schedule" the bundle yet (see cancelScheduling).
13814 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13815 !ReadyInsts.empty()) {
13816 ScheduleData *Picked = ReadyInsts.pop_back_val();
13817 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13818 "must be ready to schedule");
13819 schedule(Picked, ReadyInsts);
13820 }
13821 };
13822
13823 // Make sure that the scheduling region contains all
13824 // instructions of the bundle.
13825 for (Value *V : VL) {
13827 continue;
13828 if (!extendSchedulingRegion(V, S)) {
13829 // If the scheduling region got new instructions at the lower end (or it
13830 // is a new region for the first bundle). This makes it necessary to
13831 // recalculate all dependencies.
13832 // Otherwise the compiler may crash trying to incorrectly calculate
13833 // dependencies and emit instruction in the wrong order at the actual
13834 // scheduling.
13835 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
13836 return std::nullopt;
13837 }
13838 }
13839
13840 bool ReSchedule = false;
13841 for (Value *V : VL) {
13843 continue;
13844 ScheduleData *BundleMember = getScheduleData(V);
13845 assert(BundleMember &&
13846 "no ScheduleData for bundle member (maybe not in same basic block)");
13847
13848 // Make sure we don't leave the pieces of the bundle in the ready list when
13849 // whole bundle might not be ready.
13850 ReadyInsts.remove(BundleMember);
13851
13852 if (!BundleMember->IsScheduled)
13853 continue;
13854 // A bundle member was scheduled as single instruction before and now
13855 // needs to be scheduled as part of the bundle. We just get rid of the
13856 // existing schedule.
13857 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
13858 << " was already scheduled\n");
13859 ReSchedule = true;
13860 }
13861
13862 auto *Bundle = buildBundle(VL);
13863 TryScheduleBundleImpl(ReSchedule, Bundle);
13864 if (!Bundle->isReady()) {
13865 cancelScheduling(VL, S.OpValue);
13866 return std::nullopt;
13867 }
13868 return Bundle;
13869}
13870
13871void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
13872 Value *OpValue) {
13873 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
13875 return;
13876
13877 if (doesNotNeedToBeScheduled(OpValue))
13878 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
13879 ScheduleData *Bundle = getScheduleData(OpValue);
13880 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
13881 assert(!Bundle->IsScheduled &&
13882 "Can't cancel bundle which is already scheduled");
13883 assert(Bundle->isSchedulingEntity() &&
13884 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
13885 "tried to unbundle something which is not a bundle");
13886
13887 // Remove the bundle from the ready list.
13888 if (Bundle->isReady())
13889 ReadyInsts.remove(Bundle);
13890
13891 // Un-bundle: make single instructions out of the bundle.
13892 ScheduleData *BundleMember = Bundle;
13893 while (BundleMember) {
13894 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
13895 BundleMember->FirstInBundle = BundleMember;
13896 ScheduleData *Next = BundleMember->NextInBundle;
13897 BundleMember->NextInBundle = nullptr;
13898 BundleMember->TE = nullptr;
13899 if (BundleMember->unscheduledDepsInBundle() == 0) {
13900 ReadyInsts.insert(BundleMember);
13901 }
13902 BundleMember = Next;
13903 }
13904}
13905
13906BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13907 // Allocate a new ScheduleData for the instruction.
13908 if (ChunkPos >= ChunkSize) {
13909 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
13910 ChunkPos = 0;
13911 }
13912 return &(ScheduleDataChunks.back()[ChunkPos++]);
13913}
13914
13915bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
13916 const InstructionsState &S) {
13917 if (getScheduleData(V, isOneOf(S, V)))
13918 return true;
13919 Instruction *I = dyn_cast<Instruction>(V);
13920 assert(I && "bundle member must be an instruction");
13921 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
13923 "phi nodes/insertelements/extractelements/extractvalues don't need to "
13924 "be scheduled");
13925 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
13926 ScheduleData *ISD = getScheduleData(I);
13927 if (!ISD)
13928 return false;
13929 assert(isInSchedulingRegion(ISD) &&
13930 "ScheduleData not in scheduling region");
13931 ScheduleData *SD = allocateScheduleDataChunks();
13932 SD->Inst = I;
13933 SD->init(SchedulingRegionID, S.OpValue);
13934 ExtraScheduleDataMap[I][S.OpValue] = SD;
13935 return true;
13936 };
13937 if (CheckScheduleForI(I))
13938 return true;
13939 if (!ScheduleStart) {
13940 // It's the first instruction in the new region.
13941 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
13942 ScheduleStart = I;
13943 ScheduleEnd = I->getNextNode();
13944 if (isOneOf(S, I) != I)
13945 CheckScheduleForI(I);
13946 assert(ScheduleEnd && "tried to vectorize a terminator?");
13947 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
13948 return true;
13949 }
13950 // Search up and down at the same time, because we don't know if the new
13951 // instruction is above or below the existing scheduling region.
13952 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
13953 // against the budget. Otherwise debug info could affect codegen.
13955 ++ScheduleStart->getIterator().getReverse();
13956 BasicBlock::reverse_iterator UpperEnd = BB->rend();
13957 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
13958 BasicBlock::iterator LowerEnd = BB->end();
13959 auto IsAssumeLikeIntr = [](const Instruction &I) {
13960 if (auto *II = dyn_cast<IntrinsicInst>(&I))
13961 return II->isAssumeLikeIntrinsic();
13962 return false;
13963 };
13964 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13965 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13966 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
13967 &*DownIter != I) {
13968 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
13969 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
13970 return false;
13971 }
13972
13973 ++UpIter;
13974 ++DownIter;
13975
13976 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13977 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13978 }
13979 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
13980 assert(I->getParent() == ScheduleStart->getParent() &&
13981 "Instruction is in wrong basic block.");
13982 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
13983 ScheduleStart = I;
13984 if (isOneOf(S, I) != I)
13985 CheckScheduleForI(I);
13986 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
13987 << "\n");
13988 return true;
13989 }
13990 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
13991 "Expected to reach top of the basic block or instruction down the "
13992 "lower end.");
13993 assert(I->getParent() == ScheduleEnd->getParent() &&
13994 "Instruction is in wrong basic block.");
13995 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
13996 nullptr);
13997 ScheduleEnd = I->getNextNode();
13998 if (isOneOf(S, I) != I)
13999 CheckScheduleForI(I);
14000 assert(ScheduleEnd && "tried to vectorize a terminator?");
14001 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14002 return true;
14003}
14004
14005void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14006 Instruction *ToI,
14007 ScheduleData *PrevLoadStore,
14008 ScheduleData *NextLoadStore) {
14009 ScheduleData *CurrentLoadStore = PrevLoadStore;
14010 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14011 // No need to allocate data for non-schedulable instructions.
14013 continue;
14014 ScheduleData *SD = ScheduleDataMap.lookup(I);
14015 if (!SD) {
14016 SD = allocateScheduleDataChunks();
14017 ScheduleDataMap[I] = SD;
14018 SD->Inst = I;
14019 }
14020 assert(!isInSchedulingRegion(SD) &&
14021 "new ScheduleData already in scheduling region");
14022 SD->init(SchedulingRegionID, I);
14023
14024 if (I->mayReadOrWriteMemory() &&
14025 (!isa<IntrinsicInst>(I) ||
14026 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14027 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14028 Intrinsic::pseudoprobe))) {
14029 // Update the linked list of memory accessing instructions.
14030 if (CurrentLoadStore) {
14031 CurrentLoadStore->NextLoadStore = SD;
14032 } else {
14033 FirstLoadStoreInRegion = SD;
14034 }
14035 CurrentLoadStore = SD;
14036 }
14037
14038 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14039 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14040 RegionHasStackSave = true;
14041 }
14042 if (NextLoadStore) {
14043 if (CurrentLoadStore)
14044 CurrentLoadStore->NextLoadStore = NextLoadStore;
14045 } else {
14046 LastLoadStoreInRegion = CurrentLoadStore;
14047 }
14048}
14049
14050void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14051 bool InsertInReadyList,
14052 BoUpSLP *SLP) {
14053 assert(SD->isSchedulingEntity());
14054
14056 WorkList.push_back(SD);
14057
14058 while (!WorkList.empty()) {
14059 ScheduleData *SD = WorkList.pop_back_val();
14060 for (ScheduleData *BundleMember = SD; BundleMember;
14061 BundleMember = BundleMember->NextInBundle) {
14062 assert(isInSchedulingRegion(BundleMember));
14063 if (BundleMember->hasValidDependencies())
14064 continue;
14065
14066 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14067 << "\n");
14068 BundleMember->Dependencies = 0;
14069 BundleMember->resetUnscheduledDeps();
14070
14071 // Handle def-use chain dependencies.
14072 if (BundleMember->OpValue != BundleMember->Inst) {
14073 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14074 BundleMember->Dependencies++;
14075 ScheduleData *DestBundle = UseSD->FirstInBundle;
14076 if (!DestBundle->IsScheduled)
14077 BundleMember->incrementUnscheduledDeps(1);
14078 if (!DestBundle->hasValidDependencies())
14079 WorkList.push_back(DestBundle);
14080 }
14081 } else {
14082 for (User *U : BundleMember->Inst->users()) {
14083 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14084 BundleMember->Dependencies++;
14085 ScheduleData *DestBundle = UseSD->FirstInBundle;
14086 if (!DestBundle->IsScheduled)
14087 BundleMember->incrementUnscheduledDeps(1);
14088 if (!DestBundle->hasValidDependencies())
14089 WorkList.push_back(DestBundle);
14090 }
14091 }
14092 }
14093
14094 auto MakeControlDependent = [&](Instruction *I) {
14095 auto *DepDest = getScheduleData(I);
14096 assert(DepDest && "must be in schedule window");
14097 DepDest->ControlDependencies.push_back(BundleMember);
14098 BundleMember->Dependencies++;
14099 ScheduleData *DestBundle = DepDest->FirstInBundle;
14100 if (!DestBundle->IsScheduled)
14101 BundleMember->incrementUnscheduledDeps(1);
14102 if (!DestBundle->hasValidDependencies())
14103 WorkList.push_back(DestBundle);
14104 };
14105
14106 // Any instruction which isn't safe to speculate at the beginning of the
14107 // block is control dependend on any early exit or non-willreturn call
14108 // which proceeds it.
14109 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14110 for (Instruction *I = BundleMember->Inst->getNextNode();
14111 I != ScheduleEnd; I = I->getNextNode()) {
14112 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14113 continue;
14114
14115 // Add the dependency
14116 MakeControlDependent(I);
14117
14119 // Everything past here must be control dependent on I.
14120 break;
14121 }
14122 }
14123
14124 if (RegionHasStackSave) {
14125 // If we have an inalloc alloca instruction, it needs to be scheduled
14126 // after any preceeding stacksave. We also need to prevent any alloca
14127 // from reordering above a preceeding stackrestore.
14128 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14129 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14130 for (Instruction *I = BundleMember->Inst->getNextNode();
14131 I != ScheduleEnd; I = I->getNextNode()) {
14132 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14133 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14134 // Any allocas past here must be control dependent on I, and I
14135 // must be memory dependend on BundleMember->Inst.
14136 break;
14137
14138 if (!isa<AllocaInst>(I))
14139 continue;
14140
14141 // Add the dependency
14142 MakeControlDependent(I);
14143 }
14144 }
14145
14146 // In addition to the cases handle just above, we need to prevent
14147 // allocas and loads/stores from moving below a stacksave or a
14148 // stackrestore. Avoiding moving allocas below stackrestore is currently
14149 // thought to be conservatism. Moving loads/stores below a stackrestore
14150 // can lead to incorrect code.
14151 if (isa<AllocaInst>(BundleMember->Inst) ||
14152 BundleMember->Inst->mayReadOrWriteMemory()) {
14153 for (Instruction *I = BundleMember->Inst->getNextNode();
14154 I != ScheduleEnd; I = I->getNextNode()) {
14155 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14156 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14157 continue;
14158
14159 // Add the dependency
14160 MakeControlDependent(I);
14161 break;
14162 }
14163 }
14164 }
14165
14166 // Handle the memory dependencies (if any).
14167 ScheduleData *DepDest = BundleMember->NextLoadStore;
14168 if (!DepDest)
14169 continue;
14170 Instruction *SrcInst = BundleMember->Inst;
14171 assert(SrcInst->mayReadOrWriteMemory() &&
14172 "NextLoadStore list for non memory effecting bundle?");
14173 MemoryLocation SrcLoc = getLocation(SrcInst);
14174 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14175 unsigned NumAliased = 0;
14176 unsigned DistToSrc = 1;
14177
14178 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14179 assert(isInSchedulingRegion(DepDest));
14180
14181 // We have two limits to reduce the complexity:
14182 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14183 // SLP->isAliased (which is the expensive part in this loop).
14184 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14185 // the whole loop (even if the loop is fast, it's quadratic).
14186 // It's important for the loop break condition (see below) to
14187 // check this limit even between two read-only instructions.
14188 if (DistToSrc >= MaxMemDepDistance ||
14189 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14190 (NumAliased >= AliasedCheckLimit ||
14191 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14192
14193 // We increment the counter only if the locations are aliased
14194 // (instead of counting all alias checks). This gives a better
14195 // balance between reduced runtime and accurate dependencies.
14196 NumAliased++;
14197
14198 DepDest->MemoryDependencies.push_back(BundleMember);
14199 BundleMember->Dependencies++;
14200 ScheduleData *DestBundle = DepDest->FirstInBundle;
14201 if (!DestBundle->IsScheduled) {
14202 BundleMember->incrementUnscheduledDeps(1);
14203 }
14204 if (!DestBundle->hasValidDependencies()) {
14205 WorkList.push_back(DestBundle);
14206 }
14207 }
14208
14209 // Example, explaining the loop break condition: Let's assume our
14210 // starting instruction is i0 and MaxMemDepDistance = 3.
14211 //
14212 // +--------v--v--v
14213 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14214 // +--------^--^--^
14215 //
14216 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14217 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14218 // Previously we already added dependencies from i3 to i6,i7,i8
14219 // (because of MaxMemDepDistance). As we added a dependency from
14220 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14221 // and we can abort this loop at i6.
14222 if (DistToSrc >= 2 * MaxMemDepDistance)
14223 break;
14224 DistToSrc++;
14225 }
14226 }
14227 if (InsertInReadyList && SD->isReady()) {
14228 ReadyInsts.insert(SD);
14229 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14230 << "\n");
14231 }
14232 }
14233}
14234
14235void BoUpSLP::BlockScheduling::resetSchedule() {
14236 assert(ScheduleStart &&
14237 "tried to reset schedule on block which has not been scheduled");
14238 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14239 doForAllOpcodes(I, [&](ScheduleData *SD) {
14240 assert(isInSchedulingRegion(SD) &&
14241 "ScheduleData not in scheduling region");
14242 SD->IsScheduled = false;
14243 SD->resetUnscheduledDeps();
14244 });
14245 }
14246 ReadyInsts.clear();
14247}
14248
14249void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14250 if (!BS->ScheduleStart)
14251 return;
14252
14253 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14254
14255 // A key point - if we got here, pre-scheduling was able to find a valid
14256 // scheduling of the sub-graph of the scheduling window which consists
14257 // of all vector bundles and their transitive users. As such, we do not
14258 // need to reschedule anything *outside of* that subgraph.
14259
14260 BS->resetSchedule();
14261
14262 // For the real scheduling we use a more sophisticated ready-list: it is
14263 // sorted by the original instruction location. This lets the final schedule
14264 // be as close as possible to the original instruction order.
14265 // WARNING: If changing this order causes a correctness issue, that means
14266 // there is some missing dependence edge in the schedule data graph.
14267 struct ScheduleDataCompare {
14268 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14269 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14270 }
14271 };
14272 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14273
14274 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14275 // and fill the ready-list with initial instructions.
14276 int Idx = 0;
14277 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14278 I = I->getNextNode()) {
14279 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14280 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14281 (void)SDTE;
14283 SD->isPartOfBundle() ==
14284 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14285 "scheduler and vectorizer bundle mismatch");
14286 SD->FirstInBundle->SchedulingPriority = Idx++;
14287
14288 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14289 BS->calculateDependencies(SD, false, this);
14290 });
14291 }
14292 BS->initialFillReadyList(ReadyInsts);
14293
14294 Instruction *LastScheduledInst = BS->ScheduleEnd;
14295
14296 // Do the "real" scheduling.
14297 while (!ReadyInsts.empty()) {
14298 ScheduleData *Picked = *ReadyInsts.begin();
14299 ReadyInsts.erase(ReadyInsts.begin());
14300
14301 // Move the scheduled instruction(s) to their dedicated places, if not
14302 // there yet.
14303 for (ScheduleData *BundleMember = Picked; BundleMember;
14304 BundleMember = BundleMember->NextInBundle) {
14305 Instruction *PickedInst = BundleMember->Inst;
14306 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14307 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
14308 LastScheduledInst = PickedInst;
14309 }
14310
14311 BS->schedule(Picked, ReadyInsts);
14312 }
14313
14314 // Check that we didn't break any of our invariants.
14315#ifdef EXPENSIVE_CHECKS
14316 BS->verify();
14317#endif
14318
14319#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14320 // Check that all schedulable entities got scheduled
14321 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14322 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
14323 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14324 assert(SD->IsScheduled && "must be scheduled at this point");
14325 }
14326 });
14327 }
14328#endif
14329
14330 // Avoid duplicate scheduling of the block.
14331 BS->ScheduleStart = nullptr;
14332}
14333
14335 // If V is a store, just return the width of the stored value (or value
14336 // truncated just before storing) without traversing the expression tree.
14337 // This is the common case.
14338 if (auto *Store = dyn_cast<StoreInst>(V))
14339 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14340
14341 if (auto *IEI = dyn_cast<InsertElementInst>(V))
14342 return getVectorElementSize(IEI->getOperand(1));
14343
14344 auto E = InstrElementSize.find(V);
14345 if (E != InstrElementSize.end())
14346 return E->second;
14347
14348 // If V is not a store, we can traverse the expression tree to find loads
14349 // that feed it. The type of the loaded value may indicate a more suitable
14350 // width than V's type. We want to base the vector element size on the width
14351 // of memory operations where possible.
14354 if (auto *I = dyn_cast<Instruction>(V)) {
14355 Worklist.emplace_back(I, I->getParent(), 0);
14356 Visited.insert(I);
14357 }
14358
14359 // Traverse the expression tree in bottom-up order looking for loads. If we
14360 // encounter an instruction we don't yet handle, we give up.
14361 auto Width = 0u;
14362 Value *FirstNonBool = nullptr;
14363 while (!Worklist.empty()) {
14364 auto [I, Parent, Level] = Worklist.pop_back_val();
14365
14366 // We should only be looking at scalar instructions here. If the current
14367 // instruction has a vector type, skip.
14368 auto *Ty = I->getType();
14369 if (isa<VectorType>(Ty))
14370 continue;
14371 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14372 FirstNonBool = I;
14373 if (Level > RecursionMaxDepth)
14374 continue;
14375
14376 // If the current instruction is a load, update MaxWidth to reflect the
14377 // width of the loaded value.
14378 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
14379 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
14380
14381 // Otherwise, we need to visit the operands of the instruction. We only
14382 // handle the interesting cases from buildTree here. If an operand is an
14383 // instruction we haven't yet visited and from the same basic block as the
14384 // user or the use is a PHI node, we add it to the worklist.
14387 for (Use &U : I->operands()) {
14388 if (auto *J = dyn_cast<Instruction>(U.get()))
14389 if (Visited.insert(J).second &&
14390 (isa<PHINode>(I) || J->getParent() == Parent)) {
14391 Worklist.emplace_back(J, J->getParent(), Level + 1);
14392 continue;
14393 }
14394 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14395 FirstNonBool = U.get();
14396 }
14397 } else {
14398 break;
14399 }
14400 }
14401
14402 // If we didn't encounter a memory access in the expression tree, or if we
14403 // gave up for some reason, just return the width of V. Otherwise, return the
14404 // maximum width we found.
14405 if (!Width) {
14406 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14407 V = FirstNonBool;
14408 Width = DL->getTypeSizeInBits(V->getType());
14409 }
14410
14411 for (Instruction *I : Visited)
14412 InstrElementSize[I] = Width;
14413
14414 return Width;
14415}
14416
14417bool BoUpSLP::collectValuesToDemote(
14418 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14420 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14421 bool IsTruncRoot) const {
14422 // We can always demote constants.
14423 if (all_of(E.Scalars, IsaPred<Constant>))
14424 return true;
14425
14426 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
14427 if (OrigBitWidth == BitWidth) {
14428 MaxDepthLevel = 1;
14429 return true;
14430 }
14431
14432 // If the value is not a vectorized instruction in the expression and not used
14433 // by the insertelement instruction and not used in multiple vector nodes, it
14434 // cannot be demoted.
14435 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14436 if (MultiNodeScalars.contains(V))
14437 return false;
14438 if (OrigBitWidth > BitWidth) {
14439 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14440 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14441 return true;
14442 }
14443 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14444 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14445 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
14446 ++BitWidth1;
14447 if (auto *I = dyn_cast<Instruction>(V)) {
14448 APInt Mask = DB->getDemandedBits(I);
14449 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14450 BitWidth1 = std::min(BitWidth1, BitWidth2);
14451 }
14452 BitWidth = std::max(BitWidth, BitWidth1);
14453 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14454 };
14455 using namespace std::placeholders;
14456 auto FinalAnalysis = [&]() {
14457 if (!IsProfitableToDemote)
14458 return false;
14459 bool Res = all_of(
14460 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
14461 // Gather demoted constant operands.
14462 if (Res && E.State == TreeEntry::NeedToGather &&
14463 all_of(E.Scalars, IsaPred<Constant>))
14464 ToDemote.push_back(E.Idx);
14465 return Res;
14466 };
14467 // TODO: improve handling of gathered values and others.
14468 if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
14469 any_of(E.Scalars, [&](Value *V) {
14470 return all_of(V->users(), [&](User *U) {
14471 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14472 });
14473 }))
14474 return FinalAnalysis();
14475
14476 if (any_of(E.Scalars, [&](Value *V) {
14477 return !all_of(V->users(), [=](User *U) {
14478 return getTreeEntry(U) ||
14479 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14480 (U->getType()->isSized() && !U->getType()->isScalableTy() &&
14481 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14482 }) && !IsPotentiallyTruncated(V, BitWidth);
14483 }))
14484 return false;
14485
14486 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14487 bool &NeedToExit) {
14488 NeedToExit = false;
14489 unsigned InitLevel = MaxDepthLevel;
14490 for (const TreeEntry *Op : Operands) {
14491 unsigned Level = InitLevel;
14492 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
14493 ToDemote, Visited, Level, IsProfitableToDemote,
14494 IsTruncRoot)) {
14495 if (!IsProfitableToDemote)
14496 return false;
14497 NeedToExit = true;
14498 if (!FinalAnalysis())
14499 return false;
14500 continue;
14501 }
14502 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14503 }
14504 return true;
14505 };
14506 auto AttemptCheckBitwidth =
14507 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14508 // Try all bitwidth < OrigBitWidth.
14509 NeedToExit = false;
14510 unsigned BestFailBitwidth = 0;
14511 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14512 if (Checker(BitWidth, OrigBitWidth))
14513 return true;
14514 if (BestFailBitwidth == 0 && FinalAnalysis())
14515 BestFailBitwidth = BitWidth;
14516 }
14517 if (BitWidth >= OrigBitWidth) {
14518 if (BestFailBitwidth == 0) {
14519 BitWidth = OrigBitWidth;
14520 return false;
14521 }
14522 MaxDepthLevel = 1;
14523 BitWidth = BestFailBitwidth;
14524 NeedToExit = true;
14525 return true;
14526 }
14527 return false;
14528 };
14529 auto TryProcessInstruction =
14530 [&](unsigned &BitWidth,
14532 function_ref<bool(unsigned, unsigned)> Checker = {}) {
14533 if (Operands.empty()) {
14534 if (!IsTruncRoot)
14535 MaxDepthLevel = 1;
14536 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14537 std::ref(BitWidth)));
14538 } else {
14539 // Several vectorized uses? Check if we can truncate it, otherwise -
14540 // exit.
14541 if (E.UserTreeIndices.size() > 1 &&
14542 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14543 std::ref(BitWidth))))
14544 return false;
14545 bool NeedToExit = false;
14546 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14547 return false;
14548 if (NeedToExit)
14549 return true;
14550 if (!ProcessOperands(Operands, NeedToExit))
14551 return false;
14552 if (NeedToExit)
14553 return true;
14554 }
14555
14556 ++MaxDepthLevel;
14557 // Record the entry that we can demote.
14558 ToDemote.push_back(E.Idx);
14559 return IsProfitableToDemote;
14560 };
14561 switch (E.getOpcode()) {
14562
14563 // We can always demote truncations and extensions. Since truncations can
14564 // seed additional demotion, we save the truncated value.
14565 case Instruction::Trunc:
14566 if (IsProfitableToDemoteRoot)
14567 IsProfitableToDemote = true;
14568 return TryProcessInstruction(BitWidth);
14569 case Instruction::ZExt:
14570 case Instruction::SExt:
14571 IsProfitableToDemote = true;
14572 return TryProcessInstruction(BitWidth);
14573
14574 // We can demote certain binary operations if we can demote both of their
14575 // operands.
14576 case Instruction::Add:
14577 case Instruction::Sub:
14578 case Instruction::Mul:
14579 case Instruction::And:
14580 case Instruction::Or:
14581 case Instruction::Xor: {
14582 return TryProcessInstruction(
14583 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14584 }
14585 case Instruction::Shl: {
14586 // If we are truncating the result of this SHL, and if it's a shift of an
14587 // inrange amount, we can always perform a SHL in a smaller type.
14588 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
14589 return all_of(E.Scalars, [&](Value *V) {
14590 auto *I = cast<Instruction>(V);
14591 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14592 return AmtKnownBits.getMaxValue().ult(BitWidth);
14593 });
14594 };
14595 return TryProcessInstruction(
14596 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14597 }
14598 case Instruction::LShr: {
14599 // If this is a truncate of a logical shr, we can truncate it to a smaller
14600 // lshr iff we know that the bits we would otherwise be shifting in are
14601 // already zeros.
14602 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14603 return all_of(E.Scalars, [&](Value *V) {
14604 auto *I = cast<Instruction>(V);
14605 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14606 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14607 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14608 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14609 SimplifyQuery(*DL));
14610 });
14611 };
14612 return TryProcessInstruction(
14613 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14614 LShrChecker);
14615 }
14616 case Instruction::AShr: {
14617 // If this is a truncate of an arithmetic shr, we can truncate it to a
14618 // smaller ashr iff we know that all the bits from the sign bit of the
14619 // original type and the sign bit of the truncate type are similar.
14620 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14621 return all_of(E.Scalars, [&](Value *V) {
14622 auto *I = cast<Instruction>(V);
14623 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14624 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14625 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14626 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14627 nullptr, DT);
14628 });
14629 };
14630 return TryProcessInstruction(
14631 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14632 AShrChecker);
14633 }
14634 case Instruction::UDiv:
14635 case Instruction::URem: {
14636 // UDiv and URem can be truncated if all the truncated bits are zero.
14637 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14638 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14639 return all_of(E.Scalars, [&](Value *V) {
14640 auto *I = cast<Instruction>(V);
14641 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14642 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14643 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14644 });
14645 };
14646 return TryProcessInstruction(
14647 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14648 }
14649
14650 // We can demote selects if we can demote their true and false values.
14651 case Instruction::Select: {
14652 return TryProcessInstruction(
14653 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14654 }
14655
14656 // We can demote phis if we can demote all their incoming operands. Note that
14657 // we don't need to worry about cycles since we ensure single use above.
14658 case Instruction::PHI: {
14659 const unsigned NumOps = E.getNumOperands();
14661 transform(seq<unsigned>(0, NumOps), Ops.begin(),
14662 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
14663
14664 return TryProcessInstruction(BitWidth, Ops);
14665 }
14666
14667 case Instruction::Call: {
14668 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14669 if (!IC)
14670 break;
14672 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14673 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14674 break;
14675 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
14676 function_ref<bool(unsigned, unsigned)> CallChecker;
14677 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14678 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14679 return all_of(E.Scalars, [&](Value *V) {
14680 auto *I = cast<Instruction>(V);
14681 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14682 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14683 return MaskedValueIsZero(I->getOperand(0), Mask,
14684 SimplifyQuery(*DL)) &&
14685 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14686 }
14687 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
14688 "Expected min/max intrinsics only.");
14689 unsigned SignBits = OrigBitWidth - BitWidth;
14690 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
14691 return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14692 nullptr, DT) &&
14693 (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) ||
14694 MaskedValueIsZero(I->getOperand(0), Mask,
14695 SimplifyQuery(*DL))) &&
14696 SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
14697 nullptr, DT) &&
14698 (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) ||
14699 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
14700 });
14701 };
14702 if (ID != Intrinsic::abs) {
14703 Operands.push_back(getOperandEntry(&E, 1));
14704 CallChecker = CompChecker;
14705 }
14706 InstructionCost BestCost =
14707 std::numeric_limits<InstructionCost::CostType>::max();
14708 unsigned BestBitWidth = BitWidth;
14709 unsigned VF = E.Scalars.size();
14710 // Choose the best bitwidth based on cost estimations.
14711 auto Checker = [&](unsigned BitWidth, unsigned) {
14712 unsigned MinBW = PowerOf2Ceil(BitWidth);
14713 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
14714 auto VecCallCosts = getVectorCallCosts(
14715 IC,
14716 FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
14717 TTI, TLI, ArgTys);
14718 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
14719 if (Cost < BestCost) {
14720 BestCost = Cost;
14721 BestBitWidth = BitWidth;
14722 }
14723 return false;
14724 };
14725 [[maybe_unused]] bool NeedToExit;
14726 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14727 BitWidth = BestBitWidth;
14728 return TryProcessInstruction(BitWidth, Operands, CallChecker);
14729 }
14730
14731 // Otherwise, conservatively give up.
14732 default:
14733 break;
14734 }
14735 MaxDepthLevel = 1;
14736 return FinalAnalysis();
14737}
14738
14739static RecurKind getRdxKind(Value *V);
14740
14742 // We only attempt to truncate integer expressions.
14743 bool IsStoreOrInsertElt =
14744 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14745 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14746 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14747 ExtraBitWidthNodes.size() <= 1 &&
14748 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14749 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14750 return;
14751
14752 unsigned NodeIdx = 0;
14753 if (IsStoreOrInsertElt &&
14754 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14755 NodeIdx = 1;
14756
14757 // Ensure the roots of the vectorizable tree don't form a cycle.
14758 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14759 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
14760 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14761 [NodeIdx](const EdgeInfo &EI) {
14762 return EI.UserTE->Idx >
14763 static_cast<int>(NodeIdx);
14764 })))
14765 return;
14766
14767 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
14768 // resize to the final type.
14769 bool IsTruncRoot = false;
14770 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14771 SmallVector<unsigned> RootDemotes;
14772 if (NodeIdx != 0 &&
14773 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14774 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
14775 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
14776 IsTruncRoot = true;
14777 RootDemotes.push_back(NodeIdx);
14778 IsProfitableToDemoteRoot = true;
14779 ++NodeIdx;
14780 }
14781
14782 // Analyzed the reduction already and not profitable - exit.
14783 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
14784 return;
14785
14786 SmallVector<unsigned> ToDemote;
14787 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
14788 bool IsProfitableToDemoteRoot, unsigned Opcode,
14789 unsigned Limit, bool IsTruncRoot,
14790 bool IsSignedCmp) {
14791 ToDemote.clear();
14792 unsigned VF = E.getVectorFactor();
14793 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14794 if (!TreeRootIT || !Opcode)
14795 return 0u;
14796
14797 if (any_of(E.Scalars,
14798 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
14799 return 0u;
14800
14801 unsigned NumParts =
14802 TTI->getNumberOfParts(FixedVectorType::get(TreeRootIT, VF));
14803
14804 // The maximum bit width required to represent all the values that can be
14805 // demoted without loss of precision. It would be safe to truncate the roots
14806 // of the expression to this width.
14807 unsigned MaxBitWidth = 1u;
14808
14809 // True if the roots can be zero-extended back to their original type,
14810 // rather than sign-extended. We know that if the leading bits are not
14811 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
14812 // True.
14813 // Determine if the sign bit of all the roots is known to be zero. If not,
14814 // IsKnownPositive is set to False.
14815 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
14816 KnownBits Known = computeKnownBits(R, *DL);
14817 return Known.isNonNegative();
14818 });
14819
14820 // We first check if all the bits of the roots are demanded. If they're not,
14821 // we can truncate the roots to this narrower type.
14822 for (Value *Root : E.Scalars) {
14823 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
14824 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
14825 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14826 // If we can't prove that the sign bit is zero, we must add one to the
14827 // maximum bit width to account for the unknown sign bit. This preserves
14828 // the existing sign bit so we can safely sign-extend the root back to the
14829 // original type. Otherwise, if we know the sign bit is zero, we will
14830 // zero-extend the root instead.
14831 //
14832 // FIXME: This is somewhat suboptimal, as there will be cases where adding
14833 // one to the maximum bit width will yield a larger-than-necessary
14834 // type. In general, we need to add an extra bit only if we can't
14835 // prove that the upper bit of the original type is equal to the
14836 // upper bit of the proposed smaller type. If these two bits are
14837 // the same (either zero or one) we know that sign-extending from
14838 // the smaller type will result in the same value. Here, since we
14839 // can't yet prove this, we are just making the proposed smaller
14840 // type larger to ensure correctness.
14841 if (!IsKnownPositive)
14842 ++BitWidth1;
14843
14844 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
14845 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14846 MaxBitWidth =
14847 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
14848 }
14849
14850 if (MaxBitWidth < 8 && MaxBitWidth > 1)
14851 MaxBitWidth = 8;
14852
14853 // If the original type is large, but reduced type does not improve the reg
14854 // use - ignore it.
14855 if (NumParts > 1 &&
14856 NumParts ==
14858 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
14859 return 0u;
14860
14861 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
14862 Opcode == Instruction::SExt ||
14863 Opcode == Instruction::ZExt || NumParts > 1;
14864 // Conservatively determine if we can actually truncate the roots of the
14865 // expression. Collect the values that can be demoted in ToDemote and
14866 // additional roots that require investigating in Roots.
14868 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
14869 bool NeedToDemote = IsProfitableToDemote;
14870
14871 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
14872 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
14873 IsTruncRoot) ||
14874 (MaxDepthLevel <= Limit &&
14875 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
14876 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
14877 DL->getTypeSizeInBits(TreeRootIT) /
14878 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
14879 ->getOperand(0)
14880 ->getType()) >
14881 2)))))
14882 return 0u;
14883 // Round MaxBitWidth up to the next power-of-two.
14884 MaxBitWidth = bit_ceil(MaxBitWidth);
14885
14886 return MaxBitWidth;
14887 };
14888
14889 // If we can truncate the root, we must collect additional values that might
14890 // be demoted as a result. That is, those seeded by truncations we will
14891 // modify.
14892 // Add reduction ops sizes, if any.
14893 if (UserIgnoreList &&
14894 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
14895 for (Value *V : *UserIgnoreList) {
14896 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14897 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
14898 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14900 ++BitWidth1;
14901 unsigned BitWidth2 = BitWidth1;
14903 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
14904 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14905 }
14906 ReductionBitWidth =
14907 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
14908 }
14909 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
14910 ReductionBitWidth = 8;
14911
14912 ReductionBitWidth = bit_ceil(ReductionBitWidth);
14913 }
14914 bool IsTopRoot = NodeIdx == 0;
14915 while (NodeIdx < VectorizableTree.size() &&
14916 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14917 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
14918 RootDemotes.push_back(NodeIdx);
14919 ++NodeIdx;
14920 IsTruncRoot = true;
14921 }
14922 bool IsSignedCmp = false;
14923 while (NodeIdx < VectorizableTree.size()) {
14924 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
14925 unsigned Limit = 2;
14926 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
14927 if (IsTopRoot &&
14928 ReductionBitWidth ==
14929 DL->getTypeSizeInBits(
14930 VectorizableTree.front()->Scalars.front()->getType()))
14931 Limit = 3;
14932 unsigned MaxBitWidth = ComputeMaxBitWidth(
14933 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
14934 Opcode, Limit, IsTruncRoot, IsSignedCmp);
14935 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
14936 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
14937 ReductionBitWidth = bit_ceil(MaxBitWidth);
14938 else if (MaxBitWidth == 0)
14939 ReductionBitWidth = 0;
14940 }
14941
14942 for (unsigned Idx : RootDemotes) {
14943 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
14944 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
14945 if (OrigBitWidth > MaxBitWidth) {
14946 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
14947 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
14948 }
14949 return false;
14950 }))
14951 ToDemote.push_back(Idx);
14952 }
14953 RootDemotes.clear();
14954 IsTopRoot = false;
14955 IsProfitableToDemoteRoot = true;
14956
14957 if (ExtraBitWidthNodes.empty()) {
14958 NodeIdx = VectorizableTree.size();
14959 } else {
14960 unsigned NewIdx = 0;
14961 do {
14962 NewIdx = *ExtraBitWidthNodes.begin();
14963 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
14964 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
14965 NodeIdx = NewIdx;
14966 IsTruncRoot =
14967 NodeIdx < VectorizableTree.size() &&
14968 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14969 [](const EdgeInfo &EI) {
14970 return EI.EdgeIdx == 0 &&
14971 EI.UserTE->getOpcode() == Instruction::Trunc &&
14972 !EI.UserTE->isAltShuffle();
14973 });
14974 IsSignedCmp =
14975 NodeIdx < VectorizableTree.size() &&
14976 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14977 [](const EdgeInfo &EI) {
14978 return EI.UserTE->getOpcode() == Instruction::ICmp &&
14979 any_of(EI.UserTE->Scalars, [](Value *V) {
14980 auto *IC = dyn_cast<ICmpInst>(V);
14981 return IC && IC->isSigned();
14982 });
14983 });
14984 }
14985
14986 // If the maximum bit width we compute is less than the with of the roots'
14987 // type, we can proceed with the narrowing. Otherwise, do nothing.
14988 if (MaxBitWidth == 0 ||
14989 MaxBitWidth >=
14990 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
14991 if (UserIgnoreList)
14992 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
14993 continue;
14994 }
14995
14996 // Finally, map the values we can demote to the maximum bit with we
14997 // computed.
14998 for (unsigned Idx : ToDemote) {
14999 TreeEntry *TE = VectorizableTree[Idx].get();
15000 if (MinBWs.contains(TE))
15001 continue;
15002 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15003 any_of(TE->Scalars, [&](Value *R) {
15004 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15005 });
15006 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15007 }
15008 }
15009}
15010
15012 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15013 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15014 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15015 auto *AA = &AM.getResult<AAManager>(F);
15016 auto *LI = &AM.getResult<LoopAnalysis>(F);
15017 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15018 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15019 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15021
15022 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15023 if (!Changed)
15024 return PreservedAnalyses::all();
15025
15028 return PA;
15029}
15030
15032 TargetTransformInfo *TTI_,
15033 TargetLibraryInfo *TLI_, AAResults *AA_,
15034 LoopInfo *LI_, DominatorTree *DT_,
15035 AssumptionCache *AC_, DemandedBits *DB_,
15038 return false;
15039 SE = SE_;
15040 TTI = TTI_;
15041 TLI = TLI_;
15042 AA = AA_;
15043 LI = LI_;
15044 DT = DT_;
15045 AC = AC_;
15046 DB = DB_;
15047 DL = &F.getParent()->getDataLayout();
15048
15049 Stores.clear();
15050 GEPs.clear();
15051 bool Changed = false;
15052
15053 // If the target claims to have no vector registers don't attempt
15054 // vectorization.
15056 LLVM_DEBUG(
15057 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15058 return false;
15059 }
15060
15061 // Don't vectorize when the attribute NoImplicitFloat is used.
15062 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15063 return false;
15064
15065 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15066
15067 // Use the bottom up slp vectorizer to construct chains that start with
15068 // store instructions.
15069 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15070
15071 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15072 // delete instructions.
15073
15074 // Update DFS numbers now so that we can use them for ordering.
15075 DT->updateDFSNumbers();
15076
15077 // Scan the blocks in the function in post order.
15078 for (auto *BB : post_order(&F.getEntryBlock())) {
15079 // Start new block - clear the list of reduction roots.
15080 R.clearReductionData();
15081 collectSeedInstructions(BB);
15082
15083 // Vectorize trees that end at stores.
15084 if (!Stores.empty()) {
15085 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15086 << " underlying objects.\n");
15087 Changed |= vectorizeStoreChains(R);
15088 }
15089
15090 // Vectorize trees that end at reductions.
15091 Changed |= vectorizeChainsInBlock(BB, R);
15092
15093 // Vectorize the index computations of getelementptr instructions. This
15094 // is primarily intended to catch gather-like idioms ending at
15095 // non-consecutive loads.
15096 if (!GEPs.empty()) {
15097 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15098 << " underlying objects.\n");
15099 Changed |= vectorizeGEPIndices(BB, R);
15100 }
15101 }
15102
15103 if (Changed) {
15104 R.optimizeGatherSequence();
15105 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15106 }
15107 return Changed;
15108}
15109
15110bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15111 unsigned Idx, unsigned MinVF) {
15112 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15113 << "\n");
15114 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15115 unsigned VF = Chain.size();
15116
15117 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15118 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15119 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15120 // all vector lanes are used.
15121 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15122 return false;
15123 }
15124
15125 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15126 << "\n");
15127
15128 R.buildTree(Chain);
15129 if (R.isTreeTinyAndNotFullyVectorizable())
15130 return false;
15131 if (R.isLoadCombineCandidate())
15132 return false;
15133 R.reorderTopToBottom();
15134 R.reorderBottomToTop();
15135 R.buildExternalUses();
15136
15137 R.computeMinimumValueSizes();
15138
15139 InstructionCost Cost = R.getTreeCost();
15140
15141 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15142 if (Cost < -SLPCostThreshold) {
15143 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15144
15145 using namespace ore;
15146
15147 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15148 cast<StoreInst>(Chain[0]))
15149 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15150 << " and with tree size "
15151 << NV("TreeSize", R.getTreeSize()));
15152
15153 R.vectorizeTree();
15154 return true;
15155 }
15156
15157 return false;
15158}
15159
15160bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
15161 BoUpSLP &R) {
15162 // We may run into multiple chains that merge into a single chain. We mark the
15163 // stores that we vectorized so that we don't visit the same store twice.
15164 BoUpSLP::ValueSet VectorizedStores;
15165 bool Changed = false;
15166
15167 // Stores the pair of stores (first_store, last_store) in a range, that were
15168 // already tried to be vectorized. Allows to skip the store ranges that were
15169 // already tried to be vectorized but the attempts were unsuccessful.
15171 struct StoreDistCompare {
15172 bool operator()(const std::pair<unsigned, int> &Op1,
15173 const std::pair<unsigned, int> &Op2) const {
15174 return Op1.second < Op2.second;
15175 }
15176 };
15177 // A set of pairs (index of store in Stores array ref, Distance of the store
15178 // address relative to base store address in units).
15179 using StoreIndexToDistSet =
15180 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15181 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15182 int PrevDist = -1;
15184 // Collect the chain into a list.
15185 for (auto [Idx, Data] : enumerate(Set)) {
15186 if (Operands.empty() || Data.second - PrevDist == 1) {
15187 Operands.push_back(Stores[Data.first]);
15188 PrevDist = Data.second;
15189 if (Idx != Set.size() - 1)
15190 continue;
15191 }
15192 auto E = make_scope_exit([&, &DataVar = Data]() {
15193 Operands.clear();
15194 Operands.push_back(Stores[DataVar.first]);
15195 PrevDist = DataVar.second;
15196 });
15197
15198 if (Operands.size() <= 1)
15199 continue;
15200
15201 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15202 unsigned EltSize = R.getVectorElementSize(Operands[0]);
15203 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
15204
15205 unsigned MaxVF =
15206 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15207 auto *Store = cast<StoreInst>(Operands[0]);
15208 Type *StoreTy = Store->getValueOperand()->getType();
15209 Type *ValueTy = StoreTy;
15210 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
15211 ValueTy = Trunc->getSrcTy();
15212 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
15213 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
15214
15215 if (MaxVF < MinVF) {
15216 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15217 << ") < "
15218 << "MinVF (" << MinVF << ")\n");
15219 continue;
15220 }
15221
15222 unsigned NonPowerOf2VF = 0;
15224 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15225 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15226 // lanes are used.
15227 unsigned CandVF = Operands.size();
15228 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
15229 NonPowerOf2VF = CandVF;
15230 }
15231
15232 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15233 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15234 unsigned Size = MinVF;
15235 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15236 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15237 Size *= 2;
15238 });
15239 unsigned StartIdx = 0;
15240 for (unsigned Size : CandidateVFs) {
15241 for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
15243 assert(
15244 all_of(
15245 Slice,
15246 [&](Value *V) {
15247 return cast<StoreInst>(V)->getValueOperand()->getType() ==
15248 cast<StoreInst>(Slice.front())
15249 ->getValueOperand()
15250 ->getType();
15251 }) &&
15252 "Expected all operands of same type.");
15253 if (!VectorizedStores.count(Slice.front()) &&
15254 !VectorizedStores.count(Slice.back()) &&
15255 TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
15256 .second &&
15257 vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
15258 // Mark the vectorized stores so that we don't vectorize them again.
15259 VectorizedStores.insert(Slice.begin(), Slice.end());
15260 Changed = true;
15261 // If we vectorized initial block, no need to try to vectorize it
15262 // again.
15263 if (Cnt == StartIdx)
15264 StartIdx += Size;
15265 Cnt += Size;
15266 continue;
15267 }
15268 ++Cnt;
15269 }
15270 // Check if the whole array was vectorized already - exit.
15271 if (StartIdx >= Operands.size())
15272 break;
15273 }
15274 }
15275 };
15276
15277 // Stores pair (first: index of the store into Stores array ref, address of
15278 // which taken as base, second: sorted set of pairs {index, dist}, which are
15279 // indices of stores in the set and their store location distances relative to
15280 // the base address).
15281
15282 // Need to store the index of the very first store separately, since the set
15283 // may be reordered after the insertion and the first store may be moved. This
15284 // container allows to reduce number of calls of getPointersDiff() function.
15286 // Inserts the specified store SI with the given index Idx to the set of the
15287 // stores. If the store with the same distance is found already - stop
15288 // insertion, try to vectorize already found stores. If some stores from this
15289 // sequence were not vectorized - try to vectorize them with the new store
15290 // later. But this logic is applied only to the stores, that come before the
15291 // previous store with the same distance.
15292 // Example:
15293 // 1. store x, %p
15294 // 2. store y, %p+1
15295 // 3. store z, %p+2
15296 // 4. store a, %p
15297 // 5. store b, %p+3
15298 // - Scan this from the last to first store. The very first bunch of stores is
15299 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
15300 // vector).
15301 // - The next store in the list - #1 - has the same distance from store #5 as
15302 // the store #4.
15303 // - Try to vectorize sequence of stores 4,2,3,5.
15304 // - If all these stores are vectorized - just drop them.
15305 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
15306 // - Start new stores sequence.
15307 // The new bunch of stores is {1, {1, 0}}.
15308 // - Add the stores from previous sequence, that were not vectorized.
15309 // Here we consider the stores in the reversed order, rather they are used in
15310 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
15311 // Store #3 can be added -> comes after store #4 with the same distance as
15312 // store #1.
15313 // Store #5 cannot be added - comes before store #4.
15314 // This logic allows to improve the compile time, we assume that the stores
15315 // after previous store with the same distance most likely have memory
15316 // dependencies and no need to waste compile time to try to vectorize them.
15317 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
15318 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
15319 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15320 std::optional<int> Diff = getPointersDiff(
15321 Stores[Set.first]->getValueOperand()->getType(),
15322 Stores[Set.first]->getPointerOperand(),
15323 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
15324 /*StrictCheck=*/true);
15325 if (!Diff)
15326 continue;
15327 auto It = Set.second.find(std::make_pair(Idx, *Diff));
15328 if (It == Set.second.end()) {
15329 Set.second.emplace(Idx, *Diff);
15330 return;
15331 }
15332 // Try to vectorize the first found set to avoid duplicate analysis.
15333 TryToVectorize(Set.second);
15334 StoreIndexToDistSet PrevSet;
15335 PrevSet.swap(Set.second);
15336 Set.first = Idx;
15337 Set.second.emplace(Idx, 0);
15338 // Insert stores that followed previous match to try to vectorize them
15339 // with this store.
15340 unsigned StartIdx = It->first + 1;
15341 SmallBitVector UsedStores(Idx - StartIdx);
15342 // Distances to previously found dup store (or this store, since they
15343 // store to the same addresses).
15344 SmallVector<int> Dists(Idx - StartIdx, 0);
15345 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
15346 // Do not try to vectorize sequences, we already tried.
15347 if (Pair.first <= It->first ||
15348 VectorizedStores.contains(Stores[Pair.first]))
15349 break;
15350 unsigned BI = Pair.first - StartIdx;
15351 UsedStores.set(BI);
15352 Dists[BI] = Pair.second - It->second;
15353 }
15354 for (unsigned I = StartIdx; I < Idx; ++I) {
15355 unsigned BI = I - StartIdx;
15356 if (UsedStores.test(BI))
15357 Set.second.emplace(I, Dists[BI]);
15358 }
15359 return;
15360 }
15361 auto &Res = SortedStores.emplace_back();
15362 Res.first = Idx;
15363 Res.second.emplace(Idx, 0);
15364 };
15365 StoreInst *PrevStore = Stores.front();
15366 for (auto [I, SI] : enumerate(Stores)) {
15367 // Check that we do not try to vectorize stores of different types.
15368 if (PrevStore->getValueOperand()->getType() !=
15369 SI->getValueOperand()->getType()) {
15370 for (auto &Set : SortedStores)
15371 TryToVectorize(Set.second);
15372 SortedStores.clear();
15373 PrevStore = SI;
15374 }
15375 FillStoresSet(I, SI);
15376 }
15377
15378 // Final vectorization attempt.
15379 for (auto &Set : SortedStores)
15380 TryToVectorize(Set.second);
15381
15382 return Changed;
15383}
15384
15385void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
15386 // Initialize the collections. We will make a single pass over the block.
15387 Stores.clear();
15388 GEPs.clear();
15389
15390 // Visit the store and getelementptr instructions in BB and organize them in
15391 // Stores and GEPs according to the underlying objects of their pointer
15392 // operands.
15393 for (Instruction &I : *BB) {
15394 // Ignore store instructions that are volatile or have a pointer operand
15395 // that doesn't point to a scalar type.
15396 if (auto *SI = dyn_cast<StoreInst>(&I)) {
15397 if (!SI->isSimple())
15398 continue;
15399 if (!isValidElementType(SI->getValueOperand()->getType()))
15400 continue;
15401 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
15402 }
15403
15404 // Ignore getelementptr instructions that have more than one index, a
15405 // constant index, or a pointer operand that doesn't point to a scalar
15406 // type.
15407 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
15408 if (GEP->getNumIndices() != 1)
15409 continue;
15410 Value *Idx = GEP->idx_begin()->get();
15411 if (isa<Constant>(Idx))
15412 continue;
15413 if (!isValidElementType(Idx->getType()))
15414 continue;
15415 if (GEP->getType()->isVectorTy())
15416 continue;
15417 GEPs[GEP->getPointerOperand()].push_back(GEP);
15418 }
15419 }
15420}
15421
15422bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
15423 bool MaxVFOnly) {
15424 if (VL.size() < 2)
15425 return false;
15426
15427 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
15428 << VL.size() << ".\n");
15429
15430 // Check that all of the parts are instructions of the same type,
15431 // we permit an alternate opcode via InstructionsState.
15432 InstructionsState S = getSameOpcode(VL, *TLI);
15433 if (!S.getOpcode())
15434 return false;
15435
15436 Instruction *I0 = cast<Instruction>(S.OpValue);
15437 // Make sure invalid types (including vector type) are rejected before
15438 // determining vectorization factor for scalar instructions.
15439 for (Value *V : VL) {
15440 Type *Ty = V->getType();
15441 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
15442 // NOTE: the following will give user internal llvm type name, which may
15443 // not be useful.
15444 R.getORE()->emit([&]() {
15445 std::string TypeStr;
15446 llvm::raw_string_ostream rso(TypeStr);
15447 Ty->print(rso);
15448 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
15449 << "Cannot SLP vectorize list: type "
15450 << rso.str() + " is unsupported by vectorizer";
15451 });
15452 return false;
15453 }
15454 }
15455
15456 unsigned Sz = R.getVectorElementSize(I0);
15457 unsigned MinVF = R.getMinVF(Sz);
15458 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
15459 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15460 if (MaxVF < 2) {
15461 R.getORE()->emit([&]() {
15462 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
15463 << "Cannot SLP vectorize list: vectorization factor "
15464 << "less than 2 is not supported";
15465 });
15466 return false;
15467 }
15468
15469 bool Changed = false;
15470 bool CandidateFound = false;
15471 InstructionCost MinCost = SLPCostThreshold.getValue();
15472 Type *ScalarTy = VL[0]->getType();
15473 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15474 ScalarTy = IE->getOperand(1)->getType();
15475
15476 unsigned NextInst = 0, MaxInst = VL.size();
15477 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15478 // No actual vectorization should happen, if number of parts is the same as
15479 // provided vectorization factor (i.e. the scalar type is used for vector
15480 // code during codegen).
15481 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
15482 if (TTI->getNumberOfParts(VecTy) == VF)
15483 continue;
15484 for (unsigned I = NextInst; I < MaxInst; ++I) {
15485 unsigned ActualVF = std::min(MaxInst - I, VF);
15486
15487 if (!isPowerOf2_32(ActualVF))
15488 continue;
15489
15490 if (MaxVFOnly && ActualVF < MaxVF)
15491 break;
15492 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15493 break;
15494
15495 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
15496 // Check that a previous iteration of this loop did not delete the Value.
15497 if (llvm::any_of(Ops, [&R](Value *V) {
15498 auto *I = dyn_cast<Instruction>(V);
15499 return I && R.isDeleted(I);
15500 }))
15501 continue;
15502
15503 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
15504 << "\n");
15505
15506 R.buildTree(Ops);
15507 if (R.isTreeTinyAndNotFullyVectorizable())
15508 continue;
15509 R.reorderTopToBottom();
15510 R.reorderBottomToTop(
15511 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
15512 !R.doesRootHaveInTreeUses());
15513 R.buildExternalUses();
15514
15515 R.computeMinimumValueSizes();
15516 InstructionCost Cost = R.getTreeCost();
15517 CandidateFound = true;
15518 MinCost = std::min(MinCost, Cost);
15519
15520 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
15521 << " for VF=" << ActualVF << "\n");
15522 if (Cost < -SLPCostThreshold) {
15523 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
15524 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
15525 cast<Instruction>(Ops[0]))
15526 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
15527 << " and with tree size "
15528 << ore::NV("TreeSize", R.getTreeSize()));
15529
15530 R.vectorizeTree();
15531 // Move to the next bundle.
15532 I += VF - 1;
15533 NextInst = I + 1;
15534 Changed = true;
15535 }
15536 }
15537 }
15538
15539 if (!Changed && CandidateFound) {
15540 R.getORE()->emit([&]() {
15541 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
15542 << "List vectorization was possible but not beneficial with cost "
15543 << ore::NV("Cost", MinCost) << " >= "
15544 << ore::NV("Treshold", -SLPCostThreshold);
15545 });
15546 } else if (!Changed) {
15547 R.getORE()->emit([&]() {
15548 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
15549 << "Cannot SLP vectorize list: vectorization was impossible"
15550 << " with available vectorization factors";
15551 });
15552 }
15553 return Changed;
15554}
15555
15556bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
15557 if (!I)
15558 return false;
15559
15560 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
15561 return false;
15562
15563 Value *P = I->getParent();
15564
15565 // Vectorize in current basic block only.
15566 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
15567 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
15568 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
15569 return false;
15570
15571 // First collect all possible candidates
15573 Candidates.emplace_back(Op0, Op1);
15574
15575 auto *A = dyn_cast<BinaryOperator>(Op0);
15576 auto *B = dyn_cast<BinaryOperator>(Op1);
15577 // Try to skip B.
15578 if (A && B && B->hasOneUse()) {
15579 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
15580 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
15581 if (B0 && B0->getParent() == P)
15582 Candidates.emplace_back(A, B0);
15583 if (B1 && B1->getParent() == P)
15584 Candidates.emplace_back(A, B1);
15585 }
15586 // Try to skip A.
15587 if (B && A && A->hasOneUse()) {
15588 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
15589 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
15590 if (A0 && A0->getParent() == P)
15591 Candidates.emplace_back(A0, B);
15592 if (A1 && A1->getParent() == P)
15593 Candidates.emplace_back(A1, B);
15594 }
15595
15596 if (Candidates.size() == 1)
15597 return tryToVectorizeList({Op0, Op1}, R);
15598
15599 // We have multiple options. Try to pick the single best.
15600 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
15601 if (!BestCandidate)
15602 return false;
15603 return tryToVectorizeList(
15604 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
15605}
15606
15607namespace {
15608
15609/// Model horizontal reductions.
15610///
15611/// A horizontal reduction is a tree of reduction instructions that has values
15612/// that can be put into a vector as its leaves. For example:
15613///
15614/// mul mul mul mul
15615/// \ / \ /
15616/// + +
15617/// \ /
15618/// +
15619/// This tree has "mul" as its leaf values and "+" as its reduction
15620/// instructions. A reduction can feed into a store or a binary operation
15621/// feeding a phi.
15622/// ...
15623/// \ /
15624/// +
15625/// |
15626/// phi +=
15627///
15628/// Or:
15629/// ...
15630/// \ /
15631/// +
15632/// |
15633/// *p =
15634///
15635class HorizontalReduction {
15636 using ReductionOpsType = SmallVector<Value *, 16>;
15637 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
15638 ReductionOpsListType ReductionOps;
15639 /// List of possibly reduced values.
15641 /// Maps reduced value to the corresponding reduction operation.
15643 // Use map vector to make stable output.
15645 WeakTrackingVH ReductionRoot;
15646 /// The type of reduction operation.
15647 RecurKind RdxKind;
15648 /// Checks if the optimization of original scalar identity operations on
15649 /// matched horizontal reductions is enabled and allowed.
15650 bool IsSupportedHorRdxIdentityOp = false;
15651
15652 static bool isCmpSelMinMax(Instruction *I) {
15653 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
15655 }
15656
15657 // And/or are potentially poison-safe logical patterns like:
15658 // select x, y, false
15659 // select x, true, y
15660 static bool isBoolLogicOp(Instruction *I) {
15661 return isa<SelectInst>(I) &&
15662 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
15663 }
15664
15665 /// Checks if instruction is associative and can be vectorized.
15666 static bool isVectorizable(RecurKind Kind, Instruction *I) {
15667 if (Kind == RecurKind::None)
15668 return false;
15669
15670 // Integer ops that map to select instructions or intrinsics are fine.
15672 isBoolLogicOp(I))
15673 return true;
15674
15675 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
15676 // FP min/max are associative except for NaN and -0.0. We do not
15677 // have to rule out -0.0 here because the intrinsic semantics do not
15678 // specify a fixed result for it.
15679 return I->getFastMathFlags().noNaNs();
15680 }
15681
15682 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
15683 return true;
15684
15685 return I->isAssociative();
15686 }
15687
15688 static Value *getRdxOperand(Instruction *I, unsigned Index) {
15689 // Poison-safe 'or' takes the form: select X, true, Y
15690 // To make that work with the normal operand processing, we skip the
15691 // true value operand.
15692 // TODO: Change the code and data structures to handle this without a hack.
15693 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
15694 return I->getOperand(2);
15695 return I->getOperand(Index);
15696 }
15697
15698 /// Creates reduction operation with the current opcode.
15699 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
15700 Value *RHS, const Twine &Name, bool UseSelect) {
15701 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
15702 switch (Kind) {
15703 case RecurKind::Or:
15704 if (UseSelect &&
15706 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
15707 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15708 Name);
15709 case RecurKind::And:
15710 if (UseSelect &&
15712 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
15713 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15714 Name);
15715 case RecurKind::Add:
15716 case RecurKind::Mul:
15717 case RecurKind::Xor:
15718 case RecurKind::FAdd:
15719 case RecurKind::FMul:
15720 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15721 Name);
15722 case RecurKind::FMax:
15723 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
15724 case RecurKind::FMin:
15725 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
15726 case RecurKind::FMaximum:
15727 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
15728 case RecurKind::FMinimum:
15729 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
15730 case RecurKind::SMax:
15731 if (UseSelect) {
15732 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
15733 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15734 }
15735 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
15736 case RecurKind::SMin:
15737 if (UseSelect) {
15738 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
15739 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15740 }
15741 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
15742 case RecurKind::UMax:
15743 if (UseSelect) {
15744 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
15745 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15746 }
15747 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
15748 case RecurKind::UMin:
15749 if (UseSelect) {
15750 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
15751 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15752 }
15753 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
15754 default:
15755 llvm_unreachable("Unknown reduction operation.");
15756 }
15757 }
15758
15759 /// Creates reduction operation with the current opcode with the IR flags
15760 /// from \p ReductionOps, dropping nuw/nsw flags.
15761 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
15762 Value *RHS, const Twine &Name,
15763 const ReductionOpsListType &ReductionOps) {
15764 bool UseSelect = ReductionOps.size() == 2 ||
15765 // Logical or/and.
15766 (ReductionOps.size() == 1 &&
15767 any_of(ReductionOps.front(), IsaPred<SelectInst>));
15768 assert((!UseSelect || ReductionOps.size() != 2 ||
15769 isa<SelectInst>(ReductionOps[1][0])) &&
15770 "Expected cmp + select pairs for reduction");
15771 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
15773 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
15774 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
15775 /*IncludeWrapFlags=*/false);
15776 propagateIRFlags(Op, ReductionOps[1], nullptr,
15777 /*IncludeWrapFlags=*/false);
15778 return Op;
15779 }
15780 }
15781 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
15782 return Op;
15783 }
15784
15785public:
15786 static RecurKind getRdxKind(Value *V) {
15787 auto *I = dyn_cast<Instruction>(V);
15788 if (!I)
15789 return RecurKind::None;
15790 if (match(I, m_Add(m_Value(), m_Value())))
15791 return RecurKind::Add;
15792 if (match(I, m_Mul(m_Value(), m_Value())))
15793 return RecurKind::Mul;
15794 if (match(I, m_And(m_Value(), m_Value())) ||
15796 return RecurKind::And;
15797 if (match(I, m_Or(m_Value(), m_Value())) ||
15799 return RecurKind::Or;
15800 if (match(I, m_Xor(m_Value(), m_Value())))
15801 return RecurKind::Xor;
15802 if (match(I, m_FAdd(m_Value(), m_Value())))
15803 return RecurKind::FAdd;
15804 if (match(I, m_FMul(m_Value(), m_Value())))
15805 return RecurKind::FMul;
15806
15807 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
15808 return RecurKind::FMax;
15809 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
15810 return RecurKind::FMin;
15811
15812 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
15813 return RecurKind::FMaximum;
15814 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
15815 return RecurKind::FMinimum;
15816 // This matches either cmp+select or intrinsics. SLP is expected to handle
15817 // either form.
15818 // TODO: If we are canonicalizing to intrinsics, we can remove several
15819 // special-case paths that deal with selects.
15820 if (match(I, m_SMax(m_Value(), m_Value())))
15821 return RecurKind::SMax;
15822 if (match(I, m_SMin(m_Value(), m_Value())))
15823 return RecurKind::SMin;
15824 if (match(I, m_UMax(m_Value(), m_Value())))
15825 return RecurKind::UMax;
15826 if (match(I, m_UMin(m_Value(), m_Value())))
15827 return RecurKind::UMin;
15828
15829 if (auto *Select = dyn_cast<SelectInst>(I)) {
15830 // Try harder: look for min/max pattern based on instructions producing
15831 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
15832 // During the intermediate stages of SLP, it's very common to have
15833 // pattern like this (since optimizeGatherSequence is run only once
15834 // at the end):
15835 // %1 = extractelement <2 x i32> %a, i32 0
15836 // %2 = extractelement <2 x i32> %a, i32 1
15837 // %cond = icmp sgt i32 %1, %2
15838 // %3 = extractelement <2 x i32> %a, i32 0
15839 // %4 = extractelement <2 x i32> %a, i32 1
15840 // %select = select i1 %cond, i32 %3, i32 %4
15841 CmpInst::Predicate Pred;
15842 Instruction *L1;
15843 Instruction *L2;
15844
15845 Value *LHS = Select->getTrueValue();
15846 Value *RHS = Select->getFalseValue();
15847 Value *Cond = Select->getCondition();
15848
15849 // TODO: Support inverse predicates.
15850 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
15851 if (!isa<ExtractElementInst>(RHS) ||
15852 !L2->isIdenticalTo(cast<Instruction>(RHS)))
15853 return RecurKind::None;
15854 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
15855 if (!isa<ExtractElementInst>(LHS) ||
15856 !L1->isIdenticalTo(cast<Instruction>(LHS)))
15857 return RecurKind::None;
15858 } else {
15859 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
15860 return RecurKind::None;
15861 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
15862 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
15863 !L2->isIdenticalTo(cast<Instruction>(RHS)))
15864 return RecurKind::None;
15865 }
15866
15867 switch (Pred) {
15868 default:
15869 return RecurKind::None;
15870 case CmpInst::ICMP_SGT:
15871 case CmpInst::ICMP_SGE:
15872 return RecurKind::SMax;
15873 case CmpInst::ICMP_SLT:
15874 case CmpInst::ICMP_SLE:
15875 return RecurKind::SMin;
15876 case CmpInst::ICMP_UGT:
15877 case CmpInst::ICMP_UGE:
15878 return RecurKind::UMax;
15879 case CmpInst::ICMP_ULT:
15880 case CmpInst::ICMP_ULE:
15881 return RecurKind::UMin;
15882 }
15883 }
15884 return RecurKind::None;
15885 }
15886
15887 /// Get the index of the first operand.
15888 static unsigned getFirstOperandIndex(Instruction *I) {
15889 return isCmpSelMinMax(I) ? 1 : 0;
15890 }
15891
15892private:
15893 /// Total number of operands in the reduction operation.
15894 static unsigned getNumberOfOperands(Instruction *I) {
15895 return isCmpSelMinMax(I) ? 3 : 2;
15896 }
15897
15898 /// Checks if the instruction is in basic block \p BB.
15899 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
15900 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
15901 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
15902 auto *Sel = cast<SelectInst>(I);
15903 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
15904 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
15905 }
15906 return I->getParent() == BB;
15907 }
15908
15909 /// Expected number of uses for reduction operations/reduced values.
15910 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
15911 if (IsCmpSelMinMax) {
15912 // SelectInst must be used twice while the condition op must have single
15913 // use only.
15914 if (auto *Sel = dyn_cast<SelectInst>(I))
15915 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
15916 return I->hasNUses(2);
15917 }
15918
15919 // Arithmetic reduction operation must be used once only.
15920 return I->hasOneUse();
15921 }
15922
15923 /// Initializes the list of reduction operations.
15924 void initReductionOps(Instruction *I) {
15925 if (isCmpSelMinMax(I))
15926 ReductionOps.assign(2, ReductionOpsType());
15927 else
15928 ReductionOps.assign(1, ReductionOpsType());
15929 }
15930
15931 /// Add all reduction operations for the reduction instruction \p I.
15932 void addReductionOps(Instruction *I) {
15933 if (isCmpSelMinMax(I)) {
15934 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
15935 ReductionOps[1].emplace_back(I);
15936 } else {
15937 ReductionOps[0].emplace_back(I);
15938 }
15939 }
15940
15941 static bool isGoodForReduction(ArrayRef<Value *> Data) {
15942 int Sz = Data.size();
15943 auto *I = dyn_cast<Instruction>(Data.front());
15944 return Sz > 1 || isConstant(Data.front()) ||
15945 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
15946 }
15947
15948public:
15949 HorizontalReduction() = default;
15950
15951 /// Try to find a reduction tree.
15952 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
15953 ScalarEvolution &SE, const DataLayout &DL,
15954 const TargetLibraryInfo &TLI) {
15955 RdxKind = HorizontalReduction::getRdxKind(Root);
15956 if (!isVectorizable(RdxKind, Root))
15957 return false;
15958
15959 // Analyze "regular" integer/FP types for reductions - no target-specific
15960 // types or pointers.
15961 Type *Ty = Root->getType();
15962 if (!isValidElementType(Ty) || Ty->isPointerTy())
15963 return false;
15964
15965 // Though the ultimate reduction may have multiple uses, its condition must
15966 // have only single use.
15967 if (auto *Sel = dyn_cast<SelectInst>(Root))
15968 if (!Sel->getCondition()->hasOneUse())
15969 return false;
15970
15971 ReductionRoot = Root;
15972
15973 // Iterate through all the operands of the possible reduction tree and
15974 // gather all the reduced values, sorting them by their value id.
15975 BasicBlock *BB = Root->getParent();
15976 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
15977 SmallVector<Instruction *> Worklist(1, Root);
15978 // Checks if the operands of the \p TreeN instruction are also reduction
15979 // operations or should be treated as reduced values or an extra argument,
15980 // which is not part of the reduction.
15981 auto CheckOperands = [&](Instruction *TreeN,
15982 SmallVectorImpl<Value *> &ExtraArgs,
15983 SmallVectorImpl<Value *> &PossibleReducedVals,
15984 SmallVectorImpl<Instruction *> &ReductionOps) {
15985 for (int I = getFirstOperandIndex(TreeN),
15986 End = getNumberOfOperands(TreeN);
15987 I < End; ++I) {
15988 Value *EdgeVal = getRdxOperand(TreeN, I);
15989 ReducedValsToOps[EdgeVal].push_back(TreeN);
15990 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
15991 // Edge has wrong parent - mark as an extra argument.
15992 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
15993 !hasSameParent(EdgeInst, BB)) {
15994 ExtraArgs.push_back(EdgeVal);
15995 continue;
15996 }
15997 // If the edge is not an instruction, or it is different from the main
15998 // reduction opcode or has too many uses - possible reduced value.
15999 // Also, do not try to reduce const values, if the operation is not
16000 // foldable.
16001 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16002 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16003 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16004 !isVectorizable(RdxKind, EdgeInst) ||
16005 (R.isAnalyzedReductionRoot(EdgeInst) &&
16006 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16007 PossibleReducedVals.push_back(EdgeVal);
16008 continue;
16009 }
16010 ReductionOps.push_back(EdgeInst);
16011 }
16012 };
16013 // Try to regroup reduced values so that it gets more profitable to try to
16014 // reduce them. Values are grouped by their value ids, instructions - by
16015 // instruction op id and/or alternate op id, plus do extra analysis for
16016 // loads (grouping them by the distabce between pointers) and cmp
16017 // instructions (grouping them by the predicate).
16019 PossibleReducedVals;
16020 initReductionOps(Root);
16022 SmallSet<size_t, 2> LoadKeyUsed;
16023 SmallPtrSet<Value *, 4> DoNotReverseVals;
16024
16025 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16027 if (LoadKeyUsed.contains(Key)) {
16028 auto LIt = LoadsMap.find(Ptr);
16029 if (LIt != LoadsMap.end()) {
16030 for (LoadInst *RLI : LIt->second) {
16031 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
16032 LI->getType(), LI->getPointerOperand(), DL, SE,
16033 /*StrictCheck=*/true))
16034 return hash_value(RLI->getPointerOperand());
16035 }
16036 for (LoadInst *RLI : LIt->second) {
16038 LI->getPointerOperand(), TLI)) {
16039 hash_code SubKey = hash_value(RLI->getPointerOperand());
16040 DoNotReverseVals.insert(RLI);
16041 return SubKey;
16042 }
16043 }
16044 if (LIt->second.size() > 2) {
16045 hash_code SubKey =
16046 hash_value(LIt->second.back()->getPointerOperand());
16047 DoNotReverseVals.insert(LIt->second.back());
16048 return SubKey;
16049 }
16050 }
16051 }
16052 LoadKeyUsed.insert(Key);
16053 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
16054 return hash_value(LI->getPointerOperand());
16055 };
16056
16057 while (!Worklist.empty()) {
16058 Instruction *TreeN = Worklist.pop_back_val();
16060 SmallVector<Value *> PossibleRedVals;
16061 SmallVector<Instruction *> PossibleReductionOps;
16062 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16063 // If too many extra args - mark the instruction itself as a reduction
16064 // value, not a reduction operation.
16065 if (Args.size() < 2) {
16066 addReductionOps(TreeN);
16067 // Add extra args.
16068 if (!Args.empty()) {
16069 assert(Args.size() == 1 && "Expected only single argument.");
16070 ExtraArgs[TreeN] = Args.front();
16071 }
16072 // Add reduction values. The values are sorted for better vectorization
16073 // results.
16074 for (Value *V : PossibleRedVals) {
16075 size_t Key, Idx;
16076 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
16077 /*AllowAlternate=*/false);
16078 ++PossibleReducedVals[Key][Idx]
16079 .insert(std::make_pair(V, 0))
16080 .first->second;
16081 }
16082 Worklist.append(PossibleReductionOps.rbegin(),
16083 PossibleReductionOps.rend());
16084 } else {
16085 size_t Key, Idx;
16086 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
16087 /*AllowAlternate=*/false);
16088 ++PossibleReducedVals[Key][Idx]
16089 .insert(std::make_pair(TreeN, 0))
16090 .first->second;
16091 }
16092 }
16093 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16094 // Sort values by the total number of values kinds to start the reduction
16095 // from the longest possible reduced values sequences.
16096 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16097 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16098 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16099 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16100 It != E; ++It) {
16101 PossibleRedValsVect.emplace_back();
16102 auto RedValsVect = It->second.takeVector();
16103 stable_sort(RedValsVect, llvm::less_second());
16104 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16105 PossibleRedValsVect.back().append(Data.second, Data.first);
16106 }
16107 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
16108 return P1.size() > P2.size();
16109 });
16110 int NewIdx = -1;
16111 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16112 if (isGoodForReduction(Data) ||
16113 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16114 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16116 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16117 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
16118 ->getPointerOperand()))) {
16119 if (NewIdx < 0) {
16120 NewIdx = ReducedVals.size();
16121 ReducedVals.emplace_back();
16122 }
16123 if (DoNotReverseVals.contains(Data.front()))
16124 ReducedVals[NewIdx].append(Data.begin(), Data.end());
16125 else
16126 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
16127 } else {
16128 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
16129 }
16130 }
16131 }
16132 // Sort the reduced values by number of same/alternate opcode and/or pointer
16133 // operand.
16134 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16135 return P1.size() > P2.size();
16136 });
16137 return true;
16138 }
16139
16140 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16141 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16142 const TargetLibraryInfo &TLI) {
16143 constexpr int ReductionLimit = 4;
16144 constexpr unsigned RegMaxNumber = 4;
16145 constexpr unsigned RedValsMaxNumber = 128;
16146 // If there are a sufficient number of reduction values, reduce
16147 // to a nearby power-of-2. We can safely generate oversized
16148 // vectors and rely on the backend to split them to legal sizes.
16149 unsigned NumReducedVals =
16150 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
16151 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16152 if (!isGoodForReduction(Vals))
16153 return Num;
16154 return Num + Vals.size();
16155 });
16156 if (NumReducedVals < ReductionLimit &&
16158 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
16159 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
16160 }))) {
16161 for (ReductionOpsType &RdxOps : ReductionOps)
16162 for (Value *RdxOp : RdxOps)
16163 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16164 return nullptr;
16165 }
16166
16167 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16168 TargetFolder(DL));
16169 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
16170
16171 // Track the reduced values in case if they are replaced by extractelement
16172 // because of the vectorization.
16174 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16175 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16176 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16177 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
16178 // The same extra argument may be used several times, so log each attempt
16179 // to use it.
16180 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16181 assert(Pair.first && "DebugLoc must be set.");
16182 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16183 TrackedVals.try_emplace(Pair.second, Pair.second);
16184 }
16185
16186 // The compare instruction of a min/max is the insertion point for new
16187 // instructions and may be replaced with a new compare instruction.
16188 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16189 assert(isa<SelectInst>(RdxRootInst) &&
16190 "Expected min/max reduction to have select root instruction");
16191 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16192 assert(isa<Instruction>(ScalarCond) &&
16193 "Expected min/max reduction to have compare condition");
16194 return cast<Instruction>(ScalarCond);
16195 };
16196
16197 // Return new VectorizedTree, based on previous value.
16198 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16199 if (VectorizedTree) {
16200 // Update the final value in the reduction.
16202 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16203 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16205 !isGuaranteedNotToBePoison(VectorizedTree))) {
16206 auto It = ReducedValsToOps.find(Res);
16207 if (It != ReducedValsToOps.end() &&
16208 any_of(It->getSecond(),
16209 [](Instruction *I) { return isBoolLogicOp(I); }))
16210 std::swap(VectorizedTree, Res);
16211 }
16212
16213 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
16214 ReductionOps);
16215 }
16216 // Initialize the final value in the reduction.
16217 return Res;
16218 };
16219 bool AnyBoolLogicOp =
16220 any_of(ReductionOps.back(), [](Value *V) {
16221 return isBoolLogicOp(cast<Instruction>(V));
16222 });
16223 // The reduction root is used as the insertion point for new instructions,
16224 // so set it as externally used to prevent it from being deleted.
16225 ExternallyUsedValues[ReductionRoot];
16226 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16227 ReductionOps.front().size());
16228 for (ReductionOpsType &RdxOps : ReductionOps)
16229 for (Value *RdxOp : RdxOps) {
16230 if (!RdxOp)
16231 continue;
16232 IgnoreList.insert(RdxOp);
16233 }
16234 // Intersect the fast-math-flags from all reduction operations.
16235 FastMathFlags RdxFMF;
16236 RdxFMF.set();
16237 for (Value *U : IgnoreList)
16238 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
16239 RdxFMF &= FPMO->getFastMathFlags();
16240 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16241
16242 // Need to track reduced vals, they may be changed during vectorization of
16243 // subvectors.
16244 for (ArrayRef<Value *> Candidates : ReducedVals)
16245 for (Value *V : Candidates)
16246 TrackedVals.try_emplace(V, V);
16247
16248 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
16249 // List of the values that were reduced in other trees as part of gather
16250 // nodes and thus requiring extract if fully vectorized in other trees.
16251 SmallPtrSet<Value *, 4> RequiredExtract;
16252 Value *VectorizedTree = nullptr;
16253 bool CheckForReusedReductionOps = false;
16254 // Try to vectorize elements based on their type.
16255 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
16256 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
16257 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
16258 SmallVector<Value *> Candidates;
16259 Candidates.reserve(2 * OrigReducedVals.size());
16260 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
16261 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16262 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16263 // Check if the reduction value was not overriden by the extractelement
16264 // instruction because of the vectorization and exclude it, if it is not
16265 // compatible with other values.
16266 // Also check if the instruction was folded to constant/other value.
16267 auto *Inst = dyn_cast<Instruction>(RdxVal);
16268 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
16269 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16270 (S.getOpcode() && !Inst))
16271 continue;
16272 Candidates.push_back(RdxVal);
16273 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16274 }
16275 bool ShuffledExtracts = false;
16276 // Try to handle shuffled extractelements.
16277 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16278 I + 1 < E) {
16279 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
16280 if (NextS.getOpcode() == Instruction::ExtractElement &&
16281 !NextS.isAltShuffle()) {
16282 SmallVector<Value *> CommonCandidates(Candidates);
16283 for (Value *RV : ReducedVals[I + 1]) {
16284 Value *RdxVal = TrackedVals.find(RV)->second;
16285 // Check if the reduction value was not overriden by the
16286 // extractelement instruction because of the vectorization and
16287 // exclude it, if it is not compatible with other values.
16288 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
16289 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16290 continue;
16291 CommonCandidates.push_back(RdxVal);
16292 TrackedToOrig.try_emplace(RdxVal, RV);
16293 }
16295 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
16296 ++I;
16297 Candidates.swap(CommonCandidates);
16298 ShuffledExtracts = true;
16299 }
16300 }
16301 }
16302
16303 // Emit code for constant values.
16304 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
16305 allConstant(Candidates)) {
16306 Value *Res = Candidates.front();
16307 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
16308 for (Value *VC : ArrayRef(Candidates).drop_front()) {
16309 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
16310 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16311 if (auto *ResI = dyn_cast<Instruction>(Res))
16312 V.analyzedReductionRoot(ResI);
16313 }
16314 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16315 continue;
16316 }
16317
16318 unsigned NumReducedVals = Candidates.size();
16319 if (NumReducedVals < ReductionLimit &&
16320 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
16321 !isSplat(Candidates)))
16322 continue;
16323
16324 // Check if we support repeated scalar values processing (optimization of
16325 // original scalar identity operations on matched horizontal reductions).
16326 IsSupportedHorRdxIdentityOp =
16327 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
16328 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16329 // Gather same values.
16330 MapVector<Value *, unsigned> SameValuesCounter;
16331 if (IsSupportedHorRdxIdentityOp)
16332 for (Value *V : Candidates)
16333 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
16334 // Used to check if the reduced values used same number of times. In this
16335 // case the compiler may produce better code. E.g. if reduced values are
16336 // aabbccdd (8 x values), then the first node of the tree will have a node
16337 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
16338 // Plus, the final reduction will be performed on <8 x aabbccdd>.
16339 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
16340 // x abcd) * 2.
16341 // Currently it only handles add/fadd/xor. and/or/min/max do not require
16342 // this analysis, other operations may require an extra estimation of
16343 // the profitability.
16344 bool SameScaleFactor = false;
16345 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16346 SameValuesCounter.size() != Candidates.size();
16347 if (OptReusedScalars) {
16348 SameScaleFactor =
16349 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16350 RdxKind == RecurKind::Xor) &&
16351 all_of(drop_begin(SameValuesCounter),
16352 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
16353 return P.second == SameValuesCounter.front().second;
16354 });
16355 Candidates.resize(SameValuesCounter.size());
16356 transform(SameValuesCounter, Candidates.begin(),
16357 [](const auto &P) { return P.first; });
16358 NumReducedVals = Candidates.size();
16359 // Have a reduction of the same element.
16360 if (NumReducedVals == 1) {
16361 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16362 unsigned Cnt = SameValuesCounter.lookup(OrigV);
16363 Value *RedVal =
16364 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16366 VectorizedVals.try_emplace(OrigV, Cnt);
16367 continue;
16368 }
16369 }
16370
16371 unsigned MaxVecRegSize = V.getMaxVecRegSize();
16372 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
16373 unsigned MaxElts =
16374 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
16375
16376 unsigned ReduxWidth = std::min<unsigned>(
16377 llvm::bit_floor(NumReducedVals),
16378 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16379 RegMaxNumber * RedValsMaxNumber));
16380 unsigned Start = 0;
16381 unsigned Pos = Start;
16382 // Restarts vectorization attempt with lower vector factor.
16383 unsigned PrevReduxWidth = ReduxWidth;
16384 bool CheckForReusedReductionOpsLocal = false;
16385 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16386 &CheckForReusedReductionOpsLocal,
16387 &PrevReduxWidth, &V,
16388 &IgnoreList](bool IgnoreVL = false) {
16389 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
16390 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16391 // Check if any of the reduction ops are gathered. If so, worth
16392 // trying again with less number of reduction ops.
16393 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16394 }
16395 ++Pos;
16396 if (Pos < NumReducedVals - ReduxWidth + 1)
16397 return IsAnyRedOpGathered;
16398 Pos = Start;
16399 ReduxWidth /= 2;
16400 return IsAnyRedOpGathered;
16401 };
16402 bool AnyVectorized = false;
16403 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16404 ReduxWidth >= ReductionLimit) {
16405 // Dependency in tree of the reduction ops - drop this attempt, try
16406 // later.
16407 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16408 Start == 0) {
16409 CheckForReusedReductionOps = true;
16410 break;
16411 }
16412 PrevReduxWidth = ReduxWidth;
16413 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
16414 // Beeing analyzed already - skip.
16415 if (V.areAnalyzedReductionVals(VL)) {
16416 (void)AdjustReducedVals(/*IgnoreVL=*/true);
16417 continue;
16418 }
16419 // Early exit if any of the reduction values were deleted during
16420 // previous vectorization attempts.
16421 if (any_of(VL, [&V](Value *RedVal) {
16422 auto *RedValI = dyn_cast<Instruction>(RedVal);
16423 if (!RedValI)
16424 return false;
16425 return V.isDeleted(RedValI);
16426 }))
16427 break;
16428 V.buildTree(VL, IgnoreList);
16429 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
16430 if (!AdjustReducedVals())
16431 V.analyzedReductionVals(VL);
16432 continue;
16433 }
16434 if (V.isLoadCombineReductionCandidate(RdxKind)) {
16435 if (!AdjustReducedVals())
16436 V.analyzedReductionVals(VL);
16437 continue;
16438 }
16439 V.reorderTopToBottom();
16440 // No need to reorder the root node at all.
16441 V.reorderBottomToTop(/*IgnoreReorder=*/true);
16442 // Keep extracted other reduction values, if they are used in the
16443 // vectorization trees.
16444 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
16445 ExternallyUsedValues);
16446 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
16447 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
16448 continue;
16449 for (Value *V : ReducedVals[Cnt])
16450 if (isa<Instruction>(V))
16451 LocalExternallyUsedValues[TrackedVals[V]];
16452 }
16453 if (!IsSupportedHorRdxIdentityOp) {
16454 // Number of uses of the candidates in the vector of values.
16455 assert(SameValuesCounter.empty() &&
16456 "Reused values counter map is not empty");
16457 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16458 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16459 continue;
16460 Value *V = Candidates[Cnt];
16461 Value *OrigV = TrackedToOrig.find(V)->second;
16462 ++SameValuesCounter[OrigV];
16463 }
16464 }
16465 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
16466 // Gather externally used values.
16468 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16469 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16470 continue;
16471 Value *RdxVal = Candidates[Cnt];
16472 if (!Visited.insert(RdxVal).second)
16473 continue;
16474 // Check if the scalar was vectorized as part of the vectorization
16475 // tree but not the top node.
16476 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
16477 LocalExternallyUsedValues[RdxVal];
16478 continue;
16479 }
16480 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16481 unsigned NumOps =
16482 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16483 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
16484 LocalExternallyUsedValues[RdxVal];
16485 }
16486 // Do not need the list of reused scalars in regular mode anymore.
16487 if (!IsSupportedHorRdxIdentityOp)
16488 SameValuesCounter.clear();
16489 for (Value *RdxVal : VL)
16490 if (RequiredExtract.contains(RdxVal))
16491 LocalExternallyUsedValues[RdxVal];
16492 // Update LocalExternallyUsedValues for the scalar, replaced by
16493 // extractelement instructions.
16494 DenseMap<Value *, Value *> ReplacementToExternal;
16495 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16496 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
16497 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16498 Value *Ext = Pair.first;
16499 auto RIt = ReplacementToExternal.find(Ext);
16500 while (RIt != ReplacementToExternal.end()) {
16501 Ext = RIt->second;
16502 RIt = ReplacementToExternal.find(Ext);
16503 }
16504 auto *It = ExternallyUsedValues.find(Ext);
16505 if (It == ExternallyUsedValues.end())
16506 continue;
16507 LocalExternallyUsedValues[Pair.second].append(It->second);
16508 }
16509 V.buildExternalUses(LocalExternallyUsedValues);
16510
16511 V.computeMinimumValueSizes();
16512
16513 // Estimate cost.
16514 InstructionCost TreeCost = V.getTreeCost(VL);
16515 InstructionCost ReductionCost =
16516 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16517 InstructionCost Cost = TreeCost + ReductionCost;
16518 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16519 << " for reduction\n");
16520 if (!Cost.isValid())
16521 break;
16522 if (Cost >= -SLPCostThreshold) {
16523 V.getORE()->emit([&]() {
16525 SV_NAME, "HorSLPNotBeneficial",
16526 ReducedValsToOps.find(VL[0])->second.front())
16527 << "Vectorizing horizontal reduction is possible "
16528 << "but not beneficial with cost " << ore::NV("Cost", Cost)
16529 << " and threshold "
16530 << ore::NV("Threshold", -SLPCostThreshold);
16531 });
16532 if (!AdjustReducedVals())
16533 V.analyzedReductionVals(VL);
16534 continue;
16535 }
16536
16537 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
16538 << Cost << ". (HorRdx)\n");
16539 V.getORE()->emit([&]() {
16540 return OptimizationRemark(
16541 SV_NAME, "VectorizedHorizontalReduction",
16542 ReducedValsToOps.find(VL[0])->second.front())
16543 << "Vectorized horizontal reduction with cost "
16544 << ore::NV("Cost", Cost) << " and with tree size "
16545 << ore::NV("TreeSize", V.getTreeSize());
16546 });
16547
16548 Builder.setFastMathFlags(RdxFMF);
16549
16550 // Emit a reduction. If the root is a select (min/max idiom), the insert
16551 // point is the compare condition of that select.
16552 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16553 Instruction *InsertPt = RdxRootInst;
16554 if (IsCmpSelMinMax)
16555 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16556
16557 // Vectorize a tree.
16558 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
16559 ReplacedExternals, InsertPt);
16560
16561 Builder.SetInsertPoint(InsertPt);
16562
16563 // To prevent poison from leaking across what used to be sequential,
16564 // safe, scalar boolean logic operations, the reduction operand must be
16565 // frozen.
16566 if ((isBoolLogicOp(RdxRootInst) ||
16567 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16568 !isGuaranteedNotToBePoison(VectorizedRoot))
16569 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
16570
16571 // Emit code to correctly handle reused reduced values, if required.
16572 if (OptReusedScalars && !SameScaleFactor) {
16573 VectorizedRoot =
16574 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
16575 SameValuesCounter, TrackedToOrig);
16576 }
16577
16578 Value *ReducedSubTree =
16579 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
16580 if (ReducedSubTree->getType() != VL.front()->getType()) {
16581 ReducedSubTree = Builder.CreateIntCast(
16582 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
16584 R, cast<Instruction>(ReductionOps.front().front())
16585 ->getModule()
16586 ->getDataLayout());
16587 return !Known.isNonNegative();
16588 }));
16589 }
16590
16591 // Improved analysis for add/fadd/xor reductions with same scale factor
16592 // for all operands of reductions. We can emit scalar ops for them
16593 // instead.
16594 if (OptReusedScalars && SameScaleFactor)
16595 ReducedSubTree = emitScaleForReusedOps(
16596 ReducedSubTree, Builder, SameValuesCounter.front().second);
16597
16598 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16599 // Count vectorized reduced values to exclude them from final reduction.
16600 for (Value *RdxVal : VL) {
16601 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16602 if (IsSupportedHorRdxIdentityOp) {
16603 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
16604 continue;
16605 }
16606 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
16607 if (!V.isVectorized(RdxVal))
16608 RequiredExtract.insert(RdxVal);
16609 }
16610 Pos += ReduxWidth;
16611 Start = Pos;
16612 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
16613 AnyVectorized = true;
16614 }
16615 if (OptReusedScalars && !AnyVectorized) {
16616 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
16617 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
16618 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16619 Value *OrigV = TrackedToOrig.find(P.first)->second;
16620 VectorizedVals.try_emplace(OrigV, P.second);
16621 }
16622 continue;
16623 }
16624 }
16625 if (VectorizedTree) {
16626 // Reorder operands of bool logical op in the natural order to avoid
16627 // possible problem with poison propagation. If not possible to reorder
16628 // (both operands are originally RHS), emit an extra freeze instruction
16629 // for the LHS operand.
16630 // I.e., if we have original code like this:
16631 // RedOp1 = select i1 ?, i1 LHS, i1 false
16632 // RedOp2 = select i1 RHS, i1 ?, i1 false
16633
16634 // Then, we swap LHS/RHS to create a new op that matches the poison
16635 // semantics of the original code.
16636
16637 // If we have original code like this and both values could be poison:
16638 // RedOp1 = select i1 ?, i1 LHS, i1 false
16639 // RedOp2 = select i1 ?, i1 RHS, i1 false
16640
16641 // Then, we must freeze LHS in the new op.
16642 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
16643 Instruction *RedOp1,
16644 Instruction *RedOp2,
16645 bool InitStep) {
16646 if (!AnyBoolLogicOp)
16647 return;
16648 if (isBoolLogicOp(RedOp1) &&
16649 ((!InitStep && LHS == VectorizedTree) ||
16650 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
16651 return;
16652 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
16653 getRdxOperand(RedOp2, 0) == RHS ||
16655 std::swap(LHS, RHS);
16656 return;
16657 }
16658 if (LHS != VectorizedTree)
16659 LHS = Builder.CreateFreeze(LHS);
16660 };
16661 // Finish the reduction.
16662 // Need to add extra arguments and not vectorized possible reduction
16663 // values.
16664 // Try to avoid dependencies between the scalar remainders after
16665 // reductions.
16666 auto FinalGen =
16668 bool InitStep) {
16669 unsigned Sz = InstVals.size();
16671 Sz % 2);
16672 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
16673 Instruction *RedOp = InstVals[I + 1].first;
16674 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
16675 Value *RdxVal1 = InstVals[I].second;
16676 Value *StableRdxVal1 = RdxVal1;
16677 auto It1 = TrackedVals.find(RdxVal1);
16678 if (It1 != TrackedVals.end())
16679 StableRdxVal1 = It1->second;
16680 Value *RdxVal2 = InstVals[I + 1].second;
16681 Value *StableRdxVal2 = RdxVal2;
16682 auto It2 = TrackedVals.find(RdxVal2);
16683 if (It2 != TrackedVals.end())
16684 StableRdxVal2 = It2->second;
16685 // To prevent poison from leaking across what used to be
16686 // sequential, safe, scalar boolean logic operations, the
16687 // reduction operand must be frozen.
16688 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
16689 RedOp, InitStep);
16690 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
16691 StableRdxVal2, "op.rdx", ReductionOps);
16692 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
16693 }
16694 if (Sz % 2 == 1)
16695 ExtraReds[Sz / 2] = InstVals.back();
16696 return ExtraReds;
16697 };
16699 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
16700 VectorizedTree);
16702 for (ArrayRef<Value *> Candidates : ReducedVals) {
16703 for (Value *RdxVal : Candidates) {
16704 if (!Visited.insert(RdxVal).second)
16705 continue;
16706 unsigned NumOps = VectorizedVals.lookup(RdxVal);
16707 for (Instruction *RedOp :
16708 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
16709 .drop_back(NumOps))
16710 ExtraReductions.emplace_back(RedOp, RdxVal);
16711 }
16712 }
16713 for (auto &Pair : ExternallyUsedValues) {
16714 // Add each externally used value to the final reduction.
16715 for (auto *I : Pair.second)
16716 ExtraReductions.emplace_back(I, Pair.first);
16717 }
16718 // Iterate through all not-vectorized reduction values/extra arguments.
16719 bool InitStep = true;
16720 while (ExtraReductions.size() > 1) {
16721 VectorizedTree = ExtraReductions.front().second;
16723 FinalGen(ExtraReductions, InitStep);
16724 ExtraReductions.swap(NewReds);
16725 InitStep = false;
16726 }
16727 VectorizedTree = ExtraReductions.front().second;
16728
16729 ReductionRoot->replaceAllUsesWith(VectorizedTree);
16730
16731 // The original scalar reduction is expected to have no remaining
16732 // uses outside the reduction tree itself. Assert that we got this
16733 // correct, replace internal uses with undef, and mark for eventual
16734 // deletion.
16735#ifndef NDEBUG
16736 SmallSet<Value *, 4> IgnoreSet;
16737 for (ArrayRef<Value *> RdxOps : ReductionOps)
16738 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
16739#endif
16740 for (ArrayRef<Value *> RdxOps : ReductionOps) {
16741 for (Value *Ignore : RdxOps) {
16742 if (!Ignore)
16743 continue;
16744#ifndef NDEBUG
16745 for (auto *U : Ignore->users()) {
16746 assert(IgnoreSet.count(U) &&
16747 "All users must be either in the reduction ops list.");
16748 }
16749#endif
16750 if (!Ignore->use_empty()) {
16751 Value *Undef = UndefValue::get(Ignore->getType());
16752 Ignore->replaceAllUsesWith(Undef);
16753 }
16754 V.eraseInstruction(cast<Instruction>(Ignore));
16755 }
16756 }
16757 } else if (!CheckForReusedReductionOps) {
16758 for (ReductionOpsType &RdxOps : ReductionOps)
16759 for (Value *RdxOp : RdxOps)
16760 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16761 }
16762 return VectorizedTree;
16763 }
16764
16765private:
16766 /// Calculate the cost of a reduction.
16767 InstructionCost getReductionCost(TargetTransformInfo *TTI,
16768 ArrayRef<Value *> ReducedVals,
16769 bool IsCmpSelMinMax, unsigned ReduxWidth,
16770 FastMathFlags FMF) {
16772 Type *ScalarTy = ReducedVals.front()->getType();
16773 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
16774 InstructionCost VectorCost = 0, ScalarCost;
16775 // If all of the reduced values are constant, the vector cost is 0, since
16776 // the reduction value can be calculated at the compile time.
16777 bool AllConsts = allConstant(ReducedVals);
16778 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
16780 // Scalar cost is repeated for N-1 elements.
16781 int Cnt = ReducedVals.size();
16782 for (Value *RdxVal : ReducedVals) {
16783 if (Cnt == 1)
16784 break;
16785 --Cnt;
16786 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
16787 Cost += GenCostFn();
16788 continue;
16789 }
16790 InstructionCost ScalarCost = 0;
16791 for (User *U : RdxVal->users()) {
16792 auto *RdxOp = cast<Instruction>(U);
16793 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
16794 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
16795 continue;
16796 }
16797 ScalarCost = InstructionCost::getInvalid();
16798 break;
16799 }
16800 if (ScalarCost.isValid())
16801 Cost += ScalarCost;
16802 else
16803 Cost += GenCostFn();
16804 }
16805 return Cost;
16806 };
16807 switch (RdxKind) {
16808 case RecurKind::Add:
16809 case RecurKind::Mul:
16810 case RecurKind::Or:
16811 case RecurKind::And:
16812 case RecurKind::Xor:
16813 case RecurKind::FAdd:
16814 case RecurKind::FMul: {
16815 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
16816 if (!AllConsts)
16817 VectorCost =
16818 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
16819 ScalarCost = EvaluateScalarCost([&]() {
16820 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
16821 });
16822 break;
16823 }
16824 case RecurKind::FMax:
16825 case RecurKind::FMin:
16826 case RecurKind::FMaximum:
16827 case RecurKind::FMinimum:
16828 case RecurKind::SMax:
16829 case RecurKind::SMin:
16830 case RecurKind::UMax:
16831 case RecurKind::UMin: {
16833 if (!AllConsts)
16834 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
16835 ScalarCost = EvaluateScalarCost([&]() {
16836 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
16837 return TTI->getIntrinsicInstrCost(ICA, CostKind);
16838 });
16839 break;
16840 }
16841 default:
16842 llvm_unreachable("Expected arithmetic or min/max reduction operation");
16843 }
16844
16845 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
16846 << " for reduction of " << shortBundleName(ReducedVals)
16847 << " (It is a splitting reduction)\n");
16848 return VectorCost - ScalarCost;
16849 }
16850
16851 /// Emit a horizontal reduction of the vectorized value.
16852 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
16853 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
16854 assert(VectorizedValue && "Need to have a vectorized tree node");
16855 assert(isPowerOf2_32(ReduxWidth) &&
16856 "We only handle power-of-two reductions for now");
16857 assert(RdxKind != RecurKind::FMulAdd &&
16858 "A call to the llvm.fmuladd intrinsic is not handled yet");
16859
16860 ++NumVectorInstructions;
16861 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
16862 }
16863
16864 /// Emits optimized code for unique scalar value reused \p Cnt times.
16865 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
16866 unsigned Cnt) {
16867 assert(IsSupportedHorRdxIdentityOp &&
16868 "The optimization of matched scalar identity horizontal reductions "
16869 "must be supported.");
16870 switch (RdxKind) {
16871 case RecurKind::Add: {
16872 // res = mul vv, n
16873 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
16874 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
16875 << VectorizedValue << ". (HorRdx)\n");
16876 return Builder.CreateMul(VectorizedValue, Scale);
16877 }
16878 case RecurKind::Xor: {
16879 // res = n % 2 ? 0 : vv
16880 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
16881 << ". (HorRdx)\n");
16882 if (Cnt % 2 == 0)
16883 return Constant::getNullValue(VectorizedValue->getType());
16884 return VectorizedValue;
16885 }
16886 case RecurKind::FAdd: {
16887 // res = fmul v, n
16888 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
16889 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
16890 << VectorizedValue << ". (HorRdx)\n");
16891 return Builder.CreateFMul(VectorizedValue, Scale);
16892 }
16893 case RecurKind::And:
16894 case RecurKind::Or:
16895 case RecurKind::SMax:
16896 case RecurKind::SMin:
16897 case RecurKind::UMax:
16898 case RecurKind::UMin:
16899 case RecurKind::FMax:
16900 case RecurKind::FMin:
16901 case RecurKind::FMaximum:
16902 case RecurKind::FMinimum:
16903 // res = vv
16904 return VectorizedValue;
16905 case RecurKind::Mul:
16906 case RecurKind::FMul:
16907 case RecurKind::FMulAdd:
16908 case RecurKind::IAnyOf:
16909 case RecurKind::FAnyOf:
16910 case RecurKind::None:
16911 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
16912 }
16913 return nullptr;
16914 }
16915
16916 /// Emits actual operation for the scalar identity values, found during
16917 /// horizontal reduction analysis.
16918 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
16920 const MapVector<Value *, unsigned> &SameValuesCounter,
16921 const DenseMap<Value *, Value *> &TrackedToOrig) {
16922 assert(IsSupportedHorRdxIdentityOp &&
16923 "The optimization of matched scalar identity horizontal reductions "
16924 "must be supported.");
16925 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
16926 if (VTy->getElementType() != VL.front()->getType()) {
16927 VectorizedValue = Builder.CreateIntCast(
16928 VectorizedValue,
16929 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
16930 any_of(VL, [&](Value *R) {
16932 R, cast<Instruction>(ReductionOps.front().front())
16933 ->getModule()
16934 ->getDataLayout());
16935 return !Known.isNonNegative();
16936 }));
16937 }
16938 switch (RdxKind) {
16939 case RecurKind::Add: {
16940 // root = mul prev_root, <1, 1, n, 1>
16942 for (Value *V : VL) {
16943 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
16944 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
16945 }
16946 auto *Scale = ConstantVector::get(Vals);
16947 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
16948 << VectorizedValue << ". (HorRdx)\n");
16949 return Builder.CreateMul(VectorizedValue, Scale);
16950 }
16951 case RecurKind::And:
16952 case RecurKind::Or:
16953 // No need for multiple or/and(s).
16954 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
16955 << ". (HorRdx)\n");
16956 return VectorizedValue;
16957 case RecurKind::SMax:
16958 case RecurKind::SMin:
16959 case RecurKind::UMax:
16960 case RecurKind::UMin:
16961 case RecurKind::FMax:
16962 case RecurKind::FMin:
16963 case RecurKind::FMaximum:
16964 case RecurKind::FMinimum:
16965 // No need for multiple min/max(s) of the same value.
16966 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
16967 << ". (HorRdx)\n");
16968 return VectorizedValue;
16969 case RecurKind::Xor: {
16970 // Replace values with even number of repeats with 0, since
16971 // x xor x = 0.
16972 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
16973 // 7>, if elements 4th and 6th elements have even number of repeats.
16975 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
16977 std::iota(Mask.begin(), Mask.end(), 0);
16978 bool NeedShuffle = false;
16979 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
16980 Value *V = VL[I];
16981 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
16982 if (Cnt % 2 == 0) {
16983 Mask[I] = VF;
16984 NeedShuffle = true;
16985 }
16986 }
16987 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
16988 : Mask) dbgs()
16989 << I << " ";
16990 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
16991 if (NeedShuffle)
16992 VectorizedValue = Builder.CreateShuffleVector(
16993 VectorizedValue,
16994 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
16995 return VectorizedValue;
16996 }
16997 case RecurKind::FAdd: {
16998 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17000 for (Value *V : VL) {
17001 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17002 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17003 }
17004 auto *Scale = ConstantVector::get(Vals);
17005 return Builder.CreateFMul(VectorizedValue, Scale);
17006 }
17007 case RecurKind::Mul:
17008 case RecurKind::FMul:
17009 case RecurKind::FMulAdd:
17010 case RecurKind::IAnyOf:
17011 case RecurKind::FAnyOf:
17012 case RecurKind::None:
17013 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17014 }
17015 return nullptr;
17016 }
17017};
17018} // end anonymous namespace
17019
17020/// Gets recurrence kind from the specified value.
17022 return HorizontalReduction::getRdxKind(V);
17023}
17024static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17025 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17026 return cast<FixedVectorType>(IE->getType())->getNumElements();
17027
17028 unsigned AggregateSize = 1;
17029 auto *IV = cast<InsertValueInst>(InsertInst);
17030 Type *CurrentType = IV->getType();
17031 do {
17032 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
17033 for (auto *Elt : ST->elements())
17034 if (Elt != ST->getElementType(0)) // check homogeneity
17035 return std::nullopt;
17036 AggregateSize *= ST->getNumElements();
17037 CurrentType = ST->getElementType(0);
17038 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17039 AggregateSize *= AT->getNumElements();
17040 CurrentType = AT->getElementType();
17041 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17042 AggregateSize *= VT->getNumElements();
17043 return AggregateSize;
17044 } else if (CurrentType->isSingleValueType()) {
17045 return AggregateSize;
17046 } else {
17047 return std::nullopt;
17048 }
17049 } while (true);
17050}
17051
17052static void findBuildAggregate_rec(Instruction *LastInsertInst,
17054 SmallVectorImpl<Value *> &BuildVectorOpds,
17055 SmallVectorImpl<Value *> &InsertElts,
17056 unsigned OperandOffset) {
17057 do {
17058 Value *InsertedOperand = LastInsertInst->getOperand(1);
17059 std::optional<unsigned> OperandIndex =
17060 getInsertIndex(LastInsertInst, OperandOffset);
17061 if (!OperandIndex)
17062 return;
17063 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17064 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
17065 BuildVectorOpds, InsertElts, *OperandIndex);
17066
17067 } else {
17068 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17069 InsertElts[*OperandIndex] = LastInsertInst;
17070 }
17071 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
17072 } while (LastInsertInst != nullptr &&
17073 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17074 LastInsertInst->hasOneUse());
17075}
17076
17077/// Recognize construction of vectors like
17078/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17079/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17080/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17081/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17082/// starting from the last insertelement or insertvalue instruction.
17083///
17084/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17085/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17086/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17087///
17088/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17089///
17090/// \return true if it matches.
17091static bool findBuildAggregate(Instruction *LastInsertInst,
17093 SmallVectorImpl<Value *> &BuildVectorOpds,
17094 SmallVectorImpl<Value *> &InsertElts) {
17095
17096 assert((isa<InsertElementInst>(LastInsertInst) ||
17097 isa<InsertValueInst>(LastInsertInst)) &&
17098 "Expected insertelement or insertvalue instruction!");
17099
17100 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17101 "Expected empty result vectors!");
17102
17103 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
17104 if (!AggregateSize)
17105 return false;
17106 BuildVectorOpds.resize(*AggregateSize);
17107 InsertElts.resize(*AggregateSize);
17108
17109 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
17110 llvm::erase(BuildVectorOpds, nullptr);
17111 llvm::erase(InsertElts, nullptr);
17112 if (BuildVectorOpds.size() >= 2)
17113 return true;
17114
17115 return false;
17116}
17117
17118/// Try and get a reduction instruction from a phi node.
17119///
17120/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17121/// if they come from either \p ParentBB or a containing loop latch.
17122///
17123/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17124/// if not possible.
17126 BasicBlock *ParentBB, LoopInfo *LI) {
17127 // There are situations where the reduction value is not dominated by the
17128 // reduction phi. Vectorizing such cases has been reported to cause
17129 // miscompiles. See PR25787.
17130 auto DominatedReduxValue = [&](Value *R) {
17131 return isa<Instruction>(R) &&
17132 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
17133 };
17134
17135 Instruction *Rdx = nullptr;
17136
17137 // Return the incoming value if it comes from the same BB as the phi node.
17138 if (P->getIncomingBlock(0) == ParentBB) {
17139 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17140 } else if (P->getIncomingBlock(1) == ParentBB) {
17141 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17142 }
17143
17144 if (Rdx && DominatedReduxValue(Rdx))
17145 return Rdx;
17146
17147 // Otherwise, check whether we have a loop latch to look at.
17148 Loop *BBL = LI->getLoopFor(ParentBB);
17149 if (!BBL)
17150 return nullptr;
17151 BasicBlock *BBLatch = BBL->getLoopLatch();
17152 if (!BBLatch)
17153 return nullptr;
17154
17155 // There is a loop latch, return the incoming value if it comes from
17156 // that. This reduction pattern occasionally turns up.
17157 if (P->getIncomingBlock(0) == BBLatch) {
17158 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17159 } else if (P->getIncomingBlock(1) == BBLatch) {
17160 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17161 }
17162
17163 if (Rdx && DominatedReduxValue(Rdx))
17164 return Rdx;
17165
17166 return nullptr;
17167}
17168
17169static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17170 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
17171 return true;
17172 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
17173 return true;
17174 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
17175 return true;
17176 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
17177 return true;
17178 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17179 return true;
17180 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17181 return true;
17182 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17183 return true;
17184 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17185 return true;
17186 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17187 return true;
17188 return false;
17189}
17190
17191/// We could have an initial reduction that is not an add.
17192/// r *= v1 + v2 + v3 + v4
17193/// In such a case start looking for a tree rooted in the first '+'.
17194/// \Returns the new root if found, which may be nullptr if not an instruction.
17196 Instruction *Root) {
17197 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17198 isa<IntrinsicInst>(Root)) &&
17199 "Expected binop, select, or intrinsic for reduction matching");
17200 Value *LHS =
17201 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17202 Value *RHS =
17203 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17204 if (LHS == Phi)
17205 return dyn_cast<Instruction>(RHS);
17206 if (RHS == Phi)
17207 return dyn_cast<Instruction>(LHS);
17208 return nullptr;
17209}
17210
17211/// \p Returns the first operand of \p I that does not match \p Phi. If
17212/// operand is not an instruction it returns nullptr.
17214 Value *Op0 = nullptr;
17215 Value *Op1 = nullptr;
17216 if (!matchRdxBop(I, Op0, Op1))
17217 return nullptr;
17218 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17219}
17220
17221/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17223 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
17224 Value *B0 = nullptr, *B1 = nullptr;
17225 bool IsBinop = matchRdxBop(I, B0, B1);
17226 return IsBinop || IsSelect;
17227}
17228
17229bool SLPVectorizerPass::vectorizeHorReduction(
17231 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17232 if (!ShouldVectorizeHor)
17233 return false;
17234 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
17235
17236 if (Root->getParent() != BB || isa<PHINode>(Root))
17237 return false;
17238
17239 // If we can find a secondary reduction root, use that instead.
17240 auto SelectRoot = [&]() {
17241 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
17242 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
17243 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
17244 return NewRoot;
17245 return Root;
17246 };
17247
17248 // Start analysis starting from Root instruction. If horizontal reduction is
17249 // found, try to vectorize it. If it is not a horizontal reduction or
17250 // vectorization is not possible or not effective, and currently analyzed
17251 // instruction is a binary operation, try to vectorize the operands, using
17252 // pre-order DFS traversal order. If the operands were not vectorized, repeat
17253 // the same procedure considering each operand as a possible root of the
17254 // horizontal reduction.
17255 // Interrupt the process if the Root instruction itself was vectorized or all
17256 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17257 // If a horizintal reduction was not matched or vectorized we collect
17258 // instructions for possible later attempts for vectorization.
17259 std::queue<std::pair<Instruction *, unsigned>> Stack;
17260 Stack.emplace(SelectRoot(), 0);
17261 SmallPtrSet<Value *, 8> VisitedInstrs;
17262 bool Res = false;
17263 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
17264 if (R.isAnalyzedReductionRoot(Inst))
17265 return nullptr;
17266 if (!isReductionCandidate(Inst))
17267 return nullptr;
17268 HorizontalReduction HorRdx;
17269 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
17270 return nullptr;
17271 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
17272 };
17273 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17274 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17275 FutureSeed = getNonPhiOperand(Root, P);
17276 if (!FutureSeed)
17277 return false;
17278 }
17279 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
17280 // analysis is done separately.
17281 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17282 PostponedInsts.push_back(FutureSeed);
17283 return true;
17284 };
17285
17286 while (!Stack.empty()) {
17287 Instruction *Inst;
17288 unsigned Level;
17289 std::tie(Inst, Level) = Stack.front();
17290 Stack.pop();
17291 // Do not try to analyze instruction that has already been vectorized.
17292 // This may happen when we vectorize instruction operands on a previous
17293 // iteration while stack was populated before that happened.
17294 if (R.isDeleted(Inst))
17295 continue;
17296 if (Value *VectorizedV = TryToReduce(Inst)) {
17297 Res = true;
17298 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
17299 // Try to find another reduction.
17300 Stack.emplace(I, Level);
17301 continue;
17302 }
17303 } else {
17304 // We could not vectorize `Inst` so try to use it as a future seed.
17305 if (!TryAppendToPostponedInsts(Inst)) {
17306 assert(Stack.empty() && "Expected empty stack");
17307 break;
17308 }
17309 }
17310
17311 // Try to vectorize operands.
17312 // Continue analysis for the instruction from the same basic block only to
17313 // save compile time.
17314 if (++Level < RecursionMaxDepth)
17315 for (auto *Op : Inst->operand_values())
17316 if (VisitedInstrs.insert(Op).second)
17317 if (auto *I = dyn_cast<Instruction>(Op))
17318 // Do not try to vectorize CmpInst operands, this is done
17319 // separately.
17320 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
17321 !R.isDeleted(I) && I->getParent() == BB)
17322 Stack.emplace(I, Level);
17323 }
17324 return Res;
17325}
17326
17327bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
17328 BasicBlock *BB, BoUpSLP &R,
17330 SmallVector<WeakTrackingVH> PostponedInsts;
17331 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
17332 Res |= tryToVectorize(PostponedInsts, R);
17333 return Res;
17334}
17335
17336bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
17337 BoUpSLP &R) {
17338 bool Res = false;
17339 for (Value *V : Insts)
17340 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
17341 Res |= tryToVectorize(Inst, R);
17342 return Res;
17343}
17344
17345bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
17346 BasicBlock *BB, BoUpSLP &R) {
17347 if (!R.canMapToVector(IVI->getType()))
17348 return false;
17349
17350 SmallVector<Value *, 16> BuildVectorOpds;
17351 SmallVector<Value *, 16> BuildVectorInsts;
17352 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
17353 return false;
17354
17355 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
17356 // Aggregate value is unlikely to be processed in vector register.
17357 return tryToVectorizeList(BuildVectorOpds, R);
17358}
17359
17360bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
17361 BasicBlock *BB, BoUpSLP &R) {
17362 SmallVector<Value *, 16> BuildVectorInsts;
17363 SmallVector<Value *, 16> BuildVectorOpds;
17365 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
17366 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17367 isFixedVectorShuffle(BuildVectorOpds, Mask)))
17368 return false;
17369
17370 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
17371 return tryToVectorizeList(BuildVectorInsts, R);
17372}
17373
17374template <typename T>
17376 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
17377 function_ref<bool(T *, T *)> AreCompatible,
17378 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
17379 bool MaxVFOnly, BoUpSLP &R) {
17380 bool Changed = false;
17381 // Sort by type, parent, operands.
17382 stable_sort(Incoming, Comparator);
17383
17384 // Try to vectorize elements base on their type.
17385 SmallVector<T *> Candidates;
17386 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
17387 // Look for the next elements with the same type, parent and operand
17388 // kinds.
17389 auto *SameTypeIt = IncIt;
17390 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17391 ++SameTypeIt;
17392
17393 // Try to vectorize them.
17394 unsigned NumElts = (SameTypeIt - IncIt);
17395 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
17396 << NumElts << ")\n");
17397 // The vectorization is a 3-state attempt:
17398 // 1. Try to vectorize instructions with the same/alternate opcodes with the
17399 // size of maximal register at first.
17400 // 2. Try to vectorize remaining instructions with the same type, if
17401 // possible. This may result in the better vectorization results rather than
17402 // if we try just to vectorize instructions with the same/alternate opcodes.
17403 // 3. Final attempt to try to vectorize all instructions with the
17404 // same/alternate ops only, this may result in some extra final
17405 // vectorization.
17406 if (NumElts > 1 &&
17407 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17408 // Success start over because instructions might have been changed.
17409 Changed = true;
17410 } else {
17411 /// \Returns the minimum number of elements that we will attempt to
17412 /// vectorize.
17413 auto GetMinNumElements = [&R](Value *V) {
17414 unsigned EltSize = R.getVectorElementSize(V);
17415 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17416 };
17417 if (NumElts < GetMinNumElements(*IncIt) &&
17418 (Candidates.empty() ||
17419 Candidates.front()->getType() == (*IncIt)->getType())) {
17420 Candidates.append(IncIt, std::next(IncIt, NumElts));
17421 }
17422 }
17423 // Final attempt to vectorize instructions with the same types.
17424 if (Candidates.size() > 1 &&
17425 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17426 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
17427 // Success start over because instructions might have been changed.
17428 Changed = true;
17429 } else if (MaxVFOnly) {
17430 // Try to vectorize using small vectors.
17431 for (auto *It = Candidates.begin(), *End = Candidates.end();
17432 It != End;) {
17433 auto *SameTypeIt = It;
17434 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
17435 ++SameTypeIt;
17436 unsigned NumElts = (SameTypeIt - It);
17437 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
17438 /*MaxVFOnly=*/false))
17439 Changed = true;
17440 It = SameTypeIt;
17441 }
17442 }
17443 Candidates.clear();
17444 }
17445
17446 // Start over at the next instruction of a different type (or the end).
17447 IncIt = SameTypeIt;
17448 }
17449 return Changed;
17450}
17451
17452/// Compare two cmp instructions. If IsCompatibility is true, function returns
17453/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
17454/// operands. If IsCompatibility is false, function implements strict weak
17455/// ordering relation between two cmp instructions, returning true if the first
17456/// instruction is "less" than the second, i.e. its predicate is less than the
17457/// predicate of the second or the operands IDs are less than the operands IDs
17458/// of the second cmp instruction.
17459template <bool IsCompatibility>
17460static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
17461 const DominatorTree &DT) {
17462 assert(isValidElementType(V->getType()) &&
17463 isValidElementType(V2->getType()) &&
17464 "Expected valid element types only.");
17465 if (V == V2)
17466 return IsCompatibility;
17467 auto *CI1 = cast<CmpInst>(V);
17468 auto *CI2 = cast<CmpInst>(V2);
17469 if (CI1->getOperand(0)->getType()->getTypeID() <
17470 CI2->getOperand(0)->getType()->getTypeID())
17471 return !IsCompatibility;
17472 if (CI1->getOperand(0)->getType()->getTypeID() >
17473 CI2->getOperand(0)->getType()->getTypeID())
17474 return false;
17475 CmpInst::Predicate Pred1 = CI1->getPredicate();
17476 CmpInst::Predicate Pred2 = CI2->getPredicate();
17479 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
17480 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
17481 if (BasePred1 < BasePred2)
17482 return !IsCompatibility;
17483 if (BasePred1 > BasePred2)
17484 return false;
17485 // Compare operands.
17486 bool CI1Preds = Pred1 == BasePred1;
17487 bool CI2Preds = Pred2 == BasePred1;
17488 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
17489 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
17490 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
17491 if (Op1 == Op2)
17492 continue;
17493 if (Op1->getValueID() < Op2->getValueID())
17494 return !IsCompatibility;
17495 if (Op1->getValueID() > Op2->getValueID())
17496 return false;
17497 if (auto *I1 = dyn_cast<Instruction>(Op1))
17498 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
17499 if (IsCompatibility) {
17500 if (I1->getParent() != I2->getParent())
17501 return false;
17502 } else {
17503 // Try to compare nodes with same parent.
17504 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
17505 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
17506 if (!NodeI1)
17507 return NodeI2 != nullptr;
17508 if (!NodeI2)
17509 return false;
17510 assert((NodeI1 == NodeI2) ==
17511 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17512 "Different nodes should have different DFS numbers");
17513 if (NodeI1 != NodeI2)
17514 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17515 }
17516 InstructionsState S = getSameOpcode({I1, I2}, TLI);
17517 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17518 continue;
17519 if (IsCompatibility)
17520 return false;
17521 if (I1->getOpcode() != I2->getOpcode())
17522 return I1->getOpcode() < I2->getOpcode();
17523 }
17524 }
17525 return IsCompatibility;
17526}
17527
17528template <typename ItT>
17529bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
17530 BasicBlock *BB, BoUpSLP &R) {
17531 bool Changed = false;
17532 // Try to find reductions first.
17533 for (CmpInst *I : CmpInsts) {
17534 if (R.isDeleted(I))
17535 continue;
17536 for (Value *Op : I->operands())
17537 if (auto *RootOp = dyn_cast<Instruction>(Op))
17538 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
17539 }
17540 // Try to vectorize operands as vector bundles.
17541 for (CmpInst *I : CmpInsts) {
17542 if (R.isDeleted(I))
17543 continue;
17544 Changed |= tryToVectorize(I, R);
17545 }
17546 // Try to vectorize list of compares.
17547 // Sort by type, compare predicate, etc.
17548 auto CompareSorter = [&](Value *V, Value *V2) {
17549 if (V == V2)
17550 return false;
17551 return compareCmp<false>(V, V2, *TLI, *DT);
17552 };
17553
17554 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
17555 if (V1 == V2)
17556 return true;
17557 return compareCmp<true>(V1, V2, *TLI, *DT);
17558 };
17559
17561 for (Instruction *V : CmpInsts)
17562 if (!R.isDeleted(V) && isValidElementType(V->getType()))
17563 Vals.push_back(V);
17564 if (Vals.size() <= 1)
17565 return Changed;
17566 Changed |= tryToVectorizeSequence<Value>(
17567 Vals, CompareSorter, AreCompatibleCompares,
17568 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
17569 // Exclude possible reductions from other blocks.
17570 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
17571 return any_of(V->users(), [V](User *U) {
17572 auto *Select = dyn_cast<SelectInst>(U);
17573 return Select &&
17574 Select->getParent() != cast<Instruction>(V)->getParent();
17575 });
17576 });
17577 if (ArePossiblyReducedInOtherBlock)
17578 return false;
17579 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17580 },
17581 /*MaxVFOnly=*/true, R);
17582 return Changed;
17583}
17584
17585bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17586 BasicBlock *BB, BoUpSLP &R) {
17587 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17588 "This function only accepts Insert instructions");
17589 bool OpsChanged = false;
17590 SmallVector<WeakTrackingVH> PostponedInsts;
17591 // pass1 - try to vectorize reductions only
17592 for (auto *I : reverse(Instructions)) {
17593 if (R.isDeleted(I))
17594 continue;
17595 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
17596 }
17597 // pass2 - try to match and vectorize a buildvector sequence.
17598 for (auto *I : reverse(Instructions)) {
17599 if (R.isDeleted(I) || isa<CmpInst>(I))
17600 continue;
17601 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
17602 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
17603 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
17604 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
17605 }
17606 }
17607 // Now try to vectorize postponed instructions.
17608 OpsChanged |= tryToVectorize(PostponedInsts, R);
17609
17610 Instructions.clear();
17611 return OpsChanged;
17612}
17613
17614bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
17615 bool Changed = false;
17617 SmallPtrSet<Value *, 16> VisitedInstrs;
17618 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
17619 // node. Allows better to identify the chains that can be vectorized in the
17620 // better way.
17622 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
17624 isValidElementType(V2->getType()) &&
17625 "Expected vectorizable types only.");
17626 // It is fine to compare type IDs here, since we expect only vectorizable
17627 // types, like ints, floats and pointers, we don't care about other type.
17628 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
17629 return true;
17630 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
17631 return false;
17632 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
17633 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
17634 if (Opcodes1.size() < Opcodes2.size())
17635 return true;
17636 if (Opcodes1.size() > Opcodes2.size())
17637 return false;
17638 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
17639 {
17640 // Instructions come first.
17641 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
17642 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
17643 if (I1 && I2) {
17644 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
17645 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
17646 if (!NodeI1)
17647 return NodeI2 != nullptr;
17648 if (!NodeI2)
17649 return false;
17650 assert((NodeI1 == NodeI2) ==
17651 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17652 "Different nodes should have different DFS numbers");
17653 if (NodeI1 != NodeI2)
17654 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17655 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
17656 if (S.getOpcode() && !S.isAltShuffle())
17657 continue;
17658 return I1->getOpcode() < I2->getOpcode();
17659 }
17660 if (I1)
17661 return true;
17662 if (I2)
17663 return false;
17664 }
17665 {
17666 // Non-undef constants come next.
17667 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
17668 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
17669 if (C1 && C2)
17670 continue;
17671 if (C1)
17672 return true;
17673 if (C2)
17674 return false;
17675 }
17676 bool U1 = isa<UndefValue>(Opcodes1[I]);
17677 bool U2 = isa<UndefValue>(Opcodes2[I]);
17678 {
17679 // Non-constant non-instructions come next.
17680 if (!U1 && !U2) {
17681 auto ValID1 = Opcodes1[I]->getValueID();
17682 auto ValID2 = Opcodes2[I]->getValueID();
17683 if (ValID1 == ValID2)
17684 continue;
17685 if (ValID1 < ValID2)
17686 return true;
17687 if (ValID1 > ValID2)
17688 return false;
17689 }
17690 if (!U1)
17691 return true;
17692 if (!U2)
17693 return false;
17694 }
17695 // Undefs come last.
17696 assert(U1 && U2 && "The only thing left should be undef & undef.");
17697 continue;
17698 }
17699 return false;
17700 };
17701 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
17702 if (V1 == V2)
17703 return true;
17704 if (V1->getType() != V2->getType())
17705 return false;
17706 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
17707 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
17708 if (Opcodes1.size() != Opcodes2.size())
17709 return false;
17710 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
17711 // Undefs are compatible with any other value.
17712 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
17713 continue;
17714 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
17715 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
17716 if (I1->getParent() != I2->getParent())
17717 return false;
17718 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
17719 if (S.getOpcode())
17720 continue;
17721 return false;
17722 }
17723 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
17724 continue;
17725 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
17726 return false;
17727 }
17728 return true;
17729 };
17730
17731 bool HaveVectorizedPhiNodes = false;
17732 do {
17733 // Collect the incoming values from the PHIs.
17734 Incoming.clear();
17735 for (Instruction &I : *BB) {
17736 PHINode *P = dyn_cast<PHINode>(&I);
17737 if (!P)
17738 break;
17739
17740 // No need to analyze deleted, vectorized and non-vectorizable
17741 // instructions.
17742 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
17743 isValidElementType(P->getType()))
17744 Incoming.push_back(P);
17745 }
17746
17747 if (Incoming.size() <= 1)
17748 break;
17749
17750 // Find the corresponding non-phi nodes for better matching when trying to
17751 // build the tree.
17752 for (Value *V : Incoming) {
17753 SmallVectorImpl<Value *> &Opcodes =
17754 PHIToOpcodes.try_emplace(V).first->getSecond();
17755 if (!Opcodes.empty())
17756 continue;
17757 SmallVector<Value *, 4> Nodes(1, V);
17759 while (!Nodes.empty()) {
17760 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
17761 if (!Visited.insert(PHI).second)
17762 continue;
17763 for (Value *V : PHI->incoming_values()) {
17764 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
17765 Nodes.push_back(PHI1);
17766 continue;
17767 }
17768 Opcodes.emplace_back(V);
17769 }
17770 }
17771 }
17772
17773 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
17774 Incoming, PHICompare, AreCompatiblePHIs,
17775 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
17776 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17777 },
17778 /*MaxVFOnly=*/true, R);
17779 Changed |= HaveVectorizedPhiNodes;
17780 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
17781 } while (HaveVectorizedPhiNodes);
17782
17783 VisitedInstrs.clear();
17784
17785 InstSetVector PostProcessInserts;
17786 SmallSetVector<CmpInst *, 8> PostProcessCmps;
17787 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
17788 // also vectorizes `PostProcessCmps`.
17789 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
17790 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
17791 if (VectorizeCmps) {
17792 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
17793 PostProcessCmps.clear();
17794 }
17795 PostProcessInserts.clear();
17796 return Changed;
17797 };
17798 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
17799 auto IsInPostProcessInstrs = [&](Instruction *I) {
17800 if (auto *Cmp = dyn_cast<CmpInst>(I))
17801 return PostProcessCmps.contains(Cmp);
17802 return isa<InsertElementInst, InsertValueInst>(I) &&
17803 PostProcessInserts.contains(I);
17804 };
17805 // Returns true if `I` is an instruction without users, like terminator, or
17806 // function call with ignored return value, store. Ignore unused instructions
17807 // (basing on instruction type, except for CallInst and InvokeInst).
17808 auto HasNoUsers = [](Instruction *I) {
17809 return I->use_empty() &&
17810 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
17811 };
17812 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
17813 // Skip instructions with scalable type. The num of elements is unknown at
17814 // compile-time for scalable type.
17815 if (isa<ScalableVectorType>(It->getType()))
17816 continue;
17817
17818 // Skip instructions marked for the deletion.
17819 if (R.isDeleted(&*It))
17820 continue;
17821 // We may go through BB multiple times so skip the one we have checked.
17822 if (!VisitedInstrs.insert(&*It).second) {
17823 if (HasNoUsers(&*It) &&
17824 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
17825 // We would like to start over since some instructions are deleted
17826 // and the iterator may become invalid value.
17827 Changed = true;
17828 It = BB->begin();
17829 E = BB->end();
17830 }
17831 continue;
17832 }
17833
17834 if (isa<DbgInfoIntrinsic>(It))
17835 continue;
17836
17837 // Try to vectorize reductions that use PHINodes.
17838 if (PHINode *P = dyn_cast<PHINode>(It)) {
17839 // Check that the PHI is a reduction PHI.
17840 if (P->getNumIncomingValues() == 2) {
17841 // Try to match and vectorize a horizontal reduction.
17842 Instruction *Root = getReductionInstr(DT, P, BB, LI);
17843 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
17844 Changed = true;
17845 It = BB->begin();
17846 E = BB->end();
17847 continue;
17848 }
17849 }
17850 // Try to vectorize the incoming values of the PHI, to catch reductions
17851 // that feed into PHIs.
17852 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
17853 // Skip if the incoming block is the current BB for now. Also, bypass
17854 // unreachable IR for efficiency and to avoid crashing.
17855 // TODO: Collect the skipped incoming values and try to vectorize them
17856 // after processing BB.
17857 if (BB == P->getIncomingBlock(I) ||
17858 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
17859 continue;
17860
17861 // Postponed instructions should not be vectorized here, delay their
17862 // vectorization.
17863 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
17864 PI && !IsInPostProcessInstrs(PI))
17865 Changed |= vectorizeRootInstruction(nullptr, PI,
17866 P->getIncomingBlock(I), R, TTI);
17867 }
17868 continue;
17869 }
17870
17871 if (HasNoUsers(&*It)) {
17872 bool OpsChanged = false;
17873 auto *SI = dyn_cast<StoreInst>(It);
17874 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
17875 if (SI) {
17876 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
17877 // Try to vectorize chain in store, if this is the only store to the
17878 // address in the block.
17879 // TODO: This is just a temporarily solution to save compile time. Need
17880 // to investigate if we can safely turn on slp-vectorize-hor-store
17881 // instead to allow lookup for reduction chains in all non-vectorized
17882 // stores (need to check side effects and compile time).
17883 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
17884 SI->getValueOperand()->hasOneUse();
17885 }
17886 if (TryToVectorizeRoot) {
17887 for (auto *V : It->operand_values()) {
17888 // Postponed instructions should not be vectorized here, delay their
17889 // vectorization.
17890 if (auto *VI = dyn_cast<Instruction>(V);
17891 VI && !IsInPostProcessInstrs(VI))
17892 // Try to match and vectorize a horizontal reduction.
17893 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
17894 }
17895 }
17896 // Start vectorization of post-process list of instructions from the
17897 // top-tree instructions to try to vectorize as many instructions as
17898 // possible.
17899 OpsChanged |=
17900 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
17901 if (OpsChanged) {
17902 // We would like to start over since some instructions are deleted
17903 // and the iterator may become invalid value.
17904 Changed = true;
17905 It = BB->begin();
17906 E = BB->end();
17907 continue;
17908 }
17909 }
17910
17911 if (isa<InsertElementInst, InsertValueInst>(It))
17912 PostProcessInserts.insert(&*It);
17913 else if (isa<CmpInst>(It))
17914 PostProcessCmps.insert(cast<CmpInst>(&*It));
17915 }
17916
17917 return Changed;
17918}
17919
17920bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
17921 auto Changed = false;
17922 for (auto &Entry : GEPs) {
17923 // If the getelementptr list has fewer than two elements, there's nothing
17924 // to do.
17925 if (Entry.second.size() < 2)
17926 continue;
17927
17928 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
17929 << Entry.second.size() << ".\n");
17930
17931 // Process the GEP list in chunks suitable for the target's supported
17932 // vector size. If a vector register can't hold 1 element, we are done. We
17933 // are trying to vectorize the index computations, so the maximum number of
17934 // elements is based on the size of the index expression, rather than the
17935 // size of the GEP itself (the target's pointer size).
17936 unsigned MaxVecRegSize = R.getMaxVecRegSize();
17937 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
17938 if (MaxVecRegSize < EltSize)
17939 continue;
17940
17941 unsigned MaxElts = MaxVecRegSize / EltSize;
17942 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
17943 auto Len = std::min<unsigned>(BE - BI, MaxElts);
17944 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
17945
17946 // Initialize a set a candidate getelementptrs. Note that we use a
17947 // SetVector here to preserve program order. If the index computations
17948 // are vectorizable and begin with loads, we want to minimize the chance
17949 // of having to reorder them later.
17950 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
17951
17952 // Some of the candidates may have already been vectorized after we
17953 // initially collected them or their index is optimized to constant value.
17954 // If so, they are marked as deleted, so remove them from the set of
17955 // candidates.
17956 Candidates.remove_if([&R](Value *I) {
17957 return R.isDeleted(cast<Instruction>(I)) ||
17958 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
17959 });
17960
17961 // Remove from the set of candidates all pairs of getelementptrs with
17962 // constant differences. Such getelementptrs are likely not good
17963 // candidates for vectorization in a bottom-up phase since one can be
17964 // computed from the other. We also ensure all candidate getelementptr
17965 // indices are unique.
17966 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
17967 auto *GEPI = GEPList[I];
17968 if (!Candidates.count(GEPI))
17969 continue;
17970 auto *SCEVI = SE->getSCEV(GEPList[I]);
17971 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
17972 auto *GEPJ = GEPList[J];
17973 auto *SCEVJ = SE->getSCEV(GEPList[J]);
17974 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
17975 Candidates.remove(GEPI);
17976 Candidates.remove(GEPJ);
17977 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
17978 Candidates.remove(GEPJ);
17979 }
17980 }
17981 }
17982
17983 // We break out of the above computation as soon as we know there are
17984 // fewer than two candidates remaining.
17985 if (Candidates.size() < 2)
17986 continue;
17987
17988 // Add the single, non-constant index of each candidate to the bundle. We
17989 // ensured the indices met these constraints when we originally collected
17990 // the getelementptrs.
17991 SmallVector<Value *, 16> Bundle(Candidates.size());
17992 auto BundleIndex = 0u;
17993 for (auto *V : Candidates) {
17994 auto *GEP = cast<GetElementPtrInst>(V);
17995 auto *GEPIdx = GEP->idx_begin()->get();
17996 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
17997 Bundle[BundleIndex++] = GEPIdx;
17998 }
17999
18000 // Try and vectorize the indices. We are currently only interested in
18001 // gather-like cases of the form:
18002 //
18003 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18004 //
18005 // where the loads of "a", the loads of "b", and the subtractions can be
18006 // performed in parallel. It's likely that detecting this pattern in a
18007 // bottom-up phase will be simpler and less costly than building a
18008 // full-blown top-down phase beginning at the consecutive loads.
18009 Changed |= tryToVectorizeList(Bundle, R);
18010 }
18011 }
18012 return Changed;
18013}
18014
18015bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18016 bool Changed = false;
18017 // Sort by type, base pointers and values operand. Value operands must be
18018 // compatible (have the same opcode, same parent), otherwise it is
18019 // definitely not profitable to try to vectorize them.
18020 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18021 if (V->getValueOperand()->getType()->getTypeID() <
18022 V2->getValueOperand()->getType()->getTypeID())
18023 return true;
18024 if (V->getValueOperand()->getType()->getTypeID() >
18025 V2->getValueOperand()->getType()->getTypeID())
18026 return false;
18027 if (V->getPointerOperandType()->getTypeID() <
18028 V2->getPointerOperandType()->getTypeID())
18029 return true;
18030 if (V->getPointerOperandType()->getTypeID() >
18031 V2->getPointerOperandType()->getTypeID())
18032 return false;
18033 // UndefValues are compatible with all other values.
18034 if (isa<UndefValue>(V->getValueOperand()) ||
18035 isa<UndefValue>(V2->getValueOperand()))
18036 return false;
18037 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
18038 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18040 DT->getNode(I1->getParent());
18042 DT->getNode(I2->getParent());
18043 assert(NodeI1 && "Should only process reachable instructions");
18044 assert(NodeI2 && "Should only process reachable instructions");
18045 assert((NodeI1 == NodeI2) ==
18046 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18047 "Different nodes should have different DFS numbers");
18048 if (NodeI1 != NodeI2)
18049 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18050 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18051 if (S.getOpcode())
18052 return false;
18053 return I1->getOpcode() < I2->getOpcode();
18054 }
18055 if (isa<Constant>(V->getValueOperand()) &&
18056 isa<Constant>(V2->getValueOperand()))
18057 return false;
18058 return V->getValueOperand()->getValueID() <
18059 V2->getValueOperand()->getValueID();
18060 };
18061
18062 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18063 if (V1 == V2)
18064 return true;
18065 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18066 return false;
18067 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18068 return false;
18069 // Undefs are compatible with any other value.
18070 if (isa<UndefValue>(V1->getValueOperand()) ||
18071 isa<UndefValue>(V2->getValueOperand()))
18072 return true;
18073 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
18074 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18075 if (I1->getParent() != I2->getParent())
18076 return false;
18077 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18078 return S.getOpcode() > 0;
18079 }
18080 if (isa<Constant>(V1->getValueOperand()) &&
18081 isa<Constant>(V2->getValueOperand()))
18082 return true;
18083 return V1->getValueOperand()->getValueID() ==
18084 V2->getValueOperand()->getValueID();
18085 };
18086
18087 // Attempt to sort and vectorize each of the store-groups.
18088 for (auto &Pair : Stores) {
18089 if (Pair.second.size() < 2)
18090 continue;
18091
18092 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18093 << Pair.second.size() << ".\n");
18094
18095 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
18096 continue;
18097
18098 // Reverse stores to do bottom-to-top analysis. This is important if the
18099 // values are stores to the same addresses several times, in this case need
18100 // to follow the stores order (reversed to meet the memory dependecies).
18101 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18102 Pair.second.rend());
18103 Changed |= tryToVectorizeSequence<StoreInst>(
18104 ReversedStores, StoreSorter, AreCompatibleStores,
18105 [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
18106 return vectorizeStores(Candidates, R);
18107 },
18108 /*MaxVFOnly=*/false, R);
18109 }
18110 return Changed;
18111}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1497
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:492
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
reverse_iterator rend()
Definition: BasicBlock.h:448
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2327
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2222
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2464
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2321
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1600
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2318
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:983
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1362
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:1167
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1129
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2126
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1449
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:348
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2535
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:848
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1753
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2249
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2161
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1587
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:260
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:732
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:257
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:258
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
Type * getPointerOperandType() const
Definition: Instructions.h:420
Value * getValueOperand()
Definition: Instructions.h:414
Value * getPointerOperand()
Definition: Instructions.h:417
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:29
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isLoadCombineCandidate() const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:103
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1461
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:765
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:821
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:163
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:294
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:234
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:539
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1166
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7041
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:418
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1235
constexpr int PoisonMaskElem
@ Other
Any other memory.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:45
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2485
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const