LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Maximum allowed number of operands in the PHI nodes.
220static const unsigned MaxPHINumOperands = 128;
221
222/// Predicate for the element types that the SLP vectorizer supports.
223///
224/// The most important thing to filter here are types which are invalid in LLVM
225/// vectors. We also filter target specific types which have absolutely no
226/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
227/// avoids spending time checking the cost model and realizing that they will
228/// be inevitably scalarized.
229static bool isValidElementType(Type *Ty) {
230 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
231 !Ty->isPPC_FP128Ty();
232}
233
234/// \returns True if the value is a constant (but not globals/constant
235/// expressions).
236static bool isConstant(Value *V) {
237 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
238}
239
240/// Checks if \p V is one of vector-like instructions, i.e. undef,
241/// insertelement/extractelement with constant indices for fixed vector type or
242/// extractvalue instruction.
244 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
245 !isa<ExtractValueInst, UndefValue>(V))
246 return false;
247 auto *I = dyn_cast<Instruction>(V);
248 if (!I || isa<ExtractValueInst>(I))
249 return true;
250 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
251 return false;
252 if (isa<ExtractElementInst>(I))
253 return isConstant(I->getOperand(1));
254 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
255 return isConstant(I->getOperand(2));
256}
257
258#if !defined(NDEBUG)
259/// Print a short descriptor of the instruction bundle suitable for debug output.
260static std::string shortBundleName(ArrayRef<Value *> VL) {
261 std::string Result;
262 raw_string_ostream OS(Result);
263 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
264 OS.flush();
265 return Result;
266}
267#endif
268
269/// \returns true if all of the instructions in \p VL are in the same block or
270/// false otherwise.
272 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
273 if (!I0)
274 return false;
276 return true;
277
278 BasicBlock *BB = I0->getParent();
279 for (int I = 1, E = VL.size(); I < E; I++) {
280 auto *II = dyn_cast<Instruction>(VL[I]);
281 if (!II)
282 return false;
283
284 if (BB != II->getParent())
285 return false;
286 }
287 return true;
288}
289
290/// \returns True if all of the values in \p VL are constants (but not
291/// globals/constant expressions).
293 // Constant expressions and globals can't be vectorized like normal integer/FP
294 // constants.
295 return all_of(VL, isConstant);
296}
297
298/// \returns True if all of the values in \p VL are identical or some of them
299/// are UndefValue.
300static bool isSplat(ArrayRef<Value *> VL) {
301 Value *FirstNonUndef = nullptr;
302 for (Value *V : VL) {
303 if (isa<UndefValue>(V))
304 continue;
305 if (!FirstNonUndef) {
306 FirstNonUndef = V;
307 continue;
308 }
309 if (V != FirstNonUndef)
310 return false;
311 }
312 return FirstNonUndef != nullptr;
313}
314
315/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
317 if (auto *Cmp = dyn_cast<CmpInst>(I))
318 return Cmp->isCommutative();
319 if (auto *BO = dyn_cast<BinaryOperator>(I))
320 return BO->isCommutative() ||
321 (BO->getOpcode() == Instruction::Sub &&
322 !BO->hasNUsesOrMore(UsesLimit) &&
323 all_of(
324 BO->uses(),
325 [](const Use &U) {
326 // Commutative, if icmp eq/ne sub, 0
327 ICmpInst::Predicate Pred;
328 if (match(U.getUser(),
329 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
330 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
331 return true;
332 // Commutative, if abs(sub nsw, true) or abs(sub, false).
333 ConstantInt *Flag;
334 return match(U.getUser(),
335 m_Intrinsic<Intrinsic::abs>(
336 m_Specific(U.get()), m_ConstantInt(Flag))) &&
337 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
338 Flag->isOne());
339 })) ||
340 (BO->getOpcode() == Instruction::FSub &&
341 !BO->hasNUsesOrMore(UsesLimit) &&
342 all_of(BO->uses(), [](const Use &U) {
343 return match(U.getUser(),
344 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
345 }));
346 return I->isCommutative();
347}
348
349/// \returns inserting index of InsertElement or InsertValue instruction,
350/// using Offset as base offset for index.
351static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
352 unsigned Offset = 0) {
353 int Index = Offset;
354 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
355 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
356 if (!VT)
357 return std::nullopt;
358 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
359 if (!CI)
360 return std::nullopt;
361 if (CI->getValue().uge(VT->getNumElements()))
362 return std::nullopt;
363 Index *= VT->getNumElements();
364 Index += CI->getZExtValue();
365 return Index;
366 }
367
368 const auto *IV = cast<InsertValueInst>(InsertInst);
369 Type *CurrentType = IV->getType();
370 for (unsigned I : IV->indices()) {
371 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
372 Index *= ST->getNumElements();
373 CurrentType = ST->getElementType(I);
374 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
375 Index *= AT->getNumElements();
376 CurrentType = AT->getElementType();
377 } else {
378 return std::nullopt;
379 }
380 Index += I;
381 }
382 return Index;
383}
384
385namespace {
386/// Specifies the way the mask should be analyzed for undefs/poisonous elements
387/// in the shuffle mask.
388enum class UseMask {
389 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
390 ///< check for the mask elements for the first argument (mask
391 ///< indices are in range [0:VF)).
392 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
393 ///< for the mask elements for the second argument (mask indices
394 ///< are in range [VF:2*VF))
395 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
396 ///< future shuffle elements and mark them as ones as being used
397 ///< in future. Non-undef elements are considered as unused since
398 ///< they're already marked as used in the mask.
399};
400} // namespace
401
402/// Prepares a use bitset for the given mask either for the first argument or
403/// for the second.
405 UseMask MaskArg) {
406 SmallBitVector UseMask(VF, true);
407 for (auto [Idx, Value] : enumerate(Mask)) {
408 if (Value == PoisonMaskElem) {
409 if (MaskArg == UseMask::UndefsAsMask)
410 UseMask.reset(Idx);
411 continue;
412 }
413 if (MaskArg == UseMask::FirstArg && Value < VF)
414 UseMask.reset(Value);
415 else if (MaskArg == UseMask::SecondArg && Value >= VF)
416 UseMask.reset(Value - VF);
417 }
418 return UseMask;
419}
420
421/// Checks if the given value is actually an undefined constant vector.
422/// Also, if the \p UseMask is not empty, tries to check if the non-masked
423/// elements actually mask the insertelement buildvector, if any.
424template <bool IsPoisonOnly = false>
426 const SmallBitVector &UseMask = {}) {
427 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
428 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
429 if (isa<T>(V))
430 return Res;
431 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
432 if (!VecTy)
433 return Res.reset();
434 auto *C = dyn_cast<Constant>(V);
435 if (!C) {
436 if (!UseMask.empty()) {
437 const Value *Base = V;
438 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
439 Base = II->getOperand(0);
440 if (isa<T>(II->getOperand(1)))
441 continue;
442 std::optional<unsigned> Idx = getInsertIndex(II);
443 if (!Idx) {
444 Res.reset();
445 return Res;
446 }
447 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
448 Res.reset(*Idx);
449 }
450 // TODO: Add analysis for shuffles here too.
451 if (V == Base) {
452 Res.reset();
453 } else {
454 SmallBitVector SubMask(UseMask.size(), false);
455 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
456 }
457 } else {
458 Res.reset();
459 }
460 return Res;
461 }
462 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
463 if (Constant *Elem = C->getAggregateElement(I))
464 if (!isa<T>(Elem) &&
465 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
466 Res.reset(I);
467 }
468 return Res;
469}
470
471/// Checks if the vector of instructions can be represented as a shuffle, like:
472/// %x0 = extractelement <4 x i8> %x, i32 0
473/// %x3 = extractelement <4 x i8> %x, i32 3
474/// %y1 = extractelement <4 x i8> %y, i32 1
475/// %y2 = extractelement <4 x i8> %y, i32 2
476/// %x0x0 = mul i8 %x0, %x0
477/// %x3x3 = mul i8 %x3, %x3
478/// %y1y1 = mul i8 %y1, %y1
479/// %y2y2 = mul i8 %y2, %y2
480/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
481/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
482/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
483/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
484/// ret <4 x i8> %ins4
485/// can be transformed into:
486/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
487/// i32 6>
488/// %2 = mul <4 x i8> %1, %1
489/// ret <4 x i8> %2
490/// Mask will return the Shuffle Mask equivalent to the extracted elements.
491/// TODO: Can we split off and reuse the shuffle mask detection from
492/// ShuffleVectorInst/getShuffleCost?
493static std::optional<TargetTransformInfo::ShuffleKind>
495 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
496 if (It == VL.end())
497 return std::nullopt;
498 auto *EI0 = cast<ExtractElementInst>(*It);
499 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
500 return std::nullopt;
501 unsigned Size =
502 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
503 Value *Vec1 = nullptr;
504 Value *Vec2 = nullptr;
505 enum ShuffleMode { Unknown, Select, Permute };
506 ShuffleMode CommonShuffleMode = Unknown;
507 Mask.assign(VL.size(), PoisonMaskElem);
508 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
509 // Undef can be represented as an undef element in a vector.
510 if (isa<UndefValue>(VL[I]))
511 continue;
512 auto *EI = cast<ExtractElementInst>(VL[I]);
513 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
514 return std::nullopt;
515 auto *Vec = EI->getVectorOperand();
516 // We can extractelement from undef or poison vector.
517 if (isUndefVector(Vec).all())
518 continue;
519 // All vector operands must have the same number of vector elements.
520 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
521 return std::nullopt;
522 if (isa<UndefValue>(EI->getIndexOperand()))
523 continue;
524 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
525 if (!Idx)
526 return std::nullopt;
527 // Undefined behavior if Idx is negative or >= Size.
528 if (Idx->getValue().uge(Size))
529 continue;
530 unsigned IntIdx = Idx->getValue().getZExtValue();
531 Mask[I] = IntIdx;
532 // For correct shuffling we have to have at most 2 different vector operands
533 // in all extractelement instructions.
534 if (!Vec1 || Vec1 == Vec) {
535 Vec1 = Vec;
536 } else if (!Vec2 || Vec2 == Vec) {
537 Vec2 = Vec;
538 Mask[I] += Size;
539 } else {
540 return std::nullopt;
541 }
542 if (CommonShuffleMode == Permute)
543 continue;
544 // If the extract index is not the same as the operation number, it is a
545 // permutation.
546 if (IntIdx != I) {
547 CommonShuffleMode = Permute;
548 continue;
549 }
550 CommonShuffleMode = Select;
551 }
552 // If we're not crossing lanes in different vectors, consider it as blending.
553 if (CommonShuffleMode == Select && Vec2)
555 // If Vec2 was never used, we have a permutation of a single vector, otherwise
556 // we have permutation of 2 vectors.
559}
560
561/// \returns True if Extract{Value,Element} instruction extracts element Idx.
562static std::optional<unsigned> getExtractIndex(Instruction *E) {
563 unsigned Opcode = E->getOpcode();
564 assert((Opcode == Instruction::ExtractElement ||
565 Opcode == Instruction::ExtractValue) &&
566 "Expected extractelement or extractvalue instruction.");
567 if (Opcode == Instruction::ExtractElement) {
568 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
569 if (!CI)
570 return std::nullopt;
571 return CI->getZExtValue();
572 }
573 auto *EI = cast<ExtractValueInst>(E);
574 if (EI->getNumIndices() != 1)
575 return std::nullopt;
576 return *EI->idx_begin();
577}
578
579namespace {
580
581/// Main data required for vectorization of instructions.
582struct InstructionsState {
583 /// The very first instruction in the list with the main opcode.
584 Value *OpValue = nullptr;
585
586 /// The main/alternate instruction.
587 Instruction *MainOp = nullptr;
588 Instruction *AltOp = nullptr;
589
590 /// The main/alternate opcodes for the list of instructions.
591 unsigned getOpcode() const {
592 return MainOp ? MainOp->getOpcode() : 0;
593 }
594
595 unsigned getAltOpcode() const {
596 return AltOp ? AltOp->getOpcode() : 0;
597 }
598
599 /// Some of the instructions in the list have alternate opcodes.
600 bool isAltShuffle() const { return AltOp != MainOp; }
601
602 bool isOpcodeOrAlt(Instruction *I) const {
603 unsigned CheckedOpcode = I->getOpcode();
604 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
605 }
606
607 InstructionsState() = delete;
608 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
609 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
610};
611
612} // end anonymous namespace
613
614/// Chooses the correct key for scheduling data. If \p Op has the same (or
615/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
616/// OpValue.
617static Value *isOneOf(const InstructionsState &S, Value *Op) {
618 auto *I = dyn_cast<Instruction>(Op);
619 if (I && S.isOpcodeOrAlt(I))
620 return Op;
621 return S.OpValue;
622}
623
624/// \returns true if \p Opcode is allowed as part of the main/alternate
625/// instruction for SLP vectorization.
626///
627/// Example of unsupported opcode is SDIV that can potentially cause UB if the
628/// "shuffled out" lane would result in division by zero.
629static bool isValidForAlternation(unsigned Opcode) {
630 if (Instruction::isIntDivRem(Opcode))
631 return false;
632
633 return true;
634}
635
636static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
637 const TargetLibraryInfo &TLI,
638 unsigned BaseIndex = 0);
639
640/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
641/// compatible instructions or constants, or just some other regular values.
642static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
643 Value *Op1, const TargetLibraryInfo &TLI) {
644 return (isConstant(BaseOp0) && isConstant(Op0)) ||
645 (isConstant(BaseOp1) && isConstant(Op1)) ||
646 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
647 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
648 BaseOp0 == Op0 || BaseOp1 == Op1 ||
649 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
650 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
651}
652
653/// \returns true if a compare instruction \p CI has similar "look" and
654/// same predicate as \p BaseCI, "as is" or with its operands and predicate
655/// swapped, false otherwise.
656static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
657 const TargetLibraryInfo &TLI) {
658 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
659 "Assessing comparisons of different types?");
660 CmpInst::Predicate BasePred = BaseCI->getPredicate();
661 CmpInst::Predicate Pred = CI->getPredicate();
663
664 Value *BaseOp0 = BaseCI->getOperand(0);
665 Value *BaseOp1 = BaseCI->getOperand(1);
666 Value *Op0 = CI->getOperand(0);
667 Value *Op1 = CI->getOperand(1);
668
669 return (BasePred == Pred &&
670 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
671 (BasePred == SwappedPred &&
672 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
673}
674
675/// \returns analysis of the Instructions in \p VL described in
676/// InstructionsState, the Opcode that we suppose the whole list
677/// could be vectorized even if its structure is diverse.
678static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
679 const TargetLibraryInfo &TLI,
680 unsigned BaseIndex) {
681 // Make sure these are all Instructions.
682 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
683 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
684
685 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
686 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
687 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
688 CmpInst::Predicate BasePred =
689 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
691 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
692 unsigned AltOpcode = Opcode;
693 unsigned AltIndex = BaseIndex;
694
695 bool SwappedPredsCompatible = [&]() {
696 if (!IsCmpOp)
697 return false;
698 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
699 UniquePreds.insert(BasePred);
700 UniqueNonSwappedPreds.insert(BasePred);
701 for (Value *V : VL) {
702 auto *I = dyn_cast<CmpInst>(V);
703 if (!I)
704 return false;
705 CmpInst::Predicate CurrentPred = I->getPredicate();
706 CmpInst::Predicate SwappedCurrentPred =
707 CmpInst::getSwappedPredicate(CurrentPred);
708 UniqueNonSwappedPreds.insert(CurrentPred);
709 if (!UniquePreds.contains(CurrentPred) &&
710 !UniquePreds.contains(SwappedCurrentPred))
711 UniquePreds.insert(CurrentPred);
712 }
713 // Total number of predicates > 2, but if consider swapped predicates
714 // compatible only 2, consider swappable predicates as compatible opcodes,
715 // not alternate.
716 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
717 }();
718 // Check for one alternate opcode from another BinaryOperator.
719 // TODO - generalize to support all operators (types, calls etc.).
720 auto *IBase = cast<Instruction>(VL[BaseIndex]);
721 Intrinsic::ID BaseID = 0;
722 SmallVector<VFInfo> BaseMappings;
723 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
725 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
726 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
727 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
728 }
729 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
730 auto *I = cast<Instruction>(VL[Cnt]);
731 unsigned InstOpcode = I->getOpcode();
732 if (IsBinOp && isa<BinaryOperator>(I)) {
733 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
734 continue;
735 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
736 isValidForAlternation(Opcode)) {
737 AltOpcode = InstOpcode;
738 AltIndex = Cnt;
739 continue;
740 }
741 } else if (IsCastOp && isa<CastInst>(I)) {
742 Value *Op0 = IBase->getOperand(0);
743 Type *Ty0 = Op0->getType();
744 Value *Op1 = I->getOperand(0);
745 Type *Ty1 = Op1->getType();
746 if (Ty0 == Ty1) {
747 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
748 continue;
749 if (Opcode == AltOpcode) {
751 isValidForAlternation(InstOpcode) &&
752 "Cast isn't safe for alternation, logic needs to be updated!");
753 AltOpcode = InstOpcode;
754 AltIndex = Cnt;
755 continue;
756 }
757 }
758 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
759 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
760 Type *Ty0 = BaseInst->getOperand(0)->getType();
761 Type *Ty1 = Inst->getOperand(0)->getType();
762 if (Ty0 == Ty1) {
763 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
764 // Check for compatible operands. If the corresponding operands are not
765 // compatible - need to perform alternate vectorization.
766 CmpInst::Predicate CurrentPred = Inst->getPredicate();
767 CmpInst::Predicate SwappedCurrentPred =
768 CmpInst::getSwappedPredicate(CurrentPred);
769
770 if ((E == 2 || SwappedPredsCompatible) &&
771 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
772 continue;
773
774 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
775 continue;
776 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
777 if (AltIndex != BaseIndex) {
778 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
779 continue;
780 } else if (BasePred != CurrentPred) {
781 assert(
782 isValidForAlternation(InstOpcode) &&
783 "CmpInst isn't safe for alternation, logic needs to be updated!");
784 AltIndex = Cnt;
785 continue;
786 }
787 CmpInst::Predicate AltPred = AltInst->getPredicate();
788 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
789 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
790 continue;
791 }
792 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
793 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
794 if (Gep->getNumOperands() != 2 ||
795 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
796 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
797 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
799 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
800 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
801 auto *BaseLI = cast<LoadInst>(IBase);
802 if (!LI->isSimple() || !BaseLI->isSimple())
803 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
804 } else if (auto *Call = dyn_cast<CallInst>(I)) {
805 auto *CallBase = cast<CallInst>(IBase);
806 if (Call->getCalledFunction() != CallBase->getCalledFunction())
807 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
808 if (Call->hasOperandBundles() &&
809 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
810 Call->op_begin() + Call->getBundleOperandsEndIndex(),
811 CallBase->op_begin() +
813 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
815 if (ID != BaseID)
816 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
817 if (!ID) {
818 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
819 if (Mappings.size() != BaseMappings.size() ||
820 Mappings.front().ISA != BaseMappings.front().ISA ||
821 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
822 Mappings.front().VectorName != BaseMappings.front().VectorName ||
823 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
824 Mappings.front().Shape.Parameters !=
825 BaseMappings.front().Shape.Parameters)
826 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
827 }
828 }
829 continue;
830 }
831 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
832 }
833
834 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
835 cast<Instruction>(VL[AltIndex]));
836}
837
838/// \returns true if all of the values in \p VL have the same type or false
839/// otherwise.
841 Type *Ty = VL.front()->getType();
842 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
843}
844
845/// \returns True if in-tree use also needs extract. This refers to
846/// possible scalar operand in vectorized instruction.
847static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
848 TargetLibraryInfo *TLI) {
849 unsigned Opcode = UserInst->getOpcode();
850 switch (Opcode) {
851 case Instruction::Load: {
852 LoadInst *LI = cast<LoadInst>(UserInst);
853 return (LI->getPointerOperand() == Scalar);
854 }
855 case Instruction::Store: {
856 StoreInst *SI = cast<StoreInst>(UserInst);
857 return (SI->getPointerOperand() == Scalar);
858 }
859 case Instruction::Call: {
860 CallInst *CI = cast<CallInst>(UserInst);
862 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
863 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
864 Arg.value().get() == Scalar;
865 });
866 }
867 default:
868 return false;
869 }
870}
871
872/// \returns the AA location that is being access by the instruction.
874 if (StoreInst *SI = dyn_cast<StoreInst>(I))
875 return MemoryLocation::get(SI);
876 if (LoadInst *LI = dyn_cast<LoadInst>(I))
877 return MemoryLocation::get(LI);
878 return MemoryLocation();
879}
880
881/// \returns True if the instruction is not a volatile or atomic load/store.
882static bool isSimple(Instruction *I) {
883 if (LoadInst *LI = dyn_cast<LoadInst>(I))
884 return LI->isSimple();
885 if (StoreInst *SI = dyn_cast<StoreInst>(I))
886 return SI->isSimple();
887 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
888 return !MI->isVolatile();
889 return true;
890}
891
892/// Shuffles \p Mask in accordance with the given \p SubMask.
893/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
894/// one but two input vectors.
895static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
896 bool ExtendingManyInputs = false) {
897 if (SubMask.empty())
898 return;
899 assert(
900 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
901 // Check if input scalars were extended to match the size of other node.
902 (SubMask.size() == Mask.size() &&
903 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
904 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
905 "SubMask with many inputs support must be larger than the mask.");
906 if (Mask.empty()) {
907 Mask.append(SubMask.begin(), SubMask.end());
908 return;
909 }
910 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
911 int TermValue = std::min(Mask.size(), SubMask.size());
912 for (int I = 0, E = SubMask.size(); I < E; ++I) {
913 if (SubMask[I] == PoisonMaskElem ||
914 (!ExtendingManyInputs &&
915 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
916 continue;
917 NewMask[I] = Mask[SubMask[I]];
918 }
919 Mask.swap(NewMask);
920}
921
922/// Order may have elements assigned special value (size) which is out of
923/// bounds. Such indices only appear on places which correspond to undef values
924/// (see canReuseExtract for details) and used in order to avoid undef values
925/// have effect on operands ordering.
926/// The first loop below simply finds all unused indices and then the next loop
927/// nest assigns these indices for undef values positions.
928/// As an example below Order has two undef positions and they have assigned
929/// values 3 and 7 respectively:
930/// before: 6 9 5 4 9 2 1 0
931/// after: 6 3 5 4 7 2 1 0
933 const unsigned Sz = Order.size();
934 SmallBitVector UnusedIndices(Sz, /*t=*/true);
935 SmallBitVector MaskedIndices(Sz);
936 for (unsigned I = 0; I < Sz; ++I) {
937 if (Order[I] < Sz)
938 UnusedIndices.reset(Order[I]);
939 else
940 MaskedIndices.set(I);
941 }
942 if (MaskedIndices.none())
943 return;
944 assert(UnusedIndices.count() == MaskedIndices.count() &&
945 "Non-synced masked/available indices.");
946 int Idx = UnusedIndices.find_first();
947 int MIdx = MaskedIndices.find_first();
948 while (MIdx >= 0) {
949 assert(Idx >= 0 && "Indices must be synced.");
950 Order[MIdx] = Idx;
951 Idx = UnusedIndices.find_next(Idx);
952 MIdx = MaskedIndices.find_next(MIdx);
953 }
954}
955
956namespace llvm {
957
959 SmallVectorImpl<int> &Mask) {
960 Mask.clear();
961 const unsigned E = Indices.size();
962 Mask.resize(E, PoisonMaskElem);
963 for (unsigned I = 0; I < E; ++I)
964 Mask[Indices[I]] = I;
965}
966
967/// Reorders the list of scalars in accordance with the given \p Mask.
969 ArrayRef<int> Mask) {
970 assert(!Mask.empty() && "Expected non-empty mask.");
971 SmallVector<Value *> Prev(Scalars.size(),
972 UndefValue::get(Scalars.front()->getType()));
973 Prev.swap(Scalars);
974 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
975 if (Mask[I] != PoisonMaskElem)
976 Scalars[Mask[I]] = Prev[I];
977}
978
979/// Checks if the provided value does not require scheduling. It does not
980/// require scheduling if this is not an instruction or it is an instruction
981/// that does not read/write memory and all operands are either not instructions
982/// or phi nodes or instructions from different blocks.
984 auto *I = dyn_cast<Instruction>(V);
985 if (!I)
986 return true;
987 return !mayHaveNonDefUseDependency(*I) &&
988 all_of(I->operands(), [I](Value *V) {
989 auto *IO = dyn_cast<Instruction>(V);
990 if (!IO)
991 return true;
992 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
993 });
994}
995
996/// Checks if the provided value does not require scheduling. It does not
997/// require scheduling if this is not an instruction or it is an instruction
998/// that does not read/write memory and all users are phi nodes or instructions
999/// from the different blocks.
1000static bool isUsedOutsideBlock(Value *V) {
1001 auto *I = dyn_cast<Instruction>(V);
1002 if (!I)
1003 return true;
1004 // Limits the number of uses to save compile time.
1005 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1006 all_of(I->users(), [I](User *U) {
1007 auto *IU = dyn_cast<Instruction>(U);
1008 if (!IU)
1009 return true;
1010 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1011 });
1012}
1013
1014/// Checks if the specified value does not require scheduling. It does not
1015/// require scheduling if all operands and all users do not need to be scheduled
1016/// in the current basic block.
1019}
1020
1021/// Checks if the specified array of instructions does not require scheduling.
1022/// It is so if all either instructions have operands that do not require
1023/// scheduling or their users do not require scheduling since they are phis or
1024/// in other basic blocks.
1026 return !VL.empty() &&
1028}
1029
1030namespace slpvectorizer {
1031
1032/// Bottom Up SLP Vectorizer.
1033class BoUpSLP {
1034 struct TreeEntry;
1035 struct ScheduleData;
1038
1039public:
1040 /// Tracks the state we can represent the loads in the given sequence.
1041 enum class LoadsState {
1042 Gather,
1043 Vectorize,
1046 };
1047
1055
1057 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1060 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1061 AC(AC), DB(DB), DL(DL), ORE(ORE),
1062 Builder(Se->getContext(), TargetFolder(*DL)) {
1063 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1064 // Use the vector register size specified by the target unless overridden
1065 // by a command-line option.
1066 // TODO: It would be better to limit the vectorization factor based on
1067 // data type rather than just register size. For example, x86 AVX has
1068 // 256-bit registers, but it does not support integer operations
1069 // at that width (that requires AVX2).
1070 if (MaxVectorRegSizeOption.getNumOccurrences())
1071 MaxVecRegSize = MaxVectorRegSizeOption;
1072 else
1073 MaxVecRegSize =
1075 .getFixedValue();
1076
1077 if (MinVectorRegSizeOption.getNumOccurrences())
1078 MinVecRegSize = MinVectorRegSizeOption;
1079 else
1080 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1081 }
1082
1083 /// Vectorize the tree that starts with the elements in \p VL.
1084 /// Returns the vectorized root.
1086
1087 /// Vectorize the tree but with the list of externally used values \p
1088 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1089 /// generated extractvalue instructions.
1090 /// \param ReplacedExternals containd list of replaced external values
1091 /// {scalar, replace} after emitting extractelement for external uses.
1092 Value *
1093 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1094 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1095 Instruction *ReductionRoot = nullptr);
1096
1097 /// \returns the cost incurred by unwanted spills and fills, caused by
1098 /// holding live values over call sites.
1100
1101 /// \returns the vectorization cost of the subtree that starts at \p VL.
1102 /// A negative number means that this is profitable.
1103 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1104
1105 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1106 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1107 void buildTree(ArrayRef<Value *> Roots,
1108 const SmallDenseSet<Value *> &UserIgnoreLst);
1109
1110 /// Construct a vectorizable tree that starts at \p Roots.
1111 void buildTree(ArrayRef<Value *> Roots);
1112
1113 /// Returns whether the root node has in-tree uses.
1115 return !VectorizableTree.empty() &&
1116 !VectorizableTree.front()->UserTreeIndices.empty();
1117 }
1118
1119 /// Return the scalars of the root node.
1121 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1122 return VectorizableTree.front()->Scalars;
1123 }
1124
1125 /// Builds external uses of the vectorized scalars, i.e. the list of
1126 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1127 /// ExternallyUsedValues contains additional list of external uses to handle
1128 /// vectorization of reductions.
1129 void
1130 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1131
1132 /// Transforms graph nodes to target specific representations, if profitable.
1133 void transformNodes();
1134
1135 /// Clear the internal data structures that are created by 'buildTree'.
1136 void deleteTree() {
1137 VectorizableTree.clear();
1138 ScalarToTreeEntry.clear();
1139 MultiNodeScalars.clear();
1140 MustGather.clear();
1141 NonScheduledFirst.clear();
1142 EntryToLastInstruction.clear();
1143 ExternalUses.clear();
1144 ExternalUsesAsGEPs.clear();
1145 for (auto &Iter : BlocksSchedules) {
1146 BlockScheduling *BS = Iter.second.get();
1147 BS->clear();
1148 }
1149 MinBWs.clear();
1150 ReductionBitWidth = 0;
1151 CastMaxMinBWSizes.reset();
1152 ExtraBitWidthNodes.clear();
1153 InstrElementSize.clear();
1154 UserIgnoreList = nullptr;
1155 PostponedGathers.clear();
1156 ValueToGatherNodes.clear();
1157 }
1158
1159 unsigned getTreeSize() const { return VectorizableTree.size(); }
1160
1161 /// Perform LICM and CSE on the newly generated gather sequences.
1163
1164 /// Checks if the specified gather tree entry \p TE can be represented as a
1165 /// shuffled vector entry + (possibly) permutation with other gathers. It
1166 /// implements the checks only for possibly ordered scalars (Loads,
1167 /// ExtractElement, ExtractValue), which can be part of the graph.
1168 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1169
1170 /// Sort loads into increasing pointers offsets to allow greater clustering.
1171 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1172
1173 /// Gets reordering data for the given tree entry. If the entry is vectorized
1174 /// - just return ReorderIndices, otherwise check if the scalars can be
1175 /// reordered and return the most optimal order.
1176 /// \return std::nullopt if ordering is not important, empty order, if
1177 /// identity order is important, or the actual order.
1178 /// \param TopToBottom If true, include the order of vectorized stores and
1179 /// insertelement nodes, otherwise skip them.
1180 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1181 bool TopToBottom);
1182
1183 /// Reorders the current graph to the most profitable order starting from the
1184 /// root node to the leaf nodes. The best order is chosen only from the nodes
1185 /// of the same size (vectorization factor). Smaller nodes are considered
1186 /// parts of subgraph with smaller VF and they are reordered independently. We
1187 /// can make it because we still need to extend smaller nodes to the wider VF
1188 /// and we can merge reordering shuffles with the widening shuffles.
1189 void reorderTopToBottom();
1190
1191 /// Reorders the current graph to the most profitable order starting from
1192 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1193 /// number of reshuffles if the leaf nodes use the same order. In this case we
1194 /// can merge the orders and just shuffle user node instead of shuffling its
1195 /// operands. Plus, even the leaf nodes have different orders, it allows to
1196 /// sink reordering in the graph closer to the root node and merge it later
1197 /// during analysis.
1198 void reorderBottomToTop(bool IgnoreReorder = false);
1199
1200 /// \return The vector element size in bits to use when vectorizing the
1201 /// expression tree ending at \p V. If V is a store, the size is the width of
1202 /// the stored value. Otherwise, the size is the width of the largest loaded
1203 /// value reaching V. This method is used by the vectorizer to calculate
1204 /// vectorization factors.
1205 unsigned getVectorElementSize(Value *V);
1206
1207 /// Compute the minimum type sizes required to represent the entries in a
1208 /// vectorizable tree.
1210
1211 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1212 unsigned getMaxVecRegSize() const {
1213 return MaxVecRegSize;
1214 }
1215
1216 // \returns minimum vector register size as set by cl::opt.
1217 unsigned getMinVecRegSize() const {
1218 return MinVecRegSize;
1219 }
1220
1221 unsigned getMinVF(unsigned Sz) const {
1222 return std::max(2U, getMinVecRegSize() / Sz);
1223 }
1224
1225 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1226 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1227 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1228 return MaxVF ? MaxVF : UINT_MAX;
1229 }
1230
1231 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1232 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1233 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1234 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1235 ///
1236 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1237 unsigned canMapToVector(Type *T) const;
1238
1239 /// \returns True if the VectorizableTree is both tiny and not fully
1240 /// vectorizable. We do not vectorize such trees.
1241 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1242
1243 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1244 /// can be load combined in the backend. Load combining may not be allowed in
1245 /// the IR optimizer, so we do not want to alter the pattern. For example,
1246 /// partially transforming a scalar bswap() pattern into vector code is
1247 /// effectively impossible for the backend to undo.
1248 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1249 /// may not be necessary.
1250 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1251
1252 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1253 /// can be load combined in the backend. Load combining may not be allowed in
1254 /// the IR optimizer, so we do not want to alter the pattern. For example,
1255 /// partially transforming a scalar bswap() pattern into vector code is
1256 /// effectively impossible for the backend to undo.
1257 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1258 /// may not be necessary.
1259 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1260
1261 /// Checks if the given array of loads can be represented as a vectorized,
1262 /// scatter or just simple gather.
1263 /// \param VL list of loads.
1264 /// \param VL0 main load value.
1265 /// \param Order returned order of load instructions.
1266 /// \param PointerOps returned list of pointer operands.
1267 /// \param TryRecursiveCheck used to check if long masked gather can be
1268 /// represented as a serie of loads/insert subvector, if profitable.
1271 SmallVectorImpl<Value *> &PointerOps,
1272 bool TryRecursiveCheck = true) const;
1273
1275
1276 /// This structure holds any data we need about the edges being traversed
1277 /// during buildTree_rec(). We keep track of:
1278 /// (i) the user TreeEntry index, and
1279 /// (ii) the index of the edge.
1280 struct EdgeInfo {
1281 EdgeInfo() = default;
1282 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1284 /// The user TreeEntry.
1285 TreeEntry *UserTE = nullptr;
1286 /// The operand index of the use.
1287 unsigned EdgeIdx = UINT_MAX;
1288#ifndef NDEBUG
1290 const BoUpSLP::EdgeInfo &EI) {
1291 EI.dump(OS);
1292 return OS;
1293 }
1294 /// Debug print.
1295 void dump(raw_ostream &OS) const {
1296 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1297 << " EdgeIdx:" << EdgeIdx << "}";
1298 }
1299 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1300#endif
1301 bool operator == (const EdgeInfo &Other) const {
1302 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1303 }
1304 };
1305
1306 /// A helper class used for scoring candidates for two consecutive lanes.
1308 const TargetLibraryInfo &TLI;
1309 const DataLayout &DL;
1310 ScalarEvolution &SE;
1311 const BoUpSLP &R;
1312 int NumLanes; // Total number of lanes (aka vectorization factor).
1313 int MaxLevel; // The maximum recursion depth for accumulating score.
1314
1315 public:
1317 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1318 int MaxLevel)
1319 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1320 MaxLevel(MaxLevel) {}
1321
1322 // The hard-coded scores listed here are not very important, though it shall
1323 // be higher for better matches to improve the resulting cost. When
1324 // computing the scores of matching one sub-tree with another, we are
1325 // basically counting the number of values that are matching. So even if all
1326 // scores are set to 1, we would still get a decent matching result.
1327 // However, sometimes we have to break ties. For example we may have to
1328 // choose between matching loads vs matching opcodes. This is what these
1329 // scores are helping us with: they provide the order of preference. Also,
1330 // this is important if the scalar is externally used or used in another
1331 // tree entry node in the different lane.
1332
1333 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1334 static const int ScoreConsecutiveLoads = 4;
1335 /// The same load multiple times. This should have a better score than
1336 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1337 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1338 /// a vector load and 1.0 for a broadcast.
1339 static const int ScoreSplatLoads = 3;
1340 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1341 static const int ScoreReversedLoads = 3;
1342 /// A load candidate for masked gather.
1343 static const int ScoreMaskedGatherCandidate = 1;
1344 /// ExtractElementInst from same vector and consecutive indexes.
1345 static const int ScoreConsecutiveExtracts = 4;
1346 /// ExtractElementInst from same vector and reversed indices.
1347 static const int ScoreReversedExtracts = 3;
1348 /// Constants.
1349 static const int ScoreConstants = 2;
1350 /// Instructions with the same opcode.
1351 static const int ScoreSameOpcode = 2;
1352 /// Instructions with alt opcodes (e.g, add + sub).
1353 static const int ScoreAltOpcodes = 1;
1354 /// Identical instructions (a.k.a. splat or broadcast).
1355 static const int ScoreSplat = 1;
1356 /// Matching with an undef is preferable to failing.
1357 static const int ScoreUndef = 1;
1358 /// Score for failing to find a decent match.
1359 static const int ScoreFail = 0;
1360 /// Score if all users are vectorized.
1361 static const int ScoreAllUserVectorized = 1;
1362
1363 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1364 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1365 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1366 /// MainAltOps.
1368 ArrayRef<Value *> MainAltOps) const {
1369 if (!isValidElementType(V1->getType()) ||
1370 !isValidElementType(V2->getType()))
1372
1373 if (V1 == V2) {
1374 if (isa<LoadInst>(V1)) {
1375 // Retruns true if the users of V1 and V2 won't need to be extracted.
1376 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1377 // Bail out if we have too many uses to save compilation time.
1378 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1379 return false;
1380
1381 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1382 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1383 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1384 });
1385 };
1386 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1387 };
1388 // A broadcast of a load can be cheaper on some targets.
1389 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1390 ElementCount::getFixed(NumLanes)) &&
1391 ((int)V1->getNumUses() == NumLanes ||
1392 AllUsersAreInternal(V1, V2)))
1394 }
1396 }
1397
1398 auto CheckSameEntryOrFail = [&]() {
1399 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1400 TE1 && TE1 == R.getTreeEntry(V2))
1403 };
1404
1405 auto *LI1 = dyn_cast<LoadInst>(V1);
1406 auto *LI2 = dyn_cast<LoadInst>(V2);
1407 if (LI1 && LI2) {
1408 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1409 !LI2->isSimple())
1410 return CheckSameEntryOrFail();
1411
1412 std::optional<int> Dist = getPointersDiff(
1413 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1414 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1415 if (!Dist || *Dist == 0) {
1416 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1417 getUnderlyingObject(LI2->getPointerOperand()) &&
1418 R.TTI->isLegalMaskedGather(
1419 FixedVectorType::get(LI1->getType(), NumLanes),
1420 LI1->getAlign()))
1422 return CheckSameEntryOrFail();
1423 }
1424 // The distance is too large - still may be profitable to use masked
1425 // loads/gathers.
1426 if (std::abs(*Dist) > NumLanes / 2)
1428 // This still will detect consecutive loads, but we might have "holes"
1429 // in some cases. It is ok for non-power-2 vectorization and may produce
1430 // better results. It should not affect current vectorization.
1433 }
1434
1435 auto *C1 = dyn_cast<Constant>(V1);
1436 auto *C2 = dyn_cast<Constant>(V2);
1437 if (C1 && C2)
1439
1440 // Extracts from consecutive indexes of the same vector better score as
1441 // the extracts could be optimized away.
1442 Value *EV1;
1443 ConstantInt *Ex1Idx;
1444 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1445 // Undefs are always profitable for extractelements.
1446 // Compiler can easily combine poison and extractelement <non-poison> or
1447 // undef and extractelement <poison>. But combining undef +
1448 // extractelement <non-poison-but-may-produce-poison> requires some
1449 // extra operations.
1450 if (isa<UndefValue>(V2))
1451 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1454 Value *EV2 = nullptr;
1455 ConstantInt *Ex2Idx = nullptr;
1456 if (match(V2,
1458 m_Undef())))) {
1459 // Undefs are always profitable for extractelements.
1460 if (!Ex2Idx)
1462 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1464 if (EV2 == EV1) {
1465 int Idx1 = Ex1Idx->getZExtValue();
1466 int Idx2 = Ex2Idx->getZExtValue();
1467 int Dist = Idx2 - Idx1;
1468 // The distance is too large - still may be profitable to use
1469 // shuffles.
1470 if (std::abs(Dist) == 0)
1472 if (std::abs(Dist) > NumLanes / 2)
1476 }
1478 }
1479 return CheckSameEntryOrFail();
1480 }
1481
1482 auto *I1 = dyn_cast<Instruction>(V1);
1483 auto *I2 = dyn_cast<Instruction>(V2);
1484 if (I1 && I2) {
1485 if (I1->getParent() != I2->getParent())
1486 return CheckSameEntryOrFail();
1487 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1488 Ops.push_back(I1);
1489 Ops.push_back(I2);
1490 InstructionsState S = getSameOpcode(Ops, TLI);
1491 // Note: Only consider instructions with <= 2 operands to avoid
1492 // complexity explosion.
1493 if (S.getOpcode() &&
1494 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1495 !S.isAltShuffle()) &&
1496 all_of(Ops, [&S](Value *V) {
1497 return cast<Instruction>(V)->getNumOperands() ==
1498 S.MainOp->getNumOperands();
1499 }))
1500 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1502 }
1503
1504 if (isa<UndefValue>(V2))
1506
1507 return CheckSameEntryOrFail();
1508 }
1509
1510 /// Go through the operands of \p LHS and \p RHS recursively until
1511 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1512 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1513 /// of \p U1 and \p U2), except at the beginning of the recursion where
1514 /// these are set to nullptr.
1515 ///
1516 /// For example:
1517 /// \verbatim
1518 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1519 /// \ / \ / \ / \ /
1520 /// + + + +
1521 /// G1 G2 G3 G4
1522 /// \endverbatim
1523 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1524 /// each level recursively, accumulating the score. It starts from matching
1525 /// the additions at level 0, then moves on to the loads (level 1). The
1526 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1527 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1528 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1529 /// Please note that the order of the operands does not matter, as we
1530 /// evaluate the score of all profitable combinations of operands. In
1531 /// other words the score of G1 and G4 is the same as G1 and G2. This
1532 /// heuristic is based on ideas described in:
1533 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1534 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1535 /// Luís F. W. Góes
1537 Instruction *U2, int CurrLevel,
1538 ArrayRef<Value *> MainAltOps) const {
1539
1540 // Get the shallow score of V1 and V2.
1541 int ShallowScoreAtThisLevel =
1542 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1543
1544 // If reached MaxLevel,
1545 // or if V1 and V2 are not instructions,
1546 // or if they are SPLAT,
1547 // or if they are not consecutive,
1548 // or if profitable to vectorize loads or extractelements, early return
1549 // the current cost.
1550 auto *I1 = dyn_cast<Instruction>(LHS);
1551 auto *I2 = dyn_cast<Instruction>(RHS);
1552 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1553 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1554 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1555 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1556 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1557 ShallowScoreAtThisLevel))
1558 return ShallowScoreAtThisLevel;
1559 assert(I1 && I2 && "Should have early exited.");
1560
1561 // Contains the I2 operand indexes that got matched with I1 operands.
1562 SmallSet<unsigned, 4> Op2Used;
1563
1564 // Recursion towards the operands of I1 and I2. We are trying all possible
1565 // operand pairs, and keeping track of the best score.
1566 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1567 OpIdx1 != NumOperands1; ++OpIdx1) {
1568 // Try to pair op1I with the best operand of I2.
1569 int MaxTmpScore = 0;
1570 unsigned MaxOpIdx2 = 0;
1571 bool FoundBest = false;
1572 // If I2 is commutative try all combinations.
1573 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1574 unsigned ToIdx = isCommutative(I2)
1575 ? I2->getNumOperands()
1576 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1577 assert(FromIdx <= ToIdx && "Bad index");
1578 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1579 // Skip operands already paired with OpIdx1.
1580 if (Op2Used.count(OpIdx2))
1581 continue;
1582 // Recursively calculate the cost at each level
1583 int TmpScore =
1584 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1585 I1, I2, CurrLevel + 1, std::nullopt);
1586 // Look for the best score.
1587 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1588 TmpScore > MaxTmpScore) {
1589 MaxTmpScore = TmpScore;
1590 MaxOpIdx2 = OpIdx2;
1591 FoundBest = true;
1592 }
1593 }
1594 if (FoundBest) {
1595 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1596 Op2Used.insert(MaxOpIdx2);
1597 ShallowScoreAtThisLevel += MaxTmpScore;
1598 }
1599 }
1600 return ShallowScoreAtThisLevel;
1601 }
1602 };
1603 /// A helper data structure to hold the operands of a vector of instructions.
1604 /// This supports a fixed vector length for all operand vectors.
1606 /// For each operand we need (i) the value, and (ii) the opcode that it
1607 /// would be attached to if the expression was in a left-linearized form.
1608 /// This is required to avoid illegal operand reordering.
1609 /// For example:
1610 /// \verbatim
1611 /// 0 Op1
1612 /// |/
1613 /// Op1 Op2 Linearized + Op2
1614 /// \ / ----------> |/
1615 /// - -
1616 ///
1617 /// Op1 - Op2 (0 + Op1) - Op2
1618 /// \endverbatim
1619 ///
1620 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1621 ///
1622 /// Another way to think of this is to track all the operations across the
1623 /// path from the operand all the way to the root of the tree and to
1624 /// calculate the operation that corresponds to this path. For example, the
1625 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1626 /// corresponding operation is a '-' (which matches the one in the
1627 /// linearized tree, as shown above).
1628 ///
1629 /// For lack of a better term, we refer to this operation as Accumulated
1630 /// Path Operation (APO).
1631 struct OperandData {
1632 OperandData() = default;
1633 OperandData(Value *V, bool APO, bool IsUsed)
1634 : V(V), APO(APO), IsUsed(IsUsed) {}
1635 /// The operand value.
1636 Value *V = nullptr;
1637 /// TreeEntries only allow a single opcode, or an alternate sequence of
1638 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1639 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1640 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1641 /// (e.g., Add/Mul)
1642 bool APO = false;
1643 /// Helper data for the reordering function.
1644 bool IsUsed = false;
1645 };
1646
1647 /// During operand reordering, we are trying to select the operand at lane
1648 /// that matches best with the operand at the neighboring lane. Our
1649 /// selection is based on the type of value we are looking for. For example,
1650 /// if the neighboring lane has a load, we need to look for a load that is
1651 /// accessing a consecutive address. These strategies are summarized in the
1652 /// 'ReorderingMode' enumerator.
1653 enum class ReorderingMode {
1654 Load, ///< Matching loads to consecutive memory addresses
1655 Opcode, ///< Matching instructions based on opcode (same or alternate)
1656 Constant, ///< Matching constants
1657 Splat, ///< Matching the same instruction multiple times (broadcast)
1658 Failed, ///< We failed to create a vectorizable group
1659 };
1660
1662
1663 /// A vector of operand vectors.
1665
1666 const TargetLibraryInfo &TLI;
1667 const DataLayout &DL;
1668 ScalarEvolution &SE;
1669 const BoUpSLP &R;
1670 const Loop *L = nullptr;
1671
1672 /// \returns the operand data at \p OpIdx and \p Lane.
1673 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1674 return OpsVec[OpIdx][Lane];
1675 }
1676
1677 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1678 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1679 return OpsVec[OpIdx][Lane];
1680 }
1681
1682 /// Clears the used flag for all entries.
1683 void clearUsed() {
1684 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1685 OpIdx != NumOperands; ++OpIdx)
1686 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1687 ++Lane)
1688 OpsVec[OpIdx][Lane].IsUsed = false;
1689 }
1690
1691 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1692 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1693 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1694 }
1695
1696 /// \param Lane lane of the operands under analysis.
1697 /// \param OpIdx operand index in \p Lane lane we're looking the best
1698 /// candidate for.
1699 /// \param Idx operand index of the current candidate value.
1700 /// \returns The additional score due to possible broadcasting of the
1701 /// elements in the lane. It is more profitable to have power-of-2 unique
1702 /// elements in the lane, it will be vectorized with higher probability
1703 /// after removing duplicates. Currently the SLP vectorizer supports only
1704 /// vectorization of the power-of-2 number of unique scalars.
1705 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1706 Value *IdxLaneV = getData(Idx, Lane).V;
1707 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1708 return 0;
1710 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1711 if (Ln == Lane)
1712 continue;
1713 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1714 if (!isa<Instruction>(OpIdxLnV))
1715 return 0;
1716 Uniques.insert(OpIdxLnV);
1717 }
1718 int UniquesCount = Uniques.size();
1719 int UniquesCntWithIdxLaneV =
1720 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1721 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1722 int UniquesCntWithOpIdxLaneV =
1723 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1724 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1725 return 0;
1726 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1727 UniquesCntWithOpIdxLaneV) -
1728 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1729 }
1730
1731 /// \param Lane lane of the operands under analysis.
1732 /// \param OpIdx operand index in \p Lane lane we're looking the best
1733 /// candidate for.
1734 /// \param Idx operand index of the current candidate value.
1735 /// \returns The additional score for the scalar which users are all
1736 /// vectorized.
1737 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1738 Value *IdxLaneV = getData(Idx, Lane).V;
1739 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1740 // Do not care about number of uses for vector-like instructions
1741 // (extractelement/extractvalue with constant indices), they are extracts
1742 // themselves and already externally used. Vectorization of such
1743 // instructions does not add extra extractelement instruction, just may
1744 // remove it.
1745 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1746 isVectorLikeInstWithConstOps(OpIdxLaneV))
1748 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1749 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1750 return 0;
1751 return R.areAllUsersVectorized(IdxLaneI)
1753 : 0;
1754 }
1755
1756 /// Score scaling factor for fully compatible instructions but with
1757 /// different number of external uses. Allows better selection of the
1758 /// instructions with less external uses.
1759 static const int ScoreScaleFactor = 10;
1760
1761 /// \Returns the look-ahead score, which tells us how much the sub-trees
1762 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1763 /// score. This helps break ties in an informed way when we cannot decide on
1764 /// the order of the operands by just considering the immediate
1765 /// predecessors.
1766 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1767 int Lane, unsigned OpIdx, unsigned Idx,
1768 bool &IsUsed) {
1769 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1771 // Keep track of the instruction stack as we recurse into the operands
1772 // during the look-ahead score exploration.
1773 int Score =
1774 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1775 /*CurrLevel=*/1, MainAltOps);
1776 if (Score) {
1777 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1778 if (Score <= -SplatScore) {
1779 // Set the minimum score for splat-like sequence to avoid setting
1780 // failed state.
1781 Score = 1;
1782 } else {
1783 Score += SplatScore;
1784 // Scale score to see the difference between different operands
1785 // and similar operands but all vectorized/not all vectorized
1786 // uses. It does not affect actual selection of the best
1787 // compatible operand in general, just allows to select the
1788 // operand with all vectorized uses.
1789 Score *= ScoreScaleFactor;
1790 Score += getExternalUseScore(Lane, OpIdx, Idx);
1791 IsUsed = true;
1792 }
1793 }
1794 return Score;
1795 }
1796
1797 /// Best defined scores per lanes between the passes. Used to choose the
1798 /// best operand (with the highest score) between the passes.
1799 /// The key - {Operand Index, Lane}.
1800 /// The value - the best score between the passes for the lane and the
1801 /// operand.
1803 BestScoresPerLanes;
1804
1805 // Search all operands in Ops[*][Lane] for the one that matches best
1806 // Ops[OpIdx][LastLane] and return its opreand index.
1807 // If no good match can be found, return std::nullopt.
1808 std::optional<unsigned>
1809 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1810 ArrayRef<ReorderingMode> ReorderingModes,
1811 ArrayRef<Value *> MainAltOps) {
1812 unsigned NumOperands = getNumOperands();
1813
1814 // The operand of the previous lane at OpIdx.
1815 Value *OpLastLane = getData(OpIdx, LastLane).V;
1816
1817 // Our strategy mode for OpIdx.
1818 ReorderingMode RMode = ReorderingModes[OpIdx];
1819 if (RMode == ReorderingMode::Failed)
1820 return std::nullopt;
1821
1822 // The linearized opcode of the operand at OpIdx, Lane.
1823 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1824
1825 // The best operand index and its score.
1826 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1827 // are using the score to differentiate between the two.
1828 struct BestOpData {
1829 std::optional<unsigned> Idx;
1830 unsigned Score = 0;
1831 } BestOp;
1832 BestOp.Score =
1833 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1834 .first->second;
1835
1836 // Track if the operand must be marked as used. If the operand is set to
1837 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1838 // want to reestimate the operands again on the following iterations).
1839 bool IsUsed = RMode == ReorderingMode::Splat ||
1840 RMode == ReorderingMode::Constant ||
1841 RMode == ReorderingMode::Load;
1842 // Iterate through all unused operands and look for the best.
1843 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1844 // Get the operand at Idx and Lane.
1845 OperandData &OpData = getData(Idx, Lane);
1846 Value *Op = OpData.V;
1847 bool OpAPO = OpData.APO;
1848
1849 // Skip already selected operands.
1850 if (OpData.IsUsed)
1851 continue;
1852
1853 // Skip if we are trying to move the operand to a position with a
1854 // different opcode in the linearized tree form. This would break the
1855 // semantics.
1856 if (OpAPO != OpIdxAPO)
1857 continue;
1858
1859 // Look for an operand that matches the current mode.
1860 switch (RMode) {
1861 case ReorderingMode::Load:
1862 case ReorderingMode::Opcode: {
1863 bool LeftToRight = Lane > LastLane;
1864 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1865 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1866 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1867 OpIdx, Idx, IsUsed);
1868 if (Score > static_cast<int>(BestOp.Score) ||
1869 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1870 Idx == OpIdx)) {
1871 BestOp.Idx = Idx;
1872 BestOp.Score = Score;
1873 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1874 }
1875 break;
1876 }
1877 case ReorderingMode::Constant:
1878 if (isa<Constant>(Op) ||
1879 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1880 BestOp.Idx = Idx;
1881 if (isa<Constant>(Op)) {
1883 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1885 }
1886 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1887 IsUsed = false;
1888 }
1889 break;
1890 case ReorderingMode::Splat:
1891 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1892 IsUsed = Op == OpLastLane;
1893 if (Op == OpLastLane) {
1894 BestOp.Score = LookAheadHeuristics::ScoreSplat;
1895 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1897 }
1898 BestOp.Idx = Idx;
1899 }
1900 break;
1901 case ReorderingMode::Failed:
1902 llvm_unreachable("Not expected Failed reordering mode.");
1903 }
1904 }
1905
1906 if (BestOp.Idx) {
1907 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1908 return BestOp.Idx;
1909 }
1910 // If we could not find a good match return std::nullopt.
1911 return std::nullopt;
1912 }
1913
1914 /// Helper for reorderOperandVecs.
1915 /// \returns the lane that we should start reordering from. This is the one
1916 /// which has the least number of operands that can freely move about or
1917 /// less profitable because it already has the most optimal set of operands.
1918 unsigned getBestLaneToStartReordering() const {
1919 unsigned Min = UINT_MAX;
1920 unsigned SameOpNumber = 0;
1921 // std::pair<unsigned, unsigned> is used to implement a simple voting
1922 // algorithm and choose the lane with the least number of operands that
1923 // can freely move about or less profitable because it already has the
1924 // most optimal set of operands. The first unsigned is a counter for
1925 // voting, the second unsigned is the counter of lanes with instructions
1926 // with same/alternate opcodes and same parent basic block.
1928 // Try to be closer to the original results, if we have multiple lanes
1929 // with same cost. If 2 lanes have the same cost, use the one with the
1930 // lowest index.
1931 for (int I = getNumLanes(); I > 0; --I) {
1932 unsigned Lane = I - 1;
1933 OperandsOrderData NumFreeOpsHash =
1934 getMaxNumOperandsThatCanBeReordered(Lane);
1935 // Compare the number of operands that can move and choose the one with
1936 // the least number.
1937 if (NumFreeOpsHash.NumOfAPOs < Min) {
1938 Min = NumFreeOpsHash.NumOfAPOs;
1939 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1940 HashMap.clear();
1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1942 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1943 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1944 // Select the most optimal lane in terms of number of operands that
1945 // should be moved around.
1946 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1947 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1948 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1949 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1950 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1951 if (It == HashMap.end())
1952 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1953 else
1954 ++It->second.first;
1955 }
1956 }
1957 // Select the lane with the minimum counter.
1958 unsigned BestLane = 0;
1959 unsigned CntMin = UINT_MAX;
1960 for (const auto &Data : reverse(HashMap)) {
1961 if (Data.second.first < CntMin) {
1962 CntMin = Data.second.first;
1963 BestLane = Data.second.second;
1964 }
1965 }
1966 return BestLane;
1967 }
1968
1969 /// Data structure that helps to reorder operands.
1970 struct OperandsOrderData {
1971 /// The best number of operands with the same APOs, which can be
1972 /// reordered.
1973 unsigned NumOfAPOs = UINT_MAX;
1974 /// Number of operands with the same/alternate instruction opcode and
1975 /// parent.
1976 unsigned NumOpsWithSameOpcodeParent = 0;
1977 /// Hash for the actual operands ordering.
1978 /// Used to count operands, actually their position id and opcode
1979 /// value. It is used in the voting mechanism to find the lane with the
1980 /// least number of operands that can freely move about or less profitable
1981 /// because it already has the most optimal set of operands. Can be
1982 /// replaced with SmallVector<unsigned> instead but hash code is faster
1983 /// and requires less memory.
1984 unsigned Hash = 0;
1985 };
1986 /// \returns the maximum number of operands that are allowed to be reordered
1987 /// for \p Lane and the number of compatible instructions(with the same
1988 /// parent/opcode). This is used as a heuristic for selecting the first lane
1989 /// to start operand reordering.
1990 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1991 unsigned CntTrue = 0;
1992 unsigned NumOperands = getNumOperands();
1993 // Operands with the same APO can be reordered. We therefore need to count
1994 // how many of them we have for each APO, like this: Cnt[APO] = x.
1995 // Since we only have two APOs, namely true and false, we can avoid using
1996 // a map. Instead we can simply count the number of operands that
1997 // correspond to one of them (in this case the 'true' APO), and calculate
1998 // the other by subtracting it from the total number of operands.
1999 // Operands with the same instruction opcode and parent are more
2000 // profitable since we don't need to move them in many cases, with a high
2001 // probability such lane already can be vectorized effectively.
2002 bool AllUndefs = true;
2003 unsigned NumOpsWithSameOpcodeParent = 0;
2004 Instruction *OpcodeI = nullptr;
2005 BasicBlock *Parent = nullptr;
2006 unsigned Hash = 0;
2007 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2008 const OperandData &OpData = getData(OpIdx, Lane);
2009 if (OpData.APO)
2010 ++CntTrue;
2011 // Use Boyer-Moore majority voting for finding the majority opcode and
2012 // the number of times it occurs.
2013 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2014 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2015 I->getParent() != Parent) {
2016 if (NumOpsWithSameOpcodeParent == 0) {
2017 NumOpsWithSameOpcodeParent = 1;
2018 OpcodeI = I;
2019 Parent = I->getParent();
2020 } else {
2021 --NumOpsWithSameOpcodeParent;
2022 }
2023 } else {
2024 ++NumOpsWithSameOpcodeParent;
2025 }
2026 }
2027 Hash = hash_combine(
2028 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2029 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2030 }
2031 if (AllUndefs)
2032 return {};
2033 OperandsOrderData Data;
2034 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2035 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2036 Data.Hash = Hash;
2037 return Data;
2038 }
2039
2040 /// Go through the instructions in VL and append their operands.
2041 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2042 assert(!VL.empty() && "Bad VL");
2043 assert((empty() || VL.size() == getNumLanes()) &&
2044 "Expected same number of lanes");
2045 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2046 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2047 constexpr unsigned IntrinsicNumOperands = 2;
2048 if (isa<IntrinsicInst>(VL[0]))
2049 NumOperands = IntrinsicNumOperands;
2050 OpsVec.resize(NumOperands);
2051 unsigned NumLanes = VL.size();
2052 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2053 OpsVec[OpIdx].resize(NumLanes);
2054 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2055 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2056 // Our tree has just 3 nodes: the root and two operands.
2057 // It is therefore trivial to get the APO. We only need to check the
2058 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2059 // RHS operand. The LHS operand of both add and sub is never attached
2060 // to an inversese operation in the linearized form, therefore its APO
2061 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2062
2063 // Since operand reordering is performed on groups of commutative
2064 // operations or alternating sequences (e.g., +, -), we can safely
2065 // tell the inverse operations by checking commutativity.
2066 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2067 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2068 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2069 APO, false};
2070 }
2071 }
2072 }
2073
2074 /// \returns the number of operands.
2075 unsigned getNumOperands() const { return OpsVec.size(); }
2076
2077 /// \returns the number of lanes.
2078 unsigned getNumLanes() const { return OpsVec[0].size(); }
2079
2080 /// \returns the operand value at \p OpIdx and \p Lane.
2081 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2082 return getData(OpIdx, Lane).V;
2083 }
2084
2085 /// \returns true if the data structure is empty.
2086 bool empty() const { return OpsVec.empty(); }
2087
2088 /// Clears the data.
2089 void clear() { OpsVec.clear(); }
2090
2091 /// \Returns true if there are enough operands identical to \p Op to fill
2092 /// the whole vector (it is mixed with constants or loop invariant values).
2093 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2094 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2095 bool OpAPO = getData(OpIdx, Lane).APO;
2096 bool IsInvariant = L && L->isLoopInvariant(Op);
2097 unsigned Cnt = 0;
2098 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2099 if (Ln == Lane)
2100 continue;
2101 // This is set to true if we found a candidate for broadcast at Lane.
2102 bool FoundCandidate = false;
2103 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2104 OperandData &Data = getData(OpI, Ln);
2105 if (Data.APO != OpAPO || Data.IsUsed)
2106 continue;
2107 Value *OpILane = getValue(OpI, Lane);
2108 bool IsConstantOp = isa<Constant>(OpILane);
2109 // Consider the broadcast candidate if:
2110 // 1. Same value is found in one of the operands.
2111 if (Data.V == Op ||
2112 // 2. The operand in the given lane is not constant but there is a
2113 // constant operand in another lane (which can be moved to the
2114 // given lane). In this case we can represent it as a simple
2115 // permutation of constant and broadcast.
2116 (!IsConstantOp &&
2117 ((Lns > 2 && isa<Constant>(Data.V)) ||
2118 // 2.1. If we have only 2 lanes, need to check that value in the
2119 // next lane does not build same opcode sequence.
2120 (Lns == 2 &&
2121 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2122 .getOpcode() &&
2123 isa<Constant>(Data.V)))) ||
2124 // 3. The operand in the current lane is loop invariant (can be
2125 // hoisted out) and another operand is also a loop invariant
2126 // (though not a constant). In this case the whole vector can be
2127 // hoisted out.
2128 // FIXME: need to teach the cost model about this case for better
2129 // estimation.
2130 (IsInvariant && !isa<Constant>(Data.V) &&
2131 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2132 L->isLoopInvariant(Data.V))) {
2133 FoundCandidate = true;
2134 Data.IsUsed = Data.V == Op;
2135 if (Data.V == Op)
2136 ++Cnt;
2137 break;
2138 }
2139 }
2140 if (!FoundCandidate)
2141 return false;
2142 }
2143 return getNumLanes() == 2 || Cnt > 1;
2144 }
2145
2146 public:
2147 /// Initialize with all the operands of the instruction vector \p RootVL.
2149 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2150 L(R.LI->getLoopFor(
2151 (cast<Instruction>(RootVL.front())->getParent()))) {
2152 // Append all the operands of RootVL.
2153 appendOperandsOfVL(RootVL);
2154 }
2155
2156 /// \Returns a value vector with the operands across all lanes for the
2157 /// opearnd at \p OpIdx.
2158 ValueList getVL(unsigned OpIdx) const {
2159 ValueList OpVL(OpsVec[OpIdx].size());
2160 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2161 "Expected same num of lanes across all operands");
2162 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2163 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2164 return OpVL;
2165 }
2166
2167 // Performs operand reordering for 2 or more operands.
2168 // The original operands are in OrigOps[OpIdx][Lane].
2169 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2170 void reorder() {
2171 unsigned NumOperands = getNumOperands();
2172 unsigned NumLanes = getNumLanes();
2173 // Each operand has its own mode. We are using this mode to help us select
2174 // the instructions for each lane, so that they match best with the ones
2175 // we have selected so far.
2176 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2177
2178 // This is a greedy single-pass algorithm. We are going over each lane
2179 // once and deciding on the best order right away with no back-tracking.
2180 // However, in order to increase its effectiveness, we start with the lane
2181 // that has operands that can move the least. For example, given the
2182 // following lanes:
2183 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2184 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2185 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2186 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2187 // we will start at Lane 1, since the operands of the subtraction cannot
2188 // be reordered. Then we will visit the rest of the lanes in a circular
2189 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2190
2191 // Find the first lane that we will start our search from.
2192 unsigned FirstLane = getBestLaneToStartReordering();
2193
2194 // Initialize the modes.
2195 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2196 Value *OpLane0 = getValue(OpIdx, FirstLane);
2197 // Keep track if we have instructions with all the same opcode on one
2198 // side.
2199 if (isa<LoadInst>(OpLane0))
2200 ReorderingModes[OpIdx] = ReorderingMode::Load;
2201 else if (isa<Instruction>(OpLane0)) {
2202 // Check if OpLane0 should be broadcast.
2203 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2204 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2205 else
2206 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2207 }
2208 else if (isa<Constant>(OpLane0))
2209 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2210 else if (isa<Argument>(OpLane0))
2211 // Our best hope is a Splat. It may save some cost in some cases.
2212 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2213 else
2214 // NOTE: This should be unreachable.
2215 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2216 }
2217
2218 // Check that we don't have same operands. No need to reorder if operands
2219 // are just perfect diamond or shuffled diamond match. Do not do it only
2220 // for possible broadcasts or non-power of 2 number of scalars (just for
2221 // now).
2222 auto &&SkipReordering = [this]() {
2223 SmallPtrSet<Value *, 4> UniqueValues;
2224 ArrayRef<OperandData> Op0 = OpsVec.front();
2225 for (const OperandData &Data : Op0)
2226 UniqueValues.insert(Data.V);
2227 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2228 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2229 return !UniqueValues.contains(Data.V);
2230 }))
2231 return false;
2232 }
2233 // TODO: Check if we can remove a check for non-power-2 number of
2234 // scalars after full support of non-power-2 vectorization.
2235 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2236 };
2237
2238 // If the initial strategy fails for any of the operand indexes, then we
2239 // perform reordering again in a second pass. This helps avoid assigning
2240 // high priority to the failed strategy, and should improve reordering for
2241 // the non-failed operand indexes.
2242 for (int Pass = 0; Pass != 2; ++Pass) {
2243 // Check if no need to reorder operands since they're are perfect or
2244 // shuffled diamond match.
2245 // Need to do it to avoid extra external use cost counting for
2246 // shuffled matches, which may cause regressions.
2247 if (SkipReordering())
2248 break;
2249 // Skip the second pass if the first pass did not fail.
2250 bool StrategyFailed = false;
2251 // Mark all operand data as free to use.
2252 clearUsed();
2253 // We keep the original operand order for the FirstLane, so reorder the
2254 // rest of the lanes. We are visiting the nodes in a circular fashion,
2255 // using FirstLane as the center point and increasing the radius
2256 // distance.
2257 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2258 for (unsigned I = 0; I < NumOperands; ++I)
2259 MainAltOps[I].push_back(getData(I, FirstLane).V);
2260
2261 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2262 // Visit the lane on the right and then the lane on the left.
2263 for (int Direction : {+1, -1}) {
2264 int Lane = FirstLane + Direction * Distance;
2265 if (Lane < 0 || Lane >= (int)NumLanes)
2266 continue;
2267 int LastLane = Lane - Direction;
2268 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2269 "Out of bounds");
2270 // Look for a good match for each operand.
2271 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2272 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2273 std::optional<unsigned> BestIdx = getBestOperand(
2274 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2275 // By not selecting a value, we allow the operands that follow to
2276 // select a better matching value. We will get a non-null value in
2277 // the next run of getBestOperand().
2278 if (BestIdx) {
2279 // Swap the current operand with the one returned by
2280 // getBestOperand().
2281 swap(OpIdx, *BestIdx, Lane);
2282 } else {
2283 // Enable the second pass.
2284 StrategyFailed = true;
2285 }
2286 // Try to get the alternate opcode and follow it during analysis.
2287 if (MainAltOps[OpIdx].size() != 2) {
2288 OperandData &AltOp = getData(OpIdx, Lane);
2289 InstructionsState OpS =
2290 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2291 if (OpS.getOpcode() && OpS.isAltShuffle())
2292 MainAltOps[OpIdx].push_back(AltOp.V);
2293 }
2294 }
2295 }
2296 }
2297 // Skip second pass if the strategy did not fail.
2298 if (!StrategyFailed)
2299 break;
2300 }
2301 }
2302
2303#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2304 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2305 switch (RMode) {
2306 case ReorderingMode::Load:
2307 return "Load";
2308 case ReorderingMode::Opcode:
2309 return "Opcode";
2310 case ReorderingMode::Constant:
2311 return "Constant";
2312 case ReorderingMode::Splat:
2313 return "Splat";
2314 case ReorderingMode::Failed:
2315 return "Failed";
2316 }
2317 llvm_unreachable("Unimplemented Reordering Type");
2318 }
2319
2320 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2321 raw_ostream &OS) {
2322 return OS << getModeStr(RMode);
2323 }
2324
2325 /// Debug print.
2326 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2327 printMode(RMode, dbgs());
2328 }
2329
2330 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2331 return printMode(RMode, OS);
2332 }
2333
2335 const unsigned Indent = 2;
2336 unsigned Cnt = 0;
2337 for (const OperandDataVec &OpDataVec : OpsVec) {
2338 OS << "Operand " << Cnt++ << "\n";
2339 for (const OperandData &OpData : OpDataVec) {
2340 OS.indent(Indent) << "{";
2341 if (Value *V = OpData.V)
2342 OS << *V;
2343 else
2344 OS << "null";
2345 OS << ", APO:" << OpData.APO << "}\n";
2346 }
2347 OS << "\n";
2348 }
2349 return OS;
2350 }
2351
2352 /// Debug print.
2353 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2354#endif
2355 };
2356
2357 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2358 /// for a pair which have highest score deemed to have best chance to form
2359 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2360 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2361 /// of the cost, considered to be good enough score.
2362 std::optional<int>
2363 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2364 int Limit = LookAheadHeuristics::ScoreFail) const {
2365 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2367 int BestScore = Limit;
2368 std::optional<int> Index;
2369 for (int I : seq<int>(0, Candidates.size())) {
2370 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2371 Candidates[I].second,
2372 /*U1=*/nullptr, /*U2=*/nullptr,
2373 /*Level=*/1, std::nullopt);
2374 if (Score > BestScore) {
2375 BestScore = Score;
2376 Index = I;
2377 }
2378 }
2379 return Index;
2380 }
2381
2382 /// Checks if the instruction is marked for deletion.
2383 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2384
2385 /// Removes an instruction from its block and eventually deletes it.
2386 /// It's like Instruction::eraseFromParent() except that the actual deletion
2387 /// is delayed until BoUpSLP is destructed.
2389 DeletedInstructions.insert(I);
2390 }
2391
2392 /// Checks if the instruction was already analyzed for being possible
2393 /// reduction root.
2395 return AnalyzedReductionsRoots.count(I);
2396 }
2397 /// Register given instruction as already analyzed for being possible
2398 /// reduction root.
2400 AnalyzedReductionsRoots.insert(I);
2401 }
2402 /// Checks if the provided list of reduced values was checked already for
2403 /// vectorization.
2405 return AnalyzedReductionVals.contains(hash_value(VL));
2406 }
2407 /// Adds the list of reduced values to list of already checked values for the
2408 /// vectorization.
2410 AnalyzedReductionVals.insert(hash_value(VL));
2411 }
2412 /// Clear the list of the analyzed reduction root instructions.
2414 AnalyzedReductionsRoots.clear();
2415 AnalyzedReductionVals.clear();
2416 AnalyzedMinBWVals.clear();
2417 }
2418 /// Checks if the given value is gathered in one of the nodes.
2419 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2420 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2421 }
2422 /// Checks if the given value is gathered in one of the nodes.
2423 bool isGathered(const Value *V) const {
2424 return MustGather.contains(V);
2425 }
2426 /// Checks if the specified value was not schedule.
2427 bool isNotScheduled(const Value *V) const {
2428 return NonScheduledFirst.contains(V);
2429 }
2430
2431 /// Check if the value is vectorized in the tree.
2432 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2433
2434 ~BoUpSLP();
2435
2436private:
2437 /// Determine if a node \p E in can be demoted to a smaller type with a
2438 /// truncation. We collect the entries that will be demoted in ToDemote.
2439 /// \param E Node for analysis
2440 /// \param ToDemote indices of the nodes to be demoted.
2441 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2442 unsigned &BitWidth,
2443 SmallVectorImpl<unsigned> &ToDemote,
2445 unsigned &MaxDepthLevel,
2446 bool &IsProfitableToDemote,
2447 bool IsTruncRoot) const;
2448
2449 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2450 /// reordering (i.e. the operands can be reordered because they have only one
2451 /// user and reordarable).
2452 /// \param ReorderableGathers List of all gather nodes that require reordering
2453 /// (e.g., gather of extractlements or partially vectorizable loads).
2454 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2455 /// reordering, subset of \p NonVectorized.
2456 bool
2457 canReorderOperands(TreeEntry *UserTE,
2458 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2459 ArrayRef<TreeEntry *> ReorderableGathers,
2460 SmallVectorImpl<TreeEntry *> &GatherOps);
2461
2462 /// Checks if the given \p TE is a gather node with clustered reused scalars
2463 /// and reorders it per given \p Mask.
2464 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2465
2466 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2467 /// if any. If it is not vectorized (gather node), returns nullptr.
2468 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2469 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2470 TreeEntry *TE = nullptr;
2471 const auto *It = find_if(VL, [&](Value *V) {
2472 TE = getTreeEntry(V);
2473 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2474 return true;
2475 auto It = MultiNodeScalars.find(V);
2476 if (It != MultiNodeScalars.end()) {
2477 for (TreeEntry *E : It->second) {
2478 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2479 TE = E;
2480 return true;
2481 }
2482 }
2483 }
2484 return false;
2485 });
2486 if (It != VL.end()) {
2487 assert(TE->isSame(VL) && "Expected same scalars.");
2488 return TE;
2489 }
2490 return nullptr;
2491 }
2492
2493 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2494 /// if any. If it is not vectorized (gather node), returns nullptr.
2495 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2496 unsigned OpIdx) const {
2497 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2498 const_cast<TreeEntry *>(UserTE), OpIdx);
2499 }
2500
2501 /// Checks if all users of \p I are the part of the vectorization tree.
2502 bool areAllUsersVectorized(
2503 Instruction *I,
2504 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2505
2506 /// Return information about the vector formed for the specified index
2507 /// of a vector of (the same) instruction.
2509
2510 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2511 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2512
2513 /// \returns Cast context for the given graph node.
2515 getCastContextHint(const TreeEntry &TE) const;
2516
2517 /// \returns the cost of the vectorizable entry.
2518 InstructionCost getEntryCost(const TreeEntry *E,
2519 ArrayRef<Value *> VectorizedVals,
2520 SmallPtrSetImpl<Value *> &CheckedExtracts);
2521
2522 /// This is the recursive part of buildTree.
2523 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2524 const EdgeInfo &EI);
2525
2526 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2527 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2528 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2529 /// returns false, setting \p CurrentOrder to either an empty vector or a
2530 /// non-identity permutation that allows to reuse extract instructions.
2531 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2532 /// extract order.
2533 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2534 SmallVectorImpl<unsigned> &CurrentOrder,
2535 bool ResizeAllowed = false) const;
2536
2537 /// Vectorize a single entry in the tree.
2538 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2539 /// avoid issues with def-use order.
2540 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2541
2542 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2543 /// \p E.
2544 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2545 /// avoid issues with def-use order.
2546 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2547
2548 /// Create a new vector from a list of scalar values. Produces a sequence
2549 /// which exploits values reused across lanes, and arranges the inserts
2550 /// for ease of later optimization.
2551 template <typename BVTy, typename ResTy, typename... Args>
2552 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2553
2554 /// Create a new vector from a list of scalar values. Produces a sequence
2555 /// which exploits values reused across lanes, and arranges the inserts
2556 /// for ease of later optimization.
2557 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2558
2559 /// Returns the instruction in the bundle, which can be used as a base point
2560 /// for scheduling. Usually it is the last instruction in the bundle, except
2561 /// for the case when all operands are external (in this case, it is the first
2562 /// instruction in the list).
2563 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2564
2565 /// Tries to find extractelement instructions with constant indices from fixed
2566 /// vector type and gather such instructions into a bunch, which highly likely
2567 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2568 /// was successful, the matched scalars are replaced by poison values in \p VL
2569 /// for future analysis.
2570 std::optional<TargetTransformInfo::ShuffleKind>
2571 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2572 SmallVectorImpl<int> &Mask) const;
2573
2574 /// Tries to find extractelement instructions with constant indices from fixed
2575 /// vector type and gather such instructions into a bunch, which highly likely
2576 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2577 /// was successful, the matched scalars are replaced by poison values in \p VL
2578 /// for future analysis.
2580 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2582 unsigned NumParts) const;
2583
2584 /// Checks if the gathered \p VL can be represented as a single register
2585 /// shuffle(s) of previous tree entries.
2586 /// \param TE Tree entry checked for permutation.
2587 /// \param VL List of scalars (a subset of the TE scalar), checked for
2588 /// permutations. Must form single-register vector.
2589 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2590 /// commands to build the mask using the original vector value, without
2591 /// relying on the potential reordering.
2592 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2593 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2594 std::optional<TargetTransformInfo::ShuffleKind>
2595 isGatherShuffledSingleRegisterEntry(
2596 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2597 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2598 bool ForOrder);
2599
2600 /// Checks if the gathered \p VL can be represented as multi-register
2601 /// shuffle(s) of previous tree entries.
2602 /// \param TE Tree entry checked for permutation.
2603 /// \param VL List of scalars (a subset of the TE scalar), checked for
2604 /// permutations.
2605 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2606 /// commands to build the mask using the original vector value, without
2607 /// relying on the potential reordering.
2608 /// \returns per-register series of ShuffleKind, if gathered values can be
2609 /// represented as shuffles of previous tree entries. \p Mask is filled with
2610 /// the shuffle mask (also on per-register base).
2612 isGatherShuffledEntry(
2613 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2615 unsigned NumParts, bool ForOrder = false);
2616
2617 /// \returns the scalarization cost for this list of values. Assuming that
2618 /// this subtree gets vectorized, we may need to extract the values from the
2619 /// roots. This method calculates the cost of extracting the values.
2620 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2621 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2622 Type *ScalarTy) const;
2623
2624 /// Set the Builder insert point to one after the last instruction in
2625 /// the bundle
2626 void setInsertPointAfterBundle(const TreeEntry *E);
2627
2628 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2629 /// specified, the starting vector value is poison.
2630 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2631
2632 /// \returns whether the VectorizableTree is fully vectorizable and will
2633 /// be beneficial even the tree height is tiny.
2634 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2635
2636 /// Reorder commutative or alt operands to get better probability of
2637 /// generating vectorized code.
2638 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2641 const BoUpSLP &R);
2642
2643 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2644 /// users of \p TE and collects the stores. It returns the map from the store
2645 /// pointers to the collected stores.
2647 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2648
2649 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2650 /// stores in \p StoresVec can form a vector instruction. If so it returns
2651 /// true and populates \p ReorderIndices with the shuffle indices of the
2652 /// stores when compared to the sorted vector.
2653 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2654 OrdersType &ReorderIndices) const;
2655
2656 /// Iterates through the users of \p TE, looking for scalar stores that can be
2657 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2658 /// their order and builds an order index vector for each store bundle. It
2659 /// returns all these order vectors found.
2660 /// We run this after the tree has formed, otherwise we may come across user
2661 /// instructions that are not yet in the tree.
2663 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2664
2665 struct TreeEntry {
2666 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2667 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2668
2669 /// \returns Common mask for reorder indices and reused scalars.
2670 SmallVector<int> getCommonMask() const {
2672 inversePermutation(ReorderIndices, Mask);
2673 ::addMask(Mask, ReuseShuffleIndices);
2674 return Mask;
2675 }
2676
2677 /// \returns true if the scalars in VL are equal to this entry.
2678 bool isSame(ArrayRef<Value *> VL) const {
2679 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2680 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2681 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2682 return VL.size() == Mask.size() &&
2683 std::equal(VL.begin(), VL.end(), Mask.begin(),
2684 [Scalars](Value *V, int Idx) {
2685 return (isa<UndefValue>(V) &&
2686 Idx == PoisonMaskElem) ||
2687 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2688 });
2689 };
2690 if (!ReorderIndices.empty()) {
2691 // TODO: implement matching if the nodes are just reordered, still can
2692 // treat the vector as the same if the list of scalars matches VL
2693 // directly, without reordering.
2695 inversePermutation(ReorderIndices, Mask);
2696 if (VL.size() == Scalars.size())
2697 return IsSame(Scalars, Mask);
2698 if (VL.size() == ReuseShuffleIndices.size()) {
2699 ::addMask(Mask, ReuseShuffleIndices);
2700 return IsSame(Scalars, Mask);
2701 }
2702 return false;
2703 }
2704 return IsSame(Scalars, ReuseShuffleIndices);
2705 }
2706
2707 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2708 return State == TreeEntry::NeedToGather &&
2709 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2710 UserTreeIndices.front().UserTE == UserEI.UserTE;
2711 }
2712
2713 /// \returns true if current entry has same operands as \p TE.
2714 bool hasEqualOperands(const TreeEntry &TE) const {
2715 if (TE.getNumOperands() != getNumOperands())
2716 return false;
2717 SmallBitVector Used(getNumOperands());
2718 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2719 unsigned PrevCount = Used.count();
2720 for (unsigned K = 0; K < E; ++K) {
2721 if (Used.test(K))
2722 continue;
2723 if (getOperand(K) == TE.getOperand(I)) {
2724 Used.set(K);
2725 break;
2726 }
2727 }
2728 // Check if we actually found the matching operand.
2729 if (PrevCount == Used.count())
2730 return false;
2731 }
2732 return true;
2733 }
2734
2735 /// \return Final vectorization factor for the node. Defined by the total
2736 /// number of vectorized scalars, including those, used several times in the
2737 /// entry and counted in the \a ReuseShuffleIndices, if any.
2738 unsigned getVectorFactor() const {
2739 if (!ReuseShuffleIndices.empty())
2740 return ReuseShuffleIndices.size();
2741 return Scalars.size();
2742 };
2743
2744 /// A vector of scalars.
2745 ValueList Scalars;
2746
2747 /// The Scalars are vectorized into this value. It is initialized to Null.
2748 WeakTrackingVH VectorizedValue = nullptr;
2749
2750 /// New vector phi instructions emitted for the vectorized phi nodes.
2751 PHINode *PHI = nullptr;
2752
2753 /// Do we need to gather this sequence or vectorize it
2754 /// (either with vector instruction or with scatter/gather
2755 /// intrinsics for store/load)?
2756 enum EntryState {
2757 Vectorize,
2758 ScatterVectorize,
2759 StridedVectorize,
2760 NeedToGather
2761 };
2762 EntryState State;
2763
2764 /// Does this sequence require some shuffling?
2765 SmallVector<int, 4> ReuseShuffleIndices;
2766
2767 /// Does this entry require reordering?
2768 SmallVector<unsigned, 4> ReorderIndices;
2769
2770 /// Points back to the VectorizableTree.
2771 ///
2772 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2773 /// to be a pointer and needs to be able to initialize the child iterator.
2774 /// Thus we need a reference back to the container to translate the indices
2775 /// to entries.
2776 VecTreeTy &Container;
2777
2778 /// The TreeEntry index containing the user of this entry. We can actually
2779 /// have multiple users so the data structure is not truly a tree.
2780 SmallVector<EdgeInfo, 1> UserTreeIndices;
2781
2782 /// The index of this treeEntry in VectorizableTree.
2783 int Idx = -1;
2784
2785 private:
2786 /// The operands of each instruction in each lane Operands[op_index][lane].
2787 /// Note: This helps avoid the replication of the code that performs the
2788 /// reordering of operands during buildTree_rec() and vectorizeTree().
2790
2791 /// The main/alternate instruction.
2792 Instruction *MainOp = nullptr;
2793 Instruction *AltOp = nullptr;
2794
2795 public:
2796 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2797 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2798 if (Operands.size() < OpIdx + 1)
2799 Operands.resize(OpIdx + 1);
2800 assert(Operands[OpIdx].empty() && "Already resized?");
2801 assert(OpVL.size() <= Scalars.size() &&
2802 "Number of operands is greater than the number of scalars.");
2803 Operands[OpIdx].resize(OpVL.size());
2804 copy(OpVL, Operands[OpIdx].begin());
2805 }
2806
2807 /// Set the operands of this bundle in their original order.
2808 void setOperandsInOrder() {
2809 assert(Operands.empty() && "Already initialized?");
2810 auto *I0 = cast<Instruction>(Scalars[0]);
2811 Operands.resize(I0->getNumOperands());
2812 unsigned NumLanes = Scalars.size();
2813 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2814 OpIdx != NumOperands; ++OpIdx) {
2815 Operands[OpIdx].resize(NumLanes);
2816 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2817 auto *I = cast<Instruction>(Scalars[Lane]);
2818 assert(I->getNumOperands() == NumOperands &&
2819 "Expected same number of operands");
2820 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2821 }
2822 }
2823 }
2824
2825 /// Reorders operands of the node to the given mask \p Mask.
2826 void reorderOperands(ArrayRef<int> Mask) {
2827 for (ValueList &Operand : Operands)
2828 reorderScalars(Operand, Mask);
2829 }
2830
2831 /// \returns the \p OpIdx operand of this TreeEntry.
2832 ValueList &getOperand(unsigned OpIdx) {
2833 assert(OpIdx < Operands.size() && "Off bounds");
2834 return Operands[OpIdx];
2835 }
2836
2837 /// \returns the \p OpIdx operand of this TreeEntry.
2838 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2839 assert(OpIdx < Operands.size() && "Off bounds");
2840 return Operands[OpIdx];
2841 }
2842
2843 /// \returns the number of operands.
2844 unsigned getNumOperands() const { return Operands.size(); }
2845
2846 /// \return the single \p OpIdx operand.
2847 Value *getSingleOperand(unsigned OpIdx) const {
2848 assert(OpIdx < Operands.size() && "Off bounds");
2849 assert(!Operands[OpIdx].empty() && "No operand available");
2850 return Operands[OpIdx][0];
2851 }
2852
2853 /// Some of the instructions in the list have alternate opcodes.
2854 bool isAltShuffle() const { return MainOp != AltOp; }
2855
2856 bool isOpcodeOrAlt(Instruction *I) const {
2857 unsigned CheckedOpcode = I->getOpcode();
2858 return (getOpcode() == CheckedOpcode ||
2859 getAltOpcode() == CheckedOpcode);
2860 }
2861
2862 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2863 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2864 /// \p OpValue.
2865 Value *isOneOf(Value *Op) const {
2866 auto *I = dyn_cast<Instruction>(Op);
2867 if (I && isOpcodeOrAlt(I))
2868 return Op;
2869 return MainOp;
2870 }
2871
2872 void setOperations(const InstructionsState &S) {
2873 MainOp = S.MainOp;
2874 AltOp = S.AltOp;
2875 }
2876
2877 Instruction *getMainOp() const {
2878 return MainOp;
2879 }
2880
2881 Instruction *getAltOp() const {
2882 return AltOp;
2883 }
2884
2885 /// The main/alternate opcodes for the list of instructions.
2886 unsigned getOpcode() const {
2887 return MainOp ? MainOp->getOpcode() : 0;
2888 }
2889
2890 unsigned getAltOpcode() const {
2891 return AltOp ? AltOp->getOpcode() : 0;
2892 }
2893
2894 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2895 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2896 int findLaneForValue(Value *V) const {
2897 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2898 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2899 if (!ReorderIndices.empty())
2900 FoundLane = ReorderIndices[FoundLane];
2901 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2902 if (!ReuseShuffleIndices.empty()) {
2903 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2904 find(ReuseShuffleIndices, FoundLane));
2905 }
2906 return FoundLane;
2907 }
2908
2909 /// Build a shuffle mask for graph entry which represents a merge of main
2910 /// and alternate operations.
2911 void
2912 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2914 SmallVectorImpl<Value *> *OpScalars = nullptr,
2915 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2916
2917 /// Return true if this is a non-power-of-2 node.
2918 bool isNonPowOf2Vec() const {
2919 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2920 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2921 "Reshuffling not supported with non-power-of-2 vectors yet.");
2922 return IsNonPowerOf2;
2923 }
2924
2925#ifndef NDEBUG
2926 /// Debug printer.
2927 LLVM_DUMP_METHOD void dump() const {
2928 dbgs() << Idx << ".\n";
2929 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2930 dbgs() << "Operand " << OpI << ":\n";
2931 for (const Value *V : Operands[OpI])
2932 dbgs().indent(2) << *V << "\n";
2933 }
2934 dbgs() << "Scalars: \n";
2935 for (Value *V : Scalars)
2936 dbgs().indent(2) << *V << "\n";
2937 dbgs() << "State: ";
2938 switch (State) {
2939 case Vectorize:
2940 dbgs() << "Vectorize\n";
2941 break;
2942 case ScatterVectorize:
2943 dbgs() << "ScatterVectorize\n";
2944 break;
2945 case StridedVectorize:
2946 dbgs() << "StridedVectorize\n";
2947 break;
2948 case NeedToGather:
2949 dbgs() << "NeedToGather\n";
2950 break;
2951 }
2952 dbgs() << "MainOp: ";
2953 if (MainOp)
2954 dbgs() << *MainOp << "\n";
2955 else
2956 dbgs() << "NULL\n";
2957 dbgs() << "AltOp: ";
2958 if (AltOp)
2959 dbgs() << *AltOp << "\n";
2960 else
2961 dbgs() << "NULL\n";
2962 dbgs() << "VectorizedValue: ";
2963 if (VectorizedValue)
2964 dbgs() << *VectorizedValue << "\n";
2965 else
2966 dbgs() << "NULL\n";
2967 dbgs() << "ReuseShuffleIndices: ";
2968 if (ReuseShuffleIndices.empty())
2969 dbgs() << "Empty";
2970 else
2971 for (int ReuseIdx : ReuseShuffleIndices)
2972 dbgs() << ReuseIdx << ", ";
2973 dbgs() << "\n";
2974 dbgs() << "ReorderIndices: ";
2975 for (unsigned ReorderIdx : ReorderIndices)
2976 dbgs() << ReorderIdx << ", ";
2977 dbgs() << "\n";
2978 dbgs() << "UserTreeIndices: ";
2979 for (const auto &EInfo : UserTreeIndices)
2980 dbgs() << EInfo << ", ";
2981 dbgs() << "\n";
2982 }
2983#endif
2984 };
2985
2986#ifndef NDEBUG
2987 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2988 InstructionCost VecCost, InstructionCost ScalarCost,
2989 StringRef Banner) const {
2990 dbgs() << "SLP: " << Banner << ":\n";
2991 E->dump();
2992 dbgs() << "SLP: Costs:\n";
2993 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2994 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2995 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2996 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2997 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2998 }
2999#endif
3000
3001 /// Create a new VectorizableTree entry.
3002 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3003 std::optional<ScheduleData *> Bundle,
3004 const InstructionsState &S,
3005 const EdgeInfo &UserTreeIdx,
3006 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3007 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3008 TreeEntry::EntryState EntryState =
3009 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3010 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3011 ReuseShuffleIndices, ReorderIndices);
3012 }
3013
3014 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3015 TreeEntry::EntryState EntryState,
3016 std::optional<ScheduleData *> Bundle,
3017 const InstructionsState &S,
3018 const EdgeInfo &UserTreeIdx,
3019 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3020 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3021 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3022 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3023 "Need to vectorize gather entry?");
3024 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3025 TreeEntry *Last = VectorizableTree.back().get();
3026 Last->Idx = VectorizableTree.size() - 1;
3027 Last->State = EntryState;
3028 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3029 ReuseShuffleIndices.end());
3030 if (ReorderIndices.empty()) {
3031 Last->Scalars.assign(VL.begin(), VL.end());
3032 Last->setOperations(S);
3033 } else {
3034 // Reorder scalars and build final mask.
3035 Last->Scalars.assign(VL.size(), nullptr);
3036 transform(ReorderIndices, Last->Scalars.begin(),
3037 [VL](unsigned Idx) -> Value * {
3038 if (Idx >= VL.size())
3039 return UndefValue::get(VL.front()->getType());
3040 return VL[Idx];
3041 });
3042 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3043 Last->setOperations(S);
3044 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3045 }
3046 if (Last->State != TreeEntry::NeedToGather) {
3047 for (Value *V : VL) {
3048 const TreeEntry *TE = getTreeEntry(V);
3049 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3050 "Scalar already in tree!");
3051 if (TE) {
3052 if (TE != Last)
3053 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3054 continue;
3055 }
3056 ScalarToTreeEntry[V] = Last;
3057 }
3058 // Update the scheduler bundle to point to this TreeEntry.
3059 ScheduleData *BundleMember = *Bundle;
3060 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3061 isVectorLikeInstWithConstOps(S.MainOp) ||
3062 doesNotNeedToSchedule(VL)) &&
3063 "Bundle and VL out of sync");
3064 if (BundleMember) {
3065 for (Value *V : VL) {
3067 continue;
3068 if (!BundleMember)
3069 continue;
3070 BundleMember->TE = Last;
3071 BundleMember = BundleMember->NextInBundle;
3072 }
3073 }
3074 assert(!BundleMember && "Bundle and VL out of sync");
3075 } else {
3076 // Build a map for gathered scalars to the nodes where they are used.
3077 bool AllConstsOrCasts = true;
3078 for (Value *V : VL)
3079 if (!isConstant(V)) {
3080 auto *I = dyn_cast<CastInst>(V);
3081 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3082 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3083 }
3084 if (AllConstsOrCasts)
3085 CastMaxMinBWSizes =
3086 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3087 MustGather.insert(VL.begin(), VL.end());
3088 }
3089
3090 if (UserTreeIdx.UserTE) {
3091 Last->UserTreeIndices.push_back(UserTreeIdx);
3092 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3093 "Reordering isn't implemented for non-power-of-2 nodes yet");
3094 }
3095 return Last;
3096 }
3097
3098 /// -- Vectorization State --
3099 /// Holds all of the tree entries.
3100 TreeEntry::VecTreeTy VectorizableTree;
3101
3102#ifndef NDEBUG
3103 /// Debug printer.
3104 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3105 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3106 VectorizableTree[Id]->dump();
3107 dbgs() << "\n";
3108 }
3109 }
3110#endif
3111
3112 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3113
3114 const TreeEntry *getTreeEntry(Value *V) const {
3115 return ScalarToTreeEntry.lookup(V);
3116 }
3117
3118 /// Check that the operand node of alternate node does not generate
3119 /// buildvector sequence. If it is, then probably not worth it to build
3120 /// alternate shuffle, if number of buildvector operands + alternate
3121 /// instruction > than the number of buildvector instructions.
3122 /// \param S the instructions state of the analyzed values.
3123 /// \param VL list of the instructions with alternate opcodes.
3124 bool areAltOperandsProfitable(const InstructionsState &S,
3125 ArrayRef<Value *> VL) const;
3126
3127 /// Checks if the specified list of the instructions/values can be vectorized
3128 /// and fills required data before actual scheduling of the instructions.
3129 TreeEntry::EntryState getScalarsVectorizationState(
3130 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3131 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3132
3133 /// Maps a specific scalar to its tree entry.
3134 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3135
3136 /// List of scalars, used in several vectorize nodes, and the list of the
3137 /// nodes.
3139
3140 /// Maps a value to the proposed vectorizable size.
3141 SmallDenseMap<Value *, unsigned> InstrElementSize;
3142
3143 /// A list of scalars that we found that we need to keep as scalars.
3144 ValueSet MustGather;
3145
3146 /// A set of first non-schedulable values.
3147 ValueSet NonScheduledFirst;
3148
3149 /// A map between the vectorized entries and the last instructions in the
3150 /// bundles. The bundles are built in use order, not in the def order of the
3151 /// instructions. So, we cannot rely directly on the last instruction in the
3152 /// bundle being the last instruction in the program order during
3153 /// vectorization process since the basic blocks are affected, need to
3154 /// pre-gather them before.
3155 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3156
3157 /// List of gather nodes, depending on other gather/vector nodes, which should
3158 /// be emitted after the vector instruction emission process to correctly
3159 /// handle order of the vector instructions and shuffles.
3160 SetVector<const TreeEntry *> PostponedGathers;
3161
3162 using ValueToGatherNodesMap =
3164 ValueToGatherNodesMap ValueToGatherNodes;
3165
3166 /// This POD struct describes one external user in the vectorized tree.
3167 struct ExternalUser {
3168 ExternalUser(Value *S, llvm::User *U, int L)
3169 : Scalar(S), User(U), Lane(L) {}
3170
3171 // Which scalar in our function.
3172 Value *Scalar;
3173
3174 // Which user that uses the scalar.
3176
3177 // Which lane does the scalar belong to.
3178 int Lane;
3179 };
3180 using UserList = SmallVector<ExternalUser, 16>;
3181
3182 /// Checks if two instructions may access the same memory.
3183 ///
3184 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3185 /// is invariant in the calling loop.
3186 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3187 Instruction *Inst2) {
3188 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3189 return true;
3190 // First check if the result is already in the cache.
3191 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3192 auto It = AliasCache.find(Key);
3193 if (It != AliasCache.end())
3194 return It->second;
3195 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3196 // Store the result in the cache.
3197 AliasCache.try_emplace(Key, Aliased);
3198 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3199 return Aliased;
3200 }
3201
3202 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3203
3204 /// Cache for alias results.
3205 /// TODO: consider moving this to the AliasAnalysis itself.
3207
3208 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3209 // globally through SLP because we don't perform any action which
3210 // invalidates capture results.
3211 BatchAAResults BatchAA;
3212
3213 /// Temporary store for deleted instructions. Instructions will be deleted
3214 /// eventually when the BoUpSLP is destructed. The deferral is required to
3215 /// ensure that there are no incorrect collisions in the AliasCache, which
3216 /// can happen if a new instruction is allocated at the same address as a
3217 /// previously deleted instruction.
3218 DenseSet<Instruction *> DeletedInstructions;
3219
3220 /// Set of the instruction, being analyzed already for reductions.
3221 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3222
3223 /// Set of hashes for the list of reduction values already being analyzed.
3224 DenseSet<size_t> AnalyzedReductionVals;
3225
3226 /// Values, already been analyzed for mininmal bitwidth and found to be
3227 /// non-profitable.
3228 DenseSet<Value *> AnalyzedMinBWVals;
3229
3230 /// A list of values that need to extracted out of the tree.
3231 /// This list holds pairs of (Internal Scalar : External User). External User
3232 /// can be nullptr, it means that this Internal Scalar will be used later,
3233 /// after vectorization.
3234 UserList ExternalUses;
3235
3236 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3237 /// extractelement instructions.
3238 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3239
3240 /// Values used only by @llvm.assume calls.
3242
3243 /// Holds all of the instructions that we gathered, shuffle instructions and
3244 /// extractelements.
3245 SetVector<Instruction *> GatherShuffleExtractSeq;
3246
3247 /// A list of blocks that we are going to CSE.
3248 DenseSet<BasicBlock *> CSEBlocks;
3249
3250 /// Contains all scheduling relevant data for an instruction.
3251 /// A ScheduleData either represents a single instruction or a member of an
3252 /// instruction bundle (= a group of instructions which is combined into a
3253 /// vector instruction).
3254 struct ScheduleData {
3255 // The initial value for the dependency counters. It means that the
3256 // dependencies are not calculated yet.
3257 enum { InvalidDeps = -1 };
3258
3259 ScheduleData() = default;
3260
3261 void init(int BlockSchedulingRegionID, Value *OpVal) {
3262 FirstInBundle = this;
3263 NextInBundle = nullptr;
3264 NextLoadStore = nullptr;
3265 IsScheduled = false;
3266 SchedulingRegionID = BlockSchedulingRegionID;
3267 clearDependencies();
3268 OpValue = OpVal;
3269 TE = nullptr;
3270 }
3271
3272 /// Verify basic self consistency properties
3273 void verify() {
3274 if (hasValidDependencies()) {
3275 assert(UnscheduledDeps <= Dependencies && "invariant");
3276 } else {
3277 assert(UnscheduledDeps == Dependencies && "invariant");
3278 }
3279
3280 if (IsScheduled) {
3281 assert(isSchedulingEntity() &&
3282 "unexpected scheduled state");
3283 for (const ScheduleData *BundleMember = this; BundleMember;
3284 BundleMember = BundleMember->NextInBundle) {
3285 assert(BundleMember->hasValidDependencies() &&
3286 BundleMember->UnscheduledDeps == 0 &&
3287 "unexpected scheduled state");
3288 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3289 "only bundle is marked scheduled");
3290 }
3291 }
3292
3293 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3294 "all bundle members must be in same basic block");
3295 }
3296
3297 /// Returns true if the dependency information has been calculated.
3298 /// Note that depenendency validity can vary between instructions within
3299 /// a single bundle.
3300 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3301
3302 /// Returns true for single instructions and for bundle representatives
3303 /// (= the head of a bundle).
3304 bool isSchedulingEntity() const { return FirstInBundle == this; }
3305
3306 /// Returns true if it represents an instruction bundle and not only a
3307 /// single instruction.
3308 bool isPartOfBundle() const {
3309 return NextInBundle != nullptr || FirstInBundle != this || TE;
3310 }
3311
3312 /// Returns true if it is ready for scheduling, i.e. it has no more
3313 /// unscheduled depending instructions/bundles.
3314 bool isReady() const {
3315 assert(isSchedulingEntity() &&
3316 "can't consider non-scheduling entity for ready list");
3317 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3318 }
3319
3320 /// Modifies the number of unscheduled dependencies for this instruction,
3321 /// and returns the number of remaining dependencies for the containing
3322 /// bundle.
3323 int incrementUnscheduledDeps(int Incr) {
3324 assert(hasValidDependencies() &&
3325 "increment of unscheduled deps would be meaningless");
3326 UnscheduledDeps += Incr;
3327 return FirstInBundle->unscheduledDepsInBundle();
3328 }
3329
3330 /// Sets the number of unscheduled dependencies to the number of
3331 /// dependencies.
3332 void resetUnscheduledDeps() {
3333 UnscheduledDeps = Dependencies;
3334 }
3335
3336 /// Clears all dependency information.
3337 void clearDependencies() {
3338 Dependencies = InvalidDeps;
3339 resetUnscheduledDeps();
3340 MemoryDependencies.clear();
3341 ControlDependencies.clear();
3342 }
3343
3344 int unscheduledDepsInBundle() const {
3345 assert(isSchedulingEntity() && "only meaningful on the bundle");
3346 int Sum = 0;
3347 for (const ScheduleData *BundleMember = this; BundleMember;
3348 BundleMember = BundleMember->NextInBundle) {
3349 if (BundleMember->UnscheduledDeps == InvalidDeps)
3350 return InvalidDeps;
3351 Sum += BundleMember->UnscheduledDeps;
3352 }
3353 return Sum;
3354 }
3355
3356 void dump(raw_ostream &os) const {
3357 if (!isSchedulingEntity()) {
3358 os << "/ " << *Inst;
3359 } else if (NextInBundle) {
3360 os << '[' << *Inst;
3361 ScheduleData *SD = NextInBundle;
3362 while (SD) {
3363 os << ';' << *SD->Inst;
3364 SD = SD->NextInBundle;
3365 }
3366 os << ']';
3367 } else {
3368 os << *Inst;
3369 }
3370 }
3371
3372 Instruction *Inst = nullptr;
3373
3374 /// Opcode of the current instruction in the schedule data.
3375 Value *OpValue = nullptr;
3376
3377 /// The TreeEntry that this instruction corresponds to.
3378 TreeEntry *TE = nullptr;
3379
3380 /// Points to the head in an instruction bundle (and always to this for
3381 /// single instructions).
3382 ScheduleData *FirstInBundle = nullptr;
3383
3384 /// Single linked list of all instructions in a bundle. Null if it is a
3385 /// single instruction.
3386 ScheduleData *NextInBundle = nullptr;
3387
3388 /// Single linked list of all memory instructions (e.g. load, store, call)
3389 /// in the block - until the end of the scheduling region.
3390 ScheduleData *NextLoadStore = nullptr;
3391
3392 /// The dependent memory instructions.
3393 /// This list is derived on demand in calculateDependencies().
3394 SmallVector<ScheduleData *, 4> MemoryDependencies;
3395
3396 /// List of instructions which this instruction could be control dependent
3397 /// on. Allowing such nodes to be scheduled below this one could introduce
3398 /// a runtime fault which didn't exist in the original program.
3399 /// ex: this is a load or udiv following a readonly call which inf loops
3400 SmallVector<ScheduleData *, 4> ControlDependencies;
3401
3402 /// This ScheduleData is in the current scheduling region if this matches
3403 /// the current SchedulingRegionID of BlockScheduling.
3404 int SchedulingRegionID = 0;
3405
3406 /// Used for getting a "good" final ordering of instructions.
3407 int SchedulingPriority = 0;
3408
3409 /// The number of dependencies. Constitutes of the number of users of the
3410 /// instruction plus the number of dependent memory instructions (if any).
3411 /// This value is calculated on demand.
3412 /// If InvalidDeps, the number of dependencies is not calculated yet.
3413 int Dependencies = InvalidDeps;
3414
3415 /// The number of dependencies minus the number of dependencies of scheduled
3416 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3417 /// for scheduling.
3418 /// Note that this is negative as long as Dependencies is not calculated.
3419 int UnscheduledDeps = InvalidDeps;
3420
3421 /// True if this instruction is scheduled (or considered as scheduled in the
3422 /// dry-run).
3423 bool IsScheduled = false;
3424 };
3425
3426#ifndef NDEBUG
3428 const BoUpSLP::ScheduleData &SD) {
3429 SD.dump(os);
3430 return os;
3431 }
3432#endif
3433
3434 friend struct GraphTraits<BoUpSLP *>;
3435 friend struct DOTGraphTraits<BoUpSLP *>;
3436
3437 /// Contains all scheduling data for a basic block.
3438 /// It does not schedules instructions, which are not memory read/write
3439 /// instructions and their operands are either constants, or arguments, or
3440 /// phis, or instructions from others blocks, or their users are phis or from
3441 /// the other blocks. The resulting vector instructions can be placed at the
3442 /// beginning of the basic block without scheduling (if operands does not need
3443 /// to be scheduled) or at the end of the block (if users are outside of the
3444 /// block). It allows to save some compile time and memory used by the
3445 /// compiler.
3446 /// ScheduleData is assigned for each instruction in between the boundaries of
3447 /// the tree entry, even for those, which are not part of the graph. It is
3448 /// required to correctly follow the dependencies between the instructions and
3449 /// their correct scheduling. The ScheduleData is not allocated for the
3450 /// instructions, which do not require scheduling, like phis, nodes with
3451 /// extractelements/insertelements only or nodes with instructions, with
3452 /// uses/operands outside of the block.
3453 struct BlockScheduling {
3454 BlockScheduling(BasicBlock *BB)
3455 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3456
3457 void clear() {
3458 ReadyInsts.clear();
3459 ScheduleStart = nullptr;
3460 ScheduleEnd = nullptr;
3461 FirstLoadStoreInRegion = nullptr;
3462 LastLoadStoreInRegion = nullptr;
3463 RegionHasStackSave = false;
3464
3465 // Reduce the maximum schedule region size by the size of the
3466 // previous scheduling run.
3467 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3468 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3469 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3470 ScheduleRegionSize = 0;
3471
3472 // Make a new scheduling region, i.e. all existing ScheduleData is not
3473 // in the new region yet.
3474 ++SchedulingRegionID;
3475 }
3476
3477 ScheduleData *getScheduleData(Instruction *I) {
3478 if (BB != I->getParent())
3479 // Avoid lookup if can't possibly be in map.
3480 return nullptr;
3481 ScheduleData *SD = ScheduleDataMap.lookup(I);
3482 if (SD && isInSchedulingRegion(SD))
3483 return SD;
3484 return nullptr;
3485 }
3486
3487 ScheduleData *getScheduleData(Value *V) {
3488 if (auto *I = dyn_cast<Instruction>(V))
3489 return getScheduleData(I);
3490 return nullptr;
3491 }
3492
3493 ScheduleData *getScheduleData(Value *V, Value *Key) {
3494 if (V == Key)
3495 return getScheduleData(V);
3496 auto I = ExtraScheduleDataMap.find(V);
3497 if (I != ExtraScheduleDataMap.end()) {
3498 ScheduleData *SD = I->second.lookup(Key);
3499 if (SD && isInSchedulingRegion(SD))
3500 return SD;
3501 }
3502 return nullptr;
3503 }
3504
3505 bool isInSchedulingRegion(ScheduleData *SD) const {
3506 return SD->SchedulingRegionID == SchedulingRegionID;
3507 }
3508
3509 /// Marks an instruction as scheduled and puts all dependent ready
3510 /// instructions into the ready-list.
3511 template <typename ReadyListType>
3512 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3513 SD->IsScheduled = true;
3514 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3515
3516 for (ScheduleData *BundleMember = SD; BundleMember;
3517 BundleMember = BundleMember->NextInBundle) {
3518 if (BundleMember->Inst != BundleMember->OpValue)
3519 continue;
3520
3521 // Handle the def-use chain dependencies.
3522
3523 // Decrement the unscheduled counter and insert to ready list if ready.
3524 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3525 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3526 if (OpDef && OpDef->hasValidDependencies() &&
3527 OpDef->incrementUnscheduledDeps(-1) == 0) {
3528 // There are no more unscheduled dependencies after
3529 // decrementing, so we can put the dependent instruction
3530 // into the ready list.
3531 ScheduleData *DepBundle = OpDef->FirstInBundle;
3532 assert(!DepBundle->IsScheduled &&
3533 "already scheduled bundle gets ready");
3534 ReadyList.insert(DepBundle);
3535 LLVM_DEBUG(dbgs()
3536 << "SLP: gets ready (def): " << *DepBundle << "\n");
3537 }
3538 });
3539 };
3540
3541 // If BundleMember is a vector bundle, its operands may have been
3542 // reordered during buildTree(). We therefore need to get its operands
3543 // through the TreeEntry.
3544 if (TreeEntry *TE = BundleMember->TE) {
3545 // Need to search for the lane since the tree entry can be reordered.
3546 int Lane = std::distance(TE->Scalars.begin(),
3547 find(TE->Scalars, BundleMember->Inst));
3548 assert(Lane >= 0 && "Lane not set");
3549
3550 // Since vectorization tree is being built recursively this assertion
3551 // ensures that the tree entry has all operands set before reaching
3552 // this code. Couple of exceptions known at the moment are extracts
3553 // where their second (immediate) operand is not added. Since
3554 // immediates do not affect scheduler behavior this is considered
3555 // okay.
3556 auto *In = BundleMember->Inst;
3557 assert(
3558 In &&
3559 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3560 In->getNumOperands() == TE->getNumOperands()) &&
3561 "Missed TreeEntry operands?");
3562 (void)In; // fake use to avoid build failure when assertions disabled
3563
3564 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3565 OpIdx != NumOperands; ++OpIdx)
3566 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3567 DecrUnsched(I);
3568 } else {
3569 // If BundleMember is a stand-alone instruction, no operand reordering
3570 // has taken place, so we directly access its operands.
3571 for (Use &U : BundleMember->Inst->operands())
3572 if (auto *I = dyn_cast<Instruction>(U.get()))
3573 DecrUnsched(I);
3574 }
3575 // Handle the memory dependencies.
3576 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3577 if (MemoryDepSD->hasValidDependencies() &&
3578 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3579 // There are no more unscheduled dependencies after decrementing,
3580 // so we can put the dependent instruction into the ready list.
3581 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3582 assert(!DepBundle->IsScheduled &&
3583 "already scheduled bundle gets ready");
3584 ReadyList.insert(DepBundle);
3586 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3587 }
3588 }
3589 // Handle the control dependencies.
3590 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3591 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3592 // There are no more unscheduled dependencies after decrementing,
3593 // so we can put the dependent instruction into the ready list.
3594 ScheduleData *DepBundle = DepSD->FirstInBundle;
3595 assert(!DepBundle->IsScheduled &&
3596 "already scheduled bundle gets ready");
3597 ReadyList.insert(DepBundle);
3599 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3600 }
3601 }
3602 }
3603 }
3604
3605 /// Verify basic self consistency properties of the data structure.
3606 void verify() {
3607 if (!ScheduleStart)
3608 return;
3609
3610 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3611 ScheduleStart->comesBefore(ScheduleEnd) &&
3612 "Not a valid scheduling region?");
3613
3614 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3615 auto *SD = getScheduleData(I);
3616 if (!SD)
3617 continue;
3618 assert(isInSchedulingRegion(SD) &&
3619 "primary schedule data not in window?");
3620 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3621 "entire bundle in window!");
3622 (void)SD;
3623 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3624 }
3625
3626 for (auto *SD : ReadyInsts) {
3627 assert(SD->isSchedulingEntity() && SD->isReady() &&
3628 "item in ready list not ready?");
3629 (void)SD;
3630 }
3631 }
3632
3633 void doForAllOpcodes(Value *V,
3634 function_ref<void(ScheduleData *SD)> Action) {
3635 if (ScheduleData *SD = getScheduleData(V))
3636 Action(SD);
3637 auto I = ExtraScheduleDataMap.find(V);
3638 if (I != ExtraScheduleDataMap.end())
3639 for (auto &P : I->second)
3640 if (isInSchedulingRegion(P.second))
3641 Action(P.second);
3642 }
3643
3644 /// Put all instructions into the ReadyList which are ready for scheduling.
3645 template <typename ReadyListType>
3646 void initialFillReadyList(ReadyListType &ReadyList) {
3647 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3648 doForAllOpcodes(I, [&](ScheduleData *SD) {
3649 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3650 SD->isReady()) {
3651 ReadyList.insert(SD);
3652 LLVM_DEBUG(dbgs()
3653 << "SLP: initially in ready list: " << *SD << "\n");
3654 }
3655 });
3656 }
3657 }
3658
3659 /// Build a bundle from the ScheduleData nodes corresponding to the
3660 /// scalar instruction for each lane.
3661 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3662
3663 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3664 /// cyclic dependencies. This is only a dry-run, no instructions are
3665 /// actually moved at this stage.
3666 /// \returns the scheduling bundle. The returned Optional value is not
3667 /// std::nullopt if \p VL is allowed to be scheduled.
3668 std::optional<ScheduleData *>
3669 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3670 const InstructionsState &S);
3671
3672 /// Un-bundles a group of instructions.
3673 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3674
3675 /// Allocates schedule data chunk.
3676 ScheduleData *allocateScheduleDataChunks();
3677
3678 /// Extends the scheduling region so that V is inside the region.
3679 /// \returns true if the region size is within the limit.
3680 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3681
3682 /// Initialize the ScheduleData structures for new instructions in the
3683 /// scheduling region.
3684 void initScheduleData(Instruction *FromI, Instruction *ToI,
3685 ScheduleData *PrevLoadStore,
3686 ScheduleData *NextLoadStore);
3687
3688 /// Updates the dependency information of a bundle and of all instructions/
3689 /// bundles which depend on the original bundle.
3690 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3691 BoUpSLP *SLP);
3692
3693 /// Sets all instruction in the scheduling region to un-scheduled.
3694 void resetSchedule();
3695
3696 BasicBlock *BB;
3697
3698 /// Simple memory allocation for ScheduleData.
3700
3701 /// The size of a ScheduleData array in ScheduleDataChunks.
3702 int ChunkSize;
3703
3704 /// The allocator position in the current chunk, which is the last entry
3705 /// of ScheduleDataChunks.
3706 int ChunkPos;
3707
3708 /// Attaches ScheduleData to Instruction.
3709 /// Note that the mapping survives during all vectorization iterations, i.e.
3710 /// ScheduleData structures are recycled.
3712
3713 /// Attaches ScheduleData to Instruction with the leading key.
3715 ExtraScheduleDataMap;
3716
3717 /// The ready-list for scheduling (only used for the dry-run).
3718 SetVector<ScheduleData *> ReadyInsts;
3719
3720 /// The first instruction of the scheduling region.
3721 Instruction *ScheduleStart = nullptr;
3722
3723 /// The first instruction _after_ the scheduling region.
3724 Instruction *ScheduleEnd = nullptr;
3725
3726 /// The first memory accessing instruction in the scheduling region
3727 /// (can be null).
3728 ScheduleData *FirstLoadStoreInRegion = nullptr;
3729
3730 /// The last memory accessing instruction in the scheduling region
3731 /// (can be null).
3732 ScheduleData *LastLoadStoreInRegion = nullptr;
3733
3734 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3735 /// region? Used to optimize the dependence calculation for the
3736 /// common case where there isn't.
3737 bool RegionHasStackSave = false;
3738
3739 /// The current size of the scheduling region.
3740 int ScheduleRegionSize = 0;
3741
3742 /// The maximum size allowed for the scheduling region.
3743 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3744
3745 /// The ID of the scheduling region. For a new vectorization iteration this
3746 /// is incremented which "removes" all ScheduleData from the region.
3747 /// Make sure that the initial SchedulingRegionID is greater than the
3748 /// initial SchedulingRegionID in ScheduleData (which is 0).
3749 int SchedulingRegionID = 1;
3750 };
3751
3752 /// Attaches the BlockScheduling structures to basic blocks.
3754
3755 /// Performs the "real" scheduling. Done before vectorization is actually
3756 /// performed in a basic block.
3757 void scheduleBlock(BlockScheduling *BS);
3758
3759 /// List of users to ignore during scheduling and that don't need extracting.
3760 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3761
3762 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3763 /// sorted SmallVectors of unsigned.
3764 struct OrdersTypeDenseMapInfo {
3765 static OrdersType getEmptyKey() {
3766 OrdersType V;
3767 V.push_back(~1U);
3768 return V;
3769 }
3770
3771 static OrdersType getTombstoneKey() {
3772 OrdersType V;
3773 V.push_back(~2U);
3774 return V;
3775 }
3776
3777 static unsigned getHashValue(const OrdersType &V) {
3778 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3779 }
3780
3781 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3782 return LHS == RHS;
3783 }
3784 };
3785
3786 // Analysis and block reference.
3787 Function *F;
3788 ScalarEvolution *SE;
3790 TargetLibraryInfo *TLI;
3791 LoopInfo *LI;
3792 DominatorTree *DT;
3793 AssumptionCache *AC;
3794 DemandedBits *DB;
3795 const DataLayout *DL;
3797
3798 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3799 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3800
3801 /// Instruction builder to construct the vectorized tree.
3803
3804 /// A map of scalar integer values to the smallest bit width with which they
3805 /// can legally be represented. The values map to (width, signed) pairs,
3806 /// where "width" indicates the minimum bit width and "signed" is True if the
3807 /// value must be signed-extended, rather than zero-extended, back to its
3808 /// original width.
3810
3811 /// Final size of the reduced vector, if the current graph represents the
3812 /// input for the reduction and it was possible to narrow the size of the
3813 /// reduction.
3814 unsigned ReductionBitWidth = 0;
3815
3816 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3817 /// type sizes, used in the tree.
3818 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3819
3820 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3821 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3822 DenseSet<unsigned> ExtraBitWidthNodes;
3823};
3824
3825} // end namespace slpvectorizer
3826
3827template <> struct GraphTraits<BoUpSLP *> {
3828 using TreeEntry = BoUpSLP::TreeEntry;
3829
3830 /// NodeRef has to be a pointer per the GraphWriter.
3832
3834
3835 /// Add the VectorizableTree to the index iterator to be able to return
3836 /// TreeEntry pointers.
3837 struct ChildIteratorType
3838 : public iterator_adaptor_base<
3839 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3841
3843 ContainerTy &VT)
3844 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3845
3846 NodeRef operator*() { return I->UserTE; }
3847 };
3848
3850 return R.VectorizableTree[0].get();
3851 }
3852
3853 static ChildIteratorType child_begin(NodeRef N) {
3854 return {N->UserTreeIndices.begin(), N->Container};
3855 }
3856
3857 static ChildIteratorType child_end(NodeRef N) {
3858 return {N->UserTreeIndices.end(), N->Container};
3859 }
3860
3861 /// For the node iterator we just need to turn the TreeEntry iterator into a
3862 /// TreeEntry* iterator so that it dereferences to NodeRef.
3863 class nodes_iterator {
3865 ItTy It;
3866
3867 public:
3868 nodes_iterator(const ItTy &It2) : It(It2) {}
3869 NodeRef operator*() { return It->get(); }
3870 nodes_iterator operator++() {
3871 ++It;
3872 return *this;
3873 }
3874 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3875 };
3876
3877 static nodes_iterator nodes_begin(BoUpSLP *R) {
3878 return nodes_iterator(R->VectorizableTree.begin());
3879 }
3880
3881 static nodes_iterator nodes_end(BoUpSLP *R) {
3882 return nodes_iterator(R->VectorizableTree.end());
3883 }
3884
3885 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3886};
3887
3888template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3889 using TreeEntry = BoUpSLP::TreeEntry;
3890
3891 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3892
3893 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3894 std::string Str;
3896 OS << Entry->Idx << ".\n";
3897 if (isSplat(Entry->Scalars))
3898 OS << "<splat> ";
3899 for (auto *V : Entry->Scalars) {
3900 OS << *V;
3901 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3902 return EU.Scalar == V;
3903 }))
3904 OS << " <extract>";
3905 OS << "\n";
3906 }
3907 return Str;
3908 }
3909
3910 static std::string getNodeAttributes(const TreeEntry *Entry,
3911 const BoUpSLP *) {
3912 if (Entry->State == TreeEntry::NeedToGather)
3913 return "color=red";
3914 if (Entry->State == TreeEntry::ScatterVectorize ||
3915 Entry->State == TreeEntry::StridedVectorize)
3916 return "color=blue";
3917 return "";
3918 }
3919};
3920
3921} // end namespace llvm
3922
3925 for (auto *I : DeletedInstructions) {
3926 for (Use &U : I->operands()) {
3927 auto *Op = dyn_cast<Instruction>(U.get());
3928 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3930 DeadInsts.emplace_back(Op);
3931 }
3932 I->dropAllReferences();
3933 }
3934 for (auto *I : DeletedInstructions) {
3935 assert(I->use_empty() &&
3936 "trying to erase instruction with users.");
3937 I->eraseFromParent();
3938 }
3939
3940 // Cleanup any dead scalar code feeding the vectorized instructions
3942
3943#ifdef EXPENSIVE_CHECKS
3944 // If we could guarantee that this call is not extremely slow, we could
3945 // remove the ifdef limitation (see PR47712).
3946 assert(!verifyFunction(*F, &dbgs()));
3947#endif
3948}
3949
3950/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3951/// contains original mask for the scalars reused in the node. Procedure
3952/// transform this mask in accordance with the given \p Mask.
3954 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3955 "Expected non-empty mask.");
3956 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3957 Prev.swap(Reuses);
3958 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3959 if (Mask[I] != PoisonMaskElem)
3960 Reuses[Mask[I]] = Prev[I];
3961}
3962
3963/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3964/// the original order of the scalars. Procedure transforms the provided order
3965/// in accordance with the given \p Mask. If the resulting \p Order is just an
3966/// identity order, \p Order is cleared.
3968 bool BottomOrder = false) {
3969 assert(!Mask.empty() && "Expected non-empty mask.");
3970 unsigned Sz = Mask.size();
3971 if (BottomOrder) {
3972 SmallVector<unsigned> PrevOrder;
3973 if (Order.empty()) {
3974 PrevOrder.resize(Sz);
3975 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3976 } else {
3977 PrevOrder.swap(Order);
3978 }
3979 Order.assign(Sz, Sz);
3980 for (unsigned I = 0; I < Sz; ++I)
3981 if (Mask[I] != PoisonMaskElem)
3982 Order[I] = PrevOrder[Mask[I]];
3983 if (all_of(enumerate(Order), [&](const auto &Data) {
3984 return Data.value() == Sz || Data.index() == Data.value();
3985 })) {
3986 Order.clear();
3987 return;
3988 }
3989 fixupOrderingIndices(Order);
3990 return;
3991 }
3992 SmallVector<int> MaskOrder;
3993 if (Order.empty()) {
3994 MaskOrder.resize(Sz);
3995 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3996 } else {
3997 inversePermutation(Order, MaskOrder);
3998 }
3999 reorderReuses(MaskOrder, Mask);
4000 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4001 Order.clear();
4002 return;
4003 }
4004 Order.assign(Sz, Sz);
4005 for (unsigned I = 0; I < Sz; ++I)
4006 if (MaskOrder[I] != PoisonMaskElem)
4007 Order[MaskOrder[I]] = I;
4008 fixupOrderingIndices(Order);
4009}
4010
4011std::optional<BoUpSLP::OrdersType>
4012BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4013 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4014 // Try to find subvector extract/insert patterns and reorder only such
4015 // patterns.
4016 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4017 Type *ScalarTy = GatheredScalars.front()->getType();
4018 int NumScalars = GatheredScalars.size();
4019 if (!isValidElementType(ScalarTy))
4020 return std::nullopt;
4021 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
4022 int NumParts = TTI->getNumberOfParts(VecTy);
4023 if (NumParts == 0 || NumParts >= NumScalars)
4024 NumParts = 1;
4025 SmallVector<int> ExtractMask;
4026 SmallVector<int> Mask;
4029 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4031 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4032 /*ForOrder=*/true);
4033 // No shuffled operands - ignore.
4034 if (GatherShuffles.empty() && ExtractShuffles.empty())
4035 return std::nullopt;
4036 OrdersType CurrentOrder(NumScalars, NumScalars);
4037 if (GatherShuffles.size() == 1 &&
4038 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4039 Entries.front().front()->isSame(TE.Scalars)) {
4040 // Perfect match in the graph, will reuse the previously vectorized
4041 // node. Cost is 0.
4042 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4043 return CurrentOrder;
4044 }
4045 auto IsSplatMask = [](ArrayRef<int> Mask) {
4046 int SingleElt = PoisonMaskElem;
4047 return all_of(Mask, [&](int I) {
4048 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4049 SingleElt = I;
4050 return I == PoisonMaskElem || I == SingleElt;
4051 });
4052 };
4053 // Exclusive broadcast mask - ignore.
4054 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4055 (Entries.size() != 1 ||
4056 Entries.front().front()->ReorderIndices.empty())) ||
4057 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4058 return std::nullopt;
4059 SmallBitVector ShuffledSubMasks(NumParts);
4060 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4061 ArrayRef<int> Mask, int PartSz, int NumParts,
4062 function_ref<unsigned(unsigned)> GetVF) {
4063 for (int I : seq<int>(0, NumParts)) {
4064 if (ShuffledSubMasks.test(I))
4065 continue;
4066 const int VF = GetVF(I);
4067 if (VF == 0)
4068 continue;
4069 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
4070 // Shuffle of at least 2 vectors - ignore.
4071 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4072 std::fill(Slice.begin(), Slice.end(), NumScalars);
4073 ShuffledSubMasks.set(I);
4074 continue;
4075 }
4076 // Try to include as much elements from the mask as possible.
4077 int FirstMin = INT_MAX;
4078 int SecondVecFound = false;
4079 for (int K : seq<int>(0, PartSz)) {
4080 int Idx = Mask[I * PartSz + K];
4081 if (Idx == PoisonMaskElem) {
4082 Value *V = GatheredScalars[I * PartSz + K];
4083 if (isConstant(V) && !isa<PoisonValue>(V)) {
4084 SecondVecFound = true;
4085 break;
4086 }
4087 continue;
4088 }
4089 if (Idx < VF) {
4090 if (FirstMin > Idx)
4091 FirstMin = Idx;
4092 } else {
4093 SecondVecFound = true;
4094 break;
4095 }
4096 }
4097 FirstMin = (FirstMin / PartSz) * PartSz;
4098 // Shuffle of at least 2 vectors - ignore.
4099 if (SecondVecFound) {
4100 std::fill(Slice.begin(), Slice.end(), NumScalars);
4101 ShuffledSubMasks.set(I);
4102 continue;
4103 }
4104 for (int K : seq<int>(0, PartSz)) {
4105 int Idx = Mask[I * PartSz + K];
4106 if (Idx == PoisonMaskElem)
4107 continue;
4108 Idx -= FirstMin;
4109 if (Idx >= PartSz) {
4110 SecondVecFound = true;
4111 break;
4112 }
4113 if (CurrentOrder[I * PartSz + Idx] >
4114 static_cast<unsigned>(I * PartSz + K) &&
4115 CurrentOrder[I * PartSz + Idx] !=
4116 static_cast<unsigned>(I * PartSz + Idx))
4117 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4118 }
4119 // Shuffle of at least 2 vectors - ignore.
4120 if (SecondVecFound) {
4121 std::fill(Slice.begin(), Slice.end(), NumScalars);
4122 ShuffledSubMasks.set(I);
4123 continue;
4124 }
4125 }
4126 };
4127 int PartSz = NumScalars / NumParts;
4128 if (!ExtractShuffles.empty())
4129 TransformMaskToOrder(
4130 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4131 if (!ExtractShuffles[I])
4132 return 0U;
4133 unsigned VF = 0;
4134 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
4135 int K = I * PartSz + Idx;
4136 if (ExtractMask[K] == PoisonMaskElem)
4137 continue;
4138 if (!TE.ReuseShuffleIndices.empty())
4139 K = TE.ReuseShuffleIndices[K];
4140 if (!TE.ReorderIndices.empty())
4141 K = std::distance(TE.ReorderIndices.begin(),
4142 find(TE.ReorderIndices, K));
4143 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4144 if (!EI)
4145 continue;
4146 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4147 ->getElementCount()
4148 .getKnownMinValue());
4149 }
4150 return VF;
4151 });
4152 // Check special corner case - single shuffle of the same entry.
4153 if (GatherShuffles.size() == 1 && NumParts != 1) {
4154 if (ShuffledSubMasks.any())
4155 return std::nullopt;
4156 PartSz = NumScalars;
4157 NumParts = 1;
4158 }
4159 if (!Entries.empty())
4160 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4161 if (!GatherShuffles[I])
4162 return 0U;
4163 return std::max(Entries[I].front()->getVectorFactor(),
4164 Entries[I].back()->getVectorFactor());
4165 });
4166 int NumUndefs =
4167 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4168 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4169 return std::nullopt;
4170 return std::move(CurrentOrder);
4171}
4172
4173static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4174 const TargetLibraryInfo &TLI,
4175 bool CompareOpcodes = true) {
4176 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4177 return false;
4178 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4179 if (!GEP1)
4180 return false;
4181 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4182 if (!GEP2)
4183 return false;
4184 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4185 ((isConstant(GEP1->getOperand(1)) &&
4186 isConstant(GEP2->getOperand(1))) ||
4187 !CompareOpcodes ||
4188 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4189 .getOpcode());
4190}
4191
4192/// Calculates minimal alignment as a common alignment.
4193template <typename T>
4195 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4196 for (Value *V : VL.drop_front())
4197 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4198 return CommonAlignment;
4199}
4200
4201/// Check if \p Order represents reverse order.
4203 unsigned Sz = Order.size();
4204 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4205 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4206 });
4207}
4208
4209/// Checks if the provided list of pointers \p Pointers represents the strided
4210/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4211/// Otherwise, if \p Inst is not specified, just initialized optional value is
4212/// returned to show that the pointers represent strided pointers. If \p Inst
4213/// specified, the runtime stride is materialized before the given \p Inst.
4214/// \returns std::nullopt if the pointers are not pointers with the runtime
4215/// stride, nullptr or actual stride value, otherwise.
4216static std::optional<Value *>
4218 const DataLayout &DL, ScalarEvolution &SE,
4219 SmallVectorImpl<unsigned> &SortedIndices,
4220 Instruction *Inst = nullptr) {
4222 const SCEV *PtrSCEVLowest = nullptr;
4223 const SCEV *PtrSCEVHighest = nullptr;
4224 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4225 // addresses).
4226 for (Value *Ptr : PointerOps) {
4227 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4228 if (!PtrSCEV)
4229 return std::nullopt;
4230 SCEVs.push_back(PtrSCEV);
4231 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4232 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4233 continue;
4234 }
4235 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4236 if (isa<SCEVCouldNotCompute>(Diff))
4237 return std::nullopt;
4238 if (Diff->isNonConstantNegative()) {
4239 PtrSCEVLowest = PtrSCEV;
4240 continue;
4241 }
4242 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4243 if (isa<SCEVCouldNotCompute>(Diff1))
4244 return std::nullopt;
4245 if (Diff1->isNonConstantNegative()) {
4246 PtrSCEVHighest = PtrSCEV;
4247 continue;
4248 }
4249 }
4250 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4251 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4252 if (isa<SCEVCouldNotCompute>(Dist))
4253 return std::nullopt;
4254 int Size = DL.getTypeStoreSize(ElemTy);
4255 auto TryGetStride = [&](const SCEV *Dist,
4256 const SCEV *Multiplier) -> const SCEV * {
4257 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4258 if (M->getOperand(0) == Multiplier)
4259 return M->getOperand(1);
4260 if (M->getOperand(1) == Multiplier)
4261 return M->getOperand(0);
4262 return nullptr;
4263 }
4264 if (Multiplier == Dist)
4265 return SE.getConstant(Dist->getType(), 1);
4266 return SE.getUDivExactExpr(Dist, Multiplier);
4267 };
4268 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4269 const SCEV *Stride = nullptr;
4270 if (Size != 1 || SCEVs.size() > 2) {
4271 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4272 Stride = TryGetStride(Dist, Sz);
4273 if (!Stride)
4274 return std::nullopt;
4275 }
4276 if (!Stride || isa<SCEVConstant>(Stride))
4277 return std::nullopt;
4278 // Iterate through all pointers and check if all distances are
4279 // unique multiple of Stride.
4280 using DistOrdPair = std::pair<int64_t, int>;
4281 auto Compare = llvm::less_first();
4282 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4283 int Cnt = 0;
4284 bool IsConsecutive = true;
4285 for (const SCEV *PtrSCEV : SCEVs) {
4286 unsigned Dist = 0;
4287 if (PtrSCEV != PtrSCEVLowest) {
4288 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4289 const SCEV *Coeff = TryGetStride(Diff, Stride);
4290 if (!Coeff)
4291 return std::nullopt;
4292 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4293 if (!SC || isa<SCEVCouldNotCompute>(SC))
4294 return std::nullopt;
4295 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4296 SE.getMulExpr(Stride, SC)))
4297 ->isZero())
4298 return std::nullopt;
4299 Dist = SC->getAPInt().getZExtValue();
4300 }
4301 // If the strides are not the same or repeated, we can't vectorize.
4302 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4303 return std::nullopt;
4304 auto Res = Offsets.emplace(Dist, Cnt);
4305 if (!Res.second)
4306 return std::nullopt;
4307 // Consecutive order if the inserted element is the last one.
4308 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4309 ++Cnt;
4310 }
4311 if (Offsets.size() != SCEVs.size())
4312 return std::nullopt;
4313 SortedIndices.clear();
4314 if (!IsConsecutive) {
4315 // Fill SortedIndices array only if it is non-consecutive.
4316 SortedIndices.resize(PointerOps.size());
4317 Cnt = 0;
4318 for (const std::pair<int64_t, int> &Pair : Offsets) {
4319 SortedIndices[Cnt] = Pair.second;
4320 ++Cnt;
4321 }
4322 }
4323 if (!Inst)
4324 return nullptr;
4325 SCEVExpander Expander(SE, DL, "strided-load-vec");
4326 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4327}
4328
4329static std::pair<InstructionCost, InstructionCost>
4331 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4332 Type *ScalarTy, VectorType *VecTy);
4333
4335 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4336 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4337 // Check that a vectorized load would load the same memory as a scalar
4338 // load. For example, we don't want to vectorize loads that are smaller
4339 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4340 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4341 // from such a struct, we read/write packed bits disagreeing with the
4342 // unvectorized version.
4343 Type *ScalarTy = VL0->getType();
4344
4345 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4346 return LoadsState::Gather;
4347
4348 // Make sure all loads in the bundle are simple - we can't vectorize
4349 // atomic or volatile loads.
4350 PointerOps.clear();
4351 const unsigned Sz = VL.size();
4352 PointerOps.resize(Sz);
4353 auto *POIter = PointerOps.begin();
4354 for (Value *V : VL) {
4355 auto *L = cast<LoadInst>(V);
4356 if (!L->isSimple())
4357 return LoadsState::Gather;
4358 *POIter = L->getPointerOperand();
4359 ++POIter;
4360 }
4361
4362 Order.clear();
4363 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4364 // Check the order of pointer operands or that all pointers are the same.
4365 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4366 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4367 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4368 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4369 "supported with VectorizeNonPowerOf2");
4370 return LoadsState::Gather;
4371 }
4372
4373 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4374 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4375 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4376 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4378 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4379 return arePointersCompatible(P, PointerOps.front(), *TLI);
4380 })) {
4381 if (IsSorted) {
4382 Value *Ptr0;
4383 Value *PtrN;
4384 if (Order.empty()) {
4385 Ptr0 = PointerOps.front();
4386 PtrN = PointerOps.back();
4387 } else {
4388 Ptr0 = PointerOps[Order.front()];
4389 PtrN = PointerOps[Order.back()];
4390 }
4391 std::optional<int> Diff =
4392 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4393 // Check that the sorted loads are consecutive.
4394 if (static_cast<unsigned>(*Diff) == Sz - 1)
4395 return LoadsState::Vectorize;
4396 // Simple check if not a strided access - clear order.
4397 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4398 // Try to generate strided load node if:
4399 // 1. Target with strided load support is detected.
4400 // 2. The number of loads is greater than MinProfitableStridedLoads,
4401 // or the potential stride <= MaxProfitableLoadStride and the
4402 // potential stride is power-of-2 (to avoid perf regressions for the very
4403 // small number of loads) and max distance > number of loads, or potential
4404 // stride is -1.
4405 // 3. The loads are ordered, or number of unordered loads <=
4406 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4407 // (this check is to avoid extra costs for very expensive shuffles).
4408 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4409 (static_cast<unsigned>(std::abs(*Diff)) <=
4411 isPowerOf2_32(std::abs(*Diff)))) &&
4412 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4413 *Diff == -(static_cast<int>(Sz) - 1))) {
4414 int Stride = *Diff / static_cast<int>(Sz - 1);
4415 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4416 Align Alignment =
4417 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4418 ->getAlign();
4419 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4420 // Iterate through all pointers and check if all distances are
4421 // unique multiple of Dist.
4422 SmallSet<int, 4> Dists;
4423 for (Value *Ptr : PointerOps) {
4424 int Dist = 0;
4425 if (Ptr == PtrN)
4426 Dist = *Diff;
4427 else if (Ptr != Ptr0)
4428 Dist =
4429 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4430 // If the strides are not the same or repeated, we can't
4431 // vectorize.
4432 if (((Dist / Stride) * Stride) != Dist ||
4433 !Dists.insert(Dist).second)
4434 break;
4435 }
4436 if (Dists.size() == Sz)
4438 }
4439 }
4440 }
4441 }
4442 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4443 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4444 unsigned MinVF = getMinVF(Sz);
4445 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4446 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4447 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4448 unsigned VectorizedCnt = 0;
4450 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4451 Cnt += VF, ++VectorizedCnt) {
4452 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4454 SmallVector<Value *> PointerOps;
4455 LoadsState LS =
4456 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4457 /*TryRecursiveCheck=*/false);
4458 // Check that the sorted loads are consecutive.
4459 if (LS == LoadsState::Gather)
4460 break;
4461 // If need the reorder - consider as high-cost masked gather for now.
4462 if ((LS == LoadsState::Vectorize ||
4464 !Order.empty() && !isReverseOrder(Order))
4466 States.push_back(LS);
4467 }
4468 // Can be vectorized later as a serie of loads/insertelements.
4469 if (VectorizedCnt == VL.size() / VF) {
4470 // Compare masked gather cost and loads + insersubvector costs.
4472 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4473 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4474 CostKind, ScalarTy, VecTy);
4475 InstructionCost MaskedGatherCost =
4477 Instruction::Load, VecTy,
4478 cast<LoadInst>(VL0)->getPointerOperand(),
4479 /*VariableMask=*/false, CommonAlignment, CostKind) +
4480 VectorGEPCost - ScalarGEPCost;
4481 InstructionCost VecLdCost = 0;
4482 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4483 for (auto [I, LS] : enumerate(States)) {
4484 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4485 switch (LS) {
4486 case LoadsState::Vectorize: {
4487 auto [ScalarGEPCost, VectorGEPCost] =
4488 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4489 LI0->getPointerOperand(), Instruction::Load,
4490 CostKind, ScalarTy, SubVecTy);
4491 VecLdCost += TTI.getMemoryOpCost(
4492 Instruction::Load, SubVecTy, LI0->getAlign(),
4493 LI0->getPointerAddressSpace(), CostKind,
4495 VectorGEPCost - ScalarGEPCost;
4496 break;
4497 }
4499 auto [ScalarGEPCost, VectorGEPCost] =
4500 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4501 LI0->getPointerOperand(), Instruction::Load,
4502 CostKind, ScalarTy, SubVecTy);
4503 VecLdCost +=
4505 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4506 /*VariableMask=*/false, CommonAlignment, CostKind) +
4507 VectorGEPCost - ScalarGEPCost;
4508 break;
4509 }
4511 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4512 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4513 LI0->getPointerOperand(), Instruction::GetElementPtr,
4514 CostKind, ScalarTy, SubVecTy);
4515 VecLdCost +=
4517 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4518 /*VariableMask=*/false, CommonAlignment, CostKind) +
4519 VectorGEPCost - ScalarGEPCost;
4520 break;
4521 }
4522 case LoadsState::Gather:
4524 "Expected only consecutive, strided or masked gather loads.");
4525 }
4526 SmallVector<int> ShuffleMask(VL.size());
4527 for (int Idx : seq<int>(0, VL.size()))
4528 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4529 VecLdCost +=
4530 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4531 CostKind, I * VF, SubVecTy);
4532 }
4533 // If masked gather cost is higher - better to vectorize, so
4534 // consider it as a gather node. It will be better estimated
4535 // later.
4536 if (MaskedGatherCost >= VecLdCost)
4537 return true;
4538 }
4539 }
4540 return false;
4541 };
4542 // TODO: need to improve analysis of the pointers, if not all of them are
4543 // GEPs or have > 2 operands, we end up with a gather node, which just
4544 // increases the cost.
4545 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4546 bool ProfitableGatherPointers =
4547 L && Sz > 2 &&
4548 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4549 return L->isLoopInvariant(V);
4550 })) <= Sz / 2;
4551 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4552 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4553 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4554 (GEP && GEP->getNumOperands() == 2 &&
4555 isa<Constant, Instruction>(GEP->getOperand(1)));
4556 })) {
4557 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4558 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4559 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4560 // Check if potential masked gather can be represented as series
4561 // of loads + insertsubvectors.
4562 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4563 // If masked gather cost is higher - better to vectorize, so
4564 // consider it as a gather node. It will be better estimated
4565 // later.
4566 return LoadsState::Gather;
4567 }
4569 }
4570 }
4571 }
4572
4573 return LoadsState::Gather;
4574}
4575
4577 const DataLayout &DL, ScalarEvolution &SE,
4578 SmallVectorImpl<unsigned> &SortedIndices) {
4580 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4581 "Expected list of pointer operands.");
4582 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4583 // Ptr into, sort and return the sorted indices with values next to one
4584 // another.
4586 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4587
4588 unsigned Cnt = 1;
4589 for (Value *Ptr : VL.drop_front()) {
4590 bool Found = any_of(Bases, [&](auto &Base) {
4591 std::optional<int> Diff =
4592 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4593 /*StrictCheck=*/true);
4594 if (!Diff)
4595 return false;
4596
4597 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4598 return true;
4599 });
4600
4601 if (!Found) {
4602 // If we haven't found enough to usefully cluster, return early.
4603 if (Bases.size() > VL.size() / 2 - 1)
4604 return false;
4605
4606 // Not found already - add a new Base
4607 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4608 }
4609 }
4610
4611 // For each of the bases sort the pointers by Offset and check if any of the
4612 // base become consecutively allocated.
4613 bool AnyConsecutive = false;
4614 for (auto &Base : Bases) {
4615 auto &Vec = Base.second;
4616 if (Vec.size() > 1) {
4617 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4618 const std::tuple<Value *, int, unsigned> &Y) {
4619 return std::get<1>(X) < std::get<1>(Y);
4620 });
4621 int InitialOffset = std::get<1>(Vec[0]);
4622 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4623 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4624 });
4625 }
4626 }
4627
4628 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4629 SortedIndices.clear();
4630 if (!AnyConsecutive)
4631 return false;
4632
4633 for (auto &Base : Bases) {
4634 for (auto &T : Base.second)
4635 SortedIndices.push_back(std::get<2>(T));
4636 }
4637
4638 assert(SortedIndices.size() == VL.size() &&
4639 "Expected SortedIndices to be the size of VL");
4640 return true;
4641}
4642
4643std::optional<BoUpSLP::OrdersType>
4644BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4645 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4646 Type *ScalarTy = TE.Scalars[0]->getType();
4647
4649 Ptrs.reserve(TE.Scalars.size());
4650 for (Value *V : TE.Scalars) {
4651 auto *L = dyn_cast<LoadInst>(V);
4652 if (!L || !L->isSimple())
4653 return std::nullopt;
4654 Ptrs.push_back(L->getPointerOperand());
4655 }
4656
4657 BoUpSLP::OrdersType Order;
4658 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4659 return std::move(Order);
4660 return std::nullopt;
4661}
4662
4663/// Check if two insertelement instructions are from the same buildvector.
4666 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4667 // Instructions must be from the same basic blocks.
4668 if (VU->getParent() != V->getParent())
4669 return false;
4670 // Checks if 2 insertelements are from the same buildvector.
4671 if (VU->getType() != V->getType())
4672 return false;
4673 // Multiple used inserts are separate nodes.
4674 if (!VU->hasOneUse() && !V->hasOneUse())
4675 return false;
4676 auto *IE1 = VU;
4677 auto *IE2 = V;
4678 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4679 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4680 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4681 return false;
4682 // Go through the vector operand of insertelement instructions trying to find
4683 // either VU as the original vector for IE2 or V as the original vector for
4684 // IE1.
4685 SmallBitVector ReusedIdx(
4686 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4687 bool IsReusedIdx = false;
4688 do {
4689 if (IE2 == VU && !IE1)
4690 return VU->hasOneUse();
4691 if (IE1 == V && !IE2)
4692 return V->hasOneUse();
4693 if (IE1 && IE1 != V) {
4694 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4695 IsReusedIdx |= ReusedIdx.test(Idx1);
4696 ReusedIdx.set(Idx1);
4697 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4698 IE1 = nullptr;
4699 else
4700 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4701 }
4702 if (IE2 && IE2 != VU) {
4703 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4704 IsReusedIdx |= ReusedIdx.test(Idx2);
4705 ReusedIdx.set(Idx2);
4706 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4707 IE2 = nullptr;
4708 else
4709 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4710 }
4711 } while (!IsReusedIdx && (IE1 || IE2));
4712 return false;
4713}
4714
4715std::optional<BoUpSLP::OrdersType>
4716BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4717 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4718 if (TE.isNonPowOf2Vec())
4719 return std::nullopt;
4720
4721 // No need to reorder if need to shuffle reuses, still need to shuffle the
4722 // node.
4723 if (!TE.ReuseShuffleIndices.empty()) {
4724 if (isSplat(TE.Scalars))
4725 return std::nullopt;
4726 // Check if reuse shuffle indices can be improved by reordering.
4727 // For this, check that reuse mask is "clustered", i.e. each scalar values
4728 // is used once in each submask of size <number_of_scalars>.
4729 // Example: 4 scalar values.
4730 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4731 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4732 // element 3 is used twice in the second submask.
4733 unsigned Sz = TE.Scalars.size();
4734 if (TE.State == TreeEntry::NeedToGather) {
4735 if (std::optional<OrdersType> CurrentOrder =
4737 SmallVector<int> Mask;
4738 fixupOrderingIndices(*CurrentOrder);
4739 inversePermutation(*CurrentOrder, Mask);
4740 ::addMask(Mask, TE.ReuseShuffleIndices);
4741 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4742 unsigned Sz = TE.Scalars.size();
4743 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4744 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4745 if (Idx != PoisonMaskElem)
4746 Res[Idx + K * Sz] = I + K * Sz;
4747 }
4748 return std::move(Res);
4749 }
4750 }
4751 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4753 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4754 return std::nullopt;
4755 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4756 Sz)) {
4757 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4758 if (TE.ReorderIndices.empty())
4759 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4760 else
4761 inversePermutation(TE.ReorderIndices, ReorderMask);
4762 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4763 unsigned VF = ReorderMask.size();
4764 OrdersType ResOrder(VF, VF);
4765 unsigned NumParts = VF / Sz;
4766 SmallBitVector UsedVals(NumParts);
4767 for (unsigned I = 0; I < VF; I += Sz) {
4768 int Val = PoisonMaskElem;
4769 unsigned UndefCnt = 0;
4770 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4771 [&](int Idx) {
4772 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4773 Val = Idx;
4774 if (Idx == PoisonMaskElem)
4775 ++UndefCnt;
4776 return Idx != PoisonMaskElem && Idx != Val;
4777 }) ||
4778 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4779 UndefCnt > Sz / 2)
4780 return std::nullopt;
4781 UsedVals.set(Val);
4782 for (unsigned K = 0; K < NumParts; ++K)
4783 ResOrder[Val + Sz * K] = I + K;
4784 }
4785 return std::move(ResOrder);
4786 }
4787 unsigned VF = TE.getVectorFactor();
4788 // Try build correct order for extractelement instructions.
4789 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4790 TE.ReuseShuffleIndices.end());
4791 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4792 all_of(TE.Scalars, [Sz](Value *V) {
4793 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4794 return Idx && *Idx < Sz;
4795 })) {
4796 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4797 if (TE.ReorderIndices.empty())
4798 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4799 else
4800 inversePermutation(TE.ReorderIndices, ReorderMask);
4801 for (unsigned I = 0; I < VF; ++I) {
4802 int &Idx = ReusedMask[I];
4803 if (Idx == PoisonMaskElem)
4804 continue;
4805 Value *V = TE.Scalars[ReorderMask[Idx]];
4806 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4807 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4808 }
4809 }
4810 // Build the order of the VF size, need to reorder reuses shuffles, they are
4811 // always of VF size.
4812 OrdersType ResOrder(VF);
4813 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4814 auto *It = ResOrder.begin();
4815 for (unsigned K = 0; K < VF; K += Sz) {
4816 OrdersType CurrentOrder(TE.ReorderIndices);
4817 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4818 if (SubMask.front() == PoisonMaskElem)
4819 std::iota(SubMask.begin(), SubMask.end(), 0);
4820 reorderOrder(CurrentOrder, SubMask);
4821 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4822 std::advance(It, Sz);
4823 }
4824 if (TE.State == TreeEntry::NeedToGather &&
4825 all_of(enumerate(ResOrder),
4826 [](const auto &Data) { return Data.index() == Data.value(); }))
4827 return std::nullopt; // No need to reorder.
4828 return std::move(ResOrder);
4829 }
4830 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4831 any_of(TE.UserTreeIndices,
4832 [](const EdgeInfo &EI) {
4833 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4834 }) &&
4835 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4836 return std::nullopt;
4837 if ((TE.State == TreeEntry::Vectorize ||
4838 TE.State == TreeEntry::StridedVectorize) &&
4839 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4840 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4841 !TE.isAltShuffle())
4842 return TE.ReorderIndices;
4843 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4844 auto PHICompare = [&](unsigned I1, unsigned I2) {
4845 Value *V1 = TE.Scalars[I1];
4846 Value *V2 = TE.Scalars[I2];
4847 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4848 return false;
4849 if (V1->getNumUses() < V2->getNumUses())
4850 return true;
4851 if (V1->getNumUses() > V2->getNumUses())
4852 return false;
4853 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4854 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4855 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4856 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4858 IE1, IE2,
4859 [](InsertElementInst *II) { return II->getOperand(0); }))
4860 return I1 < I2;
4861 return getInsertIndex(IE1) < getInsertIndex(IE2);
4862 }
4863 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4864 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4865 if (EE1->getOperand(0) != EE2->getOperand(0))
4866 return I1 < I2;
4867 return getInsertIndex(EE1) < getInsertIndex(EE2);
4868 }
4869 return I1 < I2;
4870 };
4871 auto IsIdentityOrder = [](const OrdersType &Order) {
4872 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4873 if (Idx != Order[Idx])
4874 return false;
4875 return true;
4876 };
4877 if (!TE.ReorderIndices.empty())
4878 return TE.ReorderIndices;
4880 SmallVector<unsigned> Phis(TE.Scalars.size());
4881 std::iota(Phis.begin(), Phis.end(), 0);
4882 OrdersType ResOrder(TE.Scalars.size());
4883 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4884 PhiToId[Id] = Id;
4885 stable_sort(Phis, PHICompare);
4886 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4887 ResOrder[Id] = PhiToId[Phis[Id]];
4888 if (IsIdentityOrder(ResOrder))
4889 return std::nullopt; // No need to reorder.
4890 return std::move(ResOrder);
4891 }
4892 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4893 allSameType(TE.Scalars)) {
4894 // TODO: add analysis of other gather nodes with extractelement
4895 // instructions and other values/instructions, not only undefs.
4896 if ((TE.getOpcode() == Instruction::ExtractElement ||
4897 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4898 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4899 all_of(TE.Scalars, [](Value *V) {
4900 auto *EE = dyn_cast<ExtractElementInst>(V);
4901 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4902 })) {
4903 // Check that gather of extractelements can be represented as
4904 // just a shuffle of a single vector.
4905 OrdersType CurrentOrder;
4906 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4907 /*ResizeAllowed=*/true);
4908 if (Reuse || !CurrentOrder.empty())
4909 return std::move(CurrentOrder);
4910 }
4911 // If the gather node is <undef, v, .., poison> and
4912 // insertelement poison, v, 0 [+ permute]
4913 // is cheaper than
4914 // insertelement poison, v, n - try to reorder.
4915 // If rotating the whole graph, exclude the permute cost, the whole graph
4916 // might be transformed.
4917 int Sz = TE.Scalars.size();
4918 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4919 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4920 const auto *It =
4921 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4922 if (It == TE.Scalars.begin())
4923 return OrdersType();
4924 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4925 if (It != TE.Scalars.end()) {
4926 OrdersType Order(Sz, Sz);
4927 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4928 Order[Idx] = 0;
4929 fixupOrderingIndices(Order);
4930 SmallVector<int> Mask;
4931 inversePermutation(Order, Mask);
4932 InstructionCost PermuteCost =
4933 TopToBottom
4934 ? 0
4936 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4937 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4938 PoisonValue::get(Ty), *It);
4939 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4940 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4941 PoisonValue::get(Ty), *It);
4942 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4943 OrdersType Order(Sz, Sz);
4944 Order[Idx] = 0;
4945 return std::move(Order);
4946 }
4947 }
4948 }
4949 if (isSplat(TE.Scalars))
4950 return std::nullopt;
4951 if (TE.Scalars.size() >= 4)
4952 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4953 return Order;
4954 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4955 return CurrentOrder;
4956 }
4957 return std::nullopt;
4958}
4959
4960/// Checks if the given mask is a "clustered" mask with the same clusters of
4961/// size \p Sz, which are not identity submasks.
4963 unsigned Sz) {
4964 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4965 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4966 return false;
4967 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4968 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4969 if (Cluster != FirstCluster)
4970 return false;
4971 }
4972 return true;
4973}
4974
4975void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4976 // Reorder reuses mask.
4977 reorderReuses(TE.ReuseShuffleIndices, Mask);
4978 const unsigned Sz = TE.Scalars.size();
4979 // For vectorized and non-clustered reused no need to do anything else.
4980 if (TE.State != TreeEntry::NeedToGather ||
4982 Sz) ||
4983 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4984 return;
4985 SmallVector<int> NewMask;
4986 inversePermutation(TE.ReorderIndices, NewMask);
4987 addMask(NewMask, TE.ReuseShuffleIndices);
4988 // Clear reorder since it is going to be applied to the new mask.
4989 TE.ReorderIndices.clear();
4990 // Try to improve gathered nodes with clustered reuses, if possible.
4991 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4992 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4993 inversePermutation(NewOrder, NewMask);
4994 reorderScalars(TE.Scalars, NewMask);
4995 // Fill the reuses mask with the identity submasks.
4996 for (auto *It = TE.ReuseShuffleIndices.begin(),
4997 *End = TE.ReuseShuffleIndices.end();
4998 It != End; std::advance(It, Sz))
4999 std::iota(It, std::next(It, Sz), 0);
5000}
5001
5003 ArrayRef<unsigned> SecondaryOrder) {
5004 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5005 "Expected same size of orders");
5006 unsigned Sz = Order.size();
5007 SmallBitVector UsedIndices(Sz);
5008 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5009 if (Order[Idx] != Sz)
5010 UsedIndices.set(Order[Idx]);
5011 }
5012 if (SecondaryOrder.empty()) {
5013 for (unsigned Idx : seq<unsigned>(0, Sz))
5014 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5015 Order[Idx] = Idx;
5016 } else {
5017 for (unsigned Idx : seq<unsigned>(0, Sz))
5018 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5019 !UsedIndices.test(SecondaryOrder[Idx]))
5020 Order[Idx] = SecondaryOrder[Idx];
5021 }
5022}
5023
5025 // Maps VF to the graph nodes.
5027 // ExtractElement gather nodes which can be vectorized and need to handle
5028 // their ordering.
5030
5031 // Phi nodes can have preferred ordering based on their result users
5033
5034 // AltShuffles can also have a preferred ordering that leads to fewer
5035 // instructions, e.g., the addsub instruction in x86.
5036 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5037
5038 // Maps a TreeEntry to the reorder indices of external users.
5040 ExternalUserReorderMap;
5041 // Find all reorderable nodes with the given VF.
5042 // Currently the are vectorized stores,loads,extracts + some gathering of
5043 // extracts.
5044 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5045 const std::unique_ptr<TreeEntry> &TE) {
5046 // Look for external users that will probably be vectorized.
5047 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5048 findExternalStoreUsersReorderIndices(TE.get());
5049 if (!ExternalUserReorderIndices.empty()) {
5050 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5051 ExternalUserReorderMap.try_emplace(TE.get(),
5052 std::move(ExternalUserReorderIndices));
5053 }
5054
5055 // Patterns like [fadd,fsub] can be combined into a single instruction in
5056 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5057 // to take into account their order when looking for the most used order.
5058 if (TE->isAltShuffle()) {
5059 VectorType *VecTy =
5060 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
5061 unsigned Opcode0 = TE->getOpcode();
5062 unsigned Opcode1 = TE->getAltOpcode();
5063 // The opcode mask selects between the two opcodes.
5064 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
5065 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
5066 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
5067 OpcodeMask.set(Lane);
5068 // If this pattern is supported by the target then we consider the order.
5069 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5070 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5071 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5072 }
5073 // TODO: Check the reverse order too.
5074 }
5075
5076 if (std::optional<OrdersType> CurrentOrder =
5077 getReorderingData(*TE, /*TopToBottom=*/true)) {
5078 // Do not include ordering for nodes used in the alt opcode vectorization,
5079 // better to reorder them during bottom-to-top stage. If follow the order
5080 // here, it causes reordering of the whole graph though actually it is
5081 // profitable just to reorder the subgraph that starts from the alternate
5082 // opcode vectorization node. Such nodes already end-up with the shuffle
5083 // instruction and it is just enough to change this shuffle rather than
5084 // rotate the scalars for the whole graph.
5085 unsigned Cnt = 0;
5086 const TreeEntry *UserTE = TE.get();
5087 while (UserTE && Cnt < RecursionMaxDepth) {
5088 if (UserTE->UserTreeIndices.size() != 1)
5089 break;
5090 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5091 return EI.UserTE->State == TreeEntry::Vectorize &&
5092 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5093 }))
5094 return;
5095 UserTE = UserTE->UserTreeIndices.back().UserTE;
5096 ++Cnt;
5097 }
5098 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5099 if (!(TE->State == TreeEntry::Vectorize ||
5100 TE->State == TreeEntry::StridedVectorize) ||
5101 !TE->ReuseShuffleIndices.empty())
5102 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5103 if (TE->State == TreeEntry::Vectorize &&
5104 TE->getOpcode() == Instruction::PHI)
5105 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5106 }
5107 });
5108
5109 // Reorder the graph nodes according to their vectorization factor.
5110 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5111 VF /= 2) {
5112 auto It = VFToOrderedEntries.find(VF);
5113 if (It == VFToOrderedEntries.end())
5114 continue;
5115 // Try to find the most profitable order. We just are looking for the most
5116 // used order and reorder scalar elements in the nodes according to this
5117 // mostly used order.
5118 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5119 // All operands are reordered and used only in this node - propagate the
5120 // most used order to the user node.
5123 OrdersUses;
5125 for (const TreeEntry *OpTE : OrderedEntries) {
5126 // No need to reorder this nodes, still need to extend and to use shuffle,
5127 // just need to merge reordering shuffle and the reuse shuffle.
5128 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5129 continue;
5130 // Count number of orders uses.
5131 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5132 &PhisToOrders]() -> const OrdersType & {
5133 if (OpTE->State == TreeEntry::NeedToGather ||
5134 !OpTE->ReuseShuffleIndices.empty()) {
5135 auto It = GathersToOrders.find(OpTE);
5136 if (It != GathersToOrders.end())
5137 return It->second;
5138 }
5139 if (OpTE->isAltShuffle()) {
5140 auto It = AltShufflesToOrders.find(OpTE);
5141 if (It != AltShufflesToOrders.end())
5142 return It->second;
5143 }
5144 if (OpTE->State == TreeEntry::Vectorize &&
5145 OpTE->getOpcode() == Instruction::PHI) {
5146 auto It = PhisToOrders.find(OpTE);
5147 if (It != PhisToOrders.end())
5148 return It->second;
5149 }
5150 return OpTE->ReorderIndices;
5151 }();
5152 // First consider the order of the external scalar users.
5153 auto It = ExternalUserReorderMap.find(OpTE);
5154 if (It != ExternalUserReorderMap.end()) {
5155 const auto &ExternalUserReorderIndices = It->second;
5156 // If the OpTE vector factor != number of scalars - use natural order,
5157 // it is an attempt to reorder node with reused scalars but with
5158 // external uses.
5159 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5160 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5161 ExternalUserReorderIndices.size();
5162 } else {
5163 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5164 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5165 }
5166 // No other useful reorder data in this entry.
5167 if (Order.empty())
5168 continue;
5169 }
5170 // Stores actually store the mask, not the order, need to invert.
5171 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5172 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5173 SmallVector<int> Mask;
5174 inversePermutation(Order, Mask);
5175 unsigned E = Order.size();
5176 OrdersType CurrentOrder(E, E);
5177 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5178 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5179 });
5180 fixupOrderingIndices(CurrentOrder);
5181 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5182 } else {
5183 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5184 }
5185 }
5186 if (OrdersUses.empty())
5187 continue;
5188 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5189 const unsigned Sz = Order.size();
5190 for (unsigned Idx : seq<unsigned>(0, Sz))
5191 if (Idx != Order[Idx] && Order[Idx] != Sz)
5192 return false;
5193 return true;
5194 };
5195 // Choose the most used order.
5196 unsigned IdentityCnt = 0;
5197 unsigned FilledIdentityCnt = 0;
5198 OrdersType IdentityOrder(VF, VF);
5199 for (auto &Pair : OrdersUses) {
5200 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5201 if (!Pair.first.empty())
5202 FilledIdentityCnt += Pair.second;
5203 IdentityCnt += Pair.second;
5204 combineOrders(IdentityOrder, Pair.first);
5205 }
5206 }
5207 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5208 unsigned Cnt = IdentityCnt;
5209 for (auto &Pair : OrdersUses) {
5210 // Prefer identity order. But, if filled identity found (non-empty order)
5211 // with same number of uses, as the new candidate order, we can choose
5212 // this candidate order.
5213 if (Cnt < Pair.second ||
5214 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5215 Cnt == Pair.second && !BestOrder.empty() &&
5216 IsIdentityOrder(BestOrder))) {
5217 combineOrders(Pair.first, BestOrder);
5218 BestOrder = Pair.first;
5219 Cnt = Pair.second;
5220 } else {
5221 combineOrders(BestOrder, Pair.first);
5222 }
5223 }
5224 // Set order of the user node.
5225 if (IsIdentityOrder(BestOrder))
5226 continue;
5227 fixupOrderingIndices(BestOrder);
5228 SmallVector<int> Mask;
5229 inversePermutation(BestOrder, Mask);
5230 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5231 unsigned E = BestOrder.size();
5232 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5233 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5234 });
5235 // Do an actual reordering, if profitable.
5236 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5237 // Just do the reordering for the nodes with the given VF.
5238 if (TE->Scalars.size() != VF) {
5239 if (TE->ReuseShuffleIndices.size() == VF) {
5240 // Need to reorder the reuses masks of the operands with smaller VF to
5241 // be able to find the match between the graph nodes and scalar
5242 // operands of the given node during vectorization/cost estimation.
5243 assert(all_of(TE->UserTreeIndices,
5244 [VF, &TE](const EdgeInfo &EI) {
5245 return EI.UserTE->Scalars.size() == VF ||
5246 EI.UserTE->Scalars.size() ==
5247 TE->Scalars.size();
5248 }) &&
5249 "All users must be of VF size.");
5250 // Update ordering of the operands with the smaller VF than the given
5251 // one.
5252 reorderNodeWithReuses(*TE, Mask);
5253 }
5254 continue;
5255 }
5256 if ((TE->State == TreeEntry::Vectorize ||
5257 TE->State == TreeEntry::StridedVectorize) &&
5259 InsertElementInst>(TE->getMainOp()) &&
5260 !TE->isAltShuffle()) {
5261 // Build correct orders for extract{element,value}, loads and
5262 // stores.
5263 reorderOrder(TE->ReorderIndices, Mask);
5264 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5265 TE->reorderOperands(Mask);
5266 } else {
5267 // Reorder the node and its operands.
5268 TE->reorderOperands(Mask);
5269 assert(TE->ReorderIndices.empty() &&
5270 "Expected empty reorder sequence.");
5271 reorderScalars(TE->Scalars, Mask);
5272 }
5273 if (!TE->ReuseShuffleIndices.empty()) {
5274 // Apply reversed order to keep the original ordering of the reused
5275 // elements to avoid extra reorder indices shuffling.
5276 OrdersType CurrentOrder;
5277 reorderOrder(CurrentOrder, MaskOrder);
5278 SmallVector<int> NewReuses;
5279 inversePermutation(CurrentOrder, NewReuses);
5280 addMask(NewReuses, TE->ReuseShuffleIndices);
5281 TE->ReuseShuffleIndices.swap(NewReuses);
5282 }
5283 }
5284 }
5285}
5286
5287bool BoUpSLP::canReorderOperands(
5288 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5289 ArrayRef<TreeEntry *> ReorderableGathers,
5290 SmallVectorImpl<TreeEntry *> &GatherOps) {
5291 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5292 if (UserTE->isNonPowOf2Vec())
5293 return false;
5294
5295 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5296 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5297 return OpData.first == I &&
5298 (OpData.second->State == TreeEntry::Vectorize ||
5299 OpData.second->State == TreeEntry::StridedVectorize);
5300 }))
5301 continue;
5302 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5303 // Do not reorder if operand node is used by many user nodes.
5304 if (any_of(TE->UserTreeIndices,
5305 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5306 return false;
5307 // Add the node to the list of the ordered nodes with the identity
5308 // order.
5309 Edges.emplace_back(I, TE);
5310 // Add ScatterVectorize nodes to the list of operands, where just
5311 // reordering of the scalars is required. Similar to the gathers, so
5312 // simply add to the list of gathered ops.
5313 // If there are reused scalars, process this node as a regular vectorize
5314 // node, just reorder reuses mask.
5315 if (TE->State != TreeEntry::Vectorize &&
5316 TE->State != TreeEntry::StridedVectorize &&
5317 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5318 GatherOps.push_back(TE);
5319 continue;
5320 }
5321 TreeEntry *Gather = nullptr;
5322 if (count_if(ReorderableGathers,
5323 [&Gather, UserTE, I](TreeEntry *TE) {
5324 assert(TE->State != TreeEntry::Vectorize &&
5325 TE->State != TreeEntry::StridedVectorize &&
5326 "Only non-vectorized nodes are expected.");
5327 if (any_of(TE->UserTreeIndices,
5328 [UserTE, I](const EdgeInfo &EI) {
5329 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5330 })) {
5331 assert(TE->isSame(UserTE->getOperand(I)) &&
5332 "Operand entry does not match operands.");
5333 Gather = TE;
5334 return true;
5335 }
5336 return false;
5337 }) > 1 &&
5338 !allConstant(UserTE->getOperand(I)))
5339 return false;
5340 if (Gather)
5341 GatherOps.push_back(Gather);
5342 }
5343 return true;
5344}
5345
5346void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5347 SetVector<TreeEntry *> OrderedEntries;
5348 DenseSet<const TreeEntry *> GathersToOrders;
5349 // Find all reorderable leaf nodes with the given VF.
5350 // Currently the are vectorized loads,extracts without alternate operands +
5351 // some gathering of extracts.
5352 SmallVector<TreeEntry *> NonVectorized;
5353 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5354 if (TE->State != TreeEntry::Vectorize &&
5355 TE->State != TreeEntry::StridedVectorize)
5356 NonVectorized.push_back(TE.get());
5357 if (std::optional<OrdersType> CurrentOrder =
5358 getReorderingData(*TE, /*TopToBottom=*/false)) {
5359 OrderedEntries.insert(TE.get());
5360 if (!(TE->State == TreeEntry::Vectorize ||
5361 TE->State == TreeEntry::StridedVectorize) ||
5362 !TE->ReuseShuffleIndices.empty())
5363 GathersToOrders.insert(TE.get());
5364 }
5365 }
5366
5367 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5368 // I.e., if the node has operands, that are reordered, try to make at least
5369 // one operand order in the natural order and reorder others + reorder the
5370 // user node itself.
5372 while (!OrderedEntries.empty()) {
5373 // 1. Filter out only reordered nodes.
5374 // 2. If the entry has multiple uses - skip it and jump to the next node.
5376 SmallVector<TreeEntry *> Filtered;
5377 for (TreeEntry *TE : OrderedEntries) {
5378 if (!(TE->State == TreeEntry::Vectorize ||
5379 TE->State == TreeEntry::StridedVectorize ||
5380 (TE->State == TreeEntry::NeedToGather &&
5381 GathersToOrders.contains(TE))) ||
5382 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5383 !all_of(drop_begin(TE->UserTreeIndices),
5384 [TE](const EdgeInfo &EI) {
5385 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5386 }) ||
5387 !Visited.insert(TE).second) {
5388 Filtered.push_back(TE);
5389 continue;
5390 }
5391 // Build a map between user nodes and their operands order to speedup
5392 // search. The graph currently does not provide this dependency directly.
5393 for (EdgeInfo &EI : TE->UserTreeIndices) {
5394 TreeEntry *UserTE = EI.UserTE;
5395 auto It = Users.find(UserTE);
5396 if (It == Users.end())
5397 It = Users.insert({UserTE, {}}).first;
5398 It->second.emplace_back(EI.EdgeIdx, TE);
5399 }
5400 }
5401 // Erase filtered entries.
5402 for (TreeEntry *TE : Filtered)
5403 OrderedEntries.remove(TE);
5405 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5406 UsersVec(Users.begin(), Users.end());
5407 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5408 return Data1.first->Idx > Data2.first->Idx;
5409 });
5410 for (auto &Data : UsersVec) {
5411 // Check that operands are used only in the User node.
5412 SmallVector<TreeEntry *> GatherOps;
5413 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5414 GatherOps)) {
5415 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5416 OrderedEntries.remove(Op.second);
5417 continue;
5418 }
5419 // All operands are reordered and used only in this node - propagate the
5420 // most used order to the user node.
5423 OrdersUses;
5424 // Do the analysis for each tree entry only once, otherwise the order of
5425 // the same node my be considered several times, though might be not
5426 // profitable.
5429 for (const auto &Op : Data.second) {
5430 TreeEntry *OpTE = Op.second;
5431 if (!VisitedOps.insert(OpTE).second)
5432 continue;
5433 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5434 continue;
5435 const auto Order = [&]() -> const OrdersType {
5436 if (OpTE->State == TreeEntry::NeedToGather ||
5437 !OpTE->ReuseShuffleIndices.empty())
5438 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5439 .value_or(OrdersType(1));
5440 return OpTE->ReorderIndices;
5441 }();
5442 // The order is partially ordered, skip it in favor of fully non-ordered
5443 // orders.
5444 if (Order.size() == 1)
5445 continue;
5446 unsigned NumOps = count_if(
5447 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5448 return P.second == OpTE;
5449 });
5450 // Stores actually store the mask, not the order, need to invert.
5451 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5452 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5453 SmallVector<int> Mask;
5454 inversePermutation(Order, Mask);
5455 unsigned E = Order.size();
5456 OrdersType CurrentOrder(E, E);
5457 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5458 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5459 });
5460 fixupOrderingIndices(CurrentOrder);
5461 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5462 NumOps;
5463 } else {
5464 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5465 }
5466 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5467 const auto AllowsReordering = [&](const TreeEntry *TE) {
5468 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5469 if (TE->isNonPowOf2Vec())
5470 return false;
5471 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5472 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5473 (IgnoreReorder && TE->Idx == 0))
5474 return true;
5475 if (TE->State == TreeEntry::NeedToGather) {
5476 if (GathersToOrders.contains(TE))
5477 return !getReorderingData(*TE, /*TopToBottom=*/false)
5478 .value_or(OrdersType(1))
5479 .empty();
5480 return true;
5481 }
5482 return false;
5483 };
5484 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5485 TreeEntry *UserTE = EI.UserTE;
5486 if (!VisitedUsers.insert(UserTE).second)
5487 continue;
5488 // May reorder user node if it requires reordering, has reused
5489 // scalars, is an alternate op vectorize node or its op nodes require
5490 // reordering.
5491 if (AllowsReordering(UserTE))
5492 continue;
5493 // Check if users allow reordering.
5494 // Currently look up just 1 level of operands to avoid increase of
5495 // the compile time.
5496 // Profitable to reorder if definitely more operands allow
5497 // reordering rather than those with natural order.
5499 if (static_cast<unsigned>(count_if(
5500 Ops, [UserTE, &AllowsReordering](
5501 const std::pair<unsigned, TreeEntry *> &Op) {
5502 return AllowsReordering(Op.second) &&
5503 all_of(Op.second->UserTreeIndices,
5504 [UserTE](const EdgeInfo &EI) {
5505 return EI.UserTE == UserTE;
5506 });
5507 })) <= Ops.size() / 2)
5508 ++Res.first->second;
5509 }
5510 }
5511 if (OrdersUses.empty()) {
5512 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5513 OrderedEntries.remove(Op.second);
5514 continue;
5515 }
5516 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5517 const unsigned Sz = Order.size();
5518 for (unsigned Idx : seq<unsigned>(0, Sz))
5519 if (Idx != Order[Idx] && Order[Idx] != Sz)
5520 return false;
5521 return true;
5522 };
5523 // Choose the most used order.
5524 unsigned IdentityCnt = 0;
5525 unsigned VF = Data.second.front().second->getVectorFactor();
5526 OrdersType IdentityOrder(VF, VF);
5527 for (auto &Pair : OrdersUses) {
5528 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5529 IdentityCnt += Pair.second;
5530 combineOrders(IdentityOrder, Pair.first);
5531 }
5532 }
5533 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5534 unsigned Cnt = IdentityCnt;
5535 for (auto &Pair : OrdersUses) {
5536 // Prefer identity order. But, if filled identity found (non-empty
5537 // order) with same number of uses, as the new candidate order, we can
5538 // choose this candidate order.
5539 if (Cnt < Pair.second) {
5540 combineOrders(Pair.first, BestOrder);
5541 BestOrder = Pair.first;
5542 Cnt = Pair.second;
5543 } else {
5544 combineOrders(BestOrder, Pair.first);
5545 }
5546 }
5547 // Set order of the user node.
5548 if (IsIdentityOrder(BestOrder)) {
5549 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5550 OrderedEntries.remove(Op.second);
5551 continue;
5552 }
5553 fixupOrderingIndices(BestOrder);
5554 // Erase operands from OrderedEntries list and adjust their orders.
5555 VisitedOps.clear();
5556 SmallVector<int> Mask;
5557 inversePermutation(BestOrder, Mask);
5558 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5559 unsigned E = BestOrder.size();
5560 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5561 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5562 });
5563 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5564 TreeEntry *TE = Op.second;
5565 OrderedEntries.remove(TE);
5566 if (!VisitedOps.insert(TE).second)
5567 continue;
5568 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5569 reorderNodeWithReuses(*TE, Mask);
5570 continue;
5571 }
5572 // Gathers are processed separately.
5573 if (TE->State != TreeEntry::Vectorize &&
5574 TE->State != TreeEntry::StridedVectorize &&
5575 (TE->State != TreeEntry::ScatterVectorize ||
5576 TE->ReorderIndices.empty()))
5577 continue;
5578 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5579 TE->ReorderIndices.empty()) &&
5580 "Non-matching sizes of user/operand entries.");
5581 reorderOrder(TE->ReorderIndices, Mask);
5582 if (IgnoreReorder && TE == VectorizableTree.front().get())
5583 IgnoreReorder = false;
5584 }
5585 // For gathers just need to reorder its scalars.
5586 for (TreeEntry *Gather : GatherOps) {
5587 assert(Gather->ReorderIndices.empty() &&
5588 "Unexpected reordering of gathers.");
5589 if (!Gather->ReuseShuffleIndices.empty()) {
5590 // Just reorder reuses indices.
5591 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5592 continue;
5593 }
5594 reorderScalars(Gather->Scalars, Mask);
5595 OrderedEntries.remove(Gather);
5596 }
5597 // Reorder operands of the user node and set the ordering for the user
5598 // node itself.
5599 if (Data.first->State != TreeEntry::Vectorize ||
5600 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5601 Data.first->getMainOp()) ||
5602 Data.first->isAltShuffle())
5603 Data.first->reorderOperands(Mask);
5604 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5605 Data.first->isAltShuffle() ||
5606 Data.first->State == TreeEntry::StridedVectorize) {
5607 reorderScalars(Data.first->Scalars, Mask);
5608 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5609 /*BottomOrder=*/true);
5610 if (Data.first->ReuseShuffleIndices.empty() &&
5611 !Data.first->ReorderIndices.empty() &&
5612 !Data.first->isAltShuffle()) {
5613 // Insert user node to the list to try to sink reordering deeper in
5614 // the graph.
5615 OrderedEntries.insert(Data.first);
5616 }
5617 } else {
5618 reorderOrder(Data.first->ReorderIndices, Mask);
5619 }
5620 }
5621 }
5622 // If the reordering is unnecessary, just remove the reorder.
5623 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5624 VectorizableTree.front()->ReuseShuffleIndices.empty())
5625 VectorizableTree.front()->ReorderIndices.clear();
5626}
5627
5629 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5630 DenseMap<Value *, unsigned> ScalarToExtUses;
5631 // Collect the values that we need to extract from the tree.
5632 for (auto &TEPtr : VectorizableTree) {
5633 TreeEntry *Entry = TEPtr.get();
5634
5635 // No need to handle users of gathered values.
5636 if (Entry->State == TreeEntry::NeedToGather)
5637 continue;
5638
5639 // For each lane:
5640 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5641 Value *Scalar = Entry->Scalars[Lane];
5642 if (!isa<Instruction>(Scalar))
5643 continue;
5644 // All uses must be replaced already? No need to do it again.
5645 auto It = ScalarToExtUses.find(Scalar);
5646 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5647 continue;
5648
5649 // Check if the scalar is externally used as an extra arg.
5650 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5651 if (ExtI != ExternallyUsedValues.end()) {
5652 int FoundLane = Entry->findLaneForValue(Scalar);
5653 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5654 << FoundLane << " from " << *Scalar << ".\n");
5655 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5656 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5657 continue;
5658 }
5659 for (User *U : Scalar->users()) {
5660 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5661
5662 Instruction *UserInst = dyn_cast<Instruction>(U);
5663 if (!UserInst || isDeleted(UserInst))
5664 continue;
5665
5666 // Ignore users in the user ignore list.
5667 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5668 continue;
5669
5670 // Skip in-tree scalars that become vectors
5671 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5672 // Some in-tree scalars will remain as scalar in vectorized
5673 // instructions. If that is the case, the one in FoundLane will
5674 // be used.
5675 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5677 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5678 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5679 << ".\n");
5680 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5681 continue;
5682 }
5683 U = nullptr;
5684 if (It != ScalarToExtUses.end()) {
5685 ExternalUses[It->second].User = nullptr;
5686 break;
5687 }
5688 }
5689
5690 int FoundLane = Entry->findLaneForValue(Scalar);
5691 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5692 << " from lane " << FoundLane << " from " << *Scalar
5693 << ".\n");
5694 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5695 ExternalUses.emplace_back(Scalar, U, FoundLane);
5696 if (!U)
5697 break;
5698 }
5699 }
5700 }
5701}
5702
5704BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5706 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5707 Value *V = TE->Scalars[Lane];
5708 // To save compilation time we don't visit if we have too many users.
5709 if (V->hasNUsesOrMore(UsesLimit))
5710 break;
5711
5712 // Collect stores per pointer object.
5713 for (User *U : V->users()) {
5714 auto *SI = dyn_cast<StoreInst>(U);
5715 if (SI == nullptr || !SI->isSimple() ||
5716 !isValidElementType(SI->getValueOperand()->getType()))
5717 continue;
5718 // Skip entry if already
5719 if (getTreeEntry(U))
5720 continue;
5721
5722 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5723 auto &StoresVec = PtrToStoresMap[Ptr];
5724 // For now just keep one store per pointer object per lane.
5725 // TODO: Extend this to support multiple stores per pointer per lane
5726 if (StoresVec.size() > Lane)
5727 continue;
5728 // Skip if in different BBs.
5729 if (!StoresVec.empty() &&
5730 SI->getParent() != StoresVec.back()->getParent())
5731 continue;
5732 // Make sure that the stores are of the same type.
5733 if (!StoresVec.empty() &&
5734 SI->getValueOperand()->getType() !=
5735 StoresVec.back()->getValueOperand()->getType())
5736 continue;
5737 StoresVec.push_back(SI);
5738 }
5739 }
5740 return PtrToStoresMap;
5741}
5742
5743bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5744 OrdersType &ReorderIndices) const {
5745 // We check whether the stores in StoreVec can form a vector by sorting them
5746 // and checking whether they are consecutive.
5747
5748 // To avoid calling getPointersDiff() while sorting we create a vector of
5749 // pairs {store, offset from first} and sort this instead.
5750 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5751 StoreInst *S0 = StoresVec[0];
5752 StoreOffsetVec[0] = {S0, 0};
5753 Type *S0Ty = S0->getValueOperand()->getType();
5754 Value *S0Ptr = S0->getPointerOperand();
5755 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5756 StoreInst *SI = StoresVec[Idx];
5757 std::optional<int> Diff =
5758 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5759 SI->getPointerOperand(), *DL, *SE,
5760 /*StrictCheck=*/true);
5761 // We failed to compare the pointers so just abandon this StoresVec.
5762 if (!Diff)
5763 return false;
5764 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5765 }
5766
5767 // Sort the vector based on the pointers. We create a copy because we may
5768 // need the original later for calculating the reorder (shuffle) indices.
5769 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5770 const std::pair<StoreInst *, int> &Pair2) {
5771 int Offset1 = Pair1.second;
5772 int Offset2 = Pair2.second;
5773 return Offset1 < Offset2;
5774 });
5775
5776 // Check if the stores are consecutive by checking if their difference is 1.
5777 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5778 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5779 return false;
5780
5781 // Calculate the shuffle indices according to their offset against the sorted
5782 // StoreOffsetVec.
5783 ReorderIndices.reserve(StoresVec.size());
5784 for (StoreInst *SI : StoresVec) {
5785 unsigned Idx = find_if(StoreOffsetVec,
5786 [SI](const std::pair<StoreInst *, int> &Pair) {
5787 return Pair.first == SI;
5788 }) -
5789 StoreOffsetVec.begin();
5790 ReorderIndices.push_back(Idx);
5791 }
5792 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5793 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5794 // same convention here.
5795 auto IsIdentityOrder = [](const OrdersType &Order) {
5796 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5797 if (Idx != Order[Idx])
5798 return false;
5799 return true;
5800 };
5801 if (IsIdentityOrder(ReorderIndices))
5802 ReorderIndices.clear();
5803
5804 return true;
5805}
5806
5807#ifndef NDEBUG
5809 for (unsigned Idx : Order)
5810 dbgs() << Idx << ", ";
5811 dbgs() << "\n";
5812}
5813#endif
5814
5816BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5817 unsigned NumLanes = TE->Scalars.size();
5818
5820 collectUserStores(TE);
5821
5822 // Holds the reorder indices for each candidate store vector that is a user of
5823 // the current TreeEntry.
5824 SmallVector<OrdersType, 1> ExternalReorderIndices;
5825
5826 // Now inspect the stores collected per pointer and look for vectorization
5827 // candidates. For each candidate calculate the reorder index vector and push
5828 // it into `ExternalReorderIndices`
5829 for (const auto &Pair : PtrToStoresMap) {
5830 auto &StoresVec = Pair.second;
5831 // If we have fewer than NumLanes stores, then we can't form a vector.
5832 if (StoresVec.size() != NumLanes)
5833 continue;
5834
5835 // If the stores are not consecutive then abandon this StoresVec.
5836 OrdersType ReorderIndices;
5837 if (!canFormVector(StoresVec, ReorderIndices))
5838 continue;
5839
5840 // We now know that the scalars in StoresVec can form a vector instruction,
5841 // so set the reorder indices.
5842 ExternalReorderIndices.push_back(ReorderIndices);
5843 }
5844 return ExternalReorderIndices;
5845}
5846
5848 const SmallDenseSet<Value *> &UserIgnoreLst) {
5849 deleteTree();
5850 UserIgnoreList = &UserIgnoreLst;
5851 if (!allSameType(Roots))
5852 return;
5853 buildTree_rec(Roots, 0, EdgeInfo());
5854}
5855
5857 deleteTree();
5858 if (!allSameType(Roots))
5859 return;
5860 buildTree_rec(Roots, 0, EdgeInfo());
5861}
5862
5863/// \return true if the specified list of values has only one instruction that
5864/// requires scheduling, false otherwise.
5865#ifndef NDEBUG
5867 Value *NeedsScheduling = nullptr;
5868 for (Value *V : VL) {
5870 continue;
5871 if (!NeedsScheduling) {
5872 NeedsScheduling = V;
5873 continue;
5874 }
5875 return false;
5876 }
5877 return NeedsScheduling;
5878}
5879#endif
5880
5881/// Generates key/subkey pair for the given value to provide effective sorting
5882/// of the values and better detection of the vectorizable values sequences. The
5883/// keys/subkeys can be used for better sorting of the values themselves (keys)
5884/// and in values subgroups (subkeys).
5885static std::pair<size_t, size_t> generateKeySubkey(
5886 Value *V, const TargetLibraryInfo *TLI,
5887 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5888 bool AllowAlternate) {
5889 hash_code Key = hash_value(V->getValueID() + 2);
5890 hash_code SubKey = hash_value(0);
5891 // Sort the loads by the distance between the pointers.
5892 if (auto *LI = dyn_cast<LoadInst>(V)) {
5893 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5894 if (LI->isSimple())
5895 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5896 else
5897 Key = SubKey = hash_value(LI);
5898 } else if (isVectorLikeInstWithConstOps(V)) {
5899 // Sort extracts by the vector operands.
5900 if (isa<ExtractElementInst, UndefValue>(V))
5901 Key = hash_value(Value::UndefValueVal + 1);
5902 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5903 if (!isUndefVector(EI->getVectorOperand()).all() &&
5904 !isa<UndefValue>(EI->getIndexOperand()))
5905 SubKey = hash_value(EI->getVectorOperand());
5906 }
5907 } else if (auto *I = dyn_cast<Instruction>(V)) {
5908 // Sort other instructions just by the opcodes except for CMPInst.
5909 // For CMP also sort by the predicate kind.
5910 if ((isa<BinaryOperator, CastInst>(I)) &&
5911 isValidForAlternation(I->getOpcode())) {
5912 if (AllowAlternate)
5913 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5914 else
5915 Key = hash_combine(hash_value(I->getOpcode()), Key);
5916 SubKey = hash_combine(
5917 hash_value(I->getOpcode()), hash_value(I->getType()),
5918 hash_value(isa<BinaryOperator>(I)
5919 ? I->getType()
5920 : cast<CastInst>(I)->getOperand(0)->getType()));
5921 // For casts, look through the only operand to improve compile time.
5922 if (isa<CastInst>(I)) {
5923 std::pair<size_t, size_t> OpVals =
5924 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5925 /*AllowAlternate=*/true);
5926 Key = hash_combine(OpVals.first, Key);
5927 SubKey = hash_combine(OpVals.first, SubKey);
5928 }
5929 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5930 CmpInst::Predicate Pred = CI->getPredicate();
5931 if (CI->isCommutative())
5932 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5934 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5935 hash_value(SwapPred),
5936 hash_value(CI->getOperand(0)->getType()));
5937 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5940 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5941 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5942 SubKey = hash_combine(hash_value(I->getOpcode()),
5943 hash_value(Call->getCalledFunction()));
5944 } else {
5945 Key = hash_combine(hash_value(Call), Key);
5946 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5947 }
5948 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5949 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5950 hash_value(Op.Tag), SubKey);
5951 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5952 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5953 SubKey = hash_value(Gep->getPointerOperand());
5954 else
5955 SubKey = hash_value(Gep);
5956 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5957 !isa<ConstantInt>(I->getOperand(1))) {
5958 // Do not try to vectorize instructions with potentially high cost.
5959 SubKey = hash_value(I);
5960 } else {
5961 SubKey = hash_value(I->getOpcode());
5962 }
5963 Key = hash_combine(hash_value(I->getParent()), Key);
5964 }
5965 return std::make_pair(Key, SubKey);
5966}
5967
5968/// Checks if the specified instruction \p I is an alternate operation for
5969/// the given \p MainOp and \p AltOp instructions.
5970static bool isAlternateInstruction(const Instruction *I,
5971 const Instruction *MainOp,
5972 const Instruction *AltOp,
5973 const TargetLibraryInfo &TLI);
5974
5975bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5976 ArrayRef<Value *> VL) const {
5977 unsigned Opcode0 = S.getOpcode();
5978 unsigned Opcode1 = S.getAltOpcode();
5979 // The opcode mask selects between the two opcodes.
5980 SmallBitVector OpcodeMask(VL.size(), false);
5981 for (unsigned Lane : seq<unsigned>(0, VL.size()))
5982 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
5983 OpcodeMask.set(Lane);
5984 // If this pattern is supported by the target then consider it profitable.
5985 if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
5986 Opcode0, Opcode1, OpcodeMask))
5987 return true;
5989 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5990 Operands.emplace_back();
5991 // Prepare the operand vector.
5992 for (Value *V : VL)
5993 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
5994 }
5995 if (Operands.size() == 2) {
5996 // Try find best operands candidates.
5997 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5999 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6000 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6001 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6002 std::optional<int> Res = findBestRootPair(Candidates);
6003 switch (Res.value_or(0)) {
6004 case 0:
6005 break;
6006 case 1:
6007 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6008 break;
6009 case 2:
6010 std::swap(Operands[0][I], Operands[1][I]);
6011 break;
6012 default:
6013 llvm_unreachable("Unexpected index.");
6014 }
6015 }
6016 }
6017 DenseSet<unsigned> UniqueOpcodes;
6018 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6019 unsigned NonInstCnt = 0;
6020 // Estimate number of instructions, required for the vectorized node and for
6021 // the buildvector node.
6022 unsigned UndefCnt = 0;
6023 // Count the number of extra shuffles, required for vector nodes.
6024 unsigned ExtraShuffleInsts = 0;
6025 // Check that operands do not contain same values and create either perfect
6026 // diamond match or shuffled match.
6027 if (Operands.size() == 2) {
6028 // Do not count same operands twice.
6029 if (Operands.front() == Operands.back()) {
6030 Operands.erase(Operands.begin());
6031 } else if (!allConstant(Operands.front()) &&
6032 all_of(Operands.front(), [&](Value *V) {
6033 return is_contained(Operands.back(), V);
6034 })) {
6035 Operands.erase(Operands.begin());
6036 ++ExtraShuffleInsts;
6037 }
6038 }
6039 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6040 // Vectorize node, if:
6041 // 1. at least single operand is constant or splat.
6042 // 2. Operands have many loop invariants (the instructions are not loop
6043 // invariants).
6044 // 3. At least single unique operands is supposed to vectorized.
6045 return none_of(Operands,
6046 [&](ArrayRef<Value *> Op) {
6047 if (allConstant(Op) ||
6048 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6049 getSameOpcode(Op, *TLI).MainOp))
6050 return false;
6052 for (Value *V : Op) {
6053 if (isa<Constant, ExtractElementInst>(V) ||
6054 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6055 if (isa<UndefValue>(V))
6056 ++UndefCnt;
6057 continue;
6058 }
6059 auto Res = Uniques.try_emplace(V, 0);
6060 // Found first duplicate - need to add shuffle.
6061 if (!Res.second && Res.first->second == 1)
6062 ++ExtraShuffleInsts;
6063 ++Res.first->getSecond();
6064 if (auto *I = dyn_cast<Instruction>(V))
6065 UniqueOpcodes.insert(I->getOpcode());
6066 else if (Res.second)
6067 ++NonInstCnt;
6068 }
6069 return none_of(Uniques, [&](const auto &P) {
6070 return P.first->hasNUsesOrMore(P.second + 1) &&
6071 none_of(P.first->users(), [&](User *U) {
6072 return getTreeEntry(U) || Uniques.contains(U);
6073 });
6074 });
6075 }) ||
6076 // Do not vectorize node, if estimated number of vector instructions is
6077 // more than estimated number of buildvector instructions. Number of
6078 // vector operands is number of vector instructions + number of vector
6079 // instructions for operands (buildvectors). Number of buildvector
6080 // instructions is just number_of_operands * number_of_scalars.
6081 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6082 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6083 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6084}
6085
6086BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6087 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6088 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6089 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6090
6091 unsigned ShuffleOrOp =
6092 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6093 auto *VL0 = cast<Instruction>(S.OpValue);
6094 switch (ShuffleOrOp) {
6095 case Instruction::PHI: {
6096 // Too many operands - gather, most probably won't be vectorized.
6097 if (VL0->getNumOperands() > MaxPHINumOperands)
6098 return TreeEntry::NeedToGather;
6099 // Check for terminator values (e.g. invoke).
6100 for (Value *V : VL)
6101 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6102 Instruction *Term = dyn_cast<Instruction>(Incoming);
6103 if (Term && Term->isTerminator()) {
6105 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6106 return TreeEntry::NeedToGather;
6107 }
6108 }
6109
6110 return TreeEntry::Vectorize;
6111 }
6112 case Instruction::ExtractValue:
6113 case Instruction::ExtractElement: {
6114 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6115 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6116 if (!isPowerOf2_32(VL.size()))
6117 return TreeEntry::NeedToGather;
6118 if (Reuse || !CurrentOrder.empty())
6119 return TreeEntry::Vectorize;
6120 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6121 return TreeEntry::NeedToGather;
6122 }
6123 case Instruction::InsertElement: {
6124 // Check that we have a buildvector and not a shuffle of 2 or more
6125 // different vectors.
6126 ValueSet SourceVectors;
6127 for (Value *V : VL) {
6128 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6129 assert(getInsertIndex(V) != std::nullopt &&
6130 "Non-constant or undef index?");
6131 }
6132
6133 if (count_if(VL, [&SourceVectors](Value *V) {
6134 return !SourceVectors.contains(V);
6135 }) >= 2) {
6136 // Found 2nd source vector - cancel.
6137 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6138 "different source vectors.\n");
6139 return TreeEntry::NeedToGather;
6140 }
6141
6142 return TreeEntry::Vectorize;
6143 }
6144 case Instruction::Load: {
6145 // Check that a vectorized load would load the same memory as a scalar
6146 // load. For example, we don't want to vectorize loads that are smaller
6147 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6148 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6149 // from such a struct, we read/write packed bits disagreeing with the
6150 // unvectorized version.
6151 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6153 return TreeEntry::Vectorize;
6155 return TreeEntry::ScatterVectorize;
6157 return TreeEntry::StridedVectorize;
6158 case LoadsState::Gather:
6159#ifndef NDEBUG
6160 Type *ScalarTy = VL0->getType();
6161 if (DL->getTypeSizeInBits(ScalarTy) !=
6162 DL->getTypeAllocSizeInBits(ScalarTy))
6163 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6164 else if (any_of(VL,
6165 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6166 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6167 else
6168 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6169#endif // NDEBUG
6170 return TreeEntry::NeedToGather;
6171 }
6172 llvm_unreachable("Unexpected state of loads");
6173 }
6174 case Instruction::ZExt:
6175 case Instruction::SExt:
6176 case Instruction::FPToUI:
6177 case Instruction::FPToSI:
6178 case Instruction::FPExt:
6179 case Instruction::PtrToInt:
6180 case Instruction::IntToPtr:
6181 case Instruction::SIToFP:
6182 case Instruction::UIToFP:
6183 case Instruction::Trunc:
6184 case Instruction::FPTrunc:
6185 case Instruction::BitCast: {
6186 Type *SrcTy = VL0->getOperand(0)->getType();
6187 for (Value *V : VL) {
6188 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6189 if (Ty != SrcTy || !isValidElementType(Ty)) {
6190 LLVM_DEBUG(
6191 dbgs() << "SLP: Gathering casts with different src types.\n");
6192 return TreeEntry::NeedToGather;
6193 }
6194 }
6195 return TreeEntry::Vectorize;
6196 }
6197 case Instruction::ICmp:
6198 case Instruction::FCmp: {
6199 // Check that all of the compares have the same predicate.
6200 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6202 Type *ComparedTy = VL0->getOperand(0)->getType();
6203 for (Value *V : VL) {
6204 CmpInst *Cmp = cast<CmpInst>(V);
6205 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6206 Cmp->getOperand(0)->getType() != ComparedTy) {
6207 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6208 return TreeEntry::NeedToGather;
6209 }
6210 }
6211 return TreeEntry::Vectorize;
6212 }
6213 case Instruction::Select:
6214 case Instruction::FNeg:
6215 case Instruction::Add:
6216 case Instruction::FAdd:
6217 case Instruction::Sub:
6218 case Instruction::FSub:
6219 case Instruction::Mul:
6220 case Instruction::FMul:
6221 case Instruction::UDiv:
6222 case Instruction::SDiv:
6223 case Instruction::FDiv:
6224 case Instruction::URem:
6225 case Instruction::SRem:
6226 case Instruction::FRem:
6227 case Instruction::Shl:
6228 case Instruction::LShr:
6229 case Instruction::AShr:
6230 case Instruction::And:
6231 case Instruction::Or:
6232 case Instruction::Xor:
6233 return TreeEntry::Vectorize;
6234 case Instruction::GetElementPtr: {
6235 // We don't combine GEPs with complicated (nested) indexing.
6236 for (Value *V : VL) {
6237 auto *I = dyn_cast<GetElementPtrInst>(V);
6238 if (!I)
6239 continue;
6240 if (I->getNumOperands() != 2) {
6241 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6242 return TreeEntry::NeedToGather;
6243 }
6244 }
6245
6246 // We can't combine several GEPs into one vector if they operate on
6247 // different types.
6248 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6249 for (Value *V : VL) {
6250 auto *GEP = dyn_cast<GEPOperator>(V);
6251 if (!GEP)
6252 continue;
6253 Type *CurTy = GEP->getSourceElementType();
6254 if (Ty0 != CurTy) {
6255 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6256 return TreeEntry::NeedToGather;
6257 }
6258 }
6259
6260 // We don't combine GEPs with non-constant indexes.
6261 Type *Ty1 = VL0->getOperand(1)->getType();
6262 for (Value *V : VL) {
6263 auto *I = dyn_cast<GetElementPtrInst>(V);
6264 if (!I)
6265 continue;
6266 auto *Op = I->getOperand(1);
6267 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6268 (Op->getType() != Ty1 &&
6269 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6270 Op->getType()->getScalarSizeInBits() >
6271 DL->getIndexSizeInBits(
6272 V->getType()->getPointerAddressSpace())))) {
6273 LLVM_DEBUG(
6274 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6275 return TreeEntry::NeedToGather;
6276 }
6277 }
6278
6279 return TreeEntry::Vectorize;
6280 }
6281 case Instruction::Store: {
6282 // Check if the stores are consecutive or if we need to swizzle them.
6283 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6284 // Avoid types that are padded when being allocated as scalars, while
6285 // being packed together in a vector (such as i1).
6286 if (DL->getTypeSizeInBits(ScalarTy) !=
6287 DL->getTypeAllocSizeInBits(ScalarTy)) {
6288 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6289 return TreeEntry::NeedToGather;
6290 }
6291 // Make sure all stores in the bundle are simple - we can't vectorize
6292 // atomic or volatile stores.
6293 for (Value *V : VL) {
6294 auto *SI = cast<StoreInst>(V);
6295 if (!SI->isSimple()) {
6296 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6297 return TreeEntry::NeedToGather;
6298 }
6299 PointerOps.push_back(SI->getPointerOperand());
6300 }
6301
6302 // Check the order of pointer operands.
6303 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6304 Value *Ptr0;
6305 Value *PtrN;
6306 if (CurrentOrder.empty()) {
6307 Ptr0 = PointerOps.front();
6308 PtrN = PointerOps.back();
6309 } else {
6310 Ptr0 = PointerOps[CurrentOrder.front()];
6311 PtrN = PointerOps[CurrentOrder.back()];
6312 }
6313 std::optional<int> Dist =
6314 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6315 // Check that the sorted pointer operands are consecutive.
6316 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6317 return TreeEntry::Vectorize;
6318 }
6319
6320 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6321 return TreeEntry::NeedToGather;
6322 }
6323 case Instruction::Call: {
6324 // Check if the calls are all to the same vectorizable intrinsic or
6325 // library function.
6326 CallInst *CI = cast<CallInst>(VL0);
6328
6329 VFShape Shape = VFShape::get(
6330 CI->getFunctionType(),
6331 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6332 false /*HasGlobalPred*/);
6333 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6334
6335 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6336 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6337 return TreeEntry::NeedToGather;
6338 }
6339 Function *F = CI->getCalledFunction();
6340 unsigned NumArgs = CI->arg_size();
6341 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6342 for (unsigned J = 0; J != NumArgs; ++J)
6344 ScalarArgs[J] = CI->getArgOperand(J);
6345 for (Value *V : VL) {
6346 CallInst *CI2 = dyn_cast<CallInst>(V);
6347 if (!CI2 || CI2->getCalledFunction() != F ||
6348 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6349 (VecFunc &&
6350 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6352 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6353 << "\n");
6354 return TreeEntry::NeedToGather;
6355 }
6356 // Some intrinsics have scalar arguments and should be same in order for
6357 // them to be vectorized.
6358 for (unsigned J = 0; J != NumArgs; ++J) {
6360 Value *A1J = CI2->getArgOperand(J);
6361 if (ScalarArgs[J] != A1J) {
6363 << "SLP: mismatched arguments in call:" << *CI
6364 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6365 return TreeEntry::NeedToGather;
6366 }
6367 }
6368 }
6369 // Verify that the bundle operands are identical between the two calls.
6370 if (CI->hasOperandBundles() &&
6371 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6372 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6373 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6374 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6375 << "!=" << *V << '\n');
6376 return TreeEntry::NeedToGather;
6377 }
6378 }
6379
6380 return TreeEntry::Vectorize;
6381 }
6382 case Instruction::ShuffleVector: {
6383 // If this is not an alternate sequence of opcode like add-sub
6384 // then do not vectorize this instruction.
6385 if (!S.isAltShuffle()) {
6386 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6387 return TreeEntry::NeedToGather;
6388 }
6389 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6390 LLVM_DEBUG(
6391 dbgs()
6392 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6393 "the whole alt sequence is not profitable.\n");
6394 return TreeEntry::NeedToGather;
6395 }
6396
6397 return TreeEntry::Vectorize;
6398 }
6399 default:
6400 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6401 return TreeEntry::NeedToGather;
6402 }
6403}
6404
6405namespace {
6406/// Allows to correctly handle operands of the phi nodes based on the \p Main
6407/// PHINode order of incoming basic blocks/values.
6408class PHIHandler {
6409 DominatorTree &DT;
6410 PHINode *Main = nullptr;
6413
6414public:
6415 PHIHandler() = delete;
6416 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6417 : DT(DT), Main(Main), Phis(Phis),
6418 Operands(Main->getNumIncomingValues(),
6419 SmallVector<Value *>(Phis.size(), nullptr)) {}
6420 void buildOperands() {
6421 constexpr unsigned FastLimit = 4;
6422 if (Main->getNumIncomingValues() <= FastLimit) {
6423 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6424 BasicBlock *InBB = Main->getIncomingBlock(I);
6425 if (!DT.isReachableFromEntry(InBB)) {
6426 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6427 continue;
6428 }
6429 // Prepare the operand vector.
6430 for (auto [Idx, V] : enumerate(Phis)) {
6431 auto *P = cast<PHINode>(V);
6432 if (P->getIncomingBlock(I) == InBB)
6433 Operands[I][Idx] = P->getIncomingValue(I);
6434 else
6435 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6436 }
6437 }
6438 return;
6439 }
6441 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6442 BasicBlock *InBB = Main->getIncomingBlock(I);
6443 if (!DT.isReachableFromEntry(InBB)) {
6444 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6445 continue;
6446 }
6447 Blocks.try_emplace(InBB).first->second.push_back(I);
6448 }
6449 for (auto [Idx, V] : enumerate(Phis)) {
6450 auto *P = cast<PHINode>(V);
6451 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6452 BasicBlock *InBB = P->getIncomingBlock(I);
6453 if (InBB == Main->getIncomingBlock(I)) {
6454 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6455 continue;
6456 Operands[I][Idx] = P->getIncomingValue(I);
6457 continue;
6458 }
6459 auto It = Blocks.find(InBB);
6460 if (It == Blocks.end())
6461 continue;
6462 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6463 }
6464 }
6465 for (const auto &P : Blocks) {
6466 if (P.getSecond().size() <= 1)
6467 continue;
6468 unsigned BasicI = P.getSecond().front();
6469 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6471 [&](const auto &Data) {
6472 return !Data.value() ||
6473 Data.value() == Operands[BasicI][Data.index()];
6474 }) &&
6475 "Expected empty operands list.");
6476 Operands[I] = Operands[BasicI];
6477 }
6478 }
6479 }
6480 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6481};
6482} // namespace
6483
6484void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6485 const EdgeInfo &UserTreeIdx) {
6486 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6487
6488 SmallVector<int> ReuseShuffleIndicies;
6489 SmallVector<Value *> UniqueValues;
6490 SmallVector<Value *> NonUniqueValueVL;
6491 auto TryToFindDuplicates = [&](const InstructionsState &S,
6492 bool DoNotFail = false) {
6493 // Check that every instruction appears once in this bundle.
6494 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6495 for (Value *V : VL) {
6496 if (isConstant(V)) {
6497 ReuseShuffleIndicies.emplace_back(
6498 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6499 UniqueValues.emplace_back(V);
6500 continue;
6501 }
6502 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6503 ReuseShuffleIndicies.emplace_back(Res.first->second);
6504 if (Res.second)
6505 UniqueValues.emplace_back(V);
6506 }
6507 size_t NumUniqueScalarValues = UniqueValues.size();
6508 if (NumUniqueScalarValues == VL.size()) {
6509 ReuseShuffleIndicies.clear();
6510 } else {
6511 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6512 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6513 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6514 "for nodes with padding.\n");
6515 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6516 return false;
6517 }
6518 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6519 if (NumUniqueScalarValues <= 1 ||
6520 (UniquePositions.size() == 1 && all_of(UniqueValues,
6521 [](Value *V) {
6522 return isa<UndefValue>(V) ||
6523 !isConstant(V);
6524 })) ||
6525 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6526 if (DoNotFail && UniquePositions.size() > 1 &&
6527 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6528 all_of(UniqueValues, [=](Value *V) {
6529 return isa<ExtractElementInst>(V) ||
6530 areAllUsersVectorized(cast<Instruction>(V),
6531 UserIgnoreList);
6532 })) {
6533 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6534 if (PWSz == VL.size()) {
6535 ReuseShuffleIndicies.clear();
6536 } else {
6537 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6538 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6539 UniqueValues.back());
6540 VL = NonUniqueValueVL;
6541 }
6542 return true;
6543 }
6544 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6545 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6546 return false;
6547 }
6548 VL = UniqueValues;
6549 }
6550 return true;
6551 };
6552
6553 InstructionsState S = getSameOpcode(VL, *TLI);
6554
6555 // Don't vectorize ephemeral values.
6556 if (!EphValues.empty()) {
6557 for (Value *V : VL) {
6558 if (EphValues.count(V)) {
6559 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6560 << ") is ephemeral.\n");
6561 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6562 return;
6563 }
6564 }
6565 }
6566
6567 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6568 // a load), in which case peek through to include it in the tree, without
6569 // ballooning over-budget.
6570 if (Depth >= RecursionMaxDepth &&
6571 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6572 VL.size() >= 4 &&
6573 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6574 return match(I,
6576 cast<Instruction>(I)->getOpcode() ==
6577 cast<Instruction>(S.MainOp)->getOpcode();
6578 })))) {
6579 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6580 if (TryToFindDuplicates(S))
6581 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6582 ReuseShuffleIndicies);
6583 return;
6584 }
6585
6586 // Don't handle scalable vectors
6587 if (S.getOpcode() == Instruction::ExtractElement &&
6588 isa<ScalableVectorType>(
6589 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6590 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6591 if (TryToFindDuplicates(S))
6592 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6593 ReuseShuffleIndicies);
6594 return;
6595 }
6596
6597 // Don't handle vectors.
6598 if (S.OpValue->getType()->isVectorTy() &&
6599 !isa<InsertElementInst>(S.OpValue)) {
6600 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6601 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6602 return;
6603 }
6604
6605 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6606 if (SI->getValueOperand()->getType()->isVectorTy()) {
6607 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6608 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6609 return;
6610 }
6611
6612 // If all of the operands are identical or constant we have a simple solution.
6613 // If we deal with insert/extract instructions, they all must have constant
6614 // indices, otherwise we should gather them, not try to vectorize.
6615 // If alternate op node with 2 elements with gathered operands - do not
6616 // vectorize.
6617 auto &&NotProfitableForVectorization = [&S, this,
6619 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6620 return false;
6621 if (VectorizableTree.size() < MinTreeSize)
6622 return false;
6623 if (Depth >= RecursionMaxDepth - 1)
6624 return true;
6625 // Check if all operands are extracts, part of vector node or can build a
6626 // regular vectorize node.
6627 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6628 for (Value *V : VL) {
6629 auto *I = cast<Instruction>(V);
6630 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6631 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6632 }));
6633 }
6634 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6635 if ((IsCommutative &&
6636 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6637 (!IsCommutative &&
6638 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6639 return true;
6640 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6642 auto *I1 = cast<Instruction>(VL.front());
6643 auto *I2 = cast<Instruction>(VL.back());
6644 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6645 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6646 I2->getOperand(Op));
6647 if (static_cast<unsigned>(count_if(
6648 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6650 })) >= S.MainOp->getNumOperands() / 2)
6651 return false;
6652 if (S.MainOp->getNumOperands() > 2)
6653 return true;
6654 if (IsCommutative) {
6655 // Check permuted operands.
6656 Candidates.clear();
6657 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6658 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6659 I2->getOperand((Op + 1) % E));
6660 if (any_of(
6661 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6663 }))
6664 return false;
6665 }
6666 return true;
6667 };
6668 SmallVector<unsigned> SortedIndices;
6669 BasicBlock *BB = nullptr;
6670 bool IsScatterVectorizeUserTE =
6671 UserTreeIdx.UserTE &&
6672 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6673 bool AreAllSameInsts =
6674 (S.getOpcode() && allSameBlock(VL)) ||
6675 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6676 VL.size() > 2 &&
6677 all_of(VL,
6678 [&BB](Value *V) {
6679 auto *I = dyn_cast<GetElementPtrInst>(V);
6680 if (!I)
6681 return doesNotNeedToBeScheduled(V);
6682 if (!BB)
6683 BB = I->getParent();
6684 return BB == I->getParent() && I->getNumOperands() == 2;
6685 }) &&
6686 BB &&
6687 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6688 SortedIndices));
6689 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6690 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6691 S.OpValue) &&
6693 NotProfitableForVectorization(VL)) {
6694 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6695 if (TryToFindDuplicates(S))
6696 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6697 ReuseShuffleIndicies);
6698 return;
6699 }
6700
6701 // We now know that this is a vector of instructions of the same type from
6702 // the same block.
6703
6704 // Check if this is a duplicate of another entry.
6705 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6706 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6707 if (!E->isSame(VL)) {
6708 auto It = MultiNodeScalars.find(S.OpValue);
6709 if (It != MultiNodeScalars.end()) {
6710 auto *TEIt = find_if(It->getSecond(),
6711 [&](TreeEntry *ME) { return ME->isSame(VL); });
6712 if (TEIt != It->getSecond().end())
6713 E = *TEIt;
6714 else
6715 E = nullptr;
6716 } else {
6717 E = nullptr;
6718 }
6719 }
6720 if (!E) {
6721 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6722 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6723 if (TryToFindDuplicates(S))
6724 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6725 ReuseShuffleIndicies);
6726 return;
6727 }
6728 } else {
6729 // Record the reuse of the tree node. FIXME, currently this is only used
6730 // to properly draw the graph rather than for the actual vectorization.
6731 E->UserTreeIndices.push_back(UserTreeIdx);
6732 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6733 << ".\n");
6734 return;
6735 }
6736 }
6737
6738 // Check that none of the instructions in the bundle are already in the tree.
6739 for (Value *V : VL) {
6740 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6742 continue;
6743 if (getTreeEntry(V)) {
6744 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6745 << ") is already in tree.\n");
6746 if (TryToFindDuplicates(S))
6747 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6748 ReuseShuffleIndicies);
6749 return;
6750 }
6751 }
6752
6753 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6754 if (UserIgnoreList && !UserIgnoreList->empty()) {
6755 for (Value *V : VL) {
6756 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6757 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6758 if (TryToFindDuplicates(S))
6759 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6760 ReuseShuffleIndicies);
6761 return;
6762 }
6763 }
6764 }
6765
6766 // Special processing for sorted pointers for ScatterVectorize node with
6767 // constant indeces only.
6768 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6769 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6770 !(S.getOpcode() && allSameBlock(VL))) {
6771 assert(S.OpValue->getType()->isPointerTy() &&
6772 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6773 "Expected pointers only.");
6774 // Reset S to make it GetElementPtr kind of node.
6775 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6776 assert(It != VL.end() && "Expected at least one GEP.");
6777 S = getSameOpcode(*It, *TLI);
6778 }
6779
6780 // Check that all of the users of the scalars that we want to vectorize are
6781 // schedulable.
6782 auto *VL0 = cast<Instruction>(S.OpValue);
6783 BB = VL0->getParent();
6784
6785 if (!DT->isReachableFromEntry(BB)) {
6786 // Don't go into unreachable blocks. They may contain instructions with
6787 // dependency cycles which confuse the final scheduling.
6788 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6789 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6790 return;
6791 }
6792
6793 // Don't go into catchswitch blocks, which can happen with PHIs.
6794 // Such blocks can only have PHIs and the catchswitch. There is no
6795 // place to insert a shuffle if we need to, so just avoid that issue.
6796 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6797 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6798 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6799 return;
6800 }
6801
6802 // Check that every instruction appears once in this bundle.
6803 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6804 return;
6805
6806 // Perform specific checks for each particular instruction kind.
6807 OrdersType CurrentOrder;
6808 SmallVector<Value *> PointerOps;
6809 TreeEntry::EntryState State = getScalarsVectorizationState(
6810 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6811 if (State == TreeEntry::NeedToGather) {
6812 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6813 ReuseShuffleIndicies);
6814 return;
6815 }
6816
6817 auto &BSRef = BlocksSchedules[BB];
6818 if (!BSRef)
6819 BSRef = std::make_unique<BlockScheduling>(BB);
6820
6821 BlockScheduling &BS = *BSRef;
6822
6823 std::optional<ScheduleData *> Bundle =
6824 BS.tryScheduleBundle(UniqueValues, this, S);
6825#ifdef EXPENSIVE_CHECKS
6826 // Make sure we didn't break any internal invariants
6827 BS.verify();
6828#endif
6829 if (!Bundle) {
6830 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6831 assert((!BS.getScheduleData(VL0) ||
6832 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6833 "tryScheduleBundle should cancelScheduling on failure");
6834 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6835 ReuseShuffleIndicies);
6836 NonScheduledFirst.insert(VL.front());
6837 return;
6838 }
6839 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6840
6841 unsigned ShuffleOrOp = S.isAltShuffle() ?
6842 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6843 switch (ShuffleOrOp) {
6844 case Instruction::PHI: {
6845 auto *PH = cast<PHINode>(VL0);
6846
6847 TreeEntry *TE =
6848 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6849 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6850
6851 // Keeps the reordered operands to avoid code duplication.
6852 PHIHandler Handler(*DT, PH, VL);
6853 Handler.buildOperands();
6854 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6855 TE->setOperand(I, Handler.getOperands(I));
6856 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6857 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
6858 return;
6859 }
6860 case Instruction::ExtractValue:
6861 case Instruction::ExtractElement: {
6862 if (CurrentOrder.empty()) {
6863 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6864 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6865 ReuseShuffleIndicies);
6866 // This is a special case, as it does not gather, but at the same time
6867 // we are not extending buildTree_rec() towards the operands.
6868 ValueList Op0;
6869 Op0.assign(VL.size(), VL0->getOperand(0));
6870 VectorizableTree.back()->setOperand(0, Op0);
6871 return;
6872 }
6873 LLVM_DEBUG({
6874 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6875 "with order";
6876 for (unsigned Idx : CurrentOrder)
6877 dbgs() << " " << Idx;
6878 dbgs() << "\n";
6879 });
6880 fixupOrderingIndices(CurrentOrder);
6881 // Insert new order with initial value 0, if it does not exist,
6882 // otherwise return the iterator to the existing one.
6883 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6884 ReuseShuffleIndicies, CurrentOrder);
6885 // This is a special case, as it does not gather, but at the same time
6886 // we are not extending buildTree_rec() towards the operands.
6887 ValueList Op0;
6888 Op0.assign(VL.size(), VL0->getOperand(0));
6889 VectorizableTree.back()->setOperand(0, Op0);
6890 return;
6891 }
6892 case Instruction::InsertElement: {
6893 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6894
6895 auto OrdCompare = [](const std::pair<int, int> &P1,
6896 const std::pair<int, int> &P2) {
6897 return P1.first > P2.first;
6898 };
6900 decltype(OrdCompare)>
6901 Indices(OrdCompare);
6902 for (int I = 0, E = VL.size(); I < E; ++I) {
6903 unsigned Idx = *getInsertIndex(VL[I]);
6904 Indices.emplace(Idx, I);
6905 }
6906 OrdersType CurrentOrder(VL.size(), VL.size());
6907 bool IsIdentity = true;
6908 for (int I = 0, E = VL.size(); I < E; ++I) {
6909 CurrentOrder[Indices.top().second] = I;
6910 IsIdentity &= Indices.top().second == I;
6911 Indices.pop();
6912 }
6913 if (IsIdentity)
6914 CurrentOrder.clear();
6915 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6916 std::nullopt, CurrentOrder);
6917 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6918
6919 constexpr int NumOps = 2;
6920 ValueList VectorOperands[NumOps];
6921 for (int I = 0; I < NumOps; ++I) {
6922 for (Value *V : VL)
6923 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
6924
6925 TE->setOperand(I, VectorOperands[I]);
6926 }
6927 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
6928 return;
6929 }
6930 case Instruction::Load: {
6931 // Check that a vectorized load would load the same memory as a scalar
6932 // load. For example, we don't want to vectorize loads that are smaller
6933 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6934 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6935 // from such a struct, we read/write packed bits disagreeing with the
6936 // unvectorized version.
6937 TreeEntry *TE = nullptr;
6938 fixupOrderingIndices(CurrentOrder);
6939 switch (State) {
6940 case TreeEntry::Vectorize:
6941 if (CurrentOrder.empty()) {
6942 // Original loads are consecutive and does not require reordering.
6943 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6944 ReuseShuffleIndicies);
6945 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6946 } else {
6947 // Need to reorder.
6948 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6949 ReuseShuffleIndicies, CurrentOrder);
6950 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6951 }
6952 TE->setOperandsInOrder();
6953 break;
6954 case TreeEntry::StridedVectorize:
6955 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6956 if (CurrentOrder.empty()) {
6957 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6958 UserTreeIdx, ReuseShuffleIndicies);
6959 } else {
6960 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6961 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6962 }
6963 TE->setOperandsInOrder();
6964 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6965 break;
6966 case TreeEntry::ScatterVectorize:
6967 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6968 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6969 UserTreeIdx, ReuseShuffleIndicies);
6970 TE->setOperandsInOrder();
6971 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6972 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6973 break;
6974 case TreeEntry::NeedToGather:
6975 llvm_unreachable("Unexpected loads state.");
6976 }
6977 return;
6978 }
6979 case Instruction::ZExt:
6980 case Instruction::SExt:
6981 case Instruction::FPToUI:
6982 case Instruction::FPToSI:
6983 case Instruction::FPExt:
6984 case Instruction::PtrToInt:
6985 case Instruction::IntToPtr:
6986 case Instruction::SIToFP:
6987 case Instruction::UIToFP:
6988 case Instruction::Trunc:
6989 case Instruction::FPTrunc:
6990 case Instruction::BitCast: {
6991 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6992 std::make_pair(std::numeric_limits<unsigned>::min(),
6993 std::numeric_limits<unsigned>::max()));
6994 if (ShuffleOrOp == Instruction::ZExt ||
6995 ShuffleOrOp == Instruction::SExt) {
6996 CastMaxMinBWSizes = std::make_pair(
6997 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6998 PrevMaxBW),
6999 std::min<unsigned>(
7000 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7001 PrevMinBW));
7002 } else if (ShuffleOrOp == Instruction::Trunc) {
7003 CastMaxMinBWSizes = std::make_pair(
7004 std::max<unsigned>(
7005 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7006 PrevMaxBW),
7007 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7008 PrevMinBW));
7009 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7010 } else if (ShuffleOrOp == Instruction::SIToFP ||
7011 ShuffleOrOp == Instruction::UIToFP) {
7012 unsigned NumSignBits =
7013 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7014 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7015 APInt Mask = DB->getDemandedBits(OpI);
7016 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7017 }
7018 if (NumSignBits * 2 >=
7019 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7020 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7021 }
7022 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7023 ReuseShuffleIndicies);
7024 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7025
7026 TE->setOperandsInOrder();
7027 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7029 // Prepare the operand vector.
7030 for (Value *V : VL)
7031 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7032
7033 buildTree_rec(Operands, Depth + 1, {TE, I});
7034 }
7035 return;
7036 }
7037 case Instruction::ICmp:
7038 case Instruction::FCmp: {
7039 // Check that all of the compares have the same predicate.
7040 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7041 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7042 ReuseShuffleIndicies);
7043 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7044
7046 if (cast<CmpInst>(VL0)->isCommutative()) {
7047 // Commutative predicate - collect + sort operands of the instructions
7048 // so that each side is more likely to have the same opcode.
7050 "Commutative Predicate mismatch");
7051 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7052 } else {
7053 // Collect operands - commute if it uses the swapped predicate.
7054 for (Value *V : VL) {
7055 auto *Cmp = cast<CmpInst>(V);
7056 Value *LHS = Cmp->getOperand(0);
7057 Value *RHS = Cmp->getOperand(1);
7058 if (Cmp->getPredicate() != P0)
7059 std::swap(LHS, RHS);
7060 Left.push_back(LHS);
7061 Right.push_back(RHS);
7062 }
7063 }
7064 TE->setOperand(0, Left);
7065 TE->setOperand(1, Right);
7066 buildTree_rec(Left, Depth + 1, {TE, 0});
7067 buildTree_rec(Right, Depth + 1, {TE, 1});
7068 if (ShuffleOrOp == Instruction::ICmp) {
7069 unsigned NumSignBits0 =
7070 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7071 if (NumSignBits0 * 2 >=
7072 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7073 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7074 unsigned NumSignBits1 =
7075 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7076 if (NumSignBits1 * 2 >=
7077 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7078 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7079 }
7080 return;
7081 }
7082 case Instruction::Select:
7083 case Instruction::FNeg:
7084 case Instruction::Add:
7085 case Instruction::FAdd:
7086 case Instruction::Sub:
7087 case Instruction::FSub:
7088 case Instruction::Mul:
7089 case Instruction::FMul:
7090 case Instruction::UDiv:
7091 case Instruction::SDiv:
7092 case Instruction::FDiv:
7093 case Instruction::URem:
7094 case Instruction::SRem:
7095 case Instruction::FRem:
7096 case Instruction::Shl:
7097 case Instruction::LShr:
7098 case Instruction::AShr:
7099 case Instruction::And:
7100 case Instruction::Or:
7101 case Instruction::Xor: {
7102 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7103 ReuseShuffleIndicies);
7104 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7105
7106 // Sort operands of the instructions so that each side is more likely to
7107 // have the same opcode.
7108 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7110 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7111 TE->setOperand(0, Left);
7112 TE->setOperand(1, Right);
7113 buildTree_rec(Left, Depth + 1, {TE, 0});
7114 buildTree_rec(Right, Depth + 1, {TE, 1});
7115 return;
7116 }
7117
7118 TE->setOperandsInOrder();
7119 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7121 // Prepare the operand vector.
7122 for (Value *V : VL)
7123 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7124
7125 buildTree_rec(Operands, Depth + 1, {TE, I});
7126 }
7127 return;
7128 }
7129 case Instruction::GetElementPtr: {
7130 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7131 ReuseShuffleIndicies);
7132 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7134 // Prepare the operand vector for pointer operands.
7135 for (Value *V : VL) {
7136 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7137 if (!GEP) {
7138 Operands.front().push_back(V);
7139 continue;
7140 }
7141 Operands.front().push_back(GEP->getPointerOperand());
7142 }
7143 TE->setOperand(0, Operands.front());
7144 // Need to cast all indices to the same type before vectorization to
7145 // avoid crash.
7146 // Required to be able to find correct matches between different gather
7147 // nodes and reuse the vectorized values rather than trying to gather them
7148 // again.
7149 int IndexIdx = 1;
7150 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7151 Type *Ty = all_of(VL,
7152 [VL0Ty, IndexIdx](Value *V) {
7153 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7154 if (!GEP)
7155 return true;
7156 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7157 })
7158 ? VL0Ty
7159 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7160 ->getPointerOperandType()
7161 ->getScalarType());
7162 // Prepare the operand vector.
7163 for (Value *V : VL) {
7164 auto *I = dyn_cast<GetElementPtrInst>(V);
7165 if (!I) {
7166 Operands.back().push_back(
7167 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7168 continue;
7169 }
7170 auto *Op = I->getOperand(IndexIdx);
7171 auto *CI = dyn_cast<ConstantInt>(Op);
7172 if (!CI)
7173 Operands.back().push_back(Op);
7174 else
7175 Operands.back().push_back(ConstantFoldIntegerCast(
7176 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7177 }
7178 TE->setOperand(IndexIdx, Operands.back());
7179
7180 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7181 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7182 return;
7183 }
7184 case Instruction::Store: {
7185 // Check if the stores are consecutive or if we need to swizzle them.
7186 ValueList Operands(VL.size());
7187 auto *OIter = Operands.begin();
7188 for (Value *V : VL) {
7189 auto *SI = cast<StoreInst>(V);
7190 *OIter = SI->getValueOperand();
7191 ++OIter;
7192 }
7193 // Check that the sorted pointer operands are consecutive.
7194 if (CurrentOrder.empty()) {
7195 // Original stores are consecutive and does not require reordering.
7196 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7197 ReuseShuffleIndicies);
7198 TE->setOperandsInOrder();
7199 buildTree_rec(Operands, Depth + 1, {TE, 0});
7200 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7201 } else {
7202 fixupOrderingIndices(CurrentOrder);
7203 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7204 ReuseShuffleIndicies, CurrentOrder);
7205 TE->setOperandsInOrder();
7206 buildTree_rec(Operands, Depth + 1, {TE, 0});
7207 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7208 }
7209 return;
7210 }
7211 case Instruction::Call: {
7212 // Check if the calls are all to the same vectorizable intrinsic or
7213 // library function.
7214 CallInst *CI = cast<CallInst>(VL0);
7216
7217 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7218 ReuseShuffleIndicies);
7219 // Sort operands of the instructions so that each side is more likely to
7220 // have the same opcode.
7221 if (isCommutative(VL0)) {
7223 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7224 TE->setOperand(0, Left);
7225 TE->setOperand(1, Right);
7227 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7228 Operands.emplace_back();
7230 continue;
7231 for (Value *V : VL) {
7232 auto *CI2 = cast<CallInst>(V);
7233 Operands.back().push_back(CI2->getArgOperand(I));
7234 }
7235 TE->setOperand(I, Operands.back());
7236 }
7237 buildTree_rec(Left, Depth + 1, {TE, 0});
7238 buildTree_rec(Right, Depth + 1, {TE, 1});
7239 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7240 if (Operands[I - 2].empty())
7241 continue;
7242 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7243 }
7244 return;
7245 }
7246 TE->setOperandsInOrder();
7247 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7248 // For scalar operands no need to create an entry since no need to
7249 // vectorize it.
7251 continue;
7253 // Prepare the operand vector.
7254 for (Value *V : VL) {
7255 auto *CI2 = cast<CallInst>(V);
7256 Operands.push_back(CI2->getArgOperand(I));
7257 }
7258 buildTree_rec(Operands, Depth + 1, {TE, I});
7259 }
7260 return;
7261 }
7262 case Instruction::ShuffleVector: {
7263 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7264 ReuseShuffleIndicies);
7265 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7266
7267 // Reorder operands if reordering would enable vectorization.
7268 auto *CI = dyn_cast<CmpInst>(VL0);
7269 if (isa<BinaryOperator>(VL0) || CI) {
7271 if (!CI || all_of(VL, [](Value *V) {
7272 return cast<CmpInst>(V)->isCommutative();
7273 })) {
7274 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7275 } else {
7276 auto *MainCI = cast<CmpInst>(S.MainOp);
7277 auto *AltCI = cast<CmpInst>(S.AltOp);
7278 CmpInst::Predicate MainP = MainCI->getPredicate();
7279 CmpInst::Predicate AltP = AltCI->getPredicate();
7280 assert(MainP != AltP &&
7281 "Expected different main/alternate predicates.");
7282 // Collect operands - commute if it uses the swapped predicate or
7283 // alternate operation.
7284 for (Value *V : VL) {
7285 auto *Cmp = cast<CmpInst>(V);
7286 Value *LHS = Cmp->getOperand(0);
7287 Value *RHS = Cmp->getOperand(1);
7288
7289 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7290 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7291 std::swap(LHS, RHS);
7292 } else {
7293 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7294 std::swap(LHS, RHS);
7295 }
7296 Left.push_back(LHS);
7297 Right.push_back(RHS);
7298 }
7299 }
7300 TE->setOperand(0, Left);
7301 TE->setOperand(1, Right);
7302 buildTree_rec(Left, Depth + 1, {TE, 0});
7303 buildTree_rec(Right, Depth + 1, {TE, 1});
7304 return;
7305 }
7306
7307 TE->setOperandsInOrder();
7308 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7310 // Prepare the operand vector.
7311 for (Value *V : VL)
7312 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7313
7314 buildTree_rec(Operands, Depth + 1, {TE, I});
7315 }
7316 return;
7317 }
7318 default:
7319 break;
7320 }
7321 llvm_unreachable("Unexpected vectorization of the instructions.");
7322}
7323
7325 unsigned N = 1;
7326 Type *EltTy = T;
7327
7328 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7329 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7330 // Check that struct is homogeneous.
7331 for (const auto *Ty : ST->elements())
7332 if (Ty != *ST->element_begin())
7333 return 0;
7334 N *= ST->getNumElements();
7335 EltTy = *ST->element_begin();
7336 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7337 N *= AT->getNumElements();
7338 EltTy = AT->getElementType();
7339 } else {
7340 auto *VT = cast<FixedVectorType>(EltTy);
7341 N *= VT->getNumElements();
7342 EltTy = VT->getElementType();
7343 }
7344 }
7345
7346 if (!isValidElementType(EltTy))
7347 return 0;
7349 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7350 VTSize != DL->getTypeStoreSizeInBits(T))
7351 return 0;
7352 return N;
7353}
7354
7355bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7356 SmallVectorImpl<unsigned> &CurrentOrder,
7357 bool ResizeAllowed) const {
7358 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7359 assert(It != VL.end() && "Expected at least one extract instruction.");
7360 auto *E0 = cast<Instruction>(*It);
7361 assert(
7362 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7363 "Invalid opcode");
7364 // Check if all of the extracts come from the same vector and from the
7365 // correct offset.
7366 Value *Vec = E0->getOperand(0);
7367
7368 CurrentOrder.clear();
7369
7370 // We have to extract from a vector/aggregate with the same number of elements.
7371 unsigned NElts;
7372 if (E0->getOpcode() == Instruction::ExtractValue) {
7373 NElts = canMapToVector(Vec->getType());
7374 if (!NElts)
7375 return false;
7376 // Check if load can be rewritten as load of vector.
7377 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7378 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7379 return false;
7380 } else {
7381 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7382 }
7383
7384 unsigned E = VL.size();
7385 if (!ResizeAllowed && NElts != E)
7386 return false;
7387 SmallVector<int> Indices(E, PoisonMaskElem);
7388 unsigned MinIdx = NElts, MaxIdx = 0;
7389 for (auto [I, V] : enumerate(VL)) {
7390 auto *Inst = dyn_cast<Instruction>(V);
7391 if (!Inst)
7392 continue;
7393 if (Inst->getOperand(0) != Vec)
7394 return false;
7395 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7396 if (isa<UndefValue>(EE->getIndexOperand()))
7397 continue;
7398 std::optional<unsigned> Idx = getExtractIndex(Inst);
7399 if (!Idx)
7400 return false;
7401 const unsigned ExtIdx = *Idx;
7402 if (ExtIdx >= NElts)
7403 continue;
7404 Indices[I] = ExtIdx;
7405 if (MinIdx > ExtIdx)
7406 MinIdx = ExtIdx;
7407 if (MaxIdx < ExtIdx)
7408 MaxIdx = ExtIdx;
7409 }
7410 if (MaxIdx - MinIdx + 1 > E)
7411 return false;
7412 if (MaxIdx + 1 <= E)
7413 MinIdx = 0;
7414
7415 // Check that all of the indices extract from the correct offset.
7416 bool ShouldKeepOrder = true;
7417 // Assign to all items the initial value E + 1 so we can check if the extract
7418 // instruction index was used already.
7419 // Also, later we can check that all the indices are used and we have a
7420 // consecutive access in the extract instructions, by checking that no
7421 // element of CurrentOrder still has value E + 1.
7422 CurrentOrder.assign(E, E);
7423 for (unsigned I = 0; I < E; ++I) {
7424 if (Indices[I] == PoisonMaskElem)
7425 continue;
7426 const unsigned ExtIdx = Indices[I] - MinIdx;
7427 if (CurrentOrder[ExtIdx] != E) {
7428 CurrentOrder.clear();
7429 return false;
7430 }
7431 ShouldKeepOrder &= ExtIdx == I;
7432 CurrentOrder[ExtIdx] = I;
7433 }
7434 if (ShouldKeepOrder)
7435 CurrentOrder.clear();
7436
7437 return ShouldKeepOrder;
7438}
7439
7440bool BoUpSLP::areAllUsersVectorized(
7441 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7442 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7443 all_of(I->users(), [this](User *U) {
7444 return ScalarToTreeEntry.contains(U) ||
7445 isVectorLikeInstWithConstOps(U) ||
7446 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7447 });
7448}
7449
7450static std::pair<InstructionCost, InstructionCost>
7453 ArrayRef<Type *> ArgTys) {
7455
7456 // Calculate the cost of the scalar and vector calls.
7457 FastMathFlags FMF;
7458 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7459 FMF = FPCI->getFastMathFlags();
7461 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7462 dyn_cast<IntrinsicInst>(CI));
7463 auto IntrinsicCost =
7465
7466 auto Shape = VFShape::get(CI->getFunctionType(),
7468 false /*HasGlobalPred*/);
7469 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7470 auto LibCost = IntrinsicCost;
7471 if (!CI->isNoBuiltin() && VecFunc) {
7472 // Calculate the cost of the vector library call.
7473 // If the corresponding vector call is cheaper, return its cost.
7474 LibCost =
7475 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7476 }
7477 return {IntrinsicCost, LibCost};
7478}
7479
7480void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7481 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7482 SmallVectorImpl<Value *> *OpScalars,
7483 SmallVectorImpl<Value *> *AltScalars) const {
7484 unsigned Sz = Scalars.size();
7485 Mask.assign(Sz, PoisonMaskElem);
7486 SmallVector<int> OrderMask;
7487 if (!ReorderIndices.empty())
7488 inversePermutation(ReorderIndices, OrderMask);
7489 for (unsigned I = 0; I < Sz; ++I) {
7490 unsigned Idx = I;
7491 if (!ReorderIndices.empty())
7492 Idx = OrderMask[I];
7493 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7494 if (IsAltOp(OpInst)) {
7495 Mask[I] = Sz + Idx;
7496 if (AltScalars)
7497 AltScalars->push_back(OpInst);
7498 } else {
7499 Mask[I] = Idx;
7500 if (OpScalars)
7501 OpScalars->push_back(OpInst);
7502 }
7503 }
7504 if (!ReuseShuffleIndices.empty()) {
7505 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7506 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7507 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7508 });
7509 Mask.swap(NewMask);
7510 }
7511}
7512
7514 const Instruction *MainOp,
7515 const Instruction *AltOp,
7516 const TargetLibraryInfo &TLI) {
7517 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7518 auto *AltCI = cast<CmpInst>(AltOp);
7519 CmpInst::Predicate MainP = MainCI->getPredicate();
7520 CmpInst::Predicate AltP = AltCI->getPredicate();
7521 assert(MainP != AltP && "Expected different main/alternate predicates.");
7522 auto *CI = cast<CmpInst>(I);
7523 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7524 return false;
7525 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7526 return true;
7527 CmpInst::Predicate P = CI->getPredicate();
7529
7530 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7531 "CmpInst expected to match either main or alternate predicate or "
7532 "their swap.");
7533 (void)AltP;
7534 return MainP != P && MainP != SwappedP;
7535 }
7536 return I->getOpcode() == AltOp->getOpcode();
7537}
7538
7539TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7540 assert(!Ops.empty());
7541 const auto *Op0 = Ops.front();
7542
7543 const bool IsConstant = all_of(Ops, [](Value *V) {
7544 // TODO: We should allow undef elements here
7545 return isConstant(V) && !isa<UndefValue>(V);
7546 });
7547 const bool IsUniform = all_of(Ops, [=](Value *V) {
7548 // TODO: We should allow undef elements here
7549 return V == Op0;
7550 });
7551 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7552 // TODO: We should allow undef elements here
7553 if (auto *CI = dyn_cast<ConstantInt>(V))
7554 return CI->getValue().isPowerOf2();
7555 return false;
7556 });
7557 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7558 // TODO: We should allow undef elements here
7559 if (auto *CI = dyn_cast<ConstantInt>(V))
7560 return CI->getValue().isNegatedPowerOf2();
7561 return false;
7562 });
7563
7565 if (IsConstant && IsUniform)
7567 else if (IsConstant)
7569 else if (IsUniform)
7571
7573 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7574 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7575
7576 return {VK, VP};
7577}
7578
7579namespace {
7580/// The base class for shuffle instruction emission and shuffle cost estimation.
7581class BaseShuffleAnalysis {
7582protected:
7583 /// Checks if the mask is an identity mask.
7584 /// \param IsStrict if is true the function returns false if mask size does
7585 /// not match vector size.
7586 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7587 bool IsStrict) {
7588 int Limit = Mask.size();
7589 int VF = VecTy->getNumElements();
7590 int Index = -1;
7591 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7592 return true;
7593 if (!IsStrict) {
7594 // Consider extract subvector starting from index 0.
7596 Index == 0)
7597 return true;
7598 // All VF-size submasks are identity (e.g.
7599 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7600 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7601 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7602 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7604 }))
7605 return true;
7606 }
7607 return false;
7608 }
7609
7610 /// Tries to combine 2 different masks into single one.
7611 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7612 /// change the size of the vector, \p LocalVF is the original size of the
7613 /// shuffled vector.
7614 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7615 ArrayRef<int> ExtMask) {
7616 unsigned VF = Mask.size();
7617 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7618 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7619 if (ExtMask[I] == PoisonMaskElem)
7620 continue;
7621 int MaskedIdx = Mask[ExtMask[I] % VF];
7622 NewMask[I] =
7623 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7624 }
7625 Mask.swap(NewMask);
7626 }
7627
7628 /// Looks through shuffles trying to reduce final number of shuffles in the
7629 /// code. The function looks through the previously emitted shuffle
7630 /// instructions and properly mark indices in mask as undef.
7631 /// For example, given the code
7632 /// \code
7633 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7634 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7635 /// \endcode
7636 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7637 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7638 /// <0, 1, 2, 3> for the shuffle.
7639 /// If 2 operands are of different size, the smallest one will be resized and
7640 /// the mask recalculated properly.
7641 /// For example, given the code
7642 /// \code
7643 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7644 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7645 /// \endcode
7646 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7647 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7648 /// <0, 1, 2, 3> for the shuffle.
7649 /// So, it tries to transform permutations to simple vector merge, if
7650 /// possible.
7651 /// \param V The input vector which must be shuffled using the given \p Mask.
7652 /// If the better candidate is found, \p V is set to this best candidate
7653 /// vector.
7654 /// \param Mask The input mask for the shuffle. If the best candidate is found
7655 /// during looking-through-shuffles attempt, it is updated accordingly.
7656 /// \param SinglePermute true if the shuffle operation is originally a
7657 /// single-value-permutation. In this case the look-through-shuffles procedure
7658 /// may look for resizing shuffles as the best candidates.
7659 /// \return true if the shuffle results in the non-resizing identity shuffle
7660 /// (and thus can be ignored), false - otherwise.
7661 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7662 bool SinglePermute) {
7663 Value *Op = V;
7664 ShuffleVectorInst *IdentityOp = nullptr;
7665 SmallVector<int> IdentityMask;
7666 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7667 // Exit if not a fixed vector type or changing size shuffle.
7668 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7669 if (!SVTy)
7670 break;
7671 // Remember the identity or broadcast mask, if it is not a resizing
7672 // shuffle. If no better candidates are found, this Op and Mask will be
7673 // used in the final shuffle.
7674 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7675 if (!IdentityOp || !SinglePermute ||
7676 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7678 IdentityMask.size()))) {
7679 IdentityOp = SV;
7680 // Store current mask in the IdentityMask so later we did not lost
7681 // this info if IdentityOp is selected as the best candidate for the
7682 // permutation.
7683 IdentityMask.assign(Mask);
7684 }
7685 }
7686 // Remember the broadcast mask. If no better candidates are found, this Op
7687 // and Mask will be used in the final shuffle.
7688 // Zero splat can be used as identity too, since it might be used with
7689 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7690 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7691 // expensive, the analysis founds out, that the source vector is just a
7692 // broadcast, this original mask can be transformed to identity mask <0,
7693 // 1, 2, 3>.
7694 // \code
7695 // %0 = shuffle %v, poison, zeroinitalizer
7696 // %res = shuffle %0, poison, <3, 1, 2, 0>
7697 // \endcode
7698 // may be transformed to
7699 // \code
7700 // %0 = shuffle %v, poison, zeroinitalizer
7701 // %res = shuffle %0, poison, <0, 1, 2, 3>
7702 // \endcode
7703 if (SV->isZeroEltSplat()) {
7704 IdentityOp = SV;
7705 IdentityMask.assign(Mask);
7706 }
7707 int LocalVF = Mask.size();
7708 if (auto *SVOpTy =
7709 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7710 LocalVF = SVOpTy->getNumElements();
7711 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7712 for (auto [Idx, I] : enumerate(Mask)) {
7713 if (I == PoisonMaskElem ||
7714 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7715 continue;
7716 ExtMask[Idx] = SV->getMaskValue(I);
7717 }
7718 bool IsOp1Undef =
7719 isUndefVector(SV->getOperand(0),
7720 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7721 .all();
7722 bool IsOp2Undef =
7723 isUndefVector(SV->getOperand(1),
7724 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7725 .all();
7726 if (!IsOp1Undef && !IsOp2Undef) {
7727 // Update mask and mark undef elems.
7728 for (int &I : Mask) {
7729 if (I == PoisonMaskElem)
7730 continue;
7731 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7733 I = PoisonMaskElem;
7734 }
7735 break;
7736 }
7737 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7738 SV->getShuffleMask().end());
7739 combineMasks(LocalVF, ShuffleMask, Mask);
7740 Mask.swap(ShuffleMask);
7741 if (IsOp2Undef)
7742 Op = SV->getOperand(0);
7743 else
7744 Op = SV->getOperand(1);
7745 }
7746 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7747 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7749 if (IdentityOp) {
7750 V = IdentityOp;
7751 assert(Mask.size() == IdentityMask.size() &&
7752 "Expected masks of same sizes.");
7753 // Clear known poison elements.
7754 for (auto [I, Idx] : enumerate(Mask))
7755 if (Idx == PoisonMaskElem)
7756 IdentityMask[I] = PoisonMaskElem;
7757 Mask.swap(IdentityMask);
7758 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7759 return SinglePermute &&
7760 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7761 /*IsStrict=*/true) ||
7762 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7763 Shuffle->isZeroEltSplat() &&
7765 }
7766 V = Op;
7767 return false;
7768 }
7769 V = Op;
7770 return true;
7771 }
7772
7773 /// Smart shuffle instruction emission, walks through shuffles trees and
7774 /// tries to find the best matching vector for the actual shuffle
7775 /// instruction.
7776 template <typename T, typename ShuffleBuilderTy>
7777 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7778 ShuffleBuilderTy &Builder) {
7779 assert(V1 && "Expected at least one vector value.");
7780 if (V2)
7781 Builder.resizeToMatch(V1, V2);
7782 int VF = Mask.size();
7783 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7784 VF = FTy->getNumElements();
7785 if (V2 &&
7786 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7787 // Peek through shuffles.
7788 Value *Op1 = V1;
7789 Value *Op2 = V2;
7790 int VF =
7791 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7792 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7793 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7794 for (int I = 0, E = Mask.size(); I < E; ++I) {
7795 if (Mask[I] < VF)
7796 CombinedMask1[I] = Mask[I];
7797 else
7798 CombinedMask2[I] = Mask[I] - VF;
7799 }
7800 Value *PrevOp1;
7801 Value *PrevOp2;
7802 do {
7803 PrevOp1 = Op1;
7804 PrevOp2 = Op2;
7805 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7806 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7807 // Check if we have 2 resizing shuffles - need to peek through operands
7808 // again.
7809 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7810 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7811 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7812 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7813 if (I == PoisonMaskElem)
7814 continue;
7815 ExtMask1[Idx] = SV1->getMaskValue(I);
7816 }
7817 SmallBitVector UseMask1 = buildUseMask(
7818 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7819 ->getNumElements(),
7820 ExtMask1, UseMask::SecondArg);
7821 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7822 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7823 if (I == PoisonMaskElem)
7824 continue;
7825 ExtMask2[Idx] = SV2->getMaskValue(I);
7826 }
7827 SmallBitVector UseMask2 = buildUseMask(
7828 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7829 ->getNumElements(),
7830 ExtMask2, UseMask::SecondArg);
7831 if (SV1->getOperand(0)->getType() ==
7832 SV2->getOperand(0)->getType() &&
7833 SV1->getOperand(0)->getType() != SV1->getType() &&
7834 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7835 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7836 Op1 = SV1->getOperand(0);
7837 Op2 = SV2->getOperand(0);
7838 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7839 SV1->getShuffleMask().end());
7840 int LocalVF = ShuffleMask1.size();
7841 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7842 LocalVF = FTy->getNumElements();
7843 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7844 CombinedMask1.swap(ShuffleMask1);
7845 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7846 SV2->getShuffleMask().end());
7847 LocalVF = ShuffleMask2.size();
7848 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7849 LocalVF = FTy->getNumElements();
7850 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7851 CombinedMask2.swap(ShuffleMask2);
7852 }
7853 }
7854 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7855 Builder.resizeToMatch(Op1, Op2);
7856 VF = std::max(cast<VectorType>(Op1->getType())
7857 ->getElementCount()
7858 .getKnownMinValue(),
7859 cast<VectorType>(Op2->getType())
7860 ->getElementCount()
7861 .getKnownMinValue());
7862 for (int I = 0, E = Mask.size(); I < E; ++I) {
7863 if (CombinedMask2[I] != PoisonMaskElem) {
7864 assert(CombinedMask1[I] == PoisonMaskElem &&
7865 "Expected undefined mask element");
7866 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7867 }
7868 }
7869 if (Op1 == Op2 &&
7870 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7871 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7872 isa<ShuffleVectorInst>(Op1) &&
7873 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7874 ArrayRef(CombinedMask1))))
7875 return Builder.createIdentity(Op1);
7876 return Builder.createShuffleVector(
7877 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7878 CombinedMask1);
7879 }
7880 if (isa<PoisonValue>(V1))
7881 return Builder.createPoison(
7882 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7883 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7884 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7885 assert(V1 && "Expected non-null value after looking through shuffles.");
7886
7887 if (!IsIdentity)
7888 return Builder.createShuffleVector(V1, NewMask);
7889 return Builder.createIdentity(V1);
7890 }
7891};
7892} // namespace
7893
7894/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7895/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7896/// subvector pattern.
7897static InstructionCost
7899 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7901 int Index = 0, VectorType *SubTp = nullptr,
7902 ArrayRef<const Value *> Args = std::nullopt) {
7903 if (Kind != TTI::SK_PermuteTwoSrc)
7904 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7905 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7906 int NumSubElts;
7907 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7908 Mask, NumSrcElts, NumSubElts, Index)) {
7909 if (Index + NumSubElts > NumSrcElts &&
7910 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7911 return TTI.getShuffleCost(
7913 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7915 }
7916 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7917}
7918
7919/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7920static std::pair<InstructionCost, InstructionCost>
7922 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7923 Type *ScalarTy, VectorType *VecTy) {
7924 InstructionCost ScalarCost = 0;
7925 InstructionCost VecCost = 0;
7926 // Here we differentiate two cases: (1) when Ptrs represent a regular
7927 // vectorization tree node (as they are pointer arguments of scattered
7928 // loads) or (2) when Ptrs are the arguments of loads or stores being
7929 // vectorized as plane wide unit-stride load/store since all the
7930 // loads/stores are known to be from/to adjacent locations.
7931 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7932 // Case 2: estimate costs for pointer related costs when vectorizing to
7933 // a wide load/store.
7934 // Scalar cost is estimated as a set of pointers with known relationship
7935 // between them.
7936 // For vector code we will use BasePtr as argument for the wide load/store
7937 // but we also need to account all the instructions which are going to
7938 // stay in vectorized code due to uses outside of these scalar
7939 // loads/stores.
7940 ScalarCost = TTI.getPointersChainCost(
7941 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7942 CostKind);
7943
7944 SmallVector<const Value *> PtrsRetainedInVecCode;
7945 for (Value *V : Ptrs) {
7946 if (V == BasePtr) {
7947 PtrsRetainedInVecCode.push_back(V);
7948 continue;
7949 }
7950 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7951 // For simplicity assume Ptr to stay in vectorized code if it's not a
7952 // GEP instruction. We don't care since it's cost considered free.
7953 // TODO: We should check for any uses outside of vectorizable tree
7954 // rather than just single use.
7955 if (!Ptr || !Ptr->hasOneUse())
7956 PtrsRetainedInVecCode.push_back(V);
7957 }
7958
7959 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7960 // If all pointers stay in vectorized code then we don't have
7961 // any savings on that.
7962 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7963 }
7964 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7965 TTI::PointersChainInfo::getKnownStride(),
7966 VecTy, CostKind);
7967 } else {
7968 // Case 1: Ptrs are the arguments of loads that we are going to transform
7969 // into masked gather load intrinsic.
7970 // All the scalar GEPs will be removed as a result of vectorization.
7971 // For any external uses of some lanes extract element instructions will
7972 // be generated (which cost is estimated separately).
7973 TTI::PointersChainInfo PtrsInfo =
7974 all_of(Ptrs,
7975 [](const Value *V) {
7976 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7977 return Ptr && !Ptr->hasAllConstantIndices();
7978 })
7979 ? TTI::PointersChainInfo::getUnknownStride()
7980 : TTI::PointersChainInfo::getKnownStride();
7981
7982 ScalarCost =
7983 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7984 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7985 if (!BaseGEP) {
7986 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
7987 if (It != Ptrs.end())
7988 BaseGEP = cast<GEPOperator>(*It);
7989 }
7990 if (BaseGEP) {
7991 SmallVector<const Value *> Indices(BaseGEP->indices());
7992 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7993 BaseGEP->getPointerOperand(), Indices, VecTy,
7994 CostKind);
7995 }
7996 }
7997
7998 return std::make_pair(ScalarCost, VecCost);
7999}
8000
8003 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8004 TreeEntry &E = *TE.get();
8005 switch (E.getOpcode()) {
8006 case Instruction::Load: {
8007 Type *ScalarTy = E.getMainOp()->getType();
8008 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
8009 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8010 // Check if profitable to represent consecutive load + reverse as strided
8011 // load with stride -1.
8012 if (isReverseOrder(E.ReorderIndices) &&
8013 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8014 SmallVector<int> Mask;
8015 inversePermutation(E.ReorderIndices, Mask);
8016 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8017 InstructionCost OriginalVecCost =
8018 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8023 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8024 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8025 if (StridedCost < OriginalVecCost)
8026 // Strided load is more profitable than consecutive load + reverse -
8027 // transform the node to strided load.
8028 E.State = TreeEntry::StridedVectorize;
8029 }
8030 break;
8031 }
8032 case Instruction::Store: {
8033 Type *ScalarTy =
8034 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8035 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
8036 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8037 // Check if profitable to represent consecutive load + reverse as strided
8038 // load with stride -1.
8039 if (isReverseOrder(E.ReorderIndices) &&
8040 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8041 SmallVector<int> Mask;
8042 inversePermutation(E.ReorderIndices, Mask);
8043 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8044 InstructionCost OriginalVecCost =
8045 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8050 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8051 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8052 if (StridedCost < OriginalVecCost)
8053 // Strided load is more profitable than consecutive load + reverse -
8054 // transform the node to strided load.
8055 E.State = TreeEntry::StridedVectorize;
8056 }
8057 break;
8058 }
8059 default:
8060 break;
8061 }
8062 }
8063}
8064
8065/// Merges shuffle masks and emits final shuffle instruction, if required. It
8066/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8067/// when the actual shuffle instruction is generated only if this is actually
8068/// required. Otherwise, the shuffle instruction emission is delayed till the
8069/// end of the process, to reduce the number of emitted instructions and further
8070/// analysis/transformations.
8071class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8072 bool IsFinalized = false;
8073 SmallVector<int> CommonMask;
8075 Type *ScalarTy = nullptr;
8076 const TargetTransformInfo &TTI;
8078 SmallDenseSet<Value *> VectorizedVals;
8079 BoUpSLP &R;
8080 SmallPtrSetImpl<Value *> &CheckedExtracts;
8081 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8082 /// While set, still trying to estimate the cost for the same nodes and we
8083 /// can delay actual cost estimation (virtual shuffle instruction emission).
8084 /// May help better estimate the cost if same nodes must be permuted + allows
8085 /// to move most of the long shuffles cost estimation to TTI.
8086 bool SameNodesEstimated = true;
8087
8088 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8089 if (Ty->getScalarType()->isPointerTy()) {
8093 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8094 Ty->getScalarType());
8095 if (auto *VTy = dyn_cast<VectorType>(Ty))
8096 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8097 return Res;
8098 }
8099 return Constant::getAllOnesValue(Ty);
8100 }
8101
8102 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8103 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8104 return TTI::TCC_Free;
8105 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8106 InstructionCost GatherCost = 0;
8107 SmallVector<Value *> Gathers(VL.begin(), VL.end());
8108 // Improve gather cost for gather of loads, if we can group some of the
8109 // loads into vector loads.
8110 InstructionsState S = getSameOpcode(VL, *R.TLI);
8111 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8112 unsigned MinVF = R.getMinVF(2 * Sz);
8113 if (VL.size() > 2 &&
8114 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8115 (InVectors.empty() &&
8116 any_of(seq<unsigned>(0, VL.size() / MinVF),
8117 [&](unsigned Idx) {
8118 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8119 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8120 return S.getOpcode() == Instruction::Load &&
8121 !S.isAltShuffle();
8122 }))) &&
8123 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8124 !isSplat(Gathers)) {
8125 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8126 SetVector<Value *> VectorizedLoads;
8128 SmallVector<unsigned> ScatterVectorized;
8129 unsigned StartIdx = 0;
8130 unsigned VF = VL.size() / 2;
8131 for (; VF >= MinVF; VF /= 2) {
8132 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8133 Cnt += VF) {
8134 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8135 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8136 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8137 if (SliceS.getOpcode() != Instruction::Load ||
8138 SliceS.isAltShuffle())
8139 continue;
8140 }
8141 if (!VectorizedLoads.count(Slice.front()) &&
8142 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8143 SmallVector<Value *> PointerOps;
8144 OrdersType CurrentOrder;
8145 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8146 CurrentOrder, PointerOps);
8147 switch (LS) {
8151 // Mark the vectorized loads so that we don't vectorize them
8152 // again.
8153 // TODO: better handling of loads with reorders.
8154 if (((LS == LoadsState::Vectorize ||
8156 CurrentOrder.empty()) ||
8158 isReverseOrder(CurrentOrder)))
8159 VectorizedStarts.emplace_back(Cnt, LS);
8160 else
8161 ScatterVectorized.push_back(Cnt);
8162 VectorizedLoads.insert(Slice.begin(), Slice.end());
8163 // If we vectorized initial block, no need to try to vectorize
8164 // it again.
8165 if (Cnt == StartIdx)
8166 StartIdx += VF;
8167 break;
8168 case LoadsState::Gather:
8169 break;
8170 }
8171 }
8172 }
8173 // Check if the whole array was vectorized already - exit.
8174 if (StartIdx >= VL.size())
8175 break;
8176 // Found vectorizable parts - exit.
8177 if (!VectorizedLoads.empty())
8178 break;
8179 }
8180 if (!VectorizedLoads.empty()) {
8181 unsigned NumParts = TTI.getNumberOfParts(VecTy);
8182 bool NeedInsertSubvectorAnalysis =
8183 !NumParts || (VL.size() / VF) > NumParts;
8184 // Get the cost for gathered loads.
8185 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8186 if (VectorizedLoads.contains(VL[I]))
8187 continue;
8188 GatherCost +=
8189 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8190 }
8191 // Exclude potentially vectorized loads from list of gathered
8192 // scalars.
8193 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8194 // The cost for vectorized loads.
8195 InstructionCost ScalarsCost = 0;
8196 for (Value *V : VectorizedLoads) {
8197 auto *LI = cast<LoadInst>(V);
8198 ScalarsCost +=
8199 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8200 LI->getAlign(), LI->getPointerAddressSpace(),
8201 CostKind, TTI::OperandValueInfo(), LI);
8202 }
8203 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
8204 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8205 auto *LI = cast<LoadInst>(VL[P.first]);
8206 Align Alignment = LI->getAlign();
8207 GatherCost +=
8208 P.second == LoadsState::Vectorize
8209 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8210 LI->getPointerAddressSpace(), CostKind,
8213 Instruction::Load, LoadTy, LI->getPointerOperand(),
8214 /*VariableMask=*/false, Alignment, CostKind, LI);
8215 // Estimate GEP cost.
8216 SmallVector<Value *> PointerOps(VF);
8217 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8218 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8219 auto [ScalarGEPCost, VectorGEPCost] =
8220 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8221 Instruction::Load, CostKind, LI->getType(), LoadTy);
8222 GatherCost += VectorGEPCost - ScalarGEPCost;
8223 }
8224 for (unsigned P : ScatterVectorized) {
8225 auto *LI0 = cast<LoadInst>(VL[P]);
8226 ArrayRef<Value *> Slice = VL.slice(P, VF);
8227 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8228 GatherCost += TTI.getGatherScatterOpCost(
8229 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8230 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8231 // Estimate GEP cost.
8232 SmallVector<Value *> PointerOps(VF);
8233 for (auto [I, V] : enumerate(Slice))
8234 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8235 OrdersType Order;
8236 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8237 Order)) {
8238 // TODO: improve checks if GEPs can be vectorized.
8239 Value *Ptr0 = PointerOps.front();
8240 Type *ScalarTy = Ptr0->getType();
8241 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
8242 auto [ScalarGEPCost, VectorGEPCost] =
8243 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8244 CostKind, ScalarTy, VecTy);
8245 GatherCost += VectorGEPCost - ScalarGEPCost;
8246 if (!Order.empty()) {
8247 SmallVector<int> Mask;
8248 inversePermutation(Order, Mask);
8250 VecTy, Mask, CostKind);
8251 }
8252 } else {
8253 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8254 PointerOps.front()->getType());
8255 }
8256 }
8257 if (NeedInsertSubvectorAnalysis) {
8258 // Add the cost for the subvectors insert.
8259 SmallVector<int> ShuffleMask(VL.size());
8260 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8261 for (unsigned Idx : seq<unsigned>(0, E))
8262 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8263 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8264 ShuffleMask, CostKind, I, LoadTy);
8265 }
8266 }
8267 GatherCost -= ScalarsCost;
8268 }
8269 GatherCost = std::min(BaseCost, GatherCost);
8270 } else if (!Root && isSplat(VL)) {
8271 // Found the broadcasting of the single scalar, calculate the cost as
8272 // the broadcast.
8273 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8274 assert(It != VL.end() && "Expected at least one non-undef value.");
8275 // Add broadcast for non-identity shuffle only.
8276 bool NeedShuffle =
8277 count(VL, *It) > 1 &&
8278 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8279 if (!NeedShuffle)
8280 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8281 CostKind, std::distance(VL.begin(), It),
8282 PoisonValue::get(VecTy), *It);
8283
8284 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8285 transform(VL, ShuffleMask.begin(), [](Value *V) {
8286 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8287 });
8288 InstructionCost InsertCost =
8289 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8290 PoisonValue::get(VecTy), *It);
8292 VecTy, ShuffleMask, CostKind,
8293 /*Index=*/0, /*SubTp=*/nullptr,
8294 /*Args=*/*It);
8295 }
8296 return GatherCost +
8297 (all_of(Gathers, IsaPred<UndefValue>)
8299 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8300 ScalarTy));
8301 };
8302
8303 /// Compute the cost of creating a vector containing the extracted values from
8304 /// \p VL.
8306 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8307 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8308 unsigned NumParts) {
8309 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8310 unsigned NumElts =
8311 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8312 auto *EE = dyn_cast<ExtractElementInst>(V);
8313 if (!EE)
8314 return Sz;
8315 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8316 if (!VecTy)
8317 return Sz;
8318 return std::max(Sz, VecTy->getNumElements());
8319 });
8320 unsigned NumSrcRegs =
8321 TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
8322 if (NumSrcRegs == 0)
8323 NumSrcRegs = 1;
8324 // FIXME: this must be moved to TTI for better estimation.
8325 unsigned EltsPerVector = PowerOf2Ceil(std::max(
8326 divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
8327 auto CheckPerRegistersShuffle =
8328 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8329 DenseSet<int> RegIndices;
8330 // Check that if trying to permute same single/2 input vectors.
8332 int FirstRegId = -1;
8333 for (int &I : Mask) {
8334 if (I == PoisonMaskElem)
8335 continue;
8336 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8337 if (FirstRegId < 0)
8338 FirstRegId = RegId;
8339 RegIndices.insert(RegId);
8340 if (RegIndices.size() > 2)
8341 return std::nullopt;
8342 if (RegIndices.size() == 2)
8343 ShuffleKind = TTI::SK_PermuteTwoSrc;
8344 I = (I % NumElts) % EltsPerVector +
8345 (RegId == FirstRegId ? 0 : EltsPerVector);
8346 }
8347 return ShuffleKind;
8348 };
8350
8351 // Process extracts in blocks of EltsPerVector to check if the source vector
8352 // operand can be re-used directly. If not, add the cost of creating a
8353 // shuffle to extract the values into a vector register.
8354 for (unsigned Part = 0; Part < NumParts; ++Part) {
8355 if (!ShuffleKinds[Part])
8356 continue;
8357 ArrayRef<int> MaskSlice =
8358 Mask.slice(Part * EltsPerVector,
8359 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8360 ? Mask.size() % EltsPerVector
8361 : EltsPerVector);
8362 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8363 copy(MaskSlice, SubMask.begin());
8364 std::optional<TTI::ShuffleKind> RegShuffleKind =
8365 CheckPerRegistersShuffle(SubMask);
8366 if (!RegShuffleKind) {
8367 Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
8368 FixedVectorType::get(ScalarTy, NumElts),
8369 MaskSlice);
8370 continue;
8371 }
8372 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8373 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8374 Cost += ::getShuffleCost(TTI, *RegShuffleKind,
8375 FixedVectorType::get(ScalarTy, EltsPerVector),
8376 SubMask);
8377 }
8378 }
8379 return Cost;
8380 }
8381 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8382 /// shuffle emission.
8383 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8384 ArrayRef<int> Mask) {
8385 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8386 if (Mask[Idx] != PoisonMaskElem)
8387 CommonMask[Idx] = Idx;
8388 }
8389 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8390 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8391 /// elements.
8392 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8393 ArrayRef<int> Mask, unsigned Part,
8394 unsigned SliceSize) {
8395 if (SameNodesEstimated) {
8396 // Delay the cost estimation if the same nodes are reshuffling.
8397 // If we already requested the cost of reshuffling of E1 and E2 before, no
8398 // need to estimate another cost with the sub-Mask, instead include this
8399 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8400 // estimation.
8401 if ((InVectors.size() == 2 &&
8402 InVectors.front().get<const TreeEntry *>() == &E1 &&
8403 InVectors.back().get<const TreeEntry *>() == E2) ||
8404 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8405 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8406 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8407 "Expected all poisoned elements.");
8408 ArrayRef<int> SubMask =
8409 ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
8410 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8411 return;
8412 }
8413 // Found non-matching nodes - need to estimate the cost for the matched
8414 // and transform mask.
8415 Cost += createShuffle(InVectors.front(),
8416 InVectors.size() == 1 ? nullptr : InVectors.back(),
8417 CommonMask);
8418 transformMaskAfterShuffle(CommonMask, CommonMask);
8419 }
8420 SameNodesEstimated = false;
8421 if (!E2 && InVectors.size() == 1) {
8422 unsigned VF = E1.getVectorFactor();
8423 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8424 VF = std::max(VF,
8425 cast<FixedVectorType>(V1->getType())->getNumElements());
8426 } else {
8427 const auto *E = InVectors.front().get<const TreeEntry *>();
8428 VF = std::max(VF, E->getVectorFactor());
8429 }
8430 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8431 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8432 CommonMask[Idx] = Mask[Idx] + VF;
8433 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8434 transformMaskAfterShuffle(CommonMask, CommonMask);
8435 } else {
8436 Cost += createShuffle(&E1, E2, Mask);
8437 transformMaskAfterShuffle(CommonMask, Mask);
8438 }
8439 }
8440
8441 class ShuffleCostBuilder {
8442 const TargetTransformInfo &TTI;
8443
8444 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8445 int Index = -1;
8446 return Mask.empty() ||
8447 (VF == Mask.size() &&
8450 Index == 0);
8451 }
8452
8453 public:
8454 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8455 ~ShuffleCostBuilder() = default;
8456 InstructionCost createShuffleVector(Value *V1, Value *,
8457 ArrayRef<int> Mask) const {
8458 // Empty mask or identity mask are free.
8459 unsigned VF =
8460 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8461 if (isEmptyOrIdentity(Mask, VF))
8462 return TTI::TCC_Free;
8463 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8464 cast<VectorType>(V1->getType()), Mask);
8465 }
8466 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8467 // Empty mask or identity mask are free.
8468 unsigned VF =
8469 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8470 if (isEmptyOrIdentity(Mask, VF))
8471 return TTI::TCC_Free;
8473 cast<VectorType>(V1->getType()), Mask);
8474 }
8475 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8476 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8477 return TTI::TCC_Free;
8478 }
8479 void resizeToMatch(Value *&, Value *&) const {}
8480 };
8481
8482 /// Smart shuffle instruction emission, walks through shuffles trees and
8483 /// tries to find the best matching vector for the actual shuffle
8484 /// instruction.
8486 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8488 ArrayRef<int> Mask) {
8489 ShuffleCostBuilder Builder(TTI);
8490 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8491 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8492 unsigned CommonVF = Mask.size();
8493 InstructionCost ExtraCost = 0;
8494 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8495 unsigned VF) -> InstructionCost {
8496 if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars))
8497 return TTI::TCC_Free;
8498 Type *EScalarTy = E.Scalars.front()->getType();
8499 bool IsSigned = true;
8500 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8501 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8502 IsSigned = It->second.second;
8503 }
8504 if (EScalarTy != ScalarTy) {
8505 unsigned CastOpcode = Instruction::Trunc;
8506 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8507 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8508 if (DstSz > SrcSz)
8509 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8510 return TTI.getCastInstrCost(CastOpcode,
8511 FixedVectorType::get(ScalarTy, VF),
8512 FixedVectorType::get(EScalarTy, VF),
8513 TTI::CastContextHint::None, CostKind);
8514 }
8515 return TTI::TCC_Free;
8516 };
8517 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8518 if (isa<Constant>(V))
8519 return TTI::TCC_Free;
8520 auto *VecTy = cast<VectorType>(V->getType());
8521 Type *EScalarTy = VecTy->getElementType();
8522 if (EScalarTy != ScalarTy) {
8523 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8524 unsigned CastOpcode = Instruction::Trunc;
8525 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8526 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8527 if (DstSz > SrcSz)
8528 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8529 return TTI.getCastInstrCost(
8530 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8531 VecTy, TTI::CastContextHint::None, CostKind);
8532 }
8533 return TTI::TCC_Free;
8534 };
8535 if (!V1 && !V2 && !P2.isNull()) {
8536 // Shuffle 2 entry nodes.
8537 const TreeEntry *E = P1.get<const TreeEntry *>();
8538 unsigned VF = E->getVectorFactor();
8539 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8540 CommonVF = std::max(VF, E2->getVectorFactor());
8541 assert(all_of(Mask,
8542 [=](int Idx) {
8543 return Idx < 2 * static_cast<int>(CommonVF);
8544 }) &&
8545 "All elements in mask must be less than 2 * CommonVF.");
8546 if (E->Scalars.size() == E2->Scalars.size()) {
8547 SmallVector<int> EMask = E->getCommonMask();
8548 SmallVector<int> E2Mask = E2->getCommonMask();
8549 if (!EMask.empty() || !E2Mask.empty()) {
8550 for (int &Idx : CommonMask) {
8551 if (Idx == PoisonMaskElem)
8552 continue;
8553 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8554 Idx = EMask[Idx];
8555 else if (Idx >= static_cast<int>(CommonVF))
8556 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8557 E->Scalars.size();
8558 }
8559 }
8560 CommonVF = E->Scalars.size();
8561 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8562 GetNodeMinBWAffectedCost(*E2, CommonVF);
8563 } else {
8564 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8565 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8566 }
8567 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8568 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8569 } else if (!V1 && P2.isNull()) {
8570 // Shuffle single entry node.
8571 const TreeEntry *E = P1.get<const TreeEntry *>();
8572 unsigned VF = E->getVectorFactor();
8573 CommonVF = VF;
8574 assert(
8575 all_of(Mask,
8576 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8577 "All elements in mask must be less than CommonVF.");
8578 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8579 SmallVector<int> EMask = E->getCommonMask();
8580 assert(!EMask.empty() && "Expected non-empty common mask.");
8581 for (int &Idx : CommonMask) {
8582 if (Idx != PoisonMaskElem)
8583 Idx = EMask[Idx];
8584 }
8585 CommonVF = E->Scalars.size();
8586 }
8587 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8588 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8589 // Not identity/broadcast? Try to see if the original vector is better.
8590 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8591 CommonVF == CommonMask.size() &&
8592 any_of(enumerate(CommonMask),
8593 [](const auto &&P) {
8594 return P.value() != PoisonMaskElem &&
8595 static_cast<unsigned>(P.value()) != P.index();
8596 }) &&
8597 any_of(CommonMask,
8598 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8599 SmallVector<int> ReorderMask;
8600 inversePermutation(E->ReorderIndices, ReorderMask);
8601 ::addMask(CommonMask, ReorderMask);
8602 }
8603 } else if (V1 && P2.isNull()) {
8604 // Shuffle single vector.
8605 ExtraCost += GetValueMinBWAffectedCost(V1);
8606 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8607 assert(
8608 all_of(Mask,
8609 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8610 "All elements in mask must be less than CommonVF.");
8611 } else if (V1 && !V2) {
8612 // Shuffle vector and tree node.
8613 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8614 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8615 CommonVF = std::max(VF, E2->getVectorFactor());
8616 assert(all_of(Mask,
8617 [=](int Idx) {
8618 return Idx < 2 * static_cast<int>(CommonVF);
8619 }) &&
8620 "All elements in mask must be less than 2 * CommonVF.");
8621 if (E2->Scalars.size() == VF && VF != CommonVF) {
8622 SmallVector<int> E2Mask = E2->getCommonMask();
8623 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8624 for (int &Idx : CommonMask) {
8625 if (Idx == PoisonMaskElem)
8626 continue;
8627 if (Idx >= static_cast<int>(CommonVF))
8628 Idx = E2Mask[Idx - CommonVF] + VF;
8629 }
8630 CommonVF = VF;
8631 }
8632 ExtraCost += GetValueMinBWAffectedCost(V1);
8633 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8634 ExtraCost += GetNodeMinBWAffectedCost(
8635 *E2, std::min(CommonVF, E2->getVectorFactor()));
8636 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8637 } else if (!V1 && V2) {
8638 // Shuffle vector and tree node.
8639 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8640 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8641 CommonVF = std::max(VF, E1->getVectorFactor());
8642 assert(all_of(Mask,
8643 [=](int Idx) {
8644 return Idx < 2 * static_cast<int>(CommonVF);
8645 }) &&
8646 "All elements in mask must be less than 2 * CommonVF.");
8647 if (E1->Scalars.size() == VF && VF != CommonVF) {
8648 SmallVector<int> E1Mask = E1->getCommonMask();
8649 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8650 for (int &Idx : CommonMask) {
8651 if (Idx == PoisonMaskElem)
8652 continue;
8653 if (Idx >= static_cast<int>(CommonVF))
8654 Idx = E1Mask[Idx - CommonVF] + VF;
8655 else
8656 Idx = E1Mask[Idx];
8657 }
8658 CommonVF = VF;
8659 }
8660 ExtraCost += GetNodeMinBWAffectedCost(
8661 *E1, std::min(CommonVF, E1->getVectorFactor()));
8662 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8663 ExtraCost += GetValueMinBWAffectedCost(V2);
8664 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8665 } else {
8666 assert(V1 && V2 && "Expected both vectors.");
8667 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8668 CommonVF =
8669 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8670 assert(all_of(Mask,
8671 [=](int Idx) {
8672 return Idx < 2 * static_cast<int>(CommonVF);
8673 }) &&
8674 "All elements in mask must be less than 2 * CommonVF.");
8675 ExtraCost +=
8676 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8677 if (V1->getType() != V2->getType()) {
8678 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8679 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8680 } else {
8681 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8682 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8683 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8684 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8685 }
8686 }
8687 InVectors.front() = Constant::getNullValue(
8688 FixedVectorType::get(ScalarTy, CommonMask.size()));
8689 if (InVectors.size() == 2)
8690 InVectors.pop_back();
8691 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8692 V1, V2, CommonMask, Builder);
8693 }
8694
8695public:
8697 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8698 SmallPtrSetImpl<Value *> &CheckedExtracts)
8699 : ScalarTy(ScalarTy), TTI(TTI),
8700 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8701 CheckedExtracts(CheckedExtracts) {}
8702 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8703 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8704 unsigned NumParts, bool &UseVecBaseAsInput) {
8705 UseVecBaseAsInput = false;
8706 if (Mask.empty())
8707 return nullptr;
8708 Value *VecBase = nullptr;
8709 ArrayRef<Value *> VL = E->Scalars;
8710 // If the resulting type is scalarized, do not adjust the cost.
8711 if (NumParts == VL.size())
8712 return nullptr;
8713 // Check if it can be considered reused if same extractelements were
8714 // vectorized already.
8715 bool PrevNodeFound = any_of(
8716 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8717 [&](const std::unique_ptr<TreeEntry> &TE) {
8718 return ((!TE->isAltShuffle() &&
8719 TE->getOpcode() == Instruction::ExtractElement) ||
8720 TE->State == TreeEntry::NeedToGather) &&
8721 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8722 return VL.size() > Data.index() &&
8723 (Mask[Data.index()] == PoisonMaskElem ||
8724 isa<UndefValue>(VL[Data.index()]) ||
8725 Data.value() == VL[Data.index()]);
8726 });
8727 });
8728 SmallPtrSet<Value *, 4> UniqueBases;
8729 unsigned SliceSize = VL.size() / NumParts;
8730 for (unsigned Part = 0; Part < NumParts; ++Part) {
8731 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8732 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
8733 // Ignore non-extractelement scalars.
8734 if (isa<UndefValue>(V) ||
8735 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8736 continue;
8737 // If all users of instruction are going to be vectorized and this
8738 // instruction itself is not going to be vectorized, consider this
8739 // instruction as dead and remove its cost from the final cost of the
8740 // vectorized tree.
8741 // Also, avoid adjusting the cost for extractelements with multiple uses
8742 // in different graph entries.
8743 auto *EE = cast<ExtractElementInst>(V);
8744 VecBase = EE->getVectorOperand();
8745 UniqueBases.insert(VecBase);
8746 const TreeEntry *VE = R.getTreeEntry(V);
8747 if (!CheckedExtracts.insert(V).second ||
8748 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8749 any_of(EE->users(),
8750 [&](User *U) {
8751 return isa<GetElementPtrInst>(U) &&
8752 !R.areAllUsersVectorized(cast<Instruction>(U),
8753 &VectorizedVals);
8754 }) ||
8755 (VE && VE != E))
8756 continue;
8757 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8758 if (!EEIdx)
8759 continue;
8760 unsigned Idx = *EEIdx;
8761 // Take credit for instruction that will become dead.
8762 if (EE->hasOneUse() || !PrevNodeFound) {
8763 Instruction *Ext = EE->user_back();
8764 if (isa<SExtInst, ZExtInst>(Ext) &&
8765 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8766 // Use getExtractWithExtendCost() to calculate the cost of
8767 // extractelement/ext pair.
8768 Cost -=
8769 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8770 EE->getVectorOperandType(), Idx);
8771 // Add back the cost of s|zext which is subtracted separately.
8773 Ext->getOpcode(), Ext->getType(), EE->getType(),
8774 TTI::getCastContextHint(Ext), CostKind, Ext);
8775 continue;
8776 }
8777 }
8778 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8779 CostKind, Idx);
8780 }
8781 }
8782 // Check that gather of extractelements can be represented as just a
8783 // shuffle of a single/two vectors the scalars are extracted from.
8784 // Found the bunch of extractelement instructions that must be gathered
8785 // into a vector and can be represented as a permutation elements in a
8786 // single input vector or of 2 input vectors.
8787 // Done for reused if same extractelements were vectorized already.
8788 if (!PrevNodeFound)
8789 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8790 InVectors.assign(1, E);
8791 CommonMask.assign(Mask.begin(), Mask.end());
8792 transformMaskAfterShuffle(CommonMask, CommonMask);
8793 SameNodesEstimated = false;
8794 if (NumParts != 1 && UniqueBases.size() != 1) {
8795 UseVecBaseAsInput = true;
8796 VecBase = Constant::getNullValue(
8797 FixedVectorType::get(ScalarTy, CommonMask.size()));
8798 }
8799 return VecBase;
8800 }
8801 /// Checks if the specified entry \p E needs to be delayed because of its
8802 /// dependency nodes.
8803 std::optional<InstructionCost>
8804 needToDelay(const TreeEntry *,
8806 // No need to delay the cost estimation during analysis.
8807 return std::nullopt;
8808 }
8809 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8810 if (&E1 == &E2) {
8811 assert(all_of(Mask,
8812 [&](int Idx) {
8813 return Idx < static_cast<int>(E1.getVectorFactor());
8814 }) &&
8815 "Expected single vector shuffle mask.");
8816 add(E1, Mask);
8817 return;
8818 }
8819 if (InVectors.empty()) {
8820 CommonMask.assign(Mask.begin(), Mask.end());
8821 InVectors.assign({&E1, &E2});
8822 return;
8823 }
8824 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8825 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8826 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8827 if (NumParts == 0 || NumParts >= Mask.size())
8828 NumParts = 1;
8829 unsigned SliceSize = Mask.size() / NumParts;
8830 const auto *It =
8831 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8832 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8833 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8834 }
8835 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8836 if (InVectors.empty()) {
8837 CommonMask.assign(Mask.begin(), Mask.end());
8838 InVectors.assign(1, &E1);
8839 return;
8840 }
8841 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8842 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8843 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8844 if (NumParts == 0 || NumParts >= Mask.size())
8845 NumParts = 1;
8846 unsigned SliceSize = Mask.size() / NumParts;
8847 const auto *It =
8848 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8849 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8850 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8851 if (!SameNodesEstimated && InVectors.size() == 1)
8852 InVectors.emplace_back(&E1);
8853 }
8854 /// Adds 2 input vectors and the mask for their shuffling.
8855 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8856 // May come only for shuffling of 2 vectors with extractelements, already
8857 // handled in adjustExtracts.
8858 assert(InVectors.size() == 1 &&
8859 all_of(enumerate(CommonMask),
8860 [&](auto P) {
8861 if (P.value() == PoisonMaskElem)
8862 return Mask[P.index()] == PoisonMaskElem;
8863 auto *EI =
8864 cast<ExtractElementInst>(InVectors.front()
8865 .get<const TreeEntry *>()
8866 ->Scalars[P.index()]);
8867 return EI->getVectorOperand() == V1 ||
8868 EI->getVectorOperand() == V2;
8869 }) &&
8870 "Expected extractelement vectors.");
8871 }
8872 /// Adds another one input vector and the mask for the shuffling.
8873 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8874 if (InVectors.empty()) {
8875 assert(CommonMask.empty() && !ForExtracts &&
8876 "Expected empty input mask/vectors.");
8877 CommonMask.assign(Mask.begin(), Mask.end());
8878 InVectors.assign(1, V1);
8879 return;
8880 }
8881 if (ForExtracts) {
8882 // No need to add vectors here, already handled them in adjustExtracts.
8883 assert(InVectors.size() == 1 &&
8884 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8885 all_of(enumerate(CommonMask),
8886 [&](auto P) {
8887 Value *Scalar = InVectors.front()
8888 .get<const TreeEntry *>()
8889 ->Scalars[P.index()];
8890 if (P.value() == PoisonMaskElem)
8891 return P.value() == Mask[P.index()] ||
8892 isa<UndefValue>(Scalar);
8893 if (isa<Constant>(V1))
8894 return true;
8895 auto *EI = cast<ExtractElementInst>(Scalar);
8896 return EI->getVectorOperand() == V1;
8897 }) &&
8898 "Expected only tree entry for extractelement vectors.");
8899 return;
8900 }
8901 assert(!InVectors.empty() && !CommonMask.empty() &&
8902 "Expected only tree entries from extracts/reused buildvectors.");
8903 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8904 if (InVectors.size() == 2) {
8905 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8906 transformMaskAfterShuffle(CommonMask, CommonMask);
8907 VF = std::max<unsigned>(VF, CommonMask.size());
8908 } else if (const auto *InTE =
8909 InVectors.front().dyn_cast<const TreeEntry *>()) {
8910 VF = std::max(VF, InTE->getVectorFactor());
8911 } else {
8912 VF = std::max(
8913 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8914 ->getNumElements());
8915 }
8916 InVectors.push_back(V1);
8917 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8918 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8919 CommonMask[Idx] = Mask[Idx] + VF;
8920 }
8921 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8922 Value *Root = nullptr) {
8923 Cost += getBuildVectorCost(VL, Root);
8924 if (!Root) {
8925 // FIXME: Need to find a way to avoid use of getNullValue here.
8927 unsigned VF = VL.size();
8928 if (MaskVF != 0)
8929 VF = std::min(VF, MaskVF);
8930 for (Value *V : VL.take_front(VF)) {
8931 if (isa<UndefValue>(V)) {
8932 Vals.push_back(cast<Constant>(V));
8933 continue;
8934 }
8935 Vals.push_back(Constant::getNullValue(V->getType()));
8936 }
8937 return ConstantVector::get(Vals);
8938 }
8941 cast<FixedVectorType>(Root->getType())->getNumElements()),
8942 getAllOnesValue(*R.DL, ScalarTy));
8943 }
8945 /// Finalize emission of the shuffles.
8947 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8948 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8949 IsFinalized = true;
8950 if (Action) {
8951 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8952 if (InVectors.size() == 2)
8953 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8954 else
8955 Cost += createShuffle(Vec, nullptr, CommonMask);
8956 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8957 if (CommonMask[Idx] != PoisonMaskElem)
8958 CommonMask[Idx] = Idx;
8959 assert(VF > 0 &&
8960 "Expected vector length for the final value before action.");
8961 Value *V = Vec.get<Value *>();
8962 Action(V, CommonMask);
8963 InVectors.front() = V;
8964 }
8965 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8966 if (CommonMask.empty()) {
8967 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8968 return Cost;
8969 }
8970 return Cost +
8971 createShuffle(InVectors.front(),
8972 InVectors.size() == 2 ? InVectors.back() : nullptr,
8973 CommonMask);
8974 }
8975
8977 assert((IsFinalized || CommonMask.empty()) &&
8978 "Shuffle construction must be finalized.");
8979 }
8980};
8981
8982const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8983 unsigned Idx) const {
8984 Value *Op = E->getOperand(Idx).front();
8985 if (const TreeEntry *TE = getTreeEntry(Op)) {
8986 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8987 return EI.EdgeIdx == Idx && EI.UserTE == E;
8988 }) != TE->UserTreeIndices.end())
8989 return TE;
8990 auto MIt = MultiNodeScalars.find(Op);
8991 if (MIt != MultiNodeScalars.end()) {
8992 for (const TreeEntry *TE : MIt->second) {
8993 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8994 return EI.EdgeIdx == Idx && EI.UserTE == E;
8995 }) != TE->UserTreeIndices.end())
8996 return TE;
8997 }
8998 }
8999 }
9000 const auto *It =
9001 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9002 return TE->State == TreeEntry::NeedToGather &&
9003 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9004 return EI.EdgeIdx == Idx && EI.UserTE == E;
9005 }) != TE->UserTreeIndices.end();
9006 });
9007 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9008 return It->get();
9009}
9010
9011TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9012 if (TE.State == TreeEntry::ScatterVectorize ||
9013 TE.State == TreeEntry::StridedVectorize)
9015 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9016 !TE.isAltShuffle()) {
9017 if (TE.ReorderIndices.empty())
9020 inversePermutation(TE.ReorderIndices, Mask);
9021 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9023 }
9025}
9026
9027/// Builds the arguments types vector for the given call instruction with the
9028/// given \p ID for the specified vector factor.
9030 const Intrinsic::ID ID,
9031 const unsigned VF,
9032 unsigned MinBW) {
9033 SmallVector<Type *> ArgTys;
9034 for (auto [Idx, Arg] : enumerate(CI->args())) {
9037 ArgTys.push_back(Arg->getType());
9038 continue;
9039 }
9040 if (MinBW > 0) {
9042 IntegerType::get(CI->getContext(), MinBW), VF));
9043 continue;
9044 }
9045 }
9046 ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
9047 }
9048 return ArgTys;
9049}
9050
9052BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9053 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9054 ArrayRef<Value *> VL = E->Scalars;
9055
9056 Type *ScalarTy = VL[0]->getType();
9057 if (E->State != TreeEntry::NeedToGather) {
9058 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9059 ScalarTy = SI->getValueOperand()->getType();
9060 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9061 ScalarTy = CI->getOperand(0)->getType();
9062 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9063 ScalarTy = IE->getOperand(1)->getType();
9064 }
9065 if (!isValidElementType(ScalarTy))
9067 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
9069
9070 // If we have computed a smaller type for the expression, update VecTy so
9071 // that the costs will be accurate.
9072 auto It = MinBWs.find(E);
9073 Type *OrigScalarTy = ScalarTy;
9074 if (It != MinBWs.end()) {
9075 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9076 VecTy = FixedVectorType::get(ScalarTy, VL.size());
9077 }
9078 unsigned EntryVF = E->getVectorFactor();
9079 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
9080
9081 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9082 if (E->State == TreeEntry::NeedToGather) {
9083 if (allConstant(VL))
9084 return 0;
9085 if (isa<InsertElementInst>(VL[0]))
9087 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9088 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9089 }
9090 InstructionCost CommonCost = 0;
9092 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9093 if (!E->ReorderIndices.empty() &&
9094 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9095 SmallVector<int> NewMask;
9096 if (E->getOpcode() == Instruction::Store) {
9097 // For stores the order is actually a mask.
9098 NewMask.resize(E->ReorderIndices.size());
9099 copy(E->ReorderIndices, NewMask.begin());
9100 } else {
9101 inversePermutation(E->ReorderIndices, NewMask);
9102 }
9103 ::addMask(Mask, NewMask);
9104 }
9105 if (NeedToShuffleReuses)
9106 ::addMask(Mask, E->ReuseShuffleIndices);
9107 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9108 CommonCost =
9109 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9110 assert((E->State == TreeEntry::Vectorize ||
9111 E->State == TreeEntry::ScatterVectorize ||
9112 E->State == TreeEntry::StridedVectorize) &&
9113 "Unhandled state");
9114 assert(E->getOpcode() &&
9115 ((allSameType(VL) && allSameBlock(VL)) ||
9116 (E->getOpcode() == Instruction::GetElementPtr &&
9117 E->getMainOp()->getType()->isPointerTy())) &&
9118 "Invalid VL");
9119 Instruction *VL0 = E->getMainOp();
9120 unsigned ShuffleOrOp =
9121 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9122 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9123 const unsigned Sz = UniqueValues.size();
9124 SmallBitVector UsedScalars(Sz, false);
9125 for (unsigned I = 0; I < Sz; ++I) {
9126 if (getTreeEntry(UniqueValues[I]) == E)
9127 continue;
9128 UsedScalars.set(I);
9129 }
9130 auto GetCastContextHint = [&](Value *V) {
9131 if (const TreeEntry *OpTE = getTreeEntry(V))
9132 return getCastContextHint(*OpTE);
9133 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9134 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9137 };
9138 auto GetCostDiff =
9139 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9141 // Calculate the cost of this instruction.
9142 InstructionCost ScalarCost = 0;
9143 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
9144 // For some of the instructions no need to calculate cost for each
9145 // particular instruction, we can use the cost of the single
9146 // instruction x total number of scalar instructions.
9147 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9148 } else {
9149 for (unsigned I = 0; I < Sz; ++I) {
9150 if (UsedScalars.test(I))
9151 continue;
9152 ScalarCost += ScalarEltCost(I);
9153 }
9154 }
9155
9156 InstructionCost VecCost = VectorCost(CommonCost);
9157 // Check if the current node must be resized, if the parent node is not
9158 // resized.
9159 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9160 const EdgeInfo &EI = E->UserTreeIndices.front();
9161 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9162 EI.EdgeIdx != 0) &&
9163 It != MinBWs.end()) {
9164 auto UserBWIt = MinBWs.find(EI.UserTE);
9165 Type *UserScalarTy =
9166 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9167 if (UserBWIt != MinBWs.end())
9168 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9169 UserBWIt->second.first);
9170 if (ScalarTy != UserScalarTy) {
9171 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9172 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9173 unsigned VecOpcode;
9174 auto *UserVecTy =
9175 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
9176 if (BWSz > SrcBWSz)
9177 VecOpcode = Instruction::Trunc;
9178 else
9179 VecOpcode =
9180 It->second.second ? Instruction::SExt : Instruction::ZExt;
9181 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9182 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9183 CostKind);
9184 }
9185 }
9186 }
9187 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9188 ScalarCost, "Calculated costs for Tree"));
9189 return VecCost - ScalarCost;
9190 };
9191 // Calculate cost difference from vectorizing set of GEPs.
9192 // Negative value means vectorizing is profitable.
9193 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9194 assert((E->State == TreeEntry::Vectorize ||
9195 E->State == TreeEntry::StridedVectorize) &&
9196 "Entry state expected to be Vectorize or StridedVectorize here.");
9197 InstructionCost ScalarCost = 0;
9198 InstructionCost VecCost = 0;
9199 std::tie(ScalarCost, VecCost) = getGEPCosts(
9200 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9201 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9202 "Calculated GEPs cost for Tree"));
9203
9204 return VecCost - ScalarCost;
9205 };
9206
9207 switch (ShuffleOrOp) {
9208 case Instruction::PHI: {
9209 // Count reused scalars.
9210 InstructionCost ScalarCost = 0;
9212 for (Value *V : UniqueValues) {
9213 auto *PHI = dyn_cast<PHINode>(V);
9214 if (!PHI)
9215 continue;
9216
9217 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9218 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9219 Value *Op = PHI->getIncomingValue(I);
9220 Operands[I] = Op;
9221 }
9222 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9223 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9224 if (!OpTE->ReuseShuffleIndices.empty())
9225 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9226 OpTE->Scalars.size());
9227 }
9228
9229 return CommonCost - ScalarCost;
9230 }
9231 case Instruction::ExtractValue:
9232 case Instruction::ExtractElement: {
9233 auto GetScalarCost = [&](unsigned Idx) {
9234 auto *I = cast<Instruction>(UniqueValues[Idx]);
9235 VectorType *SrcVecTy;
9236 if (ShuffleOrOp == Instruction::ExtractElement) {
9237 auto *EE = cast<ExtractElementInst>(I);
9238 SrcVecTy = EE->getVectorOperandType();
9239 } else {
9240 auto *EV = cast<ExtractValueInst>(I);
9241 Type *AggregateTy = EV->getAggregateOperand()->getType();
9242 unsigned NumElts;
9243 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9244 NumElts = ATy->getNumElements();
9245 else
9246 NumElts = AggregateTy->getStructNumElements();
9247 SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
9248 }
9249 if (I->hasOneUse()) {
9250 Instruction *Ext = I->user_back();
9251 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9252 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9253 // Use getExtractWithExtendCost() to calculate the cost of
9254 // extractelement/ext pair.
9256 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9257 // Subtract the cost of s|zext which is subtracted separately.
9259 Ext->getOpcode(), Ext->getType(), I->getType(),
9261 return Cost;
9262 }
9263 }
9264 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9266 };
9267 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9268 return GetCostDiff(GetScalarCost, GetVectorCost);
9269 }
9270 case Instruction::InsertElement: {
9271 assert(E->ReuseShuffleIndices.empty() &&
9272 "Unique insertelements only are expected.");
9273 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9274 unsigned const NumElts = SrcVecTy->getNumElements();
9275 unsigned const NumScalars = VL.size();
9276
9277 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9278
9279 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9280 unsigned OffsetBeg = *getInsertIndex(VL.front());
9281 unsigned OffsetEnd = OffsetBeg;
9282 InsertMask[OffsetBeg] = 0;
9283 for (auto [I, V] : enumerate(VL.drop_front())) {
9284 unsigned Idx = *getInsertIndex(V);
9285 if (OffsetBeg > Idx)
9286 OffsetBeg = Idx;
9287 else if (OffsetEnd < Idx)
9288 OffsetEnd = Idx;
9289 InsertMask[Idx] = I + 1;
9290 }
9291 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9292 if (NumOfParts > 0)
9293 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9294 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9295 VecScalarsSz;
9296 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9297 unsigned InsertVecSz = std::min<unsigned>(
9298 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9299 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9300 bool IsWholeSubvector =
9301 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9302 // Check if we can safely insert a subvector. If it is not possible, just
9303 // generate a whole-sized vector and shuffle the source vector and the new
9304 // subvector.
9305 if (OffsetBeg + InsertVecSz > VecSz) {
9306 // Align OffsetBeg to generate correct mask.
9307 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9308 InsertVecSz = VecSz;
9309 }
9310
9311 APInt DemandedElts = APInt::getZero(NumElts);
9312 // TODO: Add support for Instruction::InsertValue.
9314 if (!E->ReorderIndices.empty()) {
9315 inversePermutation(E->ReorderIndices, Mask);
9316 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9317 } else {
9318 Mask.assign(VecSz, PoisonMaskElem);
9319 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9320 }
9321 bool IsIdentity = true;
9322 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9323 Mask.swap(PrevMask);
9324 for (unsigned I = 0; I < NumScalars; ++I) {
9325 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
9326 DemandedElts.setBit(InsertIdx);
9327 IsIdentity &= InsertIdx - OffsetBeg == I;
9328 Mask[InsertIdx - OffsetBeg] = I;
9329 }
9330 assert(Offset < NumElts && "Failed to find vector index offset");
9331
9333 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9334 /*Insert*/ true, /*Extract*/ false,
9335 CostKind);
9336
9337 // First cost - resize to actual vector size if not identity shuffle or
9338 // need to shift the vector.
9339 // Do not calculate the cost if the actual size is the register size and
9340 // we can merge this shuffle with the following SK_Select.
9341 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
9342 if (!IsIdentity)
9344 InsertVecTy, Mask);
9345 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9346 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9347 }));
9348 // Second cost - permutation with subvector, if some elements are from the
9349 // initial vector or inserting a subvector.
9350 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9351 // subvector of ActualVecTy.
9352 SmallBitVector InMask =
9353 isUndefVector(FirstInsert->getOperand(0),
9354 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9355 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9356 if (InsertVecSz != VecSz) {
9357 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
9359 std::nullopt, CostKind, OffsetBeg - Offset,
9360 InsertVecTy);
9361 } else {
9362 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9363 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9364 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9365 I <= End; ++I)
9366 if (Mask[I] != PoisonMaskElem)
9367 Mask[I] = I + VecSz;
9368 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9369 Mask[I] =
9370 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9371 Cost +=
9372 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9373 }
9374 }
9375 return Cost;
9376 }
9377 case Instruction::ZExt:
9378 case Instruction::SExt:
9379 case Instruction::FPToUI:
9380 case Instruction::FPToSI:
9381 case Instruction::FPExt:
9382 case Instruction::PtrToInt:
9383 case Instruction::IntToPtr:
9384 case Instruction::SIToFP:
9385 case Instruction::UIToFP:
9386 case Instruction::Trunc:
9387 case Instruction::FPTrunc:
9388 case Instruction::BitCast: {
9389 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9390 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9391 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9392 unsigned Opcode = ShuffleOrOp;
9393 unsigned VecOpcode = Opcode;
9394 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9395 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9396 // Check if the values are candidates to demote.
9397 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9398 if (SrcIt != MinBWs.end()) {
9399 SrcBWSz = SrcIt->second.first;
9400 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9401 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9402 }
9403 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9404 if (BWSz == SrcBWSz) {
9405 VecOpcode = Instruction::BitCast;
9406 } else if (BWSz < SrcBWSz) {
9407 VecOpcode = Instruction::Trunc;
9408 } else if (It != MinBWs.end()) {
9409 assert(BWSz > SrcBWSz && "Invalid cast!");
9410 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9411 } else if (SrcIt != MinBWs.end()) {
9412 assert(BWSz > SrcBWSz && "Invalid cast!");
9413 VecOpcode =
9414 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9415 }
9416 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9417 !SrcIt->second.second) {
9418 VecOpcode = Instruction::UIToFP;
9419 }
9420 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9421 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9422 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9423 VL0->getOperand(0)->getType(),
9425 };
9426 auto GetVectorCost = [=](InstructionCost CommonCost) {
9427 // Do not count cost here if minimum bitwidth is in effect and it is just
9428 // a bitcast (here it is just a noop).
9429 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9430 return CommonCost;
9431 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9432 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9433 return CommonCost +
9434 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9435 VecOpcode == Opcode ? VI : nullptr);
9436 };
9437 return GetCostDiff(GetScalarCost, GetVectorCost);
9438 }
9439 case Instruction::FCmp:
9440 case Instruction::ICmp:
9441 case Instruction::Select: {
9442 CmpInst::Predicate VecPred, SwappedVecPred;
9443 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9444 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9445 match(VL0, MatchCmp))
9446 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9447 else
9448 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9451 auto GetScalarCost = [&](unsigned Idx) {
9452 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9453 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9456 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9457 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9458 !match(VI, MatchCmp)) ||
9459 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9460 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9463
9464 return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
9465 Builder.getInt1Ty(), CurrentPred, CostKind,
9466 VI);
9467 };
9468 auto GetVectorCost = [&](InstructionCost CommonCost) {
9469 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9470
9472 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9473 // Check if it is possible and profitable to use min/max for selects
9474 // in VL.
9475 //
9476 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9477 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9478 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9479 {VecTy, VecTy});
9480 InstructionCost IntrinsicCost =
9481 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9482 // If the selects are the only uses of the compares, they will be
9483 // dead and we can adjust the cost by removing their cost.
9484 if (IntrinsicAndUse.second)
9485 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
9486 MaskTy, VecPred, CostKind);
9487 VecCost = std::min(VecCost, IntrinsicCost);
9488 }
9489 return VecCost + CommonCost;
9490 };
9491 return GetCostDiff(GetScalarCost, GetVectorCost);
9492 }
9493 case Instruction::FNeg:
9494 case Instruction::Add:
9495 case Instruction::FAdd:
9496 case Instruction::Sub:
9497 case Instruction::FSub:
9498 case Instruction::Mul:
9499 case Instruction::FMul:
9500 case Instruction::UDiv:
9501 case Instruction::SDiv:
9502 case Instruction::FDiv:
9503 case Instruction::URem:
9504 case Instruction::SRem:
9505 case Instruction::FRem:
9506 case Instruction::Shl:
9507 case Instruction::LShr:
9508 case Instruction::AShr:
9509 case Instruction::And:
9510 case Instruction::Or:
9511 case Instruction::Xor: {
9512 auto GetScalarCost = [&](unsigned Idx) {
9513 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9514 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9515 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9516 TTI::OperandValueInfo Op2Info =
9517 TTI::getOperandInfo(VI->getOperand(OpIdx));
9518 SmallVector<const Value *> Operands(VI->operand_values());
9519 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9520 Op1Info, Op2Info, Operands, VI);
9521 };
9522 auto GetVectorCost = [=](InstructionCost CommonCost) {
9523 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9524 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9525 ArrayRef<Value *> Ops = E->getOperand(I);
9526 if (all_of(Ops, [&](Value *Op) {
9527 auto *CI = dyn_cast<ConstantInt>(Op);
9528 return CI && CI->getValue().countr_one() >= It->second.first;
9529 }))
9530 return CommonCost;
9531 }
9532 }
9533 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9534 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9535 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9536 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9537 Op2Info, std::nullopt, nullptr, TLI) +
9538 CommonCost;
9539 };
9540 return GetCostDiff(GetScalarCost, GetVectorCost);
9541 }
9542 case Instruction::GetElementPtr: {
9543 return CommonCost + GetGEPCostDiff(VL, VL0);
9544 }
9545 case Instruction::Load: {
9546 auto GetScalarCost = [&](unsigned Idx) {
9547 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9548 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9549 VI->getAlign(), VI->getPointerAddressSpace(),
9551 };
9552 auto *LI0 = cast<LoadInst>(VL0);
9553 auto GetVectorCost = [&](InstructionCost CommonCost) {
9554 InstructionCost VecLdCost;
9555 if (E->State == TreeEntry::Vectorize) {
9556 VecLdCost = TTI->getMemoryOpCost(
9557 Instruction::Load, VecTy, LI0->getAlign(),
9558 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9559 } else if (E->State == TreeEntry::StridedVectorize) {
9560 Align CommonAlignment =
9561 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9562 VecLdCost = TTI->getStridedMemoryOpCost(
9563 Instruction::Load, VecTy, LI0->getPointerOperand(),
9564 /*VariableMask=*/false, CommonAlignment, CostKind);
9565 } else {
9566 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9567 Align CommonAlignment =
9568 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9569 VecLdCost = TTI->getGatherScatterOpCost(
9570 Instruction::Load, VecTy, LI0->getPointerOperand(),
9571 /*VariableMask=*/false, CommonAlignment, CostKind);
9572 }
9573 return VecLdCost + CommonCost;
9574 };
9575
9576 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9577 // If this node generates masked gather load then it is not a terminal node.
9578 // Hence address operand cost is estimated separately.
9579 if (E->State == TreeEntry::ScatterVectorize)
9580 return Cost;
9581
9582 // Estimate cost of GEPs since this tree node is a terminator.
9583 SmallVector<Value *> PointerOps(VL.size());
9584 for (auto [I, V] : enumerate(VL))
9585 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9586 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9587 }
9588 case Instruction::Store: {
9589 bool IsReorder = !E->ReorderIndices.empty();
9590 auto GetScalarCost = [=](unsigned Idx) {
9591 auto *VI = cast<StoreInst>(VL[Idx]);
9592 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9593 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9594 VI->getAlign(), VI->getPointerAddressSpace(),
9595 CostKind, OpInfo, VI);
9596 };
9597 auto *BaseSI =
9598 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9599 auto GetVectorCost = [=](InstructionCost CommonCost) {
9600 // We know that we can merge the stores. Calculate the cost.
9601 InstructionCost VecStCost;
9602 if (E->State == TreeEntry::StridedVectorize) {
9603 Align CommonAlignment =
9604 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9605 VecStCost = TTI->getStridedMemoryOpCost(
9606 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9607 /*VariableMask=*/false, CommonAlignment, CostKind);
9608 } else {
9609 assert(E->State == TreeEntry::Vectorize &&
9610 "Expected either strided or consecutive stores.");
9611 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9612 VecStCost = TTI->getMemoryOpCost(
9613 Instruction::Store, VecTy, BaseSI->getAlign(),
9614 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9615 }
9616 return VecStCost + CommonCost;
9617 };
9618 SmallVector<Value *> PointerOps(VL.size());
9619 for (auto [I, V] : enumerate(VL)) {
9620 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9621 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9622 }
9623
9624 return GetCostDiff(GetScalarCost, GetVectorCost) +
9625 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9626 }
9627 case Instruction::Call: {
9628 auto GetScalarCost = [&](unsigned Idx) {
9629 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9632 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9633 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9634 }
9637 CI->getFunctionType()->params(), CostKind);
9638 };
9639 auto GetVectorCost = [=](InstructionCost CommonCost) {
9640 auto *CI = cast<CallInst>(VL0);
9642 SmallVector<Type *> ArgTys =
9644 It != MinBWs.end() ? It->second.first : 0);
9645 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9646 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9647 };
9648 return GetCostDiff(GetScalarCost, GetVectorCost);
9649 }
9650 case Instruction::ShuffleVector: {
9651 assert(E->isAltShuffle() &&
9652 ((Instruction::isBinaryOp(E->getOpcode()) &&
9653 Instruction::isBinaryOp(E->getAltOpcode())) ||
9654 (Instruction::isCast(E->getOpcode()) &&
9655 Instruction::isCast(E->getAltOpcode())) ||
9656 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9657 "Invalid Shuffle Vector Operand");
9658 // Try to find the previous shuffle node with the same operands and same
9659 // main/alternate ops.
9660 auto TryFindNodeWithEqualOperands = [=]() {
9661 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9662 if (TE.get() == E)
9663 break;
9664 if (TE->isAltShuffle() &&
9665 ((TE->getOpcode() == E->getOpcode() &&
9666 TE->getAltOpcode() == E->getAltOpcode()) ||
9667 (TE->getOpcode() == E->getAltOpcode() &&
9668 TE->getAltOpcode() == E->getOpcode())) &&
9669 TE->hasEqualOperands(*E))
9670 return true;
9671 }
9672 return false;
9673 };
9674 auto GetScalarCost = [&](unsigned Idx) {
9675 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9676 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9677 (void)E;
9678 return TTI->getInstructionCost(VI, CostKind);
9679 };
9680 // Need to clear CommonCost since the final shuffle cost is included into
9681 // vector cost.
9682 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9683 // VecCost is equal to sum of the cost of creating 2 vectors
9684 // and the cost of creating shuffle.
9685 InstructionCost VecCost = 0;
9686 if (TryFindNodeWithEqualOperands()) {
9687 LLVM_DEBUG({
9688 dbgs() << "SLP: diamond match for alternate node found.\n";
9689 E->dump();
9690 });
9691 // No need to add new vector costs here since we're going to reuse
9692 // same main/alternate vector ops, just do different shuffling.
9693 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9694 VecCost =
9695 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9696 VecCost +=
9697 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9698 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9699 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9700 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9701 CI0->getPredicate(), CostKind, VL0);
9702 VecCost += TTIRef.getCmpSelInstrCost(
9703 E->getOpcode(), VecTy, MaskTy,
9704 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9705 E->getAltOp());
9706 } else {
9707 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9708 auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9709 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9710 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9711 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9712 unsigned SrcBWSz =
9713 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9714 if (SrcIt != MinBWs.end()) {
9715 SrcBWSz = SrcIt->second.first;
9716 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9717 SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9718 }
9719 if (BWSz <= SrcBWSz) {
9720 if (BWSz < SrcBWSz)
9721 VecCost =
9722 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9724 LLVM_DEBUG({
9725 dbgs()
9726 << "SLP: alternate extension, which should be truncated.\n";
9727 E->dump();
9728 });
9729 return VecCost;
9730 }
9731 }
9732 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9734 VecCost +=
9735 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9737 }
9739 E->buildAltOpShuffleMask(
9740 [E](Instruction *I) {
9741 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9742 return I->getOpcode() == E->getAltOpcode();
9743 },
9744 Mask);
9746 FinalVecTy, Mask);
9747 // Patterns like [fadd,fsub] can be combined into a single instruction
9748 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9749 // need to take into account their order when looking for the most used
9750 // order.
9751 unsigned Opcode0 = E->getOpcode();
9752 unsigned Opcode1 = E->getAltOpcode();
9753 // The opcode mask selects between the two opcodes.
9754 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9755 for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9756 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9757 OpcodeMask.set(Lane);
9758 // If this pattern is supported by the target then we consider the
9759 // order.
9760 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9761 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9762 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9763 return AltVecCost < VecCost ? AltVecCost : VecCost;
9764 }
9765 // TODO: Check the reverse order too.
9766 return VecCost;
9767 };
9768 return GetCostDiff(GetScalarCost, GetVectorCost);
9769 }
9770 default:
9771 llvm_unreachable("Unknown instruction");
9772 }
9773}
9774
9775bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9776 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9777 << VectorizableTree.size() << " is fully vectorizable .\n");
9778
9779 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9781 return TE->State == TreeEntry::NeedToGather &&
9782 !any_of(TE->Scalars,
9783 [this](Value *V) { return EphValues.contains(V); }) &&
9784 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9785 TE->Scalars.size() < Limit ||
9786 ((TE->getOpcode() == Instruction::ExtractElement ||
9787 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9788 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9789 (TE->State == TreeEntry::NeedToGather &&
9790 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9791 };
9792
9793 // We only handle trees of heights 1 and 2.
9794 if (VectorizableTree.size() == 1 &&
9795 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9796 (ForReduction &&
9797 AreVectorizableGathers(VectorizableTree[0].get(),
9798 VectorizableTree[0]->Scalars.size()) &&
9799 VectorizableTree[0]->getVectorFactor() > 2)))
9800 return true;
9801
9802 if (VectorizableTree.size() != 2)
9803 return false;
9804
9805 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9806 // with the second gather nodes if they have less scalar operands rather than
9807 // the initial tree element (may be profitable to shuffle the second gather)
9808 // or they are extractelements, which form shuffle.
9810 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9811 AreVectorizableGathers(VectorizableTree[1].get(),
9812 VectorizableTree[0]->Scalars.size()))
9813 return true;
9814
9815 // Gathering cost would be too much for tiny trees.
9816 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9817 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9818 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9819 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9820 return false;
9821
9822 return true;
9823}
9824
9825static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9827 bool MustMatchOrInst) {
9828 // Look past the root to find a source value. Arbitrarily follow the
9829 // path through operand 0 of any 'or'. Also, peek through optional
9830 // shift-left-by-multiple-of-8-bits.
9831 Value *ZextLoad = Root;
9832 const APInt *ShAmtC;
9833 bool FoundOr = false;
9834 while (!isa<ConstantExpr>(ZextLoad) &&
9835 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9836 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9837 ShAmtC->urem(8) == 0))) {
9838 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9839 ZextLoad = BinOp->getOperand(0);
9840 if (BinOp->getOpcode() == Instruction::Or)
9841 FoundOr = true;
9842 }
9843 // Check if the input is an extended load of the required or/shift expression.
9844 Value *Load;
9845 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9846 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9847 return false;
9848
9849 // Require that the total load bit width is a legal integer type.
9850 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9851 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9852 Type *SrcTy = Load->getType();
9853 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9854 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9855 return false;
9856
9857 // Everything matched - assume that we can fold the whole sequence using
9858 // load combining.
9859 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9860 << *(cast<Instruction>(Root)) << "\n");
9861
9862 return true;
9863}
9864
9866 if (RdxKind != RecurKind::Or)
9867 return false;
9868
9869 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9870 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9871 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9872 /* MatchOr */ false);
9873}
9874
9876 // Peek through a final sequence of stores and check if all operations are
9877 // likely to be load-combined.
9878 unsigned NumElts = Stores.size();
9879 for (Value *Scalar : Stores) {
9880 Value *X;
9881 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9882 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9883 return false;
9884 }
9885 return true;
9886}
9887
9888bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9889 // No need to vectorize inserts of gathered values.
9890 if (VectorizableTree.size() == 2 &&
9891 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9892 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9893 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9894 !(isSplat(VectorizableTree[1]->Scalars) ||
9895 allConstant(VectorizableTree[1]->Scalars))))
9896 return true;
9897
9898 // If the graph includes only PHI nodes and gathers, it is defnitely not
9899 // profitable for the vectorization, we can skip it, if the cost threshold is
9900 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9901 // gathers/buildvectors.
9902 constexpr int Limit = 4;
9903 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9904 !VectorizableTree.empty() &&
9905 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9906 return (TE->State == TreeEntry::NeedToGather &&
9907 TE->getOpcode() != Instruction::ExtractElement &&
9908 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9909 TE->getOpcode() == Instruction::PHI;
9910 }))
9911 return true;
9912
9913 // We can vectorize the tree if its size is greater than or equal to the
9914 // minimum size specified by the MinTreeSize command line option.
9915 if (VectorizableTree.size() >= MinTreeSize)
9916 return false;
9917
9918 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9919 // can vectorize it if we can prove it fully vectorizable.
9920 if (isFullyVectorizableTinyTree(ForReduction))
9921 return false;
9922
9923 // Check if any of the gather node forms an insertelement buildvector
9924 // somewhere.
9925 bool IsAllowedSingleBVNode =
9926 VectorizableTree.size() > 1 ||
9927 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9928 !VectorizableTree.front()->isAltShuffle() &&
9929 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9930 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9931 allSameBlock(VectorizableTree.front()->Scalars));
9932 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9933 return TE->State == TreeEntry::NeedToGather &&
9934 all_of(TE->Scalars, [&](Value *V) {
9935 return isa<ExtractElementInst, UndefValue>(V) ||
9936 (IsAllowedSingleBVNode &&
9937 !V->hasNUsesOrMore(UsesLimit) &&
9938 any_of(V->users(), IsaPred<InsertElementInst>));
9939 });
9940 }))
9941 return false;
9942
9943 assert(VectorizableTree.empty()
9944 ? ExternalUses.empty()
9945 : true && "We shouldn't have any external users");
9946
9947 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9948 // vectorizable.
9949 return true;
9950}
9951
9953 // Walk from the bottom of the tree to the top, tracking which values are
9954 // live. When we see a call instruction that is not part of our tree,
9955 // query TTI to see if there is a cost to keeping values live over it
9956 // (for example, if spills and fills are required).
9957 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9959
9961 Instruction *PrevInst = nullptr;
9962
9963 // The entries in VectorizableTree are not necessarily ordered by their
9964 // position in basic blocks. Collect them and order them by dominance so later
9965 // instructions are guaranteed to be visited first. For instructions in
9966 // different basic blocks, we only scan to the beginning of the block, so
9967 // their order does not matter, as long as all instructions in a basic block
9968 // are grouped together. Using dominance ensures a deterministic order.
9969 SmallVector<Instruction *, 16> OrderedScalars;
9970 for (const auto &TEPtr : VectorizableTree) {
9971 if (TEPtr->State != TreeEntry::Vectorize)
9972 continue;
9973 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9974 if (!Inst)
9975 continue;
9976 OrderedScalars.push_back(Inst);
9977 }
9978 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
9979 auto *NodeA = DT->getNode(A->getParent());
9980 auto *NodeB = DT->getNode(B->getParent());
9981 assert(NodeA && "Should only process reachable instructions");
9982 assert(NodeB && "Should only process reachable instructions");
9983 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9984 "Different nodes should have different DFS numbers");
9985 if (NodeA != NodeB)
9986 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9987 return B->comesBefore(A);
9988 });
9989
9990 for (Instruction *Inst : OrderedScalars) {
9991 if (!PrevInst) {
9992 PrevInst = Inst;
9993 continue;
9994 }
9995
9996 // Update LiveValues.
9997 LiveValues.erase(PrevInst);
9998 for (auto &J : PrevInst->operands()) {
9999 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10000 LiveValues.insert(cast<Instruction>(&*J));
10001 }
10002
10003 LLVM_DEBUG({
10004 dbgs() << "SLP: #LV: " << LiveValues.size();
10005 for (auto *X : LiveValues)
10006 dbgs() << " " << X->getName();
10007 dbgs() << ", Looking at ";
10008 Inst->dump();
10009 });
10010
10011 // Now find the sequence of instructions between PrevInst and Inst.
10012 unsigned NumCalls = 0;
10013 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10014 PrevInstIt =
10015 PrevInst->getIterator().getReverse();
10016 while (InstIt != PrevInstIt) {
10017 if (PrevInstIt == PrevInst->getParent()->rend()) {
10018 PrevInstIt = Inst->getParent()->rbegin();
10019 continue;
10020 }
10021
10022 auto NoCallIntrinsic = [this](Instruction *I) {
10023 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10024 if (II->isAssumeLikeIntrinsic())
10025 return true;
10026 FastMathFlags FMF;
10028 for (auto &ArgOp : II->args())
10029 Tys.push_back(ArgOp->getType());
10030 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10031 FMF = FPMO->getFastMathFlags();
10032 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10033 FMF);
10034 InstructionCost IntrCost =
10037 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10038 if (IntrCost < CallCost)
10039 return true;
10040 }
10041 return false;
10042 };
10043
10044 // Debug information does not impact spill cost.
10045 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10046 &*PrevInstIt != PrevInst)
10047 NumCalls++;
10048
10049 ++PrevInstIt;
10050 }
10051
10052 if (NumCalls) {
10054 for (auto *II : LiveValues) {
10055 auto *ScalarTy = II->getType();
10056 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10057 ScalarTy = VectorTy->getElementType();
10058 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
10059 }
10060 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10061 }
10062
10063 PrevInst = Inst;
10064 }
10065
10066 return Cost;
10067}
10068
10069/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10070/// buildvector sequence.
10072 const InsertElementInst *IE2) {
10073 if (IE1 == IE2)
10074 return false;
10075 const auto *I1 = IE1;
10076 const auto *I2 = IE2;
10077 const InsertElementInst *PrevI1;
10078 const InsertElementInst *PrevI2;
10079 unsigned Idx1 = *getInsertIndex(IE1);
10080 unsigned Idx2 = *getInsertIndex(IE2);
10081 do {
10082 if (I2 == IE1)
10083 return true;
10084 if (I1 == IE2)
10085 return false;
10086 PrevI1 = I1;
10087 PrevI2 = I2;
10088 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10089 getInsertIndex(I1).value_or(Idx2) != Idx2)
10090 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10091 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10092 getInsertIndex(I2).value_or(Idx1) != Idx1)
10093 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10094 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10095 llvm_unreachable("Two different buildvectors not expected.");
10096}
10097
10098namespace {
10099/// Returns incoming Value *, if the requested type is Value * too, or a default
10100/// value, otherwise.
10101struct ValueSelect {
10102 template <typename U>
10103 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10104 return V;
10105 }
10106 template <typename U>
10107 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10108 return U();
10109 }
10110};
10111} // namespace
10112
10113/// Does the analysis of the provided shuffle masks and performs the requested
10114/// actions on the vectors with the given shuffle masks. It tries to do it in
10115/// several steps.
10116/// 1. If the Base vector is not undef vector, resizing the very first mask to
10117/// have common VF and perform action for 2 input vectors (including non-undef
10118/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10119/// and processed as a shuffle of 2 elements.
10120/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10121/// action only for 1 vector with the given mask, if it is not the identity
10122/// mask.
10123/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10124/// vectors, combing the masks properly between the steps.
10125template <typename T>
10127 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10128 function_ref<unsigned(T *)> GetVF,
10129 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10131 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10132 SmallVector<int> Mask(ShuffleMask.begin()->second);
10133 auto VMIt = std::next(ShuffleMask.begin());
10134 T *Prev = nullptr;
10135 SmallBitVector UseMask =
10136 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10137 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10138 if (!IsBaseUndef.all()) {
10139 // Base is not undef, need to combine it with the next subvectors.
10140 std::pair<T *, bool> Res =
10141 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10142 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10143 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10144 if (Mask[Idx] == PoisonMaskElem)
10145 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10146 else
10147 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10148 }
10149 auto *V = ValueSelect::get<T *>(Base);
10150 (void)V;
10151 assert((!V || GetVF(V) == Mask.size()) &&
10152 "Expected base vector of VF number of elements.");
10153 Prev = Action(Mask, {nullptr, Res.first});
10154 } else if (ShuffleMask.size() == 1) {
10155 // Base is undef and only 1 vector is shuffled - perform the action only for
10156 // single vector, if the mask is not the identity mask.
10157 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10158 /*ForSingleMask=*/true);
10159 if (Res.second)
10160 // Identity mask is found.
10161 Prev = Res.first;
10162 else
10163 Prev = Action(Mask, {ShuffleMask.begin()->first});
10164 } else {
10165 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10166 // shuffles step by step, combining shuffle between the steps.
10167 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10168 unsigned Vec2VF = GetVF(VMIt->first);
10169 if (Vec1VF == Vec2VF) {
10170 // No need to resize the input vectors since they are of the same size, we
10171 // can shuffle them directly.
10172 ArrayRef<int> SecMask = VMIt->second;
10173 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10174 if (SecMask[I] != PoisonMaskElem) {
10175 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10176 Mask[I] = SecMask[I] + Vec1VF;
10177 }
10178 }
10179 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10180 } else {
10181 // Vectors of different sizes - resize and reshuffle.
10182 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10183 /*ForSingleMask=*/false);
10184 std::pair<T *, bool> Res2 =
10185 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10186 ArrayRef<int> SecMask = VMIt->second;
10187 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10188 if (Mask[I] != PoisonMaskElem) {
10189 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10190 if (Res1.second)
10191 Mask[I] = I;
10192 } else if (SecMask[I] != PoisonMaskElem) {
10193 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10194 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10195 }
10196 }
10197 Prev = Action(Mask, {Res1.first, Res2.first});
10198 }
10199 VMIt = std::next(VMIt);
10200 }
10201 bool IsBaseNotUndef = !IsBaseUndef.all();
10202 (void)IsBaseNotUndef;
10203 // Perform requested actions for the remaining masks/vectors.
10204 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10205 // Shuffle other input vectors, if any.
10206 std::pair<T *, bool> Res =
10207 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10208 ArrayRef<int> SecMask = VMIt->second;
10209 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10210 if (SecMask[I] != PoisonMaskElem) {
10211 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10212 "Multiple uses of scalars.");
10213 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10214 } else if (Mask[I] != PoisonMaskElem) {
10215 Mask[I] = I;
10216 }
10217 }
10218 Prev = Action(Mask, {Prev, Res.first});
10219 }
10220 return Prev;
10221}
10222
10225 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10226 << VectorizableTree.size() << ".\n");
10227
10228 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10229
10230 SmallPtrSet<Value *, 4> CheckedExtracts;
10231 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10232 TreeEntry &TE = *VectorizableTree[I];
10233 if (TE.State == TreeEntry::NeedToGather) {
10234 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10235 E && E->getVectorFactor() == TE.getVectorFactor() &&
10236 E->isSame(TE.Scalars)) {
10237 // Some gather nodes might be absolutely the same as some vectorizable
10238 // nodes after reordering, need to handle it.
10239 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10240 << shortBundleName(TE.Scalars) << ".\n"
10241 << "SLP: Current total cost = " << Cost << "\n");
10242 continue;
10243 }
10244 }
10245
10246 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10247 Cost += C;
10248 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10249 << shortBundleName(TE.Scalars) << ".\n"
10250 << "SLP: Current total cost = " << Cost << "\n");
10251 }
10252
10253 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10254 InstructionCost ExtractCost = 0;
10257 SmallVector<APInt> DemandedElts;
10258 SmallDenseSet<Value *, 4> UsedInserts;
10260 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10261 for (ExternalUser &EU : ExternalUses) {
10262 // We only add extract cost once for the same scalar.
10263 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10264 !ExtractCostCalculated.insert(EU.Scalar).second)
10265 continue;
10266
10267 // Uses by ephemeral values are free (because the ephemeral value will be
10268 // removed prior to code generation, and so the extraction will be
10269 // removed as well).
10270 if (EphValues.count(EU.User))
10271 continue;
10272
10273 // No extract cost for vector "scalar"
10274 if (isa<FixedVectorType>(EU.Scalar->getType()))
10275 continue;
10276
10277 // If found user is an insertelement, do not calculate extract cost but try
10278 // to detect it as a final shuffled/identity match.
10279 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10280 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10281 if (!UsedInserts.insert(VU).second)
10282 continue;
10283 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
10284 if (InsertIdx) {
10285 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10286 auto *It = find_if(
10287 FirstUsers,
10288 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10290 VU, cast<InsertElementInst>(Pair.first),
10291 [this](InsertElementInst *II) -> Value * {
10292 Value *Op0 = II->getOperand(0);
10293 if (getTreeEntry(II) && !getTreeEntry(Op0))
10294 return nullptr;
10295 return Op0;
10296 });
10297 });
10298 int VecId = -1;
10299 if (It == FirstUsers.end()) {
10300 (void)ShuffleMasks.emplace_back();
10301 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10302 if (Mask.empty())
10303 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10304 // Find the insertvector, vectorized in tree, if any.
10305 Value *Base = VU;
10306 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10307 if (IEBase != EU.User &&
10308 (!IEBase->hasOneUse() ||
10309 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10310 break;
10311 // Build the mask for the vectorized insertelement instructions.
10312 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10313 VU = IEBase;
10314 do {
10315 IEBase = cast<InsertElementInst>(Base);
10316 int Idx = *getInsertIndex(IEBase);
10317 assert(Mask[Idx] == PoisonMaskElem &&
10318 "InsertElementInstruction used already.");
10319 Mask[Idx] = Idx;
10320 Base = IEBase->getOperand(0);
10321 } while (E == getTreeEntry(Base));
10322 break;
10323 }
10324 Base = cast<InsertElementInst>(Base)->getOperand(0);
10325 }
10326 FirstUsers.emplace_back(VU, ScalarTE);
10327 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10328 VecId = FirstUsers.size() - 1;
10329 auto It = MinBWs.find(ScalarTE);
10330 if (It != MinBWs.end() &&
10331 VectorCasts
10332 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10333 .second) {
10334 unsigned BWSz = It->second.first;
10335 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10336 unsigned VecOpcode;
10337 if (DstBWSz < BWSz)
10338 VecOpcode = Instruction::Trunc;
10339 else
10340 VecOpcode =
10341 It->second.second ? Instruction::SExt : Instruction::ZExt;
10344 VecOpcode, FTy,
10346 IntegerType::get(FTy->getContext(), BWSz),
10347 FTy->getNumElements()),
10349 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10350 << " for extending externally used vector with "
10351 "non-equal minimum bitwidth.\n");
10352 Cost += C;
10353 }
10354 } else {
10355 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10356 It->first = VU;
10357 VecId = std::distance(FirstUsers.begin(), It);
10358 }
10359 int InIdx = *InsertIdx;
10360 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10361 if (Mask.empty())
10362 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10363 Mask[InIdx] = EU.Lane;
10364 DemandedElts[VecId].setBit(InIdx);
10365 continue;
10366 }
10367 }
10368 }
10369 // Leave the GEPs as is, they are free in most cases and better to keep them
10370 // as GEPs.
10372 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10373 if (!ValueToExtUses) {
10374 ValueToExtUses.emplace();
10375 for_each(enumerate(ExternalUses), [&](const auto &P) {
10376 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10377 });
10378 }
10379 // Can use original GEP, if no operands vectorized or they are marked as
10380 // externally used already.
10381 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10382 if (!getTreeEntry(V))
10383 return true;
10384 auto It = ValueToExtUses->find(V);
10385 if (It != ValueToExtUses->end()) {
10386 // Replace all uses to avoid compiler crash.
10387 ExternalUses[It->second].User = nullptr;
10388 return true;
10389 }
10390 return false;
10391 });
10392 if (CanBeUsedAsGEP) {
10393 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10394 ExternalUsesAsGEPs.insert(EU.Scalar);
10395 continue;
10396 }
10397 }
10398
10399 // If we plan to rewrite the tree in a smaller type, we will need to sign
10400 // extend the extracted value back to the original type. Here, we account
10401 // for the extract and the added cost of the sign extend if needed.
10402 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10403 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10404 if (It != MinBWs.end()) {
10405 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10406 unsigned Extend =
10407 It->second.second ? Instruction::SExt : Instruction::ZExt;
10408 VecTy = FixedVectorType::get(MinTy, BundleWidth);
10409 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10410 VecTy, EU.Lane);
10411 } else {
10412 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10413 CostKind, EU.Lane);
10414 }
10415 }
10416 // Add reduced value cost, if resized.
10417 if (!VectorizedVals.empty()) {
10418 const TreeEntry &Root = *VectorizableTree.front().get();
10419 auto BWIt = MinBWs.find(&Root);
10420 if (BWIt != MinBWs.end()) {
10421 Type *DstTy = Root.Scalars.front()->getType();
10422 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10423 unsigned SrcSz =
10424 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10425 if (OriginalSz != SrcSz) {
10426 unsigned Opcode = Instruction::Trunc;
10427 if (OriginalSz > SrcSz)
10428 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10429 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10430 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10433 }
10434 }
10435 }
10436
10437 InstructionCost SpillCost = getSpillCost();
10438 Cost += SpillCost + ExtractCost;
10439 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10440 bool) {
10441 InstructionCost C = 0;
10442 unsigned VF = Mask.size();
10443 unsigned VecVF = TE->getVectorFactor();
10444 if (VF != VecVF &&
10445 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10447 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10448 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10449 OrigMask.begin());
10450 C = TTI->getShuffleCost(
10452 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
10453 LLVM_DEBUG(
10454 dbgs() << "SLP: Adding cost " << C
10455 << " for final shuffle of insertelement external users.\n";
10456 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10457 Cost += C;
10458 return std::make_pair(TE, true);
10459 }
10460 return std::make_pair(TE, false);
10461 };
10462 // Calculate the cost of the reshuffled vectors, if any.
10463 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10464 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10465 auto Vector = ShuffleMasks[I].takeVector();
10466 unsigned VF = 0;
10467 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10469 assert((TEs.size() == 1 || TEs.size() == 2) &&
10470 "Expected exactly 1 or 2 tree entries.");
10471 if (TEs.size() == 1) {
10472 if (VF == 0)
10473 VF = TEs.front()->getVectorFactor();
10474 auto *FTy =
10475 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10476 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10477 !all_of(enumerate(Mask), [=](const auto &Data) {
10478 return Data.value() == PoisonMaskElem ||
10479 (Data.index() < VF &&
10480 static_cast<int>(Data.index()) == Data.value());
10481 })) {
10484 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10485 << " for final shuffle of insertelement "
10486 "external users.\n";
10487 TEs.front()->dump();
10488 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10489 Cost += C;
10490 }
10491 } else {
10492 if (VF == 0) {
10493 if (TEs.front() &&
10494 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10495 VF = TEs.front()->getVectorFactor();
10496 else
10497 VF = Mask.size();
10498 }
10499 auto *FTy =
10500 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10503 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10504 << " for final shuffle of vector node and external "
10505 "insertelement users.\n";
10506 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10507 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10508 Cost += C;
10509 }
10510 VF = Mask.size();
10511 return TEs.back();
10512 };
10513 (void)performExtractsShuffleAction<const TreeEntry>(
10514 MutableArrayRef(Vector.data(), Vector.size()), Base,
10515 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10516 EstimateShufflesCost);
10518 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10519 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10520 Cost -= InsertCost;
10521 }
10522
10523 // Add the cost for reduced value resize (if required).
10524 if (ReductionBitWidth != 0) {
10525 assert(UserIgnoreList && "Expected reduction tree.");
10526 const TreeEntry &E = *VectorizableTree.front().get();
10527 auto It = MinBWs.find(&E);
10528 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10529 unsigned SrcSize = It->second.first;
10530 unsigned DstSize = ReductionBitWidth;
10531 unsigned Opcode = Instruction::Trunc;
10532 if (SrcSize < DstSize)
10533 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10534 auto *SrcVecTy =
10535 FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10536 auto *DstVecTy =
10537 FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
10538 TTI::CastContextHint CCH = getCastContextHint(E);
10539 InstructionCost CastCost;
10540 switch (E.getOpcode()) {
10541 case Instruction::SExt:
10542 case Instruction::ZExt:
10543 case Instruction::Trunc: {
10544 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10545 CCH = getCastContextHint(*OpTE);
10546 break;
10547 }
10548 default:
10549 break;
10550 }
10551 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10553 Cost += CastCost;
10554 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10555 << " for final resize for reduction from " << SrcVecTy
10556 << " to " << DstVecTy << "\n";
10557 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10558 }
10559 }
10560
10561#ifndef NDEBUG
10562 SmallString<256> Str;
10563 {
10565 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10566 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10567 << "SLP: Total Cost = " << Cost << ".\n";
10568 }
10569 LLVM_DEBUG(dbgs() << Str);
10570 if (ViewSLPTree)
10571 ViewGraph(this, "SLP" + F->getName(), false, Str);
10572#endif
10573
10574 return Cost;
10575}
10576
10577/// Tries to find extractelement instructions with constant indices from fixed
10578/// vector type and gather such instructions into a bunch, which highly likely
10579/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10580/// successful, the matched scalars are replaced by poison values in \p VL for
10581/// future analysis.
10582std::optional<TTI::ShuffleKind>
10583BoUpSLP::tryToGatherSingleRegisterExtractElements(
10585 // Scan list of gathered scalars for extractelements that can be represented
10586 // as shuffles.
10588 SmallVector<int> UndefVectorExtracts;
10589 for (int I = 0, E = VL.size(); I < E; ++I) {
10590 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10591 if (!EI) {
10592 if (isa<UndefValue>(VL[I]))
10593 UndefVectorExtracts.push_back(I);
10594 continue;
10595 }
10596 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10597 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10598 continue;
10599 std::optional<unsigned> Idx = getExtractIndex(EI);
10600 // Undefined index.
10601 if (!Idx) {
10602 UndefVectorExtracts.push_back(I);
10603 continue;
10604 }
10605 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10606 ExtractMask.reset(*Idx);
10607 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10608 UndefVectorExtracts.push_back(I);
10609 continue;
10610 }
10611 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10612 }
10613 // Sort the vector operands by the maximum number of uses in extractelements.
10615 for (const auto &Data : VectorOpToIdx)
10616 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
10617 .push_back(Data.first);
10618 for (auto &Data : VFToVector) {
10619 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
10620 return VectorOpToIdx.find(V1)->second.size() >
10621 VectorOpToIdx.find(V2)->second.size();
10622 });
10623 }
10624 // Find the best pair of the vectors with the same number of elements or a
10625 // single vector.
10626 const int UndefSz = UndefVectorExtracts.size();
10627 unsigned SingleMax = 0;
10628 Value *SingleVec = nullptr;
10629 unsigned PairMax = 0;
10630 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10631 for (auto &Data : VFToVector) {
10632 Value *V1 = Data.second.front();
10633 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10634 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10635 SingleVec = V1;
10636 }
10637 Value *V2 = nullptr;
10638 if (Data.second.size() > 1)
10639 V2 = *std::next(Data.second.begin());
10640 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10641 UndefSz) {
10642 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10643 PairVec = std::make_pair(V1, V2);
10644 }
10645 }
10646 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10647 return std::nullopt;
10648 // Check if better to perform a shuffle of 2 vectors or just of a single
10649 // vector.
10650 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10651 SmallVector<Value *> GatheredExtracts(
10652 VL.size(), PoisonValue::get(VL.front()->getType()));
10653 if (SingleMax >= PairMax && SingleMax) {
10654 for (int Idx : VectorOpToIdx[SingleVec])
10655 std::swap(GatheredExtracts[Idx], VL[Idx]);
10656 } else {
10657 for (Value *V : {PairVec.first, PairVec.second})
10658 for (int Idx : VectorOpToIdx[V])
10659 std::swap(GatheredExtracts[Idx], VL[Idx]);
10660 }
10661 // Add extracts from undefs too.
10662 for (int Idx : UndefVectorExtracts)
10663 std::swap(GatheredExtracts[Idx], VL[Idx]);
10664 // Check that gather of extractelements can be represented as just a
10665 // shuffle of a single/two vectors the scalars are extracted from.
10666 std::optional<TTI::ShuffleKind> Res =
10667 isFixedVectorShuffle(GatheredExtracts, Mask);
10668 if (!Res) {
10669 // TODO: try to check other subsets if possible.
10670 // Restore the original VL if attempt was not successful.
10671 copy(SavedVL, VL.begin());
10672 return std::nullopt;
10673 }
10674 // Restore unused scalars from mask, if some of the extractelements were not
10675 // selected for shuffle.
10676 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10677 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10678 isa<UndefValue>(GatheredExtracts[I])) {
10679 std::swap(VL[I], GatheredExtracts[I]);
10680 continue;
10681 }
10682 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10683 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10684 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10685 is_contained(UndefVectorExtracts, I))
10686 continue;
10687 }
10688 return Res;
10689}
10690
10691/// Tries to find extractelement instructions with constant indices from fixed
10692/// vector type and gather such instructions into a bunch, which highly likely
10693/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10694/// successful, the matched scalars are replaced by poison values in \p VL for
10695/// future analysis.
10697BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10699 unsigned NumParts) const {
10700 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10701 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10702 Mask.assign(VL.size(), PoisonMaskElem);
10703 unsigned SliceSize = VL.size() / NumParts;
10704 for (unsigned Part = 0; Part < NumParts; ++Part) {
10705 // Scan list of gathered scalars for extractelements that can be represented
10706 // as shuffles.
10708 MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
10709 SmallVector<int> SubMask;
10710 std::optional<TTI::ShuffleKind> Res =
10711 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10712 ShufflesRes[Part] = Res;
10713 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10714 }
10715 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10716 return Res.has_value();
10717 }))
10718 ShufflesRes.clear();
10719 return ShufflesRes;
10720}
10721
10722std::optional<TargetTransformInfo::ShuffleKind>
10723BoUpSLP::isGatherShuffledSingleRegisterEntry(
10724 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10725 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10726 Entries.clear();
10727 // TODO: currently checking only for Scalars in the tree entry, need to count
10728 // reused elements too for better cost estimation.
10729 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10730 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10731 const BasicBlock *TEInsertBlock = nullptr;
10732 // Main node of PHI entries keeps the correct order of operands/incoming
10733 // blocks.
10734 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10735 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10736 TEInsertPt = TEInsertBlock->getTerminator();
10737 } else {
10738 TEInsertBlock = TEInsertPt->getParent();
10739 }
10740 if (!DT->isReachableFromEntry(TEInsertBlock))
10741 return std::nullopt;
10742 auto *NodeUI = DT->getNode(TEInsertBlock);
10743 assert(NodeUI && "Should only process reachable instructions");
10744 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10745 auto CheckOrdering = [&](const Instruction *InsertPt) {
10746 // Argument InsertPt is an instruction where vector code for some other
10747 // tree entry (one that shares one or more scalars with TE) is going to be
10748 // generated. This lambda returns true if insertion point of vector code
10749 // for the TE dominates that point (otherwise dependency is the other way
10750 // around). The other node is not limited to be of a gather kind. Gather
10751 // nodes are not scheduled and their vector code is inserted before their
10752 // first user. If user is PHI, that is supposed to be at the end of a
10753 // predecessor block. Otherwise it is the last instruction among scalars of
10754 // the user node. So, instead of checking dependency between instructions
10755 // themselves, we check dependency between their insertion points for vector
10756 // code (since each scalar instruction ends up as a lane of a vector
10757 // instruction).
10758 const BasicBlock *InsertBlock = InsertPt->getParent();
10759 auto *NodeEUI = DT->getNode(InsertBlock);
10760 if (!NodeEUI)
10761 return false;
10762 assert((NodeUI == NodeEUI) ==
10763 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10764 "Different nodes should have different DFS numbers");
10765 // Check the order of the gather nodes users.
10766 if (TEInsertPt->getParent() != InsertBlock &&
10767 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10768 return false;
10769 if (TEInsertPt->getParent() == InsertBlock &&
10770 TEInsertPt->comesBefore(InsertPt))
10771 return false;
10772 return true;
10773 };
10774 // Find all tree entries used by the gathered values. If no common entries
10775 // found - not a shuffle.
10776 // Here we build a set of tree nodes for each gathered value and trying to
10777 // find the intersection between these sets. If we have at least one common
10778 // tree node for each gathered value - we have just a permutation of the
10779 // single vector. If we have 2 different sets, we're in situation where we
10780 // have a permutation of 2 input vectors.
10782 DenseMap<Value *, int> UsedValuesEntry;
10783 for (Value *V : VL) {
10784 if (isConstant(V))
10785 continue;
10786 // Build a list of tree entries where V is used.
10788 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10789 if (TEPtr == TE)
10790 continue;
10791 assert(any_of(TEPtr->Scalars,
10792 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10793 "Must contain at least single gathered value.");
10794 assert(TEPtr->UserTreeIndices.size() == 1 &&
10795 "Expected only single user of a gather node.");
10796 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10797
10798 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10799 const Instruction *InsertPt =
10800 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10801 : &getLastInstructionInBundle(UseEI.UserTE);
10802 if (TEInsertPt == InsertPt) {
10803 // If 2 gathers are operands of the same entry (regardless of whether
10804 // user is PHI or else), compare operands indices, use the earlier one
10805 // as the base.
10806 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10807 continue;
10808 // If the user instruction is used for some reason in different
10809 // vectorized nodes - make it depend on index.
10810 if (TEUseEI.UserTE != UseEI.UserTE &&
10811 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10812 continue;
10813 }
10814
10815 // Check if the user node of the TE comes after user node of TEPtr,
10816 // otherwise TEPtr depends on TE.
10817 if ((TEInsertBlock != InsertPt->getParent() ||
10818 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10819 !CheckOrdering(InsertPt))
10820 continue;
10821 VToTEs.insert(TEPtr);
10822 }
10823 if (const TreeEntry *VTE = getTreeEntry(V)) {
10824 if (ForOrder) {
10825 if (VTE->State != TreeEntry::Vectorize) {
10826 auto It = MultiNodeScalars.find(V);
10827 if (It == MultiNodeScalars.end())
10828 continue;
10829 VTE = *It->getSecond().begin();
10830 // Iterate through all vectorized nodes.
10831 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10832 return MTE->State == TreeEntry::Vectorize;
10833 });
10834 if (MIt == It->getSecond().end())
10835 continue;
10836 VTE = *MIt;
10837 }
10838 }
10839 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10840 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10841 continue;
10842 VToTEs.insert(VTE);
10843 }
10844 if (VToTEs.empty())
10845 continue;
10846 if (UsedTEs.empty()) {
10847 // The first iteration, just insert the list of nodes to vector.
10848 UsedTEs.push_back(VToTEs);
10849 UsedValuesEntry.try_emplace(V, 0);
10850 } else {
10851 // Need to check if there are any previously used tree nodes which use V.
10852 // If there are no such nodes, consider that we have another one input
10853 // vector.
10854 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10855 unsigned Idx = 0;
10856 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10857 // Do we have a non-empty intersection of previously listed tree entries
10858 // and tree entries using current V?
10859 set_intersect(VToTEs, Set);
10860 if (!VToTEs.empty()) {
10861 // Yes, write the new subset and continue analysis for the next
10862 // scalar.
10863 Set.swap(VToTEs);
10864 break;
10865 }
10866 VToTEs = SavedVToTEs;
10867 ++Idx;
10868 }
10869 // No non-empty intersection found - need to add a second set of possible
10870 // source vectors.
10871 if (Idx == UsedTEs.size()) {
10872 // If the number of input vectors is greater than 2 - not a permutation,
10873 // fallback to the regular gather.
10874 // TODO: support multiple reshuffled nodes.
10875 if (UsedTEs.size() == 2)
10876 continue;
10877 UsedTEs.push_back(SavedVToTEs);
10878 Idx = UsedTEs.size() - 1;
10879 }
10880 UsedValuesEntry.try_emplace(V, Idx);
10881 }
10882 }
10883
10884 if (UsedTEs.empty()) {
10885 Entries.clear();
10886 return std::nullopt;
10887 }
10888
10889 unsigned VF = 0;
10890 if (UsedTEs.size() == 1) {
10891 // Keep the order to avoid non-determinism.
10892 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10893 UsedTEs.front().end());
10894 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10895 return TE1->Idx < TE2->Idx;
10896 });
10897 // Try to find the perfect match in another gather node at first.
10898 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10899 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10900 });
10901 if (It != FirstEntries.end() &&
10902 ((*It)->getVectorFactor() == VL.size() ||
10903 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10904 TE->ReuseShuffleIndices.size() == VL.size() &&
10905 (*It)->isSame(TE->Scalars)))) {
10906 Entries.push_back(*It);
10907 if ((*It)->getVectorFactor() == VL.size()) {
10908 std::iota(std::next(Mask.begin(), Part * VL.size()),
10909 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10910 } else {
10911 SmallVector<int> CommonMask = TE->getCommonMask();
10912 copy(CommonMask, Mask.begin());
10913 }
10914 // Clear undef scalars.
10915 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10916 if (isa<PoisonValue>(VL[I]))
10919 }
10920 // No perfect match, just shuffle, so choose the first tree node from the
10921 // tree.
10922 Entries.push_back(FirstEntries.front());
10923 } else {
10924 // Try to find nodes with the same vector factor.
10925 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10926 // Keep the order of tree nodes to avoid non-determinism.
10928 for (const TreeEntry *TE : UsedTEs.front()) {
10929 unsigned VF = TE->getVectorFactor();
10930 auto It = VFToTE.find(VF);
10931 if (It != VFToTE.end()) {
10932 if (It->second->Idx > TE->Idx)
10933 It->getSecond() = TE;
10934 continue;
10935 }
10936 VFToTE.try_emplace(VF, TE);
10937 }
10938 // Same, keep the order to avoid non-determinism.
10939 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10940 UsedTEs.back().end());
10941 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10942 return TE1->Idx < TE2->Idx;
10943 });
10944 for (const TreeEntry *TE : SecondEntries) {
10945 auto It = VFToTE.find(TE->getVectorFactor());
10946 if (It != VFToTE.end()) {
10947 VF = It->first;
10948 Entries.push_back(It->second);
10949 Entries.push_back(TE);
10950 break;
10951 }
10952 }
10953 // No 2 source vectors with the same vector factor - just choose 2 with max
10954 // index.
10955 if (Entries.empty()) {
10956 Entries.push_back(*llvm::max_element(
10957 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10958 return TE1->Idx < TE2->Idx;
10959 }));
10960 Entries.push_back(SecondEntries.front());
10961 VF = std::max(Entries.front()->getVectorFactor(),
10962 Entries.back()->getVectorFactor());
10963 }
10964 }
10965
10966 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
10967 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10968 // vectorized.
10969 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10970 auto *PHI = cast<PHINode>(V);
10971 auto *PHI1 = cast<PHINode>(V1);
10972 // Check that all incoming values are compatible/from same parent (if they
10973 // are instructions).
10974 // The incoming values are compatible if they all are constants, or
10975 // instruction with the same/alternate opcodes from the same basic block.
10976 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10977 Value *In = PHI->getIncomingValue(I);
10978 Value *In1 = PHI1->getIncomingValue(I);
10979 if (isConstant(In) && isConstant(In1))
10980 continue;
10981 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
10982 return false;
10983 if (cast<Instruction>(In)->getParent() !=
10984 cast<Instruction>(In1)->getParent())
10985 return false;
10986 }
10987 return true;
10988 };
10989 // Check if the value can be ignored during analysis for shuffled gathers.
10990 // We suppose it is better to ignore instruction, which do not form splats,
10991 // are not vectorized/not extractelements (these instructions will be handled
10992 // by extractelements processing) or may form vector node in future.
10993 auto MightBeIgnored = [=](Value *V) {
10994 auto *I = dyn_cast<Instruction>(V);
10995 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
10997 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
10998 };
10999 // Check that the neighbor instruction may form a full vector node with the
11000 // current instruction V. It is possible, if they have same/alternate opcode
11001 // and same parent basic block.
11002 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11003 Value *V1 = VL[Idx];
11004 bool UsedInSameVTE = false;
11005 auto It = UsedValuesEntry.find(V1);
11006 if (It != UsedValuesEntry.end())
11007 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11008 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11009 getSameOpcode({V, V1}, *TLI).getOpcode() &&
11010 cast<Instruction>(V)->getParent() ==
11011 cast<Instruction>(V1)->getParent() &&
11012 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11013 };
11014 // Build a shuffle mask for better cost estimation and vector emission.
11015 SmallBitVector UsedIdxs(Entries.size());
11017 for (int I = 0, E = VL.size(); I < E; ++I) {
11018 Value *V = VL[I];
11019 auto It = UsedValuesEntry.find(V);
11020 if (It == UsedValuesEntry.end())
11021 continue;
11022 // Do not try to shuffle scalars, if they are constants, or instructions
11023 // that can be vectorized as a result of the following vector build
11024 // vectorization.
11025 if (isConstant(V) || (MightBeIgnored(V) &&
11026 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11027 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11028 continue;
11029 unsigned Idx = It->second;
11030 EntryLanes.emplace_back(Idx, I);
11031 UsedIdxs.set(Idx);
11032 }
11033 // Iterate through all shuffled scalars and select entries, which can be used
11034 // for final shuffle.
11036 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11037 if (!UsedIdxs.test(I))
11038 continue;
11039 // Fix the entry number for the given scalar. If it is the first entry, set
11040 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11041 // These indices are used when calculating final shuffle mask as the vector
11042 // offset.
11043 for (std::pair<unsigned, int> &Pair : EntryLanes)
11044 if (Pair.first == I)
11045 Pair.first = TempEntries.size();
11046 TempEntries.push_back(Entries[I]);
11047 }
11048 Entries.swap(TempEntries);
11049 if (EntryLanes.size() == Entries.size() &&
11050 !VL.equals(ArrayRef(TE->Scalars)
11051 .slice(Part * VL.size(),
11052 std::min<int>(VL.size(), TE->Scalars.size())))) {
11053 // We may have here 1 or 2 entries only. If the number of scalars is equal
11054 // to the number of entries, no need to do the analysis, it is not very
11055 // profitable. Since VL is not the same as TE->Scalars, it means we already
11056 // have some shuffles before. Cut off not profitable case.
11057 Entries.clear();
11058 return std::nullopt;
11059 }
11060 // Build the final mask, check for the identity shuffle, if possible.
11061 bool IsIdentity = Entries.size() == 1;
11062 // Pair.first is the offset to the vector, while Pair.second is the index of
11063 // scalar in the list.
11064 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11065 unsigned Idx = Part * VL.size() + Pair.second;
11066 Mask[Idx] =
11067 Pair.first * VF +
11068 (ForOrder ? std::distance(
11069 Entries[Pair.first]->Scalars.begin(),
11070 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11071 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11072 IsIdentity &= Mask[Idx] == Pair.second;
11073 }
11074 switch (Entries.size()) {
11075 case 1:
11076 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11078 break;
11079 case 2:
11080 if (EntryLanes.size() > 2 || VL.size() <= 2)
11082 break;
11083 default:
11084 break;
11085 }
11086 Entries.clear();
11087 // Clear the corresponding mask elements.
11088 std::fill(std::next(Mask.begin(), Part * VL.size()),
11089 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11090 return std::nullopt;
11091}
11092
11094BoUpSLP::isGatherShuffledEntry(
11095 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11096 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11097 bool ForOrder) {
11098 assert(NumParts > 0 && NumParts < VL.size() &&
11099 "Expected positive number of registers.");
11100 Entries.clear();
11101 // No need to check for the topmost gather node.
11102 if (TE == VectorizableTree.front().get())
11103 return {};
11104 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11105 if (TE->isNonPowOf2Vec())
11106 return {};
11107 Mask.assign(VL.size(), PoisonMaskElem);
11108 assert(TE->UserTreeIndices.size() == 1 &&
11109 "Expected only single user of the gather node.");
11110 assert(VL.size() % NumParts == 0 &&
11111 "Number of scalars must be divisible by NumParts.");
11112 unsigned SliceSize = VL.size() / NumParts;
11114 for (unsigned Part = 0; Part < NumParts; ++Part) {
11115 ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
11116 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11117 std::optional<TTI::ShuffleKind> SubRes =
11118 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11119 ForOrder);
11120 if (!SubRes)
11121 SubEntries.clear();
11122 Res.push_back(SubRes);
11123 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11124 SubEntries.front()->getVectorFactor() == VL.size() &&
11125 (SubEntries.front()->isSame(TE->Scalars) ||
11126 SubEntries.front()->isSame(VL))) {
11127 SmallVector<const TreeEntry *> LocalSubEntries;
11128 LocalSubEntries.swap(SubEntries);
11129 Entries.clear();
11130 Res.clear();
11131 std::iota(Mask.begin(), Mask.end(), 0);
11132 // Clear undef scalars.
11133 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11134 if (isa<PoisonValue>(VL[I]))
11136 Entries.emplace_back(1, LocalSubEntries.front());
11138 return Res;
11139 }
11140 }
11141 if (all_of(Res,
11142 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11143 Entries.clear();
11144 return {};
11145 }
11146 return Res;
11147}
11148
11149InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11150 Type *ScalarTy) const {
11151 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11152 bool DuplicateNonConst = false;
11153 // Find the cost of inserting/extracting values from the vector.
11154 // Check if the same elements are inserted several times and count them as
11155 // shuffle candidates.
11156 APInt ShuffledElements = APInt::getZero(VL.size());
11157 DenseMap<Value *, unsigned> UniqueElements;
11160 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11161 if (V->getType() != ScalarTy) {
11162 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11164 V = nullptr;
11165 }
11166 if (!ForPoisonSrc)
11167 Cost +=
11168 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11169 I, Constant::getNullValue(VecTy), V);
11170 };
11171 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11172 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11173 Value *V = VL[I];
11174 // No need to shuffle duplicates for constants.
11175 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11176 ShuffledElements.setBit(I);
11177 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11178 continue;
11179 }
11180
11181 auto Res = UniqueElements.try_emplace(V, I);
11182 if (Res.second) {
11183 EstimateInsertCost(I, V);
11184 ShuffleMask[I] = I;
11185 continue;
11186 }
11187
11188 DuplicateNonConst = true;
11189 ShuffledElements.setBit(I);
11190 ShuffleMask[I] = Res.first->second;
11191 }
11192 if (ForPoisonSrc)
11193 Cost =
11194 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11195 /*Extract*/ false, CostKind);
11196 if (DuplicateNonConst)
11198 VecTy, ShuffleMask);
11199 return Cost;
11200}
11201
11202// Perform operand reordering on the instructions in VL and return the reordered
11203// operands in Left and Right.
11204void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11207 const BoUpSLP &R) {
11208 if (VL.empty())
11209 return;
11210 VLOperands Ops(VL, R);
11211 // Reorder the operands in place.
11212 Ops.reorder();
11213 Left = Ops.getVL(0);
11214 Right = Ops.getVL(1);
11215}
11216
11217Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11218 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11219 if (Res.second)
11220 return *Res.second;
11221 // Get the basic block this bundle is in. All instructions in the bundle
11222 // should be in this block (except for extractelement-like instructions with
11223 // constant indeces).
11224 auto *Front = E->getMainOp();
11225 auto *BB = Front->getParent();
11226 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11227 if (E->getOpcode() == Instruction::GetElementPtr &&
11228 !isa<GetElementPtrInst>(V))
11229 return true;
11230 auto *I = cast<Instruction>(V);
11231 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11232 isVectorLikeInstWithConstOps(I);
11233 }));
11234
11235 auto FindLastInst = [&]() {
11236 Instruction *LastInst = Front;
11237 for (Value *V : E->Scalars) {
11238 auto *I = dyn_cast<Instruction>(V);
11239 if (!I)
11240 continue;
11241 if (LastInst->getParent() == I->getParent()) {
11242 if (LastInst->comesBefore(I))
11243 LastInst = I;
11244 continue;
11245 }
11246 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11247 !isa<GetElementPtrInst>(I)) ||
11248 (isVectorLikeInstWithConstOps(LastInst) &&
11250 "Expected vector-like or non-GEP in GEP node insts only.");
11251 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11252 LastInst = I;
11253 continue;
11254 }
11255 if (!DT->isReachableFromEntry(I->getParent()))
11256 continue;
11257 auto *NodeA = DT->getNode(LastInst->getParent());
11258 auto *NodeB = DT->getNode(I->getParent());
11259 assert(NodeA && "Should only process reachable instructions");
11260 assert(NodeB && "Should only process reachable instructions");
11261 assert((NodeA == NodeB) ==
11262 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11263 "Different nodes should have different DFS numbers");
11264 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11265 LastInst = I;
11266 }
11267 BB = LastInst->getParent();
11268 return LastInst;
11269 };
11270
11271 auto FindFirstInst = [&]() {
11272 Instruction *FirstInst = Front;
11273 for (Value *V : E->Scalars) {
11274 auto *I = dyn_cast<Instruction>(V);
11275 if (!I)
11276 continue;
11277 if (FirstInst->getParent() == I->getParent()) {
11278 if (I->comesBefore(FirstInst))
11279 FirstInst = I;
11280 continue;
11281 }
11282 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11283 !isa<GetElementPtrInst>(I)) ||
11284 (isVectorLikeInstWithConstOps(FirstInst) &&
11286 "Expected vector-like or non-GEP in GEP node insts only.");
11287 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11288 FirstInst = I;
11289 continue;
11290 }
11291 if (!DT->isReachableFromEntry(I->getParent()))
11292 continue;
11293 auto *NodeA = DT->getNode(FirstInst->getParent());
11294 auto *NodeB = DT->getNode(I->getParent());
11295 assert(NodeA && "Should only process reachable instructions");
11296 assert(NodeB && "Should only process reachable instructions");
11297 assert((NodeA == NodeB) ==
11298 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11299 "Different nodes should have different DFS numbers");
11300 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11301 FirstInst = I;
11302 }
11303 return FirstInst;
11304 };
11305
11306 // Set the insert point to the beginning of the basic block if the entry
11307 // should not be scheduled.
11308 if (doesNotNeedToSchedule(E->Scalars) ||
11309 (E->State != TreeEntry::NeedToGather &&
11310 all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11311 if ((E->getOpcode() == Instruction::GetElementPtr &&
11312 any_of(E->Scalars,
11313 [](Value *V) {
11314 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11315 })) ||
11316 all_of(E->Scalars,
11317 [](Value *V) {
11318 return !isVectorLikeInstWithConstOps(V) &&
11319 isUsedOutsideBlock(V);
11320 }) ||
11321 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11322 all_of(E->Scalars, [](Value *V) {
11323 return isa<ExtractElementInst, UndefValue>(V) ||
11324 areAllOperandsNonInsts(V);
11325 })))
11326 Res.second = FindLastInst();
11327 else
11328 Res.second = FindFirstInst();
11329 return *Res.second;
11330 }
11331
11332 // Find the last instruction. The common case should be that BB has been
11333 // scheduled, and the last instruction is VL.back(). So we start with
11334 // VL.back() and iterate over schedule data until we reach the end of the
11335 // bundle. The end of the bundle is marked by null ScheduleData.
11336 if (BlocksSchedules.count(BB)) {
11337 Value *V = E->isOneOf(E->Scalars.back());
11339 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11340 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11341 if (Bundle && Bundle->isPartOfBundle())
11342 for (; Bundle; Bundle = Bundle->NextInBundle)
11343 if (Bundle->OpValue == Bundle->Inst)
11344 Res.second = Bundle->Inst;
11345 }
11346
11347 // LastInst can still be null at this point if there's either not an entry
11348 // for BB in BlocksSchedules or there's no ScheduleData available for
11349 // VL.back(). This can be the case if buildTree_rec aborts for various
11350 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11351 // size is reached, etc.). ScheduleData is initialized in the scheduling
11352 // "dry-run".
11353 //
11354 // If this happens, we can still find the last instruction by brute force. We
11355 // iterate forwards from Front (inclusive) until we either see all
11356 // instructions in the bundle or reach the end of the block. If Front is the
11357 // last instruction in program order, LastInst will be set to Front, and we
11358 // will visit all the remaining instructions in the block.
11359 //
11360 // One of the reasons we exit early from buildTree_rec is to place an upper
11361 // bound on compile-time. Thus, taking an additional compile-time hit here is
11362 // not ideal. However, this should be exceedingly rare since it requires that
11363 // we both exit early from buildTree_rec and that the bundle be out-of-order
11364 // (causing us to iterate all the way to the end of the block).
11365 if (!Res.second)
11366 Res.second = FindLastInst();
11367 assert(Res.second && "Failed to find last instruction in bundle");
11368 return *Res.second;
11369}
11370
11371void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11372 auto *Front = E->getMainOp();
11373 Instruction *LastInst = &getLastInstructionInBundle(E);
11374 assert(LastInst && "Failed to find last instruction in bundle");
11375 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11376 // If the instruction is PHI, set the insert point after all the PHIs.
11377 bool IsPHI = isa<PHINode>(LastInst);
11378 if (IsPHI)
11379 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11380 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11381 doesNotNeedToSchedule(E->Scalars))) {
11382 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11383 } else {
11384 // Set the insertion point after the last instruction in the bundle. Set the
11385 // debug location to Front.
11386 Builder.SetInsertPoint(
11387 LastInst->getParent(),
11389 }
11390 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11391}
11392
11393Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11394 // List of instructions/lanes from current block and/or the blocks which are
11395 // part of the current loop. These instructions will be inserted at the end to
11396 // make it possible to optimize loops and hoist invariant instructions out of
11397 // the loops body with better chances for success.
11399 SmallSet<int, 4> PostponedIndices;
11400 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11401 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11403 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11404 InsertBB = InsertBB->getSinglePredecessor();
11405 return InsertBB && InsertBB == InstBB;
11406 };
11407 for (int I = 0, E = VL.size(); I < E; ++I) {
11408 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11409 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11410 getTreeEntry(Inst) ||
11411 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11412 PostponedIndices.insert(I).second)
11413 PostponedInsts.emplace_back(Inst, I);
11414 }
11415
11416 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11417 Type *Ty) {
11418 Value *Scalar = V;
11419 if (Scalar->getType() != Ty) {
11420 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11421 "Expected integer types only.");
11422 Scalar = Builder.CreateIntCast(
11423 Scalar, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11424 }
11425
11426 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11427 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11428 if (!InsElt)
11429 return Vec;
11430 GatherShuffleExtractSeq.insert(InsElt);
11431 CSEBlocks.insert(InsElt->getParent());
11432 // Add to our 'need-to-extract' list.
11433 if (isa<Instruction>(V)) {
11434 if (TreeEntry *Entry = getTreeEntry(V)) {
11435 // Find which lane we need to extract.
11436 User *UserOp = nullptr;
11437 if (Scalar != V) {
11438 if (auto *SI = dyn_cast<Instruction>(Scalar))
11439 UserOp = SI;
11440 } else {
11441 UserOp = InsElt;
11442 }
11443 if (UserOp) {
11444 unsigned FoundLane = Entry->findLaneForValue(V);
11445 ExternalUses.emplace_back(V, UserOp, FoundLane);
11446 }
11447 }
11448 }
11449 return Vec;
11450 };
11451 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11452 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11453 SmallVector<int> NonConsts;
11454 // Insert constant values at first.
11455 for (int I = 0, E = VL.size(); I < E; ++I) {
11456 if (PostponedIndices.contains(I))
11457 continue;
11458 if (!isConstant(VL[I])) {
11459 NonConsts.push_back(I);
11460 continue;
11461 }
11462 if (Root) {
11463 if (!isa<UndefValue>(VL[I])) {
11464 NonConsts.push_back(I);
11465 continue;
11466 }
11467 if (isa<PoisonValue>(VL[I]))
11468 continue;
11469 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11470 if (SV->getMaskValue(I) == PoisonMaskElem)
11471 continue;
11472 }
11473 }
11474 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11475 }
11476 // Insert non-constant values.
11477 for (int I : NonConsts)
11478 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11479 // Append instructions, which are/may be part of the loop, in the end to make
11480 // it possible to hoist non-loop-based instructions.
11481 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11482 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11483
11484 return Vec;
11485}
11486
11487/// Merges shuffle masks and emits final shuffle instruction, if required. It
11488/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11489/// when the actual shuffle instruction is generated only if this is actually
11490/// required. Otherwise, the shuffle instruction emission is delayed till the
11491/// end of the process, to reduce the number of emitted instructions and further
11492/// analysis/transformations.
11493/// The class also will look through the previously emitted shuffle instructions
11494/// and properly mark indices in mask as undef.
11495/// For example, given the code
11496/// \code
11497/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11498/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11499/// \endcode
11500/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11501/// look through %s1 and %s2 and emit
11502/// \code
11503/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11504/// \endcode
11505/// instead.
11506/// If 2 operands are of different size, the smallest one will be resized and
11507/// the mask recalculated properly.
11508/// For example, given the code
11509/// \code
11510/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11511/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11512/// \endcode
11513/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11514/// look through %s1 and %s2 and emit
11515/// \code
11516/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11517/// \endcode
11518/// instead.
11519class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11520 bool IsFinalized = false;
11521 /// Combined mask for all applied operands and masks. It is built during
11522 /// analysis and actual emission of shuffle vector instructions.
11523 SmallVector<int> CommonMask;
11524 /// List of operands for the shuffle vector instruction. It hold at max 2
11525 /// operands, if the 3rd is going to be added, the first 2 are combined into
11526 /// shuffle with \p CommonMask mask, the first operand sets to be the
11527 /// resulting shuffle and the second operand sets to be the newly added
11528 /// operand. The \p CommonMask is transformed in the proper way after that.
11529 SmallVector<Value *, 2> InVectors;
11530 Type *ScalarTy = nullptr;
11531 IRBuilderBase &Builder;
11532 BoUpSLP &R;
11533
11534 class ShuffleIRBuilder {
11535 IRBuilderBase &Builder;
11536 /// Holds all of the instructions that we gathered.
11537 SetVector<Instruction *> &GatherShuffleExtractSeq;
11538 /// A list of blocks that we are going to CSE.
11539 DenseSet<BasicBlock *> &CSEBlocks;
11540 /// Data layout.
11541 const DataLayout &DL;
11542
11543 public:
11544 ShuffleIRBuilder(IRBuilderBase &Builder,
11545 SetVector<Instruction *> &GatherShuffleExtractSeq,
11546 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11547 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11548 CSEBlocks(CSEBlocks), DL(DL) {}
11549 ~ShuffleIRBuilder() = default;
11550 /// Creates shufflevector for the 2 operands with the given mask.
11551 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11552 if (V1->getType() != V2->getType()) {
11554 V1->getType()->isIntOrIntVectorTy() &&
11555 "Expected integer vector types only.");
11556 if (V1->getType() != V2->getType()) {
11557 if (cast<VectorType>(V2->getType())
11558 ->getElementType()
11559 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11560 ->getElementType()
11561 ->getIntegerBitWidth())
11562 V2 = Builder.CreateIntCast(
11563 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11564 else
11565 V1 = Builder.CreateIntCast(
11566 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11567 }
11568 }
11569 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11570 if (auto *I = dyn_cast<Instruction>(Vec)) {
11571 GatherShuffleExtractSeq.insert(I);
11572 CSEBlocks.insert(I->getParent());
11573 }
11574 return Vec;
11575 }
11576 /// Creates permutation of the single vector operand with the given mask, if
11577 /// it is not identity mask.
11578 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11579 if (Mask.empty())
11580 return V1;
11581 unsigned VF = Mask.size();
11582 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11583 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11584 return V1;
11585 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11586 if (auto *I = dyn_cast<Instruction>(Vec)) {
11587 GatherShuffleExtractSeq.insert(I);
11588 CSEBlocks.insert(I->getParent());
11589 }
11590 return Vec;
11591 }
11592 Value *createIdentity(Value *V) { return V; }
11593 Value *createPoison(Type *Ty, unsigned VF) {
11594 return PoisonValue::get(FixedVectorType::get(Ty, VF));
11595 }
11596 /// Resizes 2 input vector to match the sizes, if the they are not equal
11597 /// yet. The smallest vector is resized to the size of the larger vector.
11598 void resizeToMatch(Value *&V1, Value *&V2) {
11599 if (V1->getType() == V2->getType())
11600 return;
11601 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11602 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11603 int VF = std::max(V1VF, V2VF);
11604 int MinVF = std::min(V1VF, V2VF);
11605 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11606 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11607 0);
11608 Value *&Op = MinVF == V1VF ? V1 : V2;
11609 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11610 if (auto *I = dyn_cast<Instruction>(Op)) {
11611 GatherShuffleExtractSeq.insert(I);
11612 CSEBlocks.insert(I->getParent());
11613 }
11614 if (MinVF == V1VF)
11615 V1 = Op;
11616 else
11617 V2 = Op;
11618 }
11619 };
11620
11621 /// Smart shuffle instruction emission, walks through shuffles trees and
11622 /// tries to find the best matching vector for the actual shuffle
11623 /// instruction.
11624 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11625 assert(V1 && "Expected at least one vector value.");
11626 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11627 R.CSEBlocks, *R.DL);
11628 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11629 ShuffleBuilder);
11630 }
11631
11632 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11633 /// shuffle emission.
11634 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11635 ArrayRef<int> Mask) {
11636 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11637 if (Mask[Idx] != PoisonMaskElem)
11638 CommonMask[Idx] = Idx;
11639 }
11640
11641 /// Cast value \p V to the vector type with the same number of elements, but
11642 /// the base type \p ScalarTy.
11643 Value *castToScalarTyElem(Value *V,
11644 std::optional<bool> IsSigned = std::nullopt) {
11645 auto *VecTy = cast<VectorType>(V->getType());
11646 if (VecTy->getElementType() == ScalarTy)
11647 return V;
11648 return Builder.CreateIntCast(
11649 V, VectorType::get(ScalarTy, VecTy->getElementCount()),
11650 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
11651 }
11652
11653public:
11655 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11656
11657 /// Adjusts extractelements after reusing them.
11658 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11659 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11660 unsigned NumParts, bool &UseVecBaseAsInput) {
11661 UseVecBaseAsInput = false;
11662 SmallPtrSet<Value *, 4> UniqueBases;
11663 Value *VecBase = nullptr;
11664 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11665 int Idx = Mask[I];
11666 if (Idx == PoisonMaskElem)
11667 continue;
11668 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11669 VecBase = EI->getVectorOperand();
11670 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11671 VecBase = TE->VectorizedValue;
11672 assert(VecBase && "Expected vectorized value.");
11673 UniqueBases.insert(VecBase);
11674 // If the only one use is vectorized - can delete the extractelement
11675 // itself.
11676 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11677 any_of(EI->users(), [&](User *U) {
11678 const TreeEntry *UTE = R.getTreeEntry(U);
11679 return !UTE || R.MultiNodeScalars.contains(U) ||
11680 (isa<GetElementPtrInst>(U) &&
11681 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11682 count_if(R.VectorizableTree,
11683 [&](const std::unique_ptr<TreeEntry> &TE) {
11684 return any_of(TE->UserTreeIndices,
11685 [&](const EdgeInfo &Edge) {
11686 return Edge.UserTE == UTE;
11687 }) &&
11688 is_contained(TE->Scalars, EI);
11689 }) != 1;
11690 }))
11691 continue;
11692 R.eraseInstruction(EI);
11693 }
11694 if (NumParts == 1 || UniqueBases.size() == 1) {
11695 VecBase = castToScalarTyElem(VecBase);
11696 return VecBase;
11697 }
11698 UseVecBaseAsInput = true;
11699 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11700 for (auto [I, Idx] : enumerate(Mask))
11701 if (Idx != PoisonMaskElem)
11702 Idx = I;
11703 };
11704 // Perform multi-register vector shuffle, joining them into a single virtual
11705 // long vector.
11706 // Need to shuffle each part independently and then insert all this parts
11707 // into a long virtual vector register, forming the original vector.
11708 Value *Vec = nullptr;
11709 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11710 unsigned SliceSize = E->Scalars.size() / NumParts;
11711 for (unsigned Part = 0; Part < NumParts; ++Part) {
11713 ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
11714 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
11715 constexpr int MaxBases = 2;
11716 SmallVector<Value *, MaxBases> Bases(MaxBases);
11717#ifndef NDEBUG
11718 int PrevSize = 0;
11719#endif // NDEBUG
11720 for (const auto [I, V]: enumerate(VL)) {
11721 if (SubMask[I] == PoisonMaskElem)
11722 continue;
11723 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11724 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11725 VecOp = TE->VectorizedValue;
11726 assert(VecOp && "Expected vectorized value.");
11727 const int Size =
11728 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11729#ifndef NDEBUG
11730 assert((PrevSize == Size || PrevSize == 0) &&
11731 "Expected vectors of the same size.");
11732 PrevSize = Size;
11733#endif // NDEBUG
11734 VecOp = castToScalarTyElem(VecOp);
11735 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11736 }
11737 if (!Bases.front())
11738 continue;
11739 Value *SubVec;
11740 if (Bases.back()) {
11741 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11742 TransformToIdentity(SubMask);
11743 } else {
11744 SubVec = Bases.front();
11745 }
11746 if (!Vec) {
11747 Vec = SubVec;
11748 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11749 [&](unsigned P) {
11750 ArrayRef<int> SubMask =
11751 Mask.slice(P * SliceSize, SliceSize);
11752 return all_of(SubMask, [](int Idx) {
11753 return Idx == PoisonMaskElem;
11754 });
11755 })) &&
11756 "Expected first part or all previous parts masked.");
11757 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11758 } else {
11759 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11760 if (Vec->getType() != SubVec->getType()) {
11761 unsigned SubVecVF =
11762 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11763 VF = std::max(VF, SubVecVF);
11764 }
11765 // Adjust SubMask.
11766 for (int &Idx : SubMask)
11767 if (Idx != PoisonMaskElem)
11768 Idx += VF;
11769 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11770 Vec = createShuffle(Vec, SubVec, VecMask);
11771 TransformToIdentity(VecMask);
11772 }
11773 }
11774 copy(VecMask, Mask.begin());
11775 return Vec;
11776 }
11777 /// Checks if the specified entry \p E needs to be delayed because of its
11778 /// dependency nodes.
11779 std::optional<Value *>
11780 needToDelay(const TreeEntry *E,
11782 // No need to delay emission if all deps are ready.
11783 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
11784 return all_of(
11785 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
11786 }))
11787 return std::nullopt;
11788 // Postpone gather emission, will be emitted after the end of the
11789 // process to keep correct order.
11790 auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor());
11791 return Builder.CreateAlignedLoad(
11792 ResVecTy,
11794 MaybeAlign());
11795 }
11796 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11797 /// shuffling.
11798 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11799 Value *V1 = E1.VectorizedValue;
11800 if (V1->getType()->isIntOrIntVectorTy())
11801 V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
11802 return !isKnownNonNegative(
11803 V, SimplifyQuery(*R.DL));
11804 }));
11805 Value *V2 = E2.VectorizedValue;
11806 if (V2->getType()->isIntOrIntVectorTy())
11807 V2 = castToScalarTyElem(V2, all_of(E2.Scalars, [&](Value *V) {
11808 return !isKnownNonNegative(
11809 V, SimplifyQuery(*R.DL));
11810 }));
11811 add(V1, V2, Mask);
11812 }
11813 /// Adds single input vector (in form of tree entry) and the mask for its
11814 /// shuffling.
11815 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11816 Value *V1 = E1.VectorizedValue;
11817 if (V1->getType()->isIntOrIntVectorTy())
11818 V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
11819 return !isKnownNonNegative(
11820 V, SimplifyQuery(*R.DL));
11821 }));
11822 add(V1, Mask);
11823 }
11824 /// Adds 2 input vectors and the mask for their shuffling.
11825 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11826 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11827 V1 = castToScalarTyElem(V1);
11828 V2 = castToScalarTyElem(V2);
11829 if (InVectors.empty()) {
11830 InVectors.push_back(V1);
11831 InVectors.push_back(V2);
11832 CommonMask.assign(Mask.begin(), Mask.end());
11833 return;
11834 }
11835 Value *Vec = InVectors.front();
11836 if (InVectors.size() == 2) {
11837 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11838 transformMaskAfterShuffle(CommonMask, CommonMask);
11839 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
11840 Mask.size()) {
11841 Vec = createShuffle(Vec, nullptr, CommonMask);
11842 transformMaskAfterShuffle(CommonMask, CommonMask);
11843 }
11844 V1 = createShuffle(V1, V2, Mask);
11845 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11846 if (Mask[Idx] != PoisonMaskElem)
11847 CommonMask[Idx] = Idx + Sz;
11848 InVectors.front() = Vec;
11849 if (InVectors.size() == 2)
11850 InVectors.back() = V1;
11851 else
11852 InVectors.push_back(V1);
11853 }
11854 /// Adds another one input vector and the mask for the shuffling.
11855 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11856 V1 = castToScalarTyElem(V1);
11857 if (InVectors.empty()) {
11858 if (!isa<FixedVectorType>(V1->getType())) {
11859 V1 = createShuffle(V1, nullptr, CommonMask);
11860 CommonMask.assign(Mask.size(), PoisonMaskElem);
11861 transformMaskAfterShuffle(CommonMask, Mask);
11862 }
11863 InVectors.push_back(V1);
11864 CommonMask.assign(Mask.begin(), Mask.end());
11865 return;
11866 }
11867 const auto *It = find(InVectors, V1);
11868 if (It == InVectors.end()) {
11869 if (InVectors.size() == 2 ||
11870 InVectors.front()->getType() != V1->getType() ||
11871 !isa<FixedVectorType>(V1->getType())) {
11872 Value *V = InVectors.front();
11873 if (InVectors.size() == 2) {
11874 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11875 transformMaskAfterShuffle(CommonMask, CommonMask);
11876 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11877 CommonMask.size()) {
11878 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11879 transformMaskAfterShuffle(CommonMask, CommonMask);
11880 }
11881 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11882 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11883 CommonMask[Idx] =
11884 V->getType() != V1->getType()
11885 ? Idx + Sz
11886 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11887 ->getNumElements();
11888 if (V->getType() != V1->getType())
11889 V1 = createShuffle(V1, nullptr, Mask);
11890 InVectors.front() = V;
11891 if (InVectors.size() == 2)
11892 InVectors.back() = V1;
11893 else
11894 InVectors.push_back(V1);
11895 return;
11896 }
11897 // Check if second vector is required if the used elements are already
11898 // used from the first one.
11899 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11900 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11901 InVectors.push_back(V1);
11902 break;
11903 }
11904 }
11905 int VF = CommonMask.size();
11906 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11907 VF = FTy->getNumElements();
11908 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11909 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11910 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11911 }
11912 /// Adds another one input vector and the mask for the shuffling.
11914 SmallVector<int> NewMask;
11915 inversePermutation(Order, NewMask);
11916 add(V1, NewMask);
11917 }
11918 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11919 Value *Root = nullptr) {
11920 return R.gather(VL, Root, ScalarTy);
11921 }
11922 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11923 /// Finalize emission of the shuffles.
11924 /// \param Action the action (if any) to be performed before final applying of
11925 /// the \p ExtMask mask.
11926 Value *
11927 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11928 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11929 IsFinalized = true;
11930 if (Action) {
11931 Value *Vec = InVectors.front();
11932 if (InVectors.size() == 2) {
11933 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11934 InVectors.pop_back();
11935 } else {
11936 Vec = createShuffle(Vec, nullptr, CommonMask);
11937 }
11938 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11939 if (CommonMask[Idx] != PoisonMaskElem)
11940 CommonMask[Idx] = Idx;
11941 assert(VF > 0 &&
11942 "Expected vector length for the final value before action.");
11943 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11944 if (VecVF < VF) {
11945 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11946 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11947 Vec = createShuffle(Vec, nullptr, ResizeMask);
11948 }
11949 Action(Vec, CommonMask);
11950 InVectors.front() = Vec;
11951 }
11952 if (!ExtMask.empty()) {
11953 if (CommonMask.empty()) {
11954 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11955 } else {
11956 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11957 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11958 if (ExtMask[I] == PoisonMaskElem)
11959 continue;
11960 NewMask[I] = CommonMask[ExtMask[I]];
11961 }
11962 CommonMask.swap(NewMask);
11963 }
11964 }
11965 if (CommonMask.empty()) {
11966 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11967 return InVectors.front();
11968 }
11969 if (InVectors.size() == 2)
11970 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11971 return createShuffle(InVectors.front(), nullptr, CommonMask);
11972 }
11973
11975 assert((IsFinalized || CommonMask.empty()) &&
11976 "Shuffle construction must be finalized.");
11977 }
11978};
11979
11980Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11981 bool PostponedPHIs) {
11982 ValueList &VL = E->getOperand(NodeIdx);
11983 const unsigned VF = VL.size();
11984 InstructionsState S = getSameOpcode(VL, *TLI);
11985 // Special processing for GEPs bundle, which may include non-gep values.
11986 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11987 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11988 if (It != VL.end())
11989 S = getSameOpcode(*It, *TLI);
11990 }
11991 if (S.getOpcode()) {
11992 auto CheckSameVE = [&](const TreeEntry *VE) {
11993 return VE->isSame(VL) &&
11994 (any_of(VE->UserTreeIndices,
11995 [E, NodeIdx](const EdgeInfo &EI) {
11996 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11997 }) ||
11998 any_of(VectorizableTree,
11999 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12000 return TE->isOperandGatherNode({E, NodeIdx}) &&
12001 VE->isSame(TE->Scalars);
12002 }));
12003 };
12004 TreeEntry *VE = getTreeEntry(S.OpValue);
12005 bool IsSameVE = VE && CheckSameVE(VE);
12006 if (!IsSameVE) {
12007 auto It = MultiNodeScalars.find(S.OpValue);
12008 if (It != MultiNodeScalars.end()) {
12009 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12010 return TE != VE && CheckSameVE(TE);
12011 });
12012 if (I != It->getSecond().end()) {
12013 VE = *I;
12014 IsSameVE = true;
12015 }
12016 }
12017 }
12018 if (IsSameVE) {
12019 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12020 ShuffleInstructionBuilder ShuffleBuilder(
12021 cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12022 ShuffleBuilder.add(V, Mask);
12023 return ShuffleBuilder.finalize(std::nullopt);
12024 };
12025 Value *V = vectorizeTree(VE, PostponedPHIs);
12026 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
12027 if (!VE->ReuseShuffleIndices.empty()) {
12028 // Reshuffle to get only unique values.
12029 // If some of the scalars are duplicated in the vectorization
12030 // tree entry, we do not vectorize them but instead generate a
12031 // mask for the reuses. But if there are several users of the
12032 // same entry, they may have different vectorization factors.
12033 // This is especially important for PHI nodes. In this case, we
12034 // need to adapt the resulting instruction for the user
12035 // vectorization factor and have to reshuffle it again to take
12036 // only unique elements of the vector. Without this code the
12037 // function incorrectly returns reduced vector instruction with
12038 // the same elements, not with the unique ones.
12039
12040 // block:
12041 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12042 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12043 // ... (use %2)
12044 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12045 // br %block
12047 for (auto [I, V] : enumerate(VL)) {
12048 if (isa<PoisonValue>(V))
12049 continue;
12050 Mask[I] = VE->findLaneForValue(V);
12051 }
12052 V = FinalShuffle(V, Mask);
12053 } else {
12054 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12055 "Expected vectorization factor less "
12056 "than original vector size.");
12057 SmallVector<int> UniformMask(VF, 0);
12058 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12059 V = FinalShuffle(V, UniformMask);
12060 }
12061 }
12062 // Need to update the operand gather node, if actually the operand is not a
12063 // vectorized node, but the buildvector/gather node, which matches one of
12064 // the vectorized nodes.
12065 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12066 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12067 }) == VE->UserTreeIndices.end()) {
12068 auto *It = find_if(
12069 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12070 return TE->State == TreeEntry::NeedToGather &&
12071 TE->UserTreeIndices.front().UserTE == E &&
12072 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12073 });
12074 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12075 (*It)->VectorizedValue = V;
12076 }
12077 return V;
12078 }
12079 }
12080
12081 // Find the corresponding gather entry and vectorize it.
12082 // Allows to be more accurate with tree/graph transformations, checks for the
12083 // correctness of the transformations in many cases.
12084 auto *I = find_if(VectorizableTree,
12085 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12086 return TE->isOperandGatherNode({E, NodeIdx});
12087 });
12088 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12089 assert(I->get()->UserTreeIndices.size() == 1 &&
12090 "Expected only single user for the gather node.");
12091 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12092 return vectorizeTree(I->get(), PostponedPHIs);
12093}
12094
12095template <typename BVTy, typename ResTy, typename... Args>
12096ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12097 Args &...Params) {
12098 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
12099 unsigned VF = E->getVectorFactor();
12100
12101 bool NeedFreeze = false;
12102 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
12103 E->ReuseShuffleIndices.end());
12104 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12105 // Build a mask out of the reorder indices and reorder scalars per this
12106 // mask.
12107 SmallVector<int> ReorderMask;
12108 inversePermutation(E->ReorderIndices, ReorderMask);
12109 if (!ReorderMask.empty())
12110 reorderScalars(GatheredScalars, ReorderMask);
12111 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12112 unsigned I, unsigned SliceSize) {
12113 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12114 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12115 }))
12116 return false;
12117 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12118 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12119 if (UserTE->getNumOperands() != 2)
12120 return false;
12121 auto *It =
12122 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12123 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12124 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12125 }) != TE->UserTreeIndices.end();
12126 });
12127 if (It == VectorizableTree.end())
12128 return false;
12129 int Idx;
12130 if ((Mask.size() < InputVF &&
12132 Idx == 0) ||
12133 (Mask.size() == InputVF &&
12134 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12135 std::iota(std::next(Mask.begin(), I * SliceSize),
12136 std::next(Mask.begin(), (I + 1) * SliceSize), 0);
12137 } else {
12138 unsigned IVal =
12139 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12140 std::fill(std::next(Mask.begin(), I * SliceSize),
12141 std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
12142 }
12143 return true;
12144 };
12145 BVTy ShuffleBuilder(ScalarTy, Params...);
12146 ResTy Res = ResTy();
12148 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12150 Value *ExtractVecBase = nullptr;
12151 bool UseVecBaseAsInput = false;
12154 Type *OrigScalarTy = GatheredScalars.front()->getType();
12155 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
12156 unsigned NumParts = TTI->getNumberOfParts(VecTy);
12157 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12158 NumParts = 1;
12159 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12160 // Check for gathered extracts.
12161 bool Resized = false;
12162 ExtractShuffles =
12163 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12164 if (!ExtractShuffles.empty()) {
12165 SmallVector<const TreeEntry *> ExtractEntries;
12166 for (auto [Idx, I] : enumerate(ExtractMask)) {
12167 if (I == PoisonMaskElem)
12168 continue;
12169 if (const auto *TE = getTreeEntry(
12170 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12171 ExtractEntries.push_back(TE);
12172 }
12173 if (std::optional<ResTy> Delayed =
12174 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12175 // Delay emission of gathers which are not ready yet.
12176 PostponedGathers.insert(E);
12177 // Postpone gather emission, will be emitted after the end of the
12178 // process to keep correct order.
12179 return *Delayed;
12180 }
12181 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12182 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12183 ExtractVecBase = VecBase;
12184 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12185 if (VF == VecBaseTy->getNumElements() &&
12186 GatheredScalars.size() != VF) {
12187 Resized = true;
12188 GatheredScalars.append(VF - GatheredScalars.size(),
12189 PoisonValue::get(OrigScalarTy));
12190 }
12191 }
12192 }
12193 // Gather extracts after we check for full matched gathers only.
12194 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12195 E->isAltShuffle() ||
12196 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12197 isSplat(E->Scalars) ||
12198 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12199 GatherShuffles =
12200 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12201 }
12202 if (!GatherShuffles.empty()) {
12203 if (std::optional<ResTy> Delayed =
12204 ShuffleBuilder.needToDelay(E, Entries)) {
12205 // Delay emission of gathers which are not ready yet.
12206 PostponedGathers.insert(E);
12207 // Postpone gather emission, will be emitted after the end of the
12208 // process to keep correct order.
12209 return *Delayed;
12210 }
12211 if (GatherShuffles.size() == 1 &&
12212 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12213 Entries.front().front()->isSame(E->Scalars)) {
12214 // Perfect match in the graph, will reuse the previously vectorized
12215 // node. Cost is 0.
12216 LLVM_DEBUG(
12217 dbgs()
12218 << "SLP: perfect diamond match for gather bundle "
12219 << shortBundleName(E->Scalars) << ".\n");
12220 // Restore the mask for previous partially matched values.
12221 Mask.resize(E->Scalars.size());
12222 const TreeEntry *FrontTE = Entries.front().front();
12223 if (FrontTE->ReorderIndices.empty() &&
12224 ((FrontTE->ReuseShuffleIndices.empty() &&
12225 E->Scalars.size() == FrontTE->Scalars.size()) ||
12226 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12227 std::iota(Mask.begin(), Mask.end(), 0);
12228 } else {
12229 for (auto [I, V] : enumerate(E->Scalars)) {
12230 if (isa<PoisonValue>(V)) {
12232 continue;
12233 }
12234 Mask[I] = FrontTE->findLaneForValue(V);
12235 }
12236 }
12237 ShuffleBuilder.add(*FrontTE, Mask);
12238 Res = ShuffleBuilder.finalize(E->getCommonMask());
12239 return Res;
12240 }
12241 if (!Resized) {
12242 if (GatheredScalars.size() != VF &&
12243 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12244 return any_of(TEs, [&](const TreeEntry *TE) {
12245 return TE->getVectorFactor() == VF;
12246 });
12247 }))
12248 GatheredScalars.append(VF - GatheredScalars.size(),
12249 PoisonValue::get(OrigScalarTy));
12250 }
12251 // Remove shuffled elements from list of gathers.
12252 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12253 if (Mask[I] != PoisonMaskElem)
12254 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12255 }
12256 }
12257 }
12258 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12259 SmallVectorImpl<int> &ReuseMask,
12260 bool IsRootPoison) {
12261 // For splats with can emit broadcasts instead of gathers, so try to find
12262 // such sequences.
12263 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12264 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12265 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12266 SmallVector<int> UndefPos;
12267 DenseMap<Value *, unsigned> UniquePositions;
12268 // Gather unique non-const values and all constant values.
12269 // For repeated values, just shuffle them.
12270 int NumNonConsts = 0;
12271 int SinglePos = 0;
12272 for (auto [I, V] : enumerate(Scalars)) {
12273 if (isa<UndefValue>(V)) {
12274 if (!isa<PoisonValue>(V)) {
12275 ReuseMask[I] = I;
12276 UndefPos.push_back(I);
12277 }
12278 continue;
12279 }
12280 if (isConstant(V)) {
12281 ReuseMask[I] = I;
12282 continue;
12283 }
12284 ++NumNonConsts;
12285 SinglePos = I;
12286 Value *OrigV = V;
12287 Scalars[I] = PoisonValue::get(OrigScalarTy);
12288 if (IsSplat) {
12289 Scalars.front() = OrigV;
12290 ReuseMask[I] = 0;
12291 } else {
12292 const auto Res = UniquePositions.try_emplace(OrigV, I);
12293 Scalars[Res.first->second] = OrigV;
12294 ReuseMask[I] = Res.first->second;
12295 }
12296 }
12297 if (NumNonConsts == 1) {
12298 // Restore single insert element.
12299 if (IsSplat) {
12300 ReuseMask.assign(VF, PoisonMaskElem);
12301 std::swap(Scalars.front(), Scalars[SinglePos]);
12302 if (!UndefPos.empty() && UndefPos.front() == 0)
12303 Scalars.front() = UndefValue::get(OrigScalarTy);
12304 }
12305 ReuseMask[SinglePos] = SinglePos;
12306 } else if (!UndefPos.empty() && IsSplat) {
12307 // For undef values, try to replace them with the simple broadcast.
12308 // We can do it if the broadcasted value is guaranteed to be
12309 // non-poisonous, or by freezing the incoming scalar value first.
12310 auto *It = find_if(Scalars, [this, E](Value *V) {
12311 return !isa<UndefValue>(V) &&
12312 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12313 (E->UserTreeIndices.size() == 1 &&
12314 any_of(V->uses(), [E](const Use &U) {
12315 // Check if the value already used in the same operation in
12316 // one of the nodes already.
12317 return E->UserTreeIndices.front().EdgeIdx !=
12318 U.getOperandNo() &&
12319 is_contained(
12320 E->UserTreeIndices.front().UserTE->Scalars,
12321 U.getUser());
12322 })));
12323 });
12324 if (It != Scalars.end()) {
12325 // Replace undefs by the non-poisoned scalars and emit broadcast.
12326 int Pos = std::distance(Scalars.begin(), It);
12327 for (int I : UndefPos) {
12328 // Set the undef position to the non-poisoned scalar.
12329 ReuseMask[I] = Pos;
12330 // Replace the undef by the poison, in the mask it is replaced by
12331 // non-poisoned scalar already.
12332 if (I != Pos)
12333 Scalars[I] = PoisonValue::get(OrigScalarTy);
12334 }
12335 } else {
12336 // Replace undefs by the poisons, emit broadcast and then emit
12337 // freeze.
12338 for (int I : UndefPos) {
12339 ReuseMask[I] = PoisonMaskElem;
12340 if (isa<UndefValue>(Scalars[I]))
12341 Scalars[I] = PoisonValue::get(OrigScalarTy);
12342 }
12343 NeedFreeze = true;
12344 }
12345 }
12346 };
12347 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12348 bool IsNonPoisoned = true;
12349 bool IsUsedInExpr = true;
12350 Value *Vec1 = nullptr;
12351 if (!ExtractShuffles.empty()) {
12352 // Gather of extractelements can be represented as just a shuffle of
12353 // a single/two vectors the scalars are extracted from.
12354 // Find input vectors.
12355 Value *Vec2 = nullptr;
12356 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12357 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12358 ExtractMask[I] = PoisonMaskElem;
12359 }
12360 if (UseVecBaseAsInput) {
12361 Vec1 = ExtractVecBase;
12362 } else {
12363 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12364 if (ExtractMask[I] == PoisonMaskElem)
12365 continue;
12366 if (isa<UndefValue>(E->Scalars[I]))
12367 continue;
12368 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12369 Value *VecOp = EI->getVectorOperand();
12370 if (const auto *TE = getTreeEntry(VecOp))
12371 if (TE->VectorizedValue)
12372 VecOp = TE->VectorizedValue;
12373 if (!Vec1) {
12374 Vec1 = VecOp;
12375 } else if (Vec1 != VecOp) {
12376 assert((!Vec2 || Vec2 == VecOp) &&
12377 "Expected only 1 or 2 vectors shuffle.");
12378 Vec2 = VecOp;
12379 }
12380 }
12381 }
12382 if (Vec2) {
12383 IsUsedInExpr = false;
12384 IsNonPoisoned &=
12386 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12387 } else if (Vec1) {
12388 IsUsedInExpr &= FindReusedSplat(
12389 ExtractMask,
12390 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12391 ExtractMask.size());
12392 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12393 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12394 } else {
12395 IsUsedInExpr = false;
12396 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12397 /*ForExtracts=*/true);
12398 }
12399 }
12400 if (!GatherShuffles.empty()) {
12401 unsigned SliceSize = E->Scalars.size() / NumParts;
12402 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12403 for (const auto [I, TEs] : enumerate(Entries)) {
12404 if (TEs.empty()) {
12405 assert(!GatherShuffles[I] &&
12406 "No shuffles with empty entries list expected.");
12407 continue;
12408 }
12409 assert((TEs.size() == 1 || TEs.size() == 2) &&
12410 "Expected shuffle of 1 or 2 entries.");
12411 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
12412 VecMask.assign(VecMask.size(), PoisonMaskElem);
12413 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12414 if (TEs.size() == 1) {
12415 IsUsedInExpr &= FindReusedSplat(
12416 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12417 ShuffleBuilder.add(*TEs.front(), VecMask);
12418 if (TEs.front()->VectorizedValue)
12419 IsNonPoisoned &=
12420 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12421 } else {
12422 IsUsedInExpr = false;
12423 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12424 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12425 IsNonPoisoned &=
12426 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12427 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12428 }
12429 }
12430 }
12431 // Try to figure out best way to combine values: build a shuffle and insert
12432 // elements or just build several shuffles.
12433 // Insert non-constant scalars.
12434 SmallVector<Value *> NonConstants(GatheredScalars);
12435 int EMSz = ExtractMask.size();
12436 int MSz = Mask.size();
12437 // Try to build constant vector and shuffle with it only if currently we
12438 // have a single permutation and more than 1 scalar constants.
12439 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12440 bool IsIdentityShuffle =
12441 ((UseVecBaseAsInput ||
12442 all_of(ExtractShuffles,
12443 [](const std::optional<TTI::ShuffleKind> &SK) {
12444 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12446 })) &&
12447 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12448 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12449 (!GatherShuffles.empty() &&
12450 all_of(GatherShuffles,
12451 [](const std::optional<TTI::ShuffleKind> &SK) {
12452 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12454 }) &&
12455 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12457 bool EnoughConstsForShuffle =
12458 IsSingleShuffle &&
12459 (none_of(GatheredScalars,
12460 [](Value *V) {
12461 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12462 }) ||
12463 any_of(GatheredScalars,
12464 [](Value *V) {
12465 return isa<Constant>(V) && !isa<UndefValue>(V);
12466 })) &&
12467 (!IsIdentityShuffle ||
12468 (GatheredScalars.size() == 2 &&
12469 any_of(GatheredScalars,
12470 [](Value *V) { return !isa<UndefValue>(V); })) ||
12471 count_if(GatheredScalars, [](Value *V) {
12472 return isa<Constant>(V) && !isa<PoisonValue>(V);
12473 }) > 1);
12474 // NonConstants array contains just non-constant values, GatheredScalars
12475 // contains only constant to build final vector and then shuffle.
12476 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12477 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12478 NonConstants[I] = PoisonValue::get(OrigScalarTy);
12479 else
12480 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12481 }
12482 // Generate constants for final shuffle and build a mask for them.
12483 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12484 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12485 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12486 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12487 ShuffleBuilder.add(BV, BVMask);
12488 }
12489 if (all_of(NonConstants, [=](Value *V) {
12490 return isa<PoisonValue>(V) ||
12491 (IsSingleShuffle && ((IsIdentityShuffle &&
12492 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12493 }))
12494 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12495 else
12496 Res = ShuffleBuilder.finalize(
12497 E->ReuseShuffleIndices, E->Scalars.size(),
12498 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12499 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12500 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12501 });
12502 } else if (!allConstant(GatheredScalars)) {
12503 // Gather unique scalars and all constants.
12504 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12505 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12506 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12507 ShuffleBuilder.add(BV, ReuseMask);
12508 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12509 } else {
12510 // Gather all constants.
12511 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12512 for (auto [I, V] : enumerate(E->Scalars)) {
12513 if (!isa<PoisonValue>(V))
12514 Mask[I] = I;
12515 }
12516 Value *BV = ShuffleBuilder.gather(E->Scalars);
12517 ShuffleBuilder.add(BV, Mask);
12518 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12519 }
12520
12521 if (NeedFreeze)
12522 Res = ShuffleBuilder.createFreeze(Res);
12523 return Res;
12524}
12525
12526Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12527 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12528 Builder, *this);
12529}
12530
12531Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12532 IRBuilderBase::InsertPointGuard Guard(Builder);
12533
12534 if (E->VectorizedValue &&
12535 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12536 E->isAltShuffle())) {
12537 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12538 return E->VectorizedValue;
12539 }
12540
12541 Value *V = E->Scalars.front();
12542 Type *ScalarTy = V->getType();
12543 if (auto *Store = dyn_cast<StoreInst>(V))
12544 ScalarTy = Store->getValueOperand()->getType();
12545 else if (auto *IE = dyn_cast<InsertElementInst>(V))
12546 ScalarTy = IE->getOperand(1)->getType();
12547 auto It = MinBWs.find(E);
12548 if (It != MinBWs.end())
12549 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12550 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
12551 if (E->State == TreeEntry::NeedToGather) {
12552 // Set insert point for non-reduction initial nodes.
12553 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12554 setInsertPointAfterBundle(E);
12555 Value *Vec = createBuildVector(E, ScalarTy);
12556 E->VectorizedValue = Vec;
12557 return Vec;
12558 }
12559
12560 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12561 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12562 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12563 if (E->getOpcode() == Instruction::Store &&
12564 E->State == TreeEntry::Vectorize) {
12566 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12567 E->ReorderIndices.size());
12568 ShuffleBuilder.add(V, Mask);
12569 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12570 ShuffleBuilder.addOrdered(V, std::nullopt);
12571 } else {
12572 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12573 }
12574 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12575 };
12576
12577 assert((E->State == TreeEntry::Vectorize ||
12578 E->State == TreeEntry::ScatterVectorize ||
12579 E->State == TreeEntry::StridedVectorize) &&
12580 "Unhandled state");
12581 unsigned ShuffleOrOp =
12582 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12583 Instruction *VL0 = E->getMainOp();
12584 auto GetOperandSignedness = [&](unsigned Idx) {
12585 const TreeEntry *OpE = getOperandEntry(E, Idx);
12586 bool IsSigned = false;
12587 auto It = MinBWs.find(OpE);
12588 if (It != MinBWs.end())
12589 IsSigned = It->second.second;
12590 else
12591 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12592 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12593 });
12594 return IsSigned;
12595 };
12596 switch (ShuffleOrOp) {
12597 case Instruction::PHI: {
12598 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12599 E != VectorizableTree.front().get() ||
12600 !E->UserTreeIndices.empty()) &&
12601 "PHI reordering is free.");
12602 if (PostponedPHIs && E->VectorizedValue)
12603 return E->VectorizedValue;
12604 auto *PH = cast<PHINode>(VL0);
12605 Builder.SetInsertPoint(PH->getParent(),
12606 PH->getParent()->getFirstNonPHIIt());
12607 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12608 if (PostponedPHIs || !E->VectorizedValue) {
12609 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12610 E->PHI = NewPhi;
12611 Value *V = NewPhi;
12612
12613 // Adjust insertion point once all PHI's have been generated.
12614 Builder.SetInsertPoint(PH->getParent(),
12615 PH->getParent()->getFirstInsertionPt());
12616 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12617
12618 V = FinalShuffle(V, E, VecTy);
12619
12620 E->VectorizedValue = V;
12621 if (PostponedPHIs)
12622 return V;
12623 }
12624 PHINode *NewPhi = cast<PHINode>(E->PHI);
12625 // If phi node is fully emitted - exit.
12626 if (NewPhi->getNumIncomingValues() != 0)
12627 return NewPhi;
12628
12629 // PHINodes may have multiple entries from the same block. We want to
12630 // visit every block once.
12632
12633 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12635 BasicBlock *IBB = PH->getIncomingBlock(I);
12636
12637 // Stop emission if all incoming values are generated.
12638 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12639 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12640 return NewPhi;
12641 }
12642
12643 if (!VisitedBBs.insert(IBB).second) {
12644 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12645 continue;
12646 }
12647
12648 Builder.SetInsertPoint(IBB->getTerminator());
12649 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12650 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12651 if (VecTy != Vec->getType()) {
12652 assert((It != MinBWs.end() ||
12653 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12654 MinBWs.contains(getOperandEntry(E, I))) &&
12655 "Expected item in MinBWs.");
12656 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12657 }
12658 NewPhi->addIncoming(Vec, IBB);
12659 }
12660
12661 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12662 "Invalid number of incoming values");
12663 return NewPhi;
12664 }
12665
12666 case Instruction::ExtractElement: {
12667 Value *V = E->getSingleOperand(0);
12668 if (const TreeEntry *TE = getTreeEntry(V))
12669 V = TE->VectorizedValue;
12670 setInsertPointAfterBundle(E);
12671 V = FinalShuffle(V, E, VecTy);
12672 E->VectorizedValue = V;
12673 return V;
12674 }
12675 case Instruction::ExtractValue: {
12676 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12677 Builder.SetInsertPoint(LI);
12678 Value *Ptr = LI->getPointerOperand();
12679 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12680 Value *NewV = propagateMetadata(V, E->Scalars);
12681 NewV = FinalShuffle(NewV, E, VecTy);
12682 E->VectorizedValue = NewV;
12683 return NewV;
12684 }
12685 case Instruction::InsertElement: {
12686 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12687 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12688 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12689 ArrayRef<Value *> Op = E->getOperand(1);
12690 Type *ScalarTy = Op.front()->getType();
12691 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12692 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12693 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12694 assert(Res.first > 0 && "Expected item in MinBWs.");
12695 V = Builder.CreateIntCast(
12696 V,
12698 ScalarTy,
12699 cast<FixedVectorType>(V->getType())->getNumElements()),
12700 Res.second);
12701 }
12702
12703 // Create InsertVector shuffle if necessary
12704 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12705 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12706 }));
12707 const unsigned NumElts =
12708 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12709 const unsigned NumScalars = E->Scalars.size();
12710
12711 unsigned Offset = *getInsertIndex(VL0);
12712 assert(Offset < NumElts && "Failed to find vector index offset");
12713
12714 // Create shuffle to resize vector
12716 if (!E->ReorderIndices.empty()) {
12717 inversePermutation(E->ReorderIndices, Mask);
12718 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12719 } else {
12720 Mask.assign(NumElts, PoisonMaskElem);
12721 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12722 }
12723 // Create InsertVector shuffle if necessary
12724 bool IsIdentity = true;
12725 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12726 Mask.swap(PrevMask);
12727 for (unsigned I = 0; I < NumScalars; ++I) {
12728 Value *Scalar = E->Scalars[PrevMask[I]];
12729 unsigned InsertIdx = *getInsertIndex(Scalar);
12730 IsIdentity &= InsertIdx - Offset == I;
12731 Mask[InsertIdx - Offset] = I;
12732 }
12733 if (!IsIdentity || NumElts != NumScalars) {
12734 Value *V2 = nullptr;
12735 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12736 SmallVector<int> InsertMask(Mask);
12737 if (NumElts != NumScalars && Offset == 0) {
12738 // Follow all insert element instructions from the current buildvector
12739 // sequence.
12740 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12741 do {
12742 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
12743 if (!InsertIdx)
12744 break;
12745 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12746 InsertMask[*InsertIdx] = *InsertIdx;
12747 if (!Ins->hasOneUse())
12748 break;
12749 Ins = dyn_cast_or_null<InsertElementInst>(
12750 Ins->getUniqueUndroppableUser());
12751 } while (Ins);
12752 SmallBitVector UseMask =
12753 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12754 SmallBitVector IsFirstPoison =
12755 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12756 SmallBitVector IsFirstUndef =
12757 isUndefVector(FirstInsert->getOperand(0), UseMask);
12758 if (!IsFirstPoison.all()) {
12759 unsigned Idx = 0;
12760 for (unsigned I = 0; I < NumElts; I++) {
12761 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12762 IsFirstUndef.test(I)) {
12763 if (IsVNonPoisonous) {
12764 InsertMask[I] = I < NumScalars ? I : 0;
12765 continue;
12766 }
12767 if (!V2)
12768 V2 = UndefValue::get(V->getType());
12769 if (Idx >= NumScalars)
12770 Idx = NumScalars - 1;
12771 InsertMask[I] = NumScalars + Idx;
12772 ++Idx;
12773 } else if (InsertMask[I] != PoisonMaskElem &&
12774 Mask[I] == PoisonMaskElem) {
12775 InsertMask[I] = PoisonMaskElem;
12776 }
12777 }
12778 } else {
12779 InsertMask = Mask;
12780 }
12781 }
12782 if (!V2)
12783 V2 = PoisonValue::get(V->getType());
12784 V = Builder.CreateShuffleVector(V, V2, InsertMask);
12785 if (auto *I = dyn_cast<Instruction>(V)) {
12786 GatherShuffleExtractSeq.insert(I);
12787 CSEBlocks.insert(I->getParent());
12788 }
12789 }
12790
12791 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12792 for (unsigned I = 0; I < NumElts; I++) {
12793 if (Mask[I] != PoisonMaskElem)
12794 InsertMask[Offset + I] = I;
12795 }
12796 SmallBitVector UseMask =
12797 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12798 SmallBitVector IsFirstUndef =
12799 isUndefVector(FirstInsert->getOperand(0), UseMask);
12800 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12801 NumElts != NumScalars) {
12802 if (IsFirstUndef.all()) {
12803 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
12804 SmallBitVector IsFirstPoison =
12805 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12806 if (!IsFirstPoison.all()) {
12807 for (unsigned I = 0; I < NumElts; I++) {
12808 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
12809 InsertMask[I] = I + NumElts;
12810 }
12811 }
12812 V = Builder.CreateShuffleVector(
12813 V,
12814 IsFirstPoison.all() ? PoisonValue::get(V->getType())
12815 : FirstInsert->getOperand(0),
12816 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
12817 if (auto *I = dyn_cast<Instruction>(V)) {
12818 GatherShuffleExtractSeq.insert(I);
12819 CSEBlocks.insert(I->getParent());
12820 }
12821 }
12822 } else {
12823 SmallBitVector IsFirstPoison =
12824 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12825 for (unsigned I = 0; I < NumElts; I++) {
12826 if (InsertMask[I] == PoisonMaskElem)
12827 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
12828 else
12829 InsertMask[I] += NumElts;
12830 }
12831 V = Builder.CreateShuffleVector(
12832 FirstInsert->getOperand(0), V, InsertMask,
12833 cast<Instruction>(E->Scalars.back())->getName());
12834 if (auto *I = dyn_cast<Instruction>(V)) {
12835 GatherShuffleExtractSeq.insert(I);
12836 CSEBlocks.insert(I->getParent());
12837 }
12838 }
12839 }
12840
12841 ++NumVectorInstructions;
12842 E->VectorizedValue = V;
12843 return V;
12844 }
12845 case Instruction::ZExt:
12846 case Instruction::SExt:
12847 case Instruction::FPToUI:
12848 case Instruction::FPToSI:
12849 case Instruction::FPExt:
12850 case Instruction::PtrToInt:
12851 case Instruction::IntToPtr:
12852 case Instruction::SIToFP:
12853 case Instruction::UIToFP:
12854 case Instruction::Trunc:
12855 case Instruction::FPTrunc:
12856 case Instruction::BitCast: {
12857 setInsertPointAfterBundle(E);
12858
12859 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12860 if (E->VectorizedValue) {
12861 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12862 return E->VectorizedValue;
12863 }
12864
12865 auto *CI = cast<CastInst>(VL0);
12866 Instruction::CastOps VecOpcode = CI->getOpcode();
12867 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
12868 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12869 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12870 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12871 SrcScalarTy != CI->getOperand(0)->getType())) {
12872 // Check if the values are candidates to demote.
12873 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12874 if (SrcIt != MinBWs.end())
12875 SrcBWSz = SrcIt->second.first;
12876 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12877 if (BWSz == SrcBWSz) {
12878 VecOpcode = Instruction::BitCast;
12879 } else if (BWSz < SrcBWSz) {
12880 VecOpcode = Instruction::Trunc;
12881 } else if (It != MinBWs.end()) {
12882 assert(BWSz > SrcBWSz && "Invalid cast!");
12883 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12884 } else if (SrcIt != MinBWs.end()) {
12885 assert(BWSz > SrcBWSz && "Invalid cast!");
12886 VecOpcode =
12887 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12888 }
12889 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12890 !SrcIt->second.second) {
12891 VecOpcode = Instruction::UIToFP;
12892 }
12893 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12894 ? InVec
12895 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12896 V = FinalShuffle(V, E, VecTy);
12897
12898 E->VectorizedValue = V;
12899 ++NumVectorInstructions;
12900 return V;
12901 }
12902 case Instruction::FCmp:
12903 case Instruction::ICmp: {
12904 setInsertPointAfterBundle(E);
12905
12906 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12907 if (E->VectorizedValue) {
12908 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12909 return E->VectorizedValue;
12910 }
12911 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12912 if (E->VectorizedValue) {
12913 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12914 return E->VectorizedValue;
12915 }
12916 if (L->getType() != R->getType()) {
12917 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12918 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12919 MinBWs.contains(getOperandEntry(E, 0)) ||
12920 MinBWs.contains(getOperandEntry(E, 1))) &&
12921 "Expected item in MinBWs.");
12922 if (cast<VectorType>(L->getType())
12923 ->getElementType()
12924 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
12925 ->getElementType()
12926 ->getIntegerBitWidth()) {
12927 Type *CastTy = R->getType();
12928 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
12929 } else {
12930 Type *CastTy = L->getType();
12931 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
12932 }
12933 }
12934
12935 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12936 Value *V = Builder.CreateCmp(P0, L, R);
12937 propagateIRFlags(V, E->Scalars, VL0);
12938 // Do not cast for cmps.
12939 VecTy = cast<FixedVectorType>(V->getType());
12940 V = FinalShuffle(V, E, VecTy);
12941
12942 E->VectorizedValue = V;
12943 ++NumVectorInstructions;
12944 return V;
12945 }
12946 case Instruction::Select: {
12947 setInsertPointAfterBundle(E);
12948
12949 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12950 if (E->VectorizedValue) {
12951 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12952 return E->VectorizedValue;
12953 }
12954 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12955 if (E->VectorizedValue) {
12956 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12957 return E->VectorizedValue;
12958 }
12959 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12960 if (E->VectorizedValue) {
12961 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12962 return E->VectorizedValue;
12963 }
12964 if (True->getType() != VecTy || False->getType() != VecTy) {
12965 assert((It != MinBWs.end() ||
12966 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12967 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12968 MinBWs.contains(getOperandEntry(E, 1)) ||
12969 MinBWs.contains(getOperandEntry(E, 2))) &&
12970 "Expected item in MinBWs.");
12971 if (True->getType() != VecTy)
12972 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
12973 if (False->getType() != VecTy)
12974 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
12975 }
12976
12977 Value *V = Builder.CreateSelect(Cond, True, False);
12978 V = FinalShuffle(V, E, VecTy);
12979
12980 E->VectorizedValue = V;
12981 ++NumVectorInstructions;
12982 return V;
12983 }
12984 case Instruction::FNeg: {
12985 setInsertPointAfterBundle(E);
12986
12987 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
12988
12989 if (E->VectorizedValue) {
12990 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12991 return E->VectorizedValue;
12992 }
12993
12994 Value *V = Builder.CreateUnOp(
12995 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
12996 propagateIRFlags(V, E->Scalars, VL0);
12997 if (auto *I = dyn_cast<Instruction>(V))
12998 V = propagateMetadata(I, E->Scalars);
12999
13000 V = FinalShuffle(V, E, VecTy);
13001
13002 E->VectorizedValue = V;
13003 ++NumVectorInstructions;
13004
13005 return V;
13006 }
13007 case Instruction::Add:
13008 case Instruction::FAdd:
13009 case Instruction::Sub:
13010 case Instruction::FSub:
13011 case Instruction::Mul:
13012 case Instruction::FMul:
13013 case Instruction::UDiv:
13014 case Instruction::SDiv:
13015 case Instruction::FDiv:
13016 case Instruction::URem:
13017 case Instruction::SRem:
13018 case Instruction::FRem:
13019 case Instruction::Shl:
13020 case Instruction::LShr:
13021 case Instruction::AShr:
13022 case Instruction::And:
13023 case Instruction::Or:
13024 case Instruction::Xor: {
13025 setInsertPointAfterBundle(E);
13026
13027 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13028 if (E->VectorizedValue) {
13029 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13030 return E->VectorizedValue;
13031 }
13032 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13033 if (E->VectorizedValue) {
13034 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13035 return E->VectorizedValue;
13036 }
13037 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13038 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13039 ArrayRef<Value *> Ops = E->getOperand(I);
13040 if (all_of(Ops, [&](Value *Op) {
13041 auto *CI = dyn_cast<ConstantInt>(Op);
13042 return CI && CI->getValue().countr_one() >= It->second.first;
13043 })) {
13044 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13045 E->VectorizedValue = V;
13046 ++NumVectorInstructions;
13047 return V;
13048 }
13049 }
13050 }
13051 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13052 assert((It != MinBWs.end() ||
13053 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13054 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13055 MinBWs.contains(getOperandEntry(E, 0)) ||
13056 MinBWs.contains(getOperandEntry(E, 1))) &&
13057 "Expected item in MinBWs.");
13058 if (LHS->getType() != VecTy)
13059 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13060 if (RHS->getType() != VecTy)
13061 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13062 }
13063
13064 Value *V = Builder.CreateBinOp(
13065 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13066 RHS);
13067 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13068 if (auto *I = dyn_cast<Instruction>(V)) {
13069 V = propagateMetadata(I, E->Scalars);
13070 // Drop nuw flags for abs(sub(commutative), true).
13071 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13072 any_of(E->Scalars, [](Value *V) {
13073 return isCommutative(cast<Instruction>(V));
13074 }))
13075 I->setHasNoUnsignedWrap(/*b=*/false);
13076 }
13077
13078 V = FinalShuffle(V, E, VecTy);
13079
13080 E->VectorizedValue = V;
13081 ++NumVectorInstructions;
13082
13083 return V;
13084 }
13085 case Instruction::Load: {
13086 // Loads are inserted at the head of the tree because we don't want to
13087 // sink them all the way down past store instructions.
13088 setInsertPointAfterBundle(E);
13089
13090 LoadInst *LI = cast<LoadInst>(VL0);
13091 Instruction *NewLI;
13092 Value *PO = LI->getPointerOperand();
13093 if (E->State == TreeEntry::Vectorize) {
13094 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13095 } else if (E->State == TreeEntry::StridedVectorize) {
13096 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13097 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13098 PO = IsReverseOrder ? PtrN : Ptr0;
13099 std::optional<int> Diff = getPointersDiff(
13100 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13101 Type *StrideTy = DL->getIndexType(PO->getType());
13102 Value *StrideVal;
13103 if (Diff) {
13104 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13105 StrideVal =
13106 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13107 DL->getTypeAllocSize(ScalarTy));
13108 } else {
13109 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13110 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13111 return cast<LoadInst>(V)->getPointerOperand();
13112 });
13113 OrdersType Order;
13114 std::optional<Value *> Stride =
13115 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13116 &*Builder.GetInsertPoint());
13117 Value *NewStride =
13118 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13119 StrideVal = Builder.CreateMul(
13120 NewStride,
13121 ConstantInt::get(
13122 StrideTy,
13123 (IsReverseOrder ? -1 : 1) *
13124 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13125 }
13126 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13127 auto *Inst = Builder.CreateIntrinsic(
13128 Intrinsic::experimental_vp_strided_load,
13129 {VecTy, PO->getType(), StrideTy},
13130 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13131 Builder.getInt32(E->Scalars.size())});
13132 Inst->addParamAttr(
13133 /*ArgNo=*/0,
13134 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13135 NewLI = Inst;
13136 } else {
13137 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13138 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13139 if (E->VectorizedValue) {
13140 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13141 return E->VectorizedValue;
13142 }
13143 // Use the minimum alignment of the gathered loads.
13144 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13145 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13146 }
13147 Value *V = propagateMetadata(NewLI, E->Scalars);
13148
13149 V = FinalShuffle(V, E, VecTy);
13150 E->VectorizedValue = V;
13151 ++NumVectorInstructions;
13152 return V;
13153 }
13154 case Instruction::Store: {
13155 auto *SI = cast<StoreInst>(VL0);
13156
13157 setInsertPointAfterBundle(E);
13158
13159 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13160 if (VecValue->getType() != VecTy)
13161 VecValue =
13162 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13163 VecValue = FinalShuffle(VecValue, E, VecTy);
13164
13165 Value *Ptr = SI->getPointerOperand();
13166 Instruction *ST;
13167 if (E->State == TreeEntry::Vectorize) {
13168 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13169 } else {
13170 assert(E->State == TreeEntry::StridedVectorize &&
13171 "Expected either strided or conseutive stores.");
13172 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13173 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13174 auto *Inst = Builder.CreateIntrinsic(
13175 Intrinsic::experimental_vp_strided_store,
13176 {VecTy, Ptr->getType(), StrideTy},
13177 {VecValue, Ptr,
13178 ConstantInt::get(
13179 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13180 Builder.getAllOnesMask(VecTy->getElementCount()),
13181 Builder.getInt32(E->Scalars.size())});
13182 Inst->addParamAttr(
13183 /*ArgNo=*/1,
13184 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13185 ST = Inst;
13186 }
13187
13188 Value *V = propagateMetadata(ST, E->Scalars);
13189
13190 E->VectorizedValue = V;
13191 ++NumVectorInstructions;
13192 return V;
13193 }
13194 case Instruction::GetElementPtr: {
13195 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13196 setInsertPointAfterBundle(E);
13197
13198 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13199 if (E->VectorizedValue) {
13200 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13201 return E->VectorizedValue;
13202 }
13203
13204 SmallVector<Value *> OpVecs;
13205 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13206 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13207 if (E->VectorizedValue) {
13208 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13209 return E->VectorizedValue;
13210 }
13211 OpVecs.push_back(OpVec);
13212 }
13213
13214 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13215 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13217 for (Value *V : E->Scalars) {
13218 if (isa<GetElementPtrInst>(V))
13219 GEPs.push_back(V);
13220 }
13221 V = propagateMetadata(I, GEPs);
13222 }
13223
13224 V = FinalShuffle(V, E, VecTy);
13225
13226 E->VectorizedValue = V;
13227 ++NumVectorInstructions;
13228
13229 return V;
13230 }
13231 case Instruction::Call: {
13232 CallInst *CI = cast<CallInst>(VL0);
13233 setInsertPointAfterBundle(E);
13234
13236
13237 SmallVector<Type *> ArgTys =
13239 It != MinBWs.end() ? It->second.first : 0);
13240 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13241 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13242 VecCallCosts.first <= VecCallCosts.second;
13243
13244 Value *ScalarArg = nullptr;
13245 SmallVector<Value *> OpVecs;
13246 SmallVector<Type *, 2> TysForDecl;
13247 // Add return type if intrinsic is overloaded on it.
13248 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13249 TysForDecl.push_back(VecTy);
13250 auto *CEI = cast<CallInst>(VL0);
13251 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13252 ValueList OpVL;
13253 // Some intrinsics have scalar arguments. This argument should not be
13254 // vectorized.
13255 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13256 ScalarArg = CEI->getArgOperand(I);
13257 // if decided to reduce bitwidth of abs intrinsic, it second argument
13258 // must be set false (do not return poison, if value issigned min).
13259 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13260 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13261 ScalarArg = Builder.getFalse();
13262 OpVecs.push_back(ScalarArg);
13264 TysForDecl.push_back(ScalarArg->getType());
13265 continue;
13266 }
13267
13268 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13269 if (E->VectorizedValue) {
13270 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13271 return E->VectorizedValue;
13272 }
13273 ScalarArg = CEI->getArgOperand(I);
13274 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13275 ScalarArg->getType() &&
13276 It == MinBWs.end()) {
13277 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
13278 VecTy->getNumElements());
13279 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13280 } else if (It != MinBWs.end()) {
13281 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13282 }
13283 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13284 OpVecs.push_back(OpVec);
13285 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13286 TysForDecl.push_back(OpVec->getType());
13287 }
13288
13289 Function *CF;
13290 if (!UseIntrinsic) {
13291 VFShape Shape =
13294 static_cast<unsigned>(VecTy->getNumElements())),
13295 false /*HasGlobalPred*/);
13296 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13297 } else {
13298 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13299 }
13300
13302 CI->getOperandBundlesAsDefs(OpBundles);
13303 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13304
13305 propagateIRFlags(V, E->Scalars, VL0);
13306 V = FinalShuffle(V, E, VecTy);
13307
13308 E->VectorizedValue = V;
13309 ++NumVectorInstructions;
13310 return V;
13311 }
13312 case Instruction::ShuffleVector: {
13313 assert(E->isAltShuffle() &&
13314 ((Instruction::isBinaryOp(E->getOpcode()) &&
13315 Instruction::isBinaryOp(E->getAltOpcode())) ||
13316 (Instruction::isCast(E->getOpcode()) &&
13317 Instruction::isCast(E->getAltOpcode())) ||
13318 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13319 "Invalid Shuffle Vector Operand");
13320
13321 Value *LHS = nullptr, *RHS = nullptr;
13322 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13323 setInsertPointAfterBundle(E);
13324 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13325 if (E->VectorizedValue) {
13326 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13327 return E->VectorizedValue;
13328 }
13329 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13330 } else {
13331 setInsertPointAfterBundle(E);
13332 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13333 }
13334 if (E->VectorizedValue) {
13335 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13336 return E->VectorizedValue;
13337 }
13338 if (LHS && RHS &&
13339 ((Instruction::isBinaryOp(E->getOpcode()) &&
13340 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13341 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13342 assert((It != MinBWs.end() ||
13343 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13344 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13345 MinBWs.contains(getOperandEntry(E, 0)) ||
13346 MinBWs.contains(getOperandEntry(E, 1))) &&
13347 "Expected item in MinBWs.");
13348 Type *CastTy = VecTy;
13349 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13350 if (cast<VectorType>(LHS->getType())
13351 ->getElementType()
13352 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13353 ->getElementType()
13354 ->getIntegerBitWidth())
13355 CastTy = RHS->getType();
13356 else
13357 CastTy = LHS->getType();
13358 }
13359 if (LHS->getType() != CastTy)
13360 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13361 if (RHS->getType() != CastTy)
13362 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13363 }
13364
13365 Value *V0, *V1;
13366 if (Instruction::isBinaryOp(E->getOpcode())) {
13367 V0 = Builder.CreateBinOp(
13368 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13369 V1 = Builder.CreateBinOp(
13370 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13371 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13372 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13373 auto *AltCI = cast<CmpInst>(E->getAltOp());
13374 CmpInst::Predicate AltPred = AltCI->getPredicate();
13375 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13376 } else {
13377 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13378 unsigned SrcBWSz = DL->getTypeSizeInBits(
13379 cast<VectorType>(LHS->getType())->getElementType());
13380 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13381 if (BWSz <= SrcBWSz) {
13382 if (BWSz < SrcBWSz)
13383 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13384 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13385 if (auto *I = dyn_cast<Instruction>(LHS))
13386 LHS = propagateMetadata(I, E->Scalars);
13387 E->VectorizedValue = LHS;
13388 ++NumVectorInstructions;
13389 return LHS;
13390 }
13391 }
13392 V0 = Builder.CreateCast(
13393 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13394 V1 = Builder.CreateCast(
13395 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13396 }
13397 // Add V0 and V1 to later analysis to try to find and remove matching
13398 // instruction, if any.
13399 for (Value *V : {V0, V1}) {
13400 if (auto *I = dyn_cast<Instruction>(V)) {
13401 GatherShuffleExtractSeq.insert(I);
13402 CSEBlocks.insert(I->getParent());
13403 }
13404 }
13405
13406 // Create shuffle to take alternate operations from the vector.
13407 // Also, gather up main and alt scalar ops to propagate IR flags to
13408 // each vector operation.
13409 ValueList OpScalars, AltScalars;
13411 E->buildAltOpShuffleMask(
13412 [E, this](Instruction *I) {
13413 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13414 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13415 *TLI);
13416 },
13417 Mask, &OpScalars, &AltScalars);
13418
13419 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13420 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13421 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13422 // Drop nuw flags for abs(sub(commutative), true).
13423 if (auto *I = dyn_cast<Instruction>(Vec);
13424 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13425 any_of(E->Scalars, [](Value *V) {
13426 auto *IV = cast<Instruction>(V);
13427 return IV->getOpcode() == Instruction::Sub &&
13428 isCommutative(cast<Instruction>(IV));
13429 }))
13430 I->setHasNoUnsignedWrap(/*b=*/false);
13431 };
13432 DropNuwFlag(V0, E->getOpcode());
13433 DropNuwFlag(V1, E->getAltOpcode());
13434
13435 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13436 if (auto *I = dyn_cast<Instruction>(V)) {
13437 V = propagateMetadata(I, E->Scalars);
13438 GatherShuffleExtractSeq.insert(I);
13439 CSEBlocks.insert(I->getParent());
13440 }
13441
13442 E->VectorizedValue = V;
13443 ++NumVectorInstructions;
13444
13445 return V;
13446 }
13447 default:
13448 llvm_unreachable("unknown inst");
13449 }
13450 return nullptr;
13451}
13452
13454 ExtraValueToDebugLocsMap ExternallyUsedValues;
13455 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13456 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13457}
13458
13459namespace {
13460/// Data type for handling buildvector sequences with the reused scalars from
13461/// other tree entries.
13462struct ShuffledInsertData {
13463 /// List of insertelements to be replaced by shuffles.
13464 SmallVector<InsertElementInst *> InsertElements;
13465 /// The parent vectors and shuffle mask for the given list of inserts.
13467};
13468} // namespace
13469
13471 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13472 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13473 Instruction *ReductionRoot) {
13474 // All blocks must be scheduled before any instructions are inserted.
13475 for (auto &BSIter : BlocksSchedules) {
13476 scheduleBlock(BSIter.second.get());
13477 }
13478 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13479 // need to rebuild it.
13480 EntryToLastInstruction.clear();
13481
13482 if (ReductionRoot)
13483 Builder.SetInsertPoint(ReductionRoot->getParent(),
13484 ReductionRoot->getIterator());
13485 else
13486 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13487
13488 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13489 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13490 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13491 if (TE->State == TreeEntry::Vectorize &&
13492 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13493 TE->VectorizedValue)
13494 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13495 // Run through the list of postponed gathers and emit them, replacing the temp
13496 // emitted allocas with actual vector instructions.
13497 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13499 for (const TreeEntry *E : PostponedNodes) {
13500 auto *TE = const_cast<TreeEntry *>(E);
13501 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13502 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13503 TE->UserTreeIndices.front().EdgeIdx)) &&
13504 VecTE->isSame(TE->Scalars))
13505 // Found gather node which is absolutely the same as one of the
13506 // vectorized nodes. It may happen after reordering.
13507 continue;
13508 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13509 TE->VectorizedValue = nullptr;
13510 auto *UserI =
13511 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13512 // If user is a PHI node, its vector code have to be inserted right before
13513 // block terminator. Since the node was delayed, there were some unresolved
13514 // dependencies at the moment when stab instruction was emitted. In a case
13515 // when any of these dependencies turn out an operand of another PHI, coming
13516 // from this same block, position of a stab instruction will become invalid.
13517 // The is because source vector that supposed to feed this gather node was
13518 // inserted at the end of the block [after stab instruction]. So we need
13519 // to adjust insertion point again to the end of block.
13520 if (isa<PHINode>(UserI)) {
13521 // Insert before all users.
13522 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13523 for (User *U : PrevVec->users()) {
13524 if (U == UserI)
13525 continue;
13526 auto *UI = dyn_cast<Instruction>(U);
13527 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13528 continue;
13529 if (UI->comesBefore(InsertPt))
13530 InsertPt = UI;
13531 }
13532 Builder.SetInsertPoint(InsertPt);
13533 } else {
13534 Builder.SetInsertPoint(PrevVec);
13535 }
13536 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13537 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13538 if (Vec->getType() != PrevVec->getType()) {
13539 assert(Vec->getType()->isIntOrIntVectorTy() &&
13540 PrevVec->getType()->isIntOrIntVectorTy() &&
13541 "Expected integer vector types only.");
13542 std::optional<bool> IsSigned;
13543 for (Value *V : TE->Scalars) {
13544 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13545 auto It = MinBWs.find(BaseTE);
13546 if (It != MinBWs.end()) {
13547 IsSigned = IsSigned.value_or(false) || It->second.second;
13548 if (*IsSigned)
13549 break;
13550 }
13551 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13552 auto It = MinBWs.find(MNTE);
13553 if (It != MinBWs.end()) {
13554 IsSigned = IsSigned.value_or(false) || It->second.second;
13555 if (*IsSigned)
13556 break;
13557 }
13558 }
13559 if (IsSigned.value_or(false))
13560 break;
13561 // Scan through gather nodes.
13562 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13563 auto It = MinBWs.find(BVE);
13564 if (It != MinBWs.end()) {
13565 IsSigned = IsSigned.value_or(false) || It->second.second;
13566 if (*IsSigned)
13567 break;
13568 }
13569 }
13570 if (IsSigned.value_or(false))
13571 break;
13572 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13573 IsSigned =
13574 IsSigned.value_or(false) ||
13575 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13576 continue;
13577 }
13578 if (IsSigned.value_or(false))
13579 break;
13580 }
13581 }
13582 if (IsSigned.value_or(false)) {
13583 // Final attempt - check user node.
13584 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13585 if (It != MinBWs.end())
13586 IsSigned = It->second.second;
13587 }
13588 assert(IsSigned &&
13589 "Expected user node or perfect diamond match in MinBWs.");
13590 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13591 }
13592 PrevVec->replaceAllUsesWith(Vec);
13593 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13594 // Replace the stub vector node, if it was used before for one of the
13595 // buildvector nodes already.
13596 auto It = PostponedValues.find(PrevVec);
13597 if (It != PostponedValues.end()) {
13598 for (TreeEntry *VTE : It->getSecond())
13599 VTE->VectorizedValue = Vec;
13600 }
13601 eraseInstruction(PrevVec);
13602 }
13603
13604 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13605 << " values .\n");
13606
13607 SmallVector<ShuffledInsertData> ShuffledInserts;
13608 // Maps vector instruction to original insertelement instruction
13609 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13610 // Maps extract Scalar to the corresponding extractelement instruction in the
13611 // basic block. Only one extractelement per block should be emitted.
13612 DenseMap<Value *,
13614 ScalarToEEs;
13615 SmallDenseSet<Value *, 4> UsedInserts;
13617 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13618 // Extract all of the elements with the external uses.
13619 for (const auto &ExternalUse : ExternalUses) {
13620 Value *Scalar = ExternalUse.Scalar;
13621 llvm::User *User = ExternalUse.User;
13622
13623 // Skip users that we already RAUW. This happens when one instruction
13624 // has multiple uses of the same value.
13625 if (User && !is_contained(Scalar->users(), User))
13626 continue;
13627 TreeEntry *E = getTreeEntry(Scalar);
13628 assert(E && "Invalid scalar");
13629 assert(E->State != TreeEntry::NeedToGather &&
13630 "Extracting from a gather list");
13631 // Non-instruction pointers are not deleted, just skip them.
13632 if (E->getOpcode() == Instruction::GetElementPtr &&
13633 !isa<GetElementPtrInst>(Scalar))
13634 continue;
13635
13636 Value *Vec = E->VectorizedValue;
13637 assert(Vec && "Can't find vectorizable value");
13638
13639 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13640 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13641 if (Scalar->getType() != Vec->getType()) {
13642 Value *Ex = nullptr;
13643 Value *ExV = nullptr;
13644 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13645 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13646 auto It = ScalarToEEs.find(Scalar);
13647 if (It != ScalarToEEs.end()) {
13648 // No need to emit many extracts, just move the only one in the
13649 // current block.
13650 auto EEIt = It->second.find(Builder.GetInsertBlock());
13651 if (EEIt != It->second.end()) {
13652 Instruction *I = EEIt->second.first;
13653 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13654 Builder.GetInsertPoint()->comesBefore(I)) {
13655 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13656 Builder.GetInsertPoint());
13657 if (auto *CI = EEIt->second.second)
13658 CI->moveAfter(I);
13659 }
13660 Ex = I;
13661 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13662 }
13663 }
13664 if (!Ex) {
13665 // "Reuse" the existing extract to improve final codegen.
13666 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13667 Value *V = ES->getVectorOperand();
13668 if (const TreeEntry *ETE = getTreeEntry(V))
13669 V = ETE->VectorizedValue;
13670 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13671 } else if (ReplaceGEP) {
13672 // Leave the GEPs as is, they are free in most cases and better to
13673 // keep them as GEPs.
13674 auto *CloneGEP = GEP->clone();
13675 if (isa<Instruction>(Vec))
13676 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13677 Builder.GetInsertPoint());
13678 else
13679 CloneGEP->insertBefore(GEP);
13680 if (GEP->hasName())
13681 CloneGEP->takeName(GEP);
13682 Ex = CloneGEP;
13683 } else {
13684 Ex = Builder.CreateExtractElement(Vec, Lane);
13685 }
13686 // If necessary, sign-extend or zero-extend ScalarRoot
13687 // to the larger type.
13688 ExV = Ex;
13689 if (Scalar->getType() != Ex->getType())
13690 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13691 MinBWs.find(E)->second.second);
13692 if (auto *I = dyn_cast<Instruction>(Ex))
13693 ScalarToEEs[Scalar].try_emplace(
13694 Builder.GetInsertBlock(),
13695 std::make_pair(I, cast<Instruction>(ExV)));
13696 }
13697 // The then branch of the previous if may produce constants, since 0
13698 // operand might be a constant.
13699 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13700 GatherShuffleExtractSeq.insert(ExI);
13701 CSEBlocks.insert(ExI->getParent());
13702 }
13703 return ExV;
13704 }
13705 assert(isa<FixedVectorType>(Scalar->getType()) &&
13706 isa<InsertElementInst>(Scalar) &&
13707 "In-tree scalar of vector type is not insertelement?");
13708 auto *IE = cast<InsertElementInst>(Scalar);
13709 VectorToInsertElement.try_emplace(Vec, IE);
13710 return Vec;
13711 };
13712 // If User == nullptr, the Scalar remains as scalar in vectorized
13713 // instructions or is used as extra arg. Generate ExtractElement instruction
13714 // and update the record for this scalar in ExternallyUsedValues.
13715 if (!User) {
13716 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13717 continue;
13718 assert((ExternallyUsedValues.count(Scalar) ||
13719 any_of(Scalar->users(),
13720 [&](llvm::User *U) {
13721 if (ExternalUsesAsGEPs.contains(U))
13722 return true;
13723 TreeEntry *UseEntry = getTreeEntry(U);
13724 return UseEntry &&
13725 (UseEntry->State == TreeEntry::Vectorize ||
13726 UseEntry->State ==
13727 TreeEntry::StridedVectorize) &&
13728 (E->State == TreeEntry::Vectorize ||
13729 E->State == TreeEntry::StridedVectorize) &&
13730 doesInTreeUserNeedToExtract(
13731 Scalar,
13732 cast<Instruction>(UseEntry->Scalars.front()),
13733 TLI);
13734 })) &&
13735 "Scalar with nullptr User must be registered in "
13736 "ExternallyUsedValues map or remain as scalar in vectorized "
13737 "instructions");
13738 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13739 if (auto *PHI = dyn_cast<PHINode>(VecI))
13740 Builder.SetInsertPoint(PHI->getParent(),
13741 PHI->getParent()->getFirstNonPHIIt());
13742 else
13743 Builder.SetInsertPoint(VecI->getParent(),
13744 std::next(VecI->getIterator()));
13745 } else {
13746 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13747 }
13748 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13749 // Required to update internally referenced instructions.
13750 Scalar->replaceAllUsesWith(NewInst);
13751 ReplacedExternals.emplace_back(Scalar, NewInst);
13752 continue;
13753 }
13754
13755 if (auto *VU = dyn_cast<InsertElementInst>(User)) {
13756 // Skip if the scalar is another vector op or Vec is not an instruction.
13757 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13758 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13759 if (!UsedInserts.insert(VU).second)
13760 continue;
13761 // Need to use original vector, if the root is truncated.
13762 auto BWIt = MinBWs.find(E);
13763 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13764 auto *ScalarTy = FTy->getElementType();
13765 auto Key = std::make_pair(Vec, ScalarTy);
13766 auto VecIt = VectorCasts.find(Key);
13767 if (VecIt == VectorCasts.end()) {
13768 IRBuilderBase::InsertPointGuard Guard(Builder);
13769 if (auto *IVec = dyn_cast<PHINode>(Vec))
13770 Builder.SetInsertPoint(
13771 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13772 else if (auto *IVec = dyn_cast<Instruction>(Vec))
13773 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13774 Vec = Builder.CreateIntCast(
13775 Vec,
13777 ScalarTy,
13778 cast<FixedVectorType>(Vec->getType())->getNumElements()),
13779 BWIt->second.second);
13780 VectorCasts.try_emplace(Key, Vec);
13781 } else {
13782 Vec = VecIt->second;
13783 }
13784 }
13785
13786 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
13787 if (InsertIdx) {
13788 auto *It =
13789 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
13790 // Checks if 2 insertelements are from the same buildvector.
13791 InsertElementInst *VecInsert = Data.InsertElements.front();
13793 VU, VecInsert,
13794 [](InsertElementInst *II) { return II->getOperand(0); });
13795 });
13796 unsigned Idx = *InsertIdx;
13797 if (It == ShuffledInserts.end()) {
13798 (void)ShuffledInserts.emplace_back();
13799 It = std::next(ShuffledInserts.begin(),
13800 ShuffledInserts.size() - 1);
13801 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13802 if (Mask.empty())
13803 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13804 // Find the insertvector, vectorized in tree, if any.
13805 Value *Base = VU;
13806 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
13807 if (IEBase != User &&
13808 (!IEBase->hasOneUse() ||
13809 getInsertIndex(IEBase).value_or(Idx) == Idx))
13810 break;
13811 // Build the mask for the vectorized insertelement instructions.
13812 if (const TreeEntry *E = getTreeEntry(IEBase)) {
13813 do {
13814 IEBase = cast<InsertElementInst>(Base);
13815 int IEIdx = *getInsertIndex(IEBase);
13816 assert(Mask[IEIdx] == PoisonMaskElem &&
13817 "InsertElementInstruction used already.");
13818 Mask[IEIdx] = IEIdx;
13819 Base = IEBase->getOperand(0);
13820 } while (E == getTreeEntry(Base));
13821 break;
13822 }
13823 Base = cast<InsertElementInst>(Base)->getOperand(0);
13824 // After the vectorization the def-use chain has changed, need
13825 // to look through original insertelement instructions, if they
13826 // get replaced by vector instructions.
13827 auto It = VectorToInsertElement.find(Base);
13828 if (It != VectorToInsertElement.end())
13829 Base = It->second;
13830 }
13831 }
13832 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13833 if (Mask.empty())
13834 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13835 Mask[Idx] = ExternalUse.Lane;
13836 It->InsertElements.push_back(cast<InsertElementInst>(User));
13837 continue;
13838 }
13839 }
13840 }
13841 }
13842
13843 // Generate extracts for out-of-tree users.
13844 // Find the insertion point for the extractelement lane.
13845 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13846 if (PHINode *PH = dyn_cast<PHINode>(User)) {
13847 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13848 if (PH->getIncomingValue(I) == Scalar) {
13849 Instruction *IncomingTerminator =
13850 PH->getIncomingBlock(I)->getTerminator();
13851 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13852 Builder.SetInsertPoint(VecI->getParent(),
13853 std::next(VecI->getIterator()));
13854 } else {
13855 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
13856 }
13857 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13858 PH->setOperand(I, NewInst);
13859 }
13860 }
13861 } else {
13862 Builder.SetInsertPoint(cast<Instruction>(User));
13863 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13864 User->replaceUsesOfWith(Scalar, NewInst);
13865 }
13866 } else {
13867 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13868 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13869 User->replaceUsesOfWith(Scalar, NewInst);
13870 }
13871
13872 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13873 }
13874
13875 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13876 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13877 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13878 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13879 for (int I = 0, E = Mask.size(); I < E; ++I) {
13880 if (Mask[I] < VF)
13881 CombinedMask1[I] = Mask[I];
13882 else
13883 CombinedMask2[I] = Mask[I] - VF;
13884 }
13885 ShuffleInstructionBuilder ShuffleBuilder(
13886 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
13887 ShuffleBuilder.add(V1, CombinedMask1);
13888 if (V2)
13889 ShuffleBuilder.add(V2, CombinedMask2);
13890 return ShuffleBuilder.finalize(std::nullopt);
13891 };
13892
13893 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13894 bool ForSingleMask) {
13895 unsigned VF = Mask.size();
13896 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13897 if (VF != VecVF) {
13898 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13899 Vec = CreateShuffle(Vec, nullptr, Mask);
13900 return std::make_pair(Vec, true);
13901 }
13902 if (!ForSingleMask) {
13903 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13904 for (unsigned I = 0; I < VF; ++I) {
13905 if (Mask[I] != PoisonMaskElem)
13906 ResizeMask[Mask[I]] = Mask[I];
13907 }
13908 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13909 }
13910 }
13911
13912 return std::make_pair(Vec, false);
13913 };
13914 // Perform shuffling of the vectorize tree entries for better handling of
13915 // external extracts.
13916 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13917 // Find the first and the last instruction in the list of insertelements.
13918 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
13919 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13920 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13921 Builder.SetInsertPoint(LastInsert);
13922 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13923 Value *NewInst = performExtractsShuffleAction<Value>(
13924 MutableArrayRef(Vector.data(), Vector.size()),
13925 FirstInsert->getOperand(0),
13926 [](Value *Vec) {
13927 return cast<VectorType>(Vec->getType())
13928 ->getElementCount()
13929 .getKnownMinValue();
13930 },
13931 ResizeToVF,
13932 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13933 ArrayRef<Value *> Vals) {
13934 assert((Vals.size() == 1 || Vals.size() == 2) &&
13935 "Expected exactly 1 or 2 input values.");
13936 if (Vals.size() == 1) {
13937 // Do not create shuffle if the mask is a simple identity
13938 // non-resizing mask.
13939 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13940 ->getNumElements() ||
13941 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13942 return CreateShuffle(Vals.front(), nullptr, Mask);
13943 return Vals.front();
13944 }
13945 return CreateShuffle(Vals.front() ? Vals.front()
13946 : FirstInsert->getOperand(0),
13947 Vals.back(), Mask);
13948 });
13949 auto It = ShuffledInserts[I].InsertElements.rbegin();
13950 // Rebuild buildvector chain.
13951 InsertElementInst *II = nullptr;
13952 if (It != ShuffledInserts[I].InsertElements.rend())
13953 II = *It;
13955 while (It != ShuffledInserts[I].InsertElements.rend()) {
13956 assert(II && "Must be an insertelement instruction.");
13957 if (*It == II)
13958 ++It;
13959 else
13960 Inserts.push_back(cast<Instruction>(II));
13961 II = dyn_cast<InsertElementInst>(II->getOperand(0));
13962 }
13963 for (Instruction *II : reverse(Inserts)) {
13964 II->replaceUsesOfWith(II->getOperand(0), NewInst);
13965 if (auto *NewI = dyn_cast<Instruction>(NewInst))
13966 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
13967 II->moveAfter(NewI);
13968 NewInst = II;
13969 }
13970 LastInsert->replaceAllUsesWith(NewInst);
13971 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
13972 IE->replaceUsesOfWith(IE->getOperand(0),
13973 PoisonValue::get(IE->getOperand(0)->getType()));
13974 IE->replaceUsesOfWith(IE->getOperand(1),
13975 PoisonValue::get(IE->getOperand(1)->getType()));
13976 eraseInstruction(IE);
13977 }
13978 CSEBlocks.insert(LastInsert->getParent());
13979 }
13980
13981 SmallVector<Instruction *> RemovedInsts;
13982 // For each vectorized value:
13983 for (auto &TEPtr : VectorizableTree) {
13984 TreeEntry *Entry = TEPtr.get();
13985
13986 // No need to handle users of gathered values.
13987 if (Entry->State == TreeEntry::NeedToGather)
13988 continue;
13989
13990 assert(Entry->VectorizedValue && "Can't find vectorizable value");
13991
13992 // For each lane:
13993 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13994 Value *Scalar = Entry->Scalars[Lane];
13995
13996 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13997 !isa<GetElementPtrInst>(Scalar))
13998 continue;
13999#ifndef NDEBUG
14000 Type *Ty = Scalar->getType();
14001 if (!Ty->isVoidTy()) {
14002 for (User *U : Scalar->users()) {
14003 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14004
14005 // It is legal to delete users in the ignorelist.
14006 assert((getTreeEntry(U) ||
14007 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14008 (isa_and_nonnull<Instruction>(U) &&
14009 isDeleted(cast<Instruction>(U)))) &&
14010 "Deleting out-of-tree value");
14011 }
14012 }
14013#endif
14014 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14015 eraseInstruction(cast<Instruction>(Scalar));
14016 // Retain to-be-deleted instructions for some debug-info
14017 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
14018 // deletion - instructions are not deleted until later.
14019 RemovedInsts.push_back(cast<Instruction>(Scalar));
14020 }
14021 }
14022
14023 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14024 // new vector instruction.
14025 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14026 V->mergeDIAssignID(RemovedInsts);
14027
14028 Builder.ClearInsertionPoint();
14029 InstrElementSize.clear();
14030
14031 const TreeEntry &RootTE = *VectorizableTree.front().get();
14032 Value *Vec = RootTE.VectorizedValue;
14033 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14034 It != MinBWs.end() &&
14035 ReductionBitWidth != It->second.first) {
14036 IRBuilder<>::InsertPointGuard Guard(Builder);
14037 Builder.SetInsertPoint(ReductionRoot->getParent(),
14038 ReductionRoot->getIterator());
14039 Vec = Builder.CreateIntCast(
14040 Vec,
14041 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14042 cast<VectorType>(Vec->getType())->getElementCount()),
14043 It->second.second);
14044 }
14045 return Vec;
14046}
14047
14049 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14050 << " gather sequences instructions.\n");
14051 // LICM InsertElementInst sequences.
14052 for (Instruction *I : GatherShuffleExtractSeq) {
14053 if (isDeleted(I))
14054 continue;
14055
14056 // Check if this block is inside a loop.
14057 Loop *L = LI->getLoopFor(I->getParent());
14058 if (!L)
14059 continue;
14060
14061 // Check if it has a preheader.
14062 BasicBlock *PreHeader = L->getLoopPreheader();
14063 if (!PreHeader)
14064 continue;
14065
14066 // If the vector or the element that we insert into it are
14067 // instructions that are defined in this basic block then we can't
14068 // hoist this instruction.
14069 if (any_of(I->operands(), [L](Value *V) {
14070 auto *OpI = dyn_cast<Instruction>(V);
14071 return OpI && L->contains(OpI);
14072 }))
14073 continue;
14074
14075 // We can hoist this instruction. Move it to the pre-header.
14076 I->moveBefore(PreHeader->getTerminator());
14077 CSEBlocks.insert(PreHeader);
14078 }
14079
14080 // Make a list of all reachable blocks in our CSE queue.
14082 CSEWorkList.reserve(CSEBlocks.size());
14083 for (BasicBlock *BB : CSEBlocks)
14084 if (DomTreeNode *N = DT->getNode(BB)) {
14086 CSEWorkList.push_back(N);
14087 }
14088
14089 // Sort blocks by domination. This ensures we visit a block after all blocks
14090 // dominating it are visited.
14091 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14092 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14093 "Different nodes should have different DFS numbers");
14094 return A->getDFSNumIn() < B->getDFSNumIn();
14095 });
14096
14097 // Less defined shuffles can be replaced by the more defined copies.
14098 // Between two shuffles one is less defined if it has the same vector operands
14099 // and its mask indeces are the same as in the first one or undefs. E.g.
14100 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14101 // poison, <0, 0, 0, 0>.
14102 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14103 SmallVectorImpl<int> &NewMask) {
14104 if (I1->getType() != I2->getType())
14105 return false;
14106 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14107 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14108 if (!SI1 || !SI2)
14109 return I1->isIdenticalTo(I2);
14110 if (SI1->isIdenticalTo(SI2))
14111 return true;
14112 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14113 if (SI1->getOperand(I) != SI2->getOperand(I))
14114 return false;
14115 // Check if the second instruction is more defined than the first one.
14116 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14117 ArrayRef<int> SM1 = SI1->getShuffleMask();
14118 // Count trailing undefs in the mask to check the final number of used
14119 // registers.
14120 unsigned LastUndefsCnt = 0;
14121 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14122 if (SM1[I] == PoisonMaskElem)
14123 ++LastUndefsCnt;
14124 else
14125 LastUndefsCnt = 0;
14126 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14127 NewMask[I] != SM1[I])
14128 return false;
14129 if (NewMask[I] == PoisonMaskElem)
14130 NewMask[I] = SM1[I];
14131 }
14132 // Check if the last undefs actually change the final number of used vector
14133 // registers.
14134 return SM1.size() - LastUndefsCnt > 1 &&
14135 TTI->getNumberOfParts(SI1->getType()) ==
14137 FixedVectorType::get(SI1->getType()->getElementType(),
14138 SM1.size() - LastUndefsCnt));
14139 };
14140 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14141 // instructions. TODO: We can further optimize this scan if we split the
14142 // instructions into different buckets based on the insert lane.
14144 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14145 assert(*I &&
14146 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14147 "Worklist not sorted properly!");
14148 BasicBlock *BB = (*I)->getBlock();
14149 // For all instructions in blocks containing gather sequences:
14150 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14151 if (isDeleted(&In))
14152 continue;
14153 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14154 !GatherShuffleExtractSeq.contains(&In))
14155 continue;
14156
14157 // Check if we can replace this instruction with any of the
14158 // visited instructions.
14159 bool Replaced = false;
14160 for (Instruction *&V : Visited) {
14161 SmallVector<int> NewMask;
14162 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14163 DT->dominates(V->getParent(), In.getParent())) {
14164 In.replaceAllUsesWith(V);
14165 eraseInstruction(&In);
14166 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14167 if (!NewMask.empty())
14168 SI->setShuffleMask(NewMask);
14169 Replaced = true;
14170 break;
14171 }
14172 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14173 GatherShuffleExtractSeq.contains(V) &&
14174 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14175 DT->dominates(In.getParent(), V->getParent())) {
14176 In.moveAfter(V);
14177 V->replaceAllUsesWith(&In);
14179 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14180 if (!NewMask.empty())
14181 SI->setShuffleMask(NewMask);
14182 V = &In;
14183 Replaced = true;
14184 break;
14185 }
14186 }
14187 if (!Replaced) {
14188 assert(!is_contained(Visited, &In));
14189 Visited.push_back(&In);
14190 }
14191 }
14192 }
14193 CSEBlocks.clear();
14194 GatherShuffleExtractSeq.clear();
14195}
14196
14197BoUpSLP::ScheduleData *
14198BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14199 ScheduleData *Bundle = nullptr;
14200 ScheduleData *PrevInBundle = nullptr;
14201 for (Value *V : VL) {
14203 continue;
14204 ScheduleData *BundleMember = getScheduleData(V);
14205 assert(BundleMember &&
14206 "no ScheduleData for bundle member "
14207 "(maybe not in same basic block)");
14208 assert(BundleMember->isSchedulingEntity() &&
14209 "bundle member already part of other bundle");
14210 if (PrevInBundle) {
14211 PrevInBundle->NextInBundle = BundleMember;
14212 } else {
14213 Bundle = BundleMember;
14214 }
14215
14216 // Group the instructions to a bundle.
14217 BundleMember->FirstInBundle = Bundle;
14218 PrevInBundle = BundleMember;
14219 }
14220 assert(Bundle && "Failed to find schedule bundle");
14221 return Bundle;
14222}
14223
14224// Groups the instructions to a bundle (which is then a single scheduling entity)
14225// and schedules instructions until the bundle gets ready.
14226std::optional<BoUpSLP::ScheduleData *>
14227BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14228 const InstructionsState &S) {
14229 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14230 // instructions.
14231 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14233 return nullptr;
14234
14235 // Initialize the instruction bundle.
14236 Instruction *OldScheduleEnd = ScheduleEnd;
14237 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14238
14239 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14240 ScheduleData *Bundle) {
14241 // The scheduling region got new instructions at the lower end (or it is a
14242 // new region for the first bundle). This makes it necessary to
14243 // recalculate all dependencies.
14244 // It is seldom that this needs to be done a second time after adding the
14245 // initial bundle to the region.
14246 if (ScheduleEnd != OldScheduleEnd) {
14247 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14248 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14249 ReSchedule = true;
14250 }
14251 if (Bundle) {
14252 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14253 << " in block " << BB->getName() << "\n");
14254 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14255 }
14256
14257 if (ReSchedule) {
14258 resetSchedule();
14259 initialFillReadyList(ReadyInsts);
14260 }
14261
14262 // Now try to schedule the new bundle or (if no bundle) just calculate
14263 // dependencies. As soon as the bundle is "ready" it means that there are no
14264 // cyclic dependencies and we can schedule it. Note that's important that we
14265 // don't "schedule" the bundle yet (see cancelScheduling).
14266 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14267 !ReadyInsts.empty()) {
14268 ScheduleData *Picked = ReadyInsts.pop_back_val();
14269 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14270 "must be ready to schedule");
14271 schedule(Picked, ReadyInsts);
14272 }
14273 };
14274
14275 // Make sure that the scheduling region contains all
14276 // instructions of the bundle.
14277 for (Value *V : VL) {
14279 continue;
14280 if (!extendSchedulingRegion(V, S)) {
14281 // If the scheduling region got new instructions at the lower end (or it
14282 // is a new region for the first bundle). This makes it necessary to
14283 // recalculate all dependencies.
14284 // Otherwise the compiler may crash trying to incorrectly calculate
14285 // dependencies and emit instruction in the wrong order at the actual
14286 // scheduling.
14287 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14288 return std::nullopt;
14289 }
14290 }
14291
14292 bool ReSchedule = false;
14293 for (Value *V : VL) {
14295 continue;
14296 ScheduleData *BundleMember = getScheduleData(V);
14297 assert(BundleMember &&
14298 "no ScheduleData for bundle member (maybe not in same basic block)");
14299
14300 // Make sure we don't leave the pieces of the bundle in the ready list when
14301 // whole bundle might not be ready.
14302 ReadyInsts.remove(BundleMember);
14303
14304 if (!BundleMember->IsScheduled)
14305 continue;
14306 // A bundle member was scheduled as single instruction before and now
14307 // needs to be scheduled as part of the bundle. We just get rid of the
14308 // existing schedule.
14309 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14310 << " was already scheduled\n");
14311 ReSchedule = true;
14312 }
14313
14314 auto *Bundle = buildBundle(VL);
14315 TryScheduleBundleImpl(ReSchedule, Bundle);
14316 if (!Bundle->isReady()) {
14317 cancelScheduling(VL, S.OpValue);
14318 return std::nullopt;
14319 }
14320 return Bundle;
14321}
14322
14323void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14324 Value *OpValue) {
14325 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14327 return;
14328
14329 if (doesNotNeedToBeScheduled(OpValue))
14330 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14331 ScheduleData *Bundle = getScheduleData(OpValue);
14332 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14333 assert(!Bundle->IsScheduled &&
14334 "Can't cancel bundle which is already scheduled");
14335 assert(Bundle->isSchedulingEntity() &&
14336 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14337 "tried to unbundle something which is not a bundle");
14338
14339 // Remove the bundle from the ready list.
14340 if (Bundle->isReady())
14341 ReadyInsts.remove(Bundle);
14342
14343 // Un-bundle: make single instructions out of the bundle.
14344 ScheduleData *BundleMember = Bundle;
14345 while (BundleMember) {
14346 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14347 BundleMember->FirstInBundle = BundleMember;
14348 ScheduleData *Next = BundleMember->NextInBundle;
14349 BundleMember->NextInBundle = nullptr;
14350 BundleMember->TE = nullptr;
14351 if (BundleMember->unscheduledDepsInBundle() == 0) {
14352 ReadyInsts.insert(BundleMember);
14353 }
14354 BundleMember = Next;
14355 }
14356}
14357
14358BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14359 // Allocate a new ScheduleData for the instruction.
14360 if (ChunkPos >= ChunkSize) {
14361 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14362 ChunkPos = 0;
14363 }
14364 return &(ScheduleDataChunks.back()[ChunkPos++]);
14365}
14366
14367bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14368 const InstructionsState &S) {
14369 if (getScheduleData(V, isOneOf(S, V)))
14370 return true;
14371 Instruction *I = dyn_cast<Instruction>(V);
14372 assert(I && "bundle member must be an instruction");
14373 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14375 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14376 "be scheduled");
14377 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14378 ScheduleData *ISD = getScheduleData(I);
14379 if (!ISD)
14380 return false;
14381 assert(isInSchedulingRegion(ISD) &&
14382 "ScheduleData not in scheduling region");
14383 ScheduleData *SD = allocateScheduleDataChunks();
14384 SD->Inst = I;
14385 SD->init(SchedulingRegionID, S.OpValue);
14386 ExtraScheduleDataMap[I][S.OpValue] = SD;
14387 return true;
14388 };
14389 if (CheckScheduleForI(I))
14390 return true;
14391 if (!ScheduleStart) {
14392 // It's the first instruction in the new region.
14393 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14394 ScheduleStart = I;
14395 ScheduleEnd = I->getNextNode();
14396 if (isOneOf(S, I) != I)
14397 CheckScheduleForI(I);
14398 assert(ScheduleEnd && "tried to vectorize a terminator?");
14399 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14400 return true;
14401 }
14402 // Search up and down at the same time, because we don't know if the new
14403 // instruction is above or below the existing scheduling region.
14404 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14405 // against the budget. Otherwise debug info could affect codegen.
14407 ++ScheduleStart->getIterator().getReverse();
14408 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14409 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14410 BasicBlock::iterator LowerEnd = BB->end();
14411 auto IsAssumeLikeIntr = [](const Instruction &I) {
14412 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14413 return II->isAssumeLikeIntrinsic();
14414 return false;
14415 };
14416 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14417 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14418 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14419 &*DownIter != I) {
14420 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14421 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14422 return false;
14423 }
14424
14425 ++UpIter;
14426 ++DownIter;
14427
14428 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14429 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14430 }
14431 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14432 assert(I->getParent() == ScheduleStart->getParent() &&
14433 "Instruction is in wrong basic block.");
14434 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14435 ScheduleStart = I;
14436 if (isOneOf(S, I) != I)
14437 CheckScheduleForI(I);
14438 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14439 << "\n");
14440 return true;
14441 }
14442 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14443 "Expected to reach top of the basic block or instruction down the "
14444 "lower end.");
14445 assert(I->getParent() == ScheduleEnd->getParent() &&
14446 "Instruction is in wrong basic block.");
14447 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14448 nullptr);
14449 ScheduleEnd = I->getNextNode();
14450 if (isOneOf(S, I) != I)
14451 CheckScheduleForI(I);
14452 assert(ScheduleEnd && "tried to vectorize a terminator?");
14453 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14454 return true;
14455}
14456
14457void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14458 Instruction *ToI,
14459 ScheduleData *PrevLoadStore,
14460 ScheduleData *NextLoadStore) {
14461 ScheduleData *CurrentLoadStore = PrevLoadStore;
14462 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14463 // No need to allocate data for non-schedulable instructions.
14465 continue;
14466 ScheduleData *SD = ScheduleDataMap.lookup(I);
14467 if (!SD) {
14468 SD = allocateScheduleDataChunks();
14469 ScheduleDataMap[I] = SD;
14470 SD->Inst = I;
14471 }
14472 assert(!isInSchedulingRegion(SD) &&
14473 "new ScheduleData already in scheduling region");
14474 SD->init(SchedulingRegionID, I);
14475
14476 if (I->mayReadOrWriteMemory() &&
14477 (!isa<IntrinsicInst>(I) ||
14478 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14479 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14480 Intrinsic::pseudoprobe))) {
14481 // Update the linked list of memory accessing instructions.
14482 if (CurrentLoadStore) {
14483 CurrentLoadStore->NextLoadStore = SD;
14484 } else {
14485 FirstLoadStoreInRegion = SD;
14486 }
14487 CurrentLoadStore = SD;
14488 }
14489
14490 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14491 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14492 RegionHasStackSave = true;
14493 }
14494 if (NextLoadStore) {
14495 if (CurrentLoadStore)
14496 CurrentLoadStore->NextLoadStore = NextLoadStore;
14497 } else {
14498 LastLoadStoreInRegion = CurrentLoadStore;
14499 }
14500}
14501
14502void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14503 bool InsertInReadyList,
14504 BoUpSLP *SLP) {
14505 assert(SD->isSchedulingEntity());
14506
14508 WorkList.push_back(SD);
14509
14510 while (!WorkList.empty()) {
14511 ScheduleData *SD = WorkList.pop_back_val();
14512 for (ScheduleData *BundleMember = SD; BundleMember;
14513 BundleMember = BundleMember->NextInBundle) {
14514 assert(isInSchedulingRegion(BundleMember));
14515 if (BundleMember->hasValidDependencies())
14516 continue;
14517
14518 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14519 << "\n");
14520 BundleMember->Dependencies = 0;
14521 BundleMember->resetUnscheduledDeps();
14522
14523 // Handle def-use chain dependencies.
14524 if (BundleMember->OpValue != BundleMember->Inst) {
14525 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14526 BundleMember->Dependencies++;
14527 ScheduleData *DestBundle = UseSD->FirstInBundle;
14528 if (!DestBundle->IsScheduled)
14529 BundleMember->incrementUnscheduledDeps(1);
14530 if (!DestBundle->hasValidDependencies())
14531 WorkList.push_back(DestBundle);
14532 }
14533 } else {
14534 for (User *U : BundleMember->Inst->users()) {
14535 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14536 BundleMember->Dependencies++;
14537 ScheduleData *DestBundle = UseSD->FirstInBundle;
14538 if (!DestBundle->IsScheduled)
14539 BundleMember->incrementUnscheduledDeps(1);
14540 if (!DestBundle->hasValidDependencies())
14541 WorkList.push_back(DestBundle);
14542 }
14543 }
14544 }
14545
14546 auto MakeControlDependent = [&](Instruction *I) {
14547 auto *DepDest = getScheduleData(I);
14548 assert(DepDest && "must be in schedule window");
14549 DepDest->ControlDependencies.push_back(BundleMember);
14550 BundleMember->Dependencies++;
14551 ScheduleData *DestBundle = DepDest->FirstInBundle;
14552 if (!DestBundle->IsScheduled)
14553 BundleMember->incrementUnscheduledDeps(1);
14554 if (!DestBundle->hasValidDependencies())
14555 WorkList.push_back(DestBundle);
14556 };
14557
14558 // Any instruction which isn't safe to speculate at the beginning of the
14559 // block is control dependend on any early exit or non-willreturn call
14560 // which proceeds it.
14561 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14562 for (Instruction *I = BundleMember->Inst->getNextNode();
14563 I != ScheduleEnd; I = I->getNextNode()) {
14564 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14565 continue;
14566
14567 // Add the dependency
14568 MakeControlDependent(I);
14569
14571 // Everything past here must be control dependent on I.
14572 break;
14573 }
14574 }
14575
14576 if (RegionHasStackSave) {
14577 // If we have an inalloc alloca instruction, it needs to be scheduled
14578 // after any preceeding stacksave. We also need to prevent any alloca
14579 // from reordering above a preceeding stackrestore.
14580 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14581 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14582 for (Instruction *I = BundleMember->Inst->getNextNode();
14583 I != ScheduleEnd; I = I->getNextNode()) {
14584 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14585 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14586 // Any allocas past here must be control dependent on I, and I
14587 // must be memory dependend on BundleMember->Inst.
14588 break;
14589
14590 if (!isa<AllocaInst>(I))
14591 continue;
14592
14593 // Add the dependency
14594 MakeControlDependent(I);
14595 }
14596 }
14597
14598 // In addition to the cases handle just above, we need to prevent
14599 // allocas and loads/stores from moving below a stacksave or a
14600 // stackrestore. Avoiding moving allocas below stackrestore is currently
14601 // thought to be conservatism. Moving loads/stores below a stackrestore
14602 // can lead to incorrect code.
14603 if (isa<AllocaInst>(BundleMember->Inst) ||
14604 BundleMember->Inst->mayReadOrWriteMemory()) {
14605 for (Instruction *I = BundleMember->Inst->getNextNode();
14606 I != ScheduleEnd; I = I->getNextNode()) {
14607 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14608 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14609 continue;
14610
14611 // Add the dependency
14612 MakeControlDependent(I);
14613 break;
14614 }
14615 }
14616 }
14617
14618 // Handle the memory dependencies (if any).
14619 ScheduleData *DepDest = BundleMember->NextLoadStore;
14620 if (!DepDest)
14621 continue;
14622 Instruction *SrcInst = BundleMember->Inst;
14623 assert(SrcInst->mayReadOrWriteMemory() &&
14624 "NextLoadStore list for non memory effecting bundle?");
14625 MemoryLocation SrcLoc = getLocation(SrcInst);
14626 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14627 unsigned NumAliased = 0;
14628 unsigned DistToSrc = 1;
14629
14630 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14631 assert(isInSchedulingRegion(DepDest));
14632
14633 // We have two limits to reduce the complexity:
14634 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14635 // SLP->isAliased (which is the expensive part in this loop).
14636 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14637 // the whole loop (even if the loop is fast, it's quadratic).
14638 // It's important for the loop break condition (see below) to
14639 // check this limit even between two read-only instructions.
14640 if (DistToSrc >= MaxMemDepDistance ||
14641 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14642 (NumAliased >= AliasedCheckLimit ||
14643 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14644
14645 // We increment the counter only if the locations are aliased
14646 // (instead of counting all alias checks). This gives a better
14647 // balance between reduced runtime and accurate dependencies.
14648 NumAliased++;
14649
14650 DepDest->MemoryDependencies.push_back(BundleMember);
14651 BundleMember->Dependencies++;
14652 ScheduleData *DestBundle = DepDest->FirstInBundle;
14653 if (!DestBundle->IsScheduled) {
14654 BundleMember->incrementUnscheduledDeps(1);
14655 }
14656 if (!DestBundle->hasValidDependencies()) {
14657 WorkList.push_back(DestBundle);
14658 }
14659 }
14660
14661 // Example, explaining the loop break condition: Let's assume our
14662 // starting instruction is i0 and MaxMemDepDistance = 3.
14663 //
14664 // +--------v--v--v
14665 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14666 // +--------^--^--^
14667 //
14668 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14669 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14670 // Previously we already added dependencies from i3 to i6,i7,i8
14671 // (because of MaxMemDepDistance). As we added a dependency from
14672 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14673 // and we can abort this loop at i6.
14674 if (DistToSrc >= 2 * MaxMemDepDistance)
14675 break;
14676 DistToSrc++;
14677 }
14678 }
14679 if (InsertInReadyList && SD->isReady()) {
14680 ReadyInsts.insert(SD);
14681 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14682 << "\n");
14683 }
14684 }
14685}
14686
14687void BoUpSLP::BlockScheduling::resetSchedule() {
14688 assert(ScheduleStart &&
14689 "tried to reset schedule on block which has not been scheduled");
14690 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14691 doForAllOpcodes(I, [&](ScheduleData *SD) {
14692 assert(isInSchedulingRegion(SD) &&
14693 "ScheduleData not in scheduling region");
14694 SD->IsScheduled = false;
14695 SD->resetUnscheduledDeps();
14696 });
14697 }
14698 ReadyInsts.clear();
14699}
14700
14701void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14702 if (!BS->ScheduleStart)
14703 return;
14704
14705 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14706
14707 // A key point - if we got here, pre-scheduling was able to find a valid
14708 // scheduling of the sub-graph of the scheduling window which consists
14709 // of all vector bundles and their transitive users. As such, we do not
14710 // need to reschedule anything *outside of* that subgraph.
14711
14712 BS->resetSchedule();
14713
14714 // For the real scheduling we use a more sophisticated ready-list: it is
14715 // sorted by the original instruction location. This lets the final schedule
14716 // be as close as possible to the original instruction order.
14717 // WARNING: If changing this order causes a correctness issue, that means
14718 // there is some missing dependence edge in the schedule data graph.
14719 struct ScheduleDataCompare {
14720 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14721 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14722 }
14723 };
14724 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14725
14726 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14727 // and fill the ready-list with initial instructions.
14728 int Idx = 0;
14729 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14730 I = I->getNextNode()) {
14731 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14732 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14733 (void)SDTE;
14735 SD->isPartOfBundle() ==
14736 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14737 "scheduler and vectorizer bundle mismatch");
14738 SD->FirstInBundle->SchedulingPriority = Idx++;
14739
14740 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14741 BS->calculateDependencies(SD, false, this);
14742 });
14743 }
14744 BS->initialFillReadyList(ReadyInsts);
14745
14746 Instruction *LastScheduledInst = BS->ScheduleEnd;
14747
14748 // Do the "real" scheduling.
14749 while (!ReadyInsts.empty()) {
14750 ScheduleData *Picked = *ReadyInsts.begin();
14751 ReadyInsts.erase(ReadyInsts.begin());
14752
14753 // Move the scheduled instruction(s) to their dedicated places, if not
14754 // there yet.
14755 for (ScheduleData *BundleMember = Picked; BundleMember;
14756 BundleMember = BundleMember->NextInBundle) {
14757 Instruction *PickedInst = BundleMember->Inst;
14758 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14759 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
14760 LastScheduledInst = PickedInst;
14761 }
14762
14763 BS->schedule(Picked, ReadyInsts);
14764 }
14765
14766 // Check that we didn't break any of our invariants.
14767#ifdef EXPENSIVE_CHECKS
14768 BS->verify();
14769#endif
14770
14771#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14772 // Check that all schedulable entities got scheduled
14773 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14774 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
14775 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14776 assert(SD->IsScheduled && "must be scheduled at this point");
14777 }
14778 });
14779 }
14780#endif
14781
14782 // Avoid duplicate scheduling of the block.
14783 BS->ScheduleStart = nullptr;
14784}
14785
14787 // If V is a store, just return the width of the stored value (or value
14788 // truncated just before storing) without traversing the expression tree.
14789 // This is the common case.
14790 if (auto *Store = dyn_cast<StoreInst>(V))
14791 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14792
14793 if (auto *IEI = dyn_cast<InsertElementInst>(V))
14794 return getVectorElementSize(IEI->getOperand(1));
14795
14796 auto E = InstrElementSize.find(V);
14797 if (E != InstrElementSize.end())
14798 return E->second;
14799
14800 // If V is not a store, we can traverse the expression tree to find loads
14801 // that feed it. The type of the loaded value may indicate a more suitable
14802 // width than V's type. We want to base the vector element size on the width
14803 // of memory operations where possible.
14806 if (auto *I = dyn_cast<Instruction>(V)) {
14807 Worklist.emplace_back(I, I->getParent(), 0);
14808 Visited.insert(I);
14809 }
14810
14811 // Traverse the expression tree in bottom-up order looking for loads. If we
14812 // encounter an instruction we don't yet handle, we give up.
14813 auto Width = 0u;
14814 Value *FirstNonBool = nullptr;
14815 while (!Worklist.empty()) {
14816 auto [I, Parent, Level] = Worklist.pop_back_val();
14817
14818 // We should only be looking at scalar instructions here. If the current
14819 // instruction has a vector type, skip.
14820 auto *Ty = I->getType();
14821 if (isa<VectorType>(Ty))
14822 continue;
14823 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14824 FirstNonBool = I;
14825 if (Level > RecursionMaxDepth)
14826 continue;
14827
14828 // If the current instruction is a load, update MaxWidth to reflect the
14829 // width of the loaded value.
14830 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
14831 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
14832
14833 // Otherwise, we need to visit the operands of the instruction. We only
14834 // handle the interesting cases from buildTree here. If an operand is an
14835 // instruction we haven't yet visited and from the same basic block as the
14836 // user or the use is a PHI node, we add it to the worklist.
14839 for (Use &U : I->operands()) {
14840 if (auto *J = dyn_cast<Instruction>(U.get()))
14841 if (Visited.insert(J).second &&
14842 (isa<PHINode>(I) || J->getParent() == Parent)) {
14843 Worklist.emplace_back(J, J->getParent(), Level + 1);
14844 continue;
14845 }
14846 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14847 FirstNonBool = U.get();
14848 }
14849 } else {
14850 break;
14851 }
14852 }
14853
14854 // If we didn't encounter a memory access in the expression tree, or if we
14855 // gave up for some reason, just return the width of V. Otherwise, return the
14856 // maximum width we found.
14857 if (!Width) {
14858 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14859 V = FirstNonBool;
14860 Width = DL->getTypeSizeInBits(V->getType());
14861 }
14862
14863 for (Instruction *I : Visited)
14864 InstrElementSize[I] = Width;
14865
14866 return Width;
14867}
14868
14869bool BoUpSLP::collectValuesToDemote(
14870 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14872 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14873 bool IsTruncRoot) const {
14874 // We can always demote constants.
14875 if (all_of(E.Scalars, IsaPred<Constant>))
14876 return true;
14877
14878 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
14879 if (OrigBitWidth == BitWidth) {
14880 MaxDepthLevel = 1;
14881 return true;
14882 }
14883
14884 // If the value is not a vectorized instruction in the expression and not used
14885 // by the insertelement instruction and not used in multiple vector nodes, it
14886 // cannot be demoted.
14887 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14888 if (MultiNodeScalars.contains(V))
14889 return false;
14890 if (OrigBitWidth > BitWidth) {
14891 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14892 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14893 return true;
14894 }
14895 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14896 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14897 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*DL));
14898 if (IsSigned)
14899 ++BitWidth1;
14900 if (auto *I = dyn_cast<Instruction>(V)) {
14901 APInt Mask = DB->getDemandedBits(I);
14902 unsigned BitWidth2 =
14903 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14904 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14905 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
14906 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14907 break;
14908 BitWidth2 *= 2;
14909 }
14910 BitWidth1 = std::min(BitWidth1, BitWidth2);
14911 }
14912 BitWidth = std::max(BitWidth, BitWidth1);
14913 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14914 };
14915 using namespace std::placeholders;
14916 auto FinalAnalysis = [&]() {
14917 if (!IsProfitableToDemote)
14918 return false;
14919 bool Res = all_of(
14920 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
14921 // Demote gathers.
14922 if (Res && E.State == TreeEntry::NeedToGather) {
14923 // Check possible extractelement instructions bases and final vector
14924 // length.
14925 SmallPtrSet<Value *, 4> UniqueBases;
14926 for (Value *V : E.Scalars) {
14927 auto *EE = dyn_cast<ExtractElementInst>(V);
14928 if (!EE)
14929 continue;
14930 UniqueBases.insert(EE->getVectorOperand());
14931 }
14932 const unsigned VF = E.Scalars.size();
14933 Type *OrigScalarTy = E.Scalars.front()->getType();
14934 if (UniqueBases.size() <= 2 ||
14935 TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) ==
14937 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
14938 ToDemote.push_back(E.Idx);
14939 }
14940 return Res;
14941 };
14942 if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
14943 any_of(E.Scalars, [&](Value *V) {
14944 return all_of(V->users(), [&](User *U) {
14945 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14946 });
14947 }))
14948 return FinalAnalysis();
14949
14950 if (any_of(E.Scalars, [&](Value *V) {
14951 return !all_of(V->users(), [=](User *U) {
14952 return getTreeEntry(U) ||
14953 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14954 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14955 !U->getType()->isScalableTy() &&
14956 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14957 }) && !IsPotentiallyTruncated(V, BitWidth);
14958 }))
14959 return false;
14960
14961 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14962 bool &NeedToExit) {
14963 NeedToExit = false;
14964 unsigned InitLevel = MaxDepthLevel;
14965 for (const TreeEntry *Op : Operands) {
14966 unsigned Level = InitLevel;
14967 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
14968 ToDemote, Visited, Level, IsProfitableToDemote,
14969 IsTruncRoot)) {
14970 if (!IsProfitableToDemote)
14971 return false;
14972 NeedToExit = true;
14973 if (!FinalAnalysis())
14974 return false;
14975 continue;
14976 }
14977 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14978 }
14979 return true;
14980 };
14981 auto AttemptCheckBitwidth =
14982 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14983 // Try all bitwidth < OrigBitWidth.
14984 NeedToExit = false;
14985 unsigned BestFailBitwidth = 0;
14986 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14987 if (Checker(BitWidth, OrigBitWidth))
14988 return true;
14989 if (BestFailBitwidth == 0 && FinalAnalysis())
14990 BestFailBitwidth = BitWidth;
14991 }
14992 if (BitWidth >= OrigBitWidth) {
14993 if (BestFailBitwidth == 0) {
14994 BitWidth = OrigBitWidth;
14995 return false;
14996 }
14997 MaxDepthLevel = 1;
14998 BitWidth = BestFailBitwidth;
14999 NeedToExit = true;
15000 return true;
15001 }
15002 return false;
15003 };
15004 auto TryProcessInstruction =
15005 [&](unsigned &BitWidth,
15007 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15008 if (Operands.empty()) {
15009 if (!IsTruncRoot)
15010 MaxDepthLevel = 1;
15011 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15012 std::ref(BitWidth)));
15013 } else {
15014 // Several vectorized uses? Check if we can truncate it, otherwise -
15015 // exit.
15016 if (E.UserTreeIndices.size() > 1 &&
15017 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15018 std::ref(BitWidth))))
15019 return false;
15020 bool NeedToExit = false;
15021 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15022 return false;
15023 if (NeedToExit)
15024 return true;
15025 if (!ProcessOperands(Operands, NeedToExit))
15026 return false;
15027 if (NeedToExit)
15028 return true;
15029 }
15030
15031 ++MaxDepthLevel;
15032 // Record the entry that we can demote.
15033 ToDemote.push_back(E.Idx);
15034 return IsProfitableToDemote;
15035 };
15036 switch (E.getOpcode()) {
15037
15038 // We can always demote truncations and extensions. Since truncations can
15039 // seed additional demotion, we save the truncated value.
15040 case Instruction::Trunc:
15041 if (IsProfitableToDemoteRoot)
15042 IsProfitableToDemote = true;
15043 return TryProcessInstruction(BitWidth);
15044 case Instruction::ZExt:
15045 case Instruction::SExt:
15046 IsProfitableToDemote = true;
15047 return TryProcessInstruction(BitWidth);
15048
15049 // We can demote certain binary operations if we can demote both of their
15050 // operands.
15051 case Instruction::Add:
15052 case Instruction::Sub:
15053 case Instruction::Mul:
15054 case Instruction::And:
15055 case Instruction::Or:
15056 case Instruction::Xor: {
15057 return TryProcessInstruction(
15058 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15059 }
15060 case Instruction::Shl: {
15061 // If we are truncating the result of this SHL, and if it's a shift of an
15062 // inrange amount, we can always perform a SHL in a smaller type.
15063 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15064 return all_of(E.Scalars, [&](Value *V) {
15065 auto *I = cast<Instruction>(V);
15066 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15067 return AmtKnownBits.getMaxValue().ult(BitWidth);
15068 });
15069 };
15070 return TryProcessInstruction(
15071 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15072 }
15073 case Instruction::LShr: {
15074 // If this is a truncate of a logical shr, we can truncate it to a smaller
15075 // lshr iff we know that the bits we would otherwise be shifting in are
15076 // already zeros.
15077 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15078 return all_of(E.Scalars, [&](Value *V) {
15079 auto *I = cast<Instruction>(V);
15080 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15081 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15082 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15083 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15084 SimplifyQuery(*DL));
15085 });
15086 };
15087 return TryProcessInstruction(
15088 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15089 LShrChecker);
15090 }
15091 case Instruction::AShr: {
15092 // If this is a truncate of an arithmetic shr, we can truncate it to a
15093 // smaller ashr iff we know that all the bits from the sign bit of the
15094 // original type and the sign bit of the truncate type are similar.
15095 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15096 return all_of(E.Scalars, [&](Value *V) {
15097 auto *I = cast<Instruction>(V);
15098 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15099 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15100 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15101 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15102 nullptr, DT);
15103 });
15104 };
15105 return TryProcessInstruction(
15106 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15107 AShrChecker);
15108 }
15109 case Instruction::UDiv:
15110 case Instruction::URem: {
15111 // UDiv and URem can be truncated if all the truncated bits are zero.
15112 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15113 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15114 return all_of(E.Scalars, [&](Value *V) {
15115 auto *I = cast<Instruction>(V);
15116 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15117 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15118 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15119 });
15120 };
15121 return TryProcessInstruction(
15122 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15123 }
15124
15125 // We can demote selects if we can demote their true and false values.
15126 case Instruction::Select: {
15127 return TryProcessInstruction(
15128 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15129 }
15130
15131 // We can demote phis if we can demote all their incoming operands. Note that
15132 // we don't need to worry about cycles since we ensure single use above.
15133 case Instruction::PHI: {
15134 const unsigned NumOps = E.getNumOperands();
15136 transform(seq<unsigned>(0, NumOps), Ops.begin(),
15137 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15138
15139 return TryProcessInstruction(BitWidth, Ops);
15140 }
15141
15142 case Instruction::Call: {
15143 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15144 if (!IC)
15145 break;
15147 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15148 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15149 break;
15150 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15151 function_ref<bool(unsigned, unsigned)> CallChecker;
15152 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15153 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15154 return all_of(E.Scalars, [&](Value *V) {
15155 auto *I = cast<Instruction>(V);
15156 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15157 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15158 return MaskedValueIsZero(I->getOperand(0), Mask,
15159 SimplifyQuery(*DL)) &&
15160 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15161 }
15162 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15163 "Expected min/max intrinsics only.");
15164 unsigned SignBits = OrigBitWidth - BitWidth;
15165 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15166 return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15167 nullptr, DT) &&
15168 (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) ||
15169 MaskedValueIsZero(I->getOperand(0), Mask,
15170 SimplifyQuery(*DL))) &&
15171 SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15172 nullptr, DT) &&
15173 (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) ||
15174 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15175 });
15176 };
15177 if (ID != Intrinsic::abs) {
15178 Operands.push_back(getOperandEntry(&E, 1));
15179 CallChecker = CompChecker;
15180 }
15181 InstructionCost BestCost =
15182 std::numeric_limits<InstructionCost::CostType>::max();
15183 unsigned BestBitWidth = BitWidth;
15184 unsigned VF = E.Scalars.size();
15185 // Choose the best bitwidth based on cost estimations.
15186 auto Checker = [&](unsigned BitWidth, unsigned) {
15187 unsigned MinBW = PowerOf2Ceil(BitWidth);
15188 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15189 auto VecCallCosts = getVectorCallCosts(
15190 IC,
15191 FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
15192 TTI, TLI, ArgTys);
15193 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15194 if (Cost < BestCost) {
15195 BestCost = Cost;
15196 BestBitWidth = BitWidth;
15197 }
15198 return false;
15199 };
15200 [[maybe_unused]] bool NeedToExit;
15201 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15202 BitWidth = BestBitWidth;
15203 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15204 }
15205
15206 // Otherwise, conservatively give up.
15207 default:
15208 break;
15209 }
15210 MaxDepthLevel = 1;
15211 return FinalAnalysis();
15212}
15213
15214static RecurKind getRdxKind(Value *V);
15215
15217 // We only attempt to truncate integer expressions.
15218 bool IsStoreOrInsertElt =
15219 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15220 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15221 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15222 ExtraBitWidthNodes.size() <= 1 &&
15223 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15224 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15225 return;
15226
15227 unsigned NodeIdx = 0;
15228 if (IsStoreOrInsertElt &&
15229 VectorizableTree.front()->State != TreeEntry::NeedToGather)
15230 NodeIdx = 1;
15231
15232 // Ensure the roots of the vectorizable tree don't form a cycle.
15233 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
15234 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15235 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15236 [NodeIdx](const EdgeInfo &EI) {
15237 return EI.UserTE->Idx >
15238 static_cast<int>(NodeIdx);
15239 })))
15240 return;
15241
15242 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15243 // resize to the final type.
15244 bool IsTruncRoot = false;
15245 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15246 SmallVector<unsigned> RootDemotes;
15247 if (NodeIdx != 0 &&
15248 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15249 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15250 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15251 IsTruncRoot = true;
15252 RootDemotes.push_back(NodeIdx);
15253 IsProfitableToDemoteRoot = true;
15254 ++NodeIdx;
15255 }
15256
15257 // Analyzed the reduction already and not profitable - exit.
15258 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15259 return;
15260
15261 SmallVector<unsigned> ToDemote;
15262 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15263 bool IsProfitableToDemoteRoot, unsigned Opcode,
15264 unsigned Limit, bool IsTruncRoot,
15265 bool IsSignedCmp) {
15266 ToDemote.clear();
15267 unsigned VF = E.getVectorFactor();
15268 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15269 if (!TreeRootIT || !Opcode)
15270 return 0u;
15271
15272 if (any_of(E.Scalars,
15273 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15274 return 0u;
15275
15276 unsigned NumParts =
15277 TTI->getNumberOfParts(FixedVectorType::get(TreeRootIT, VF));
15278
15279 // The maximum bit width required to represent all the values that can be
15280 // demoted without loss of precision. It would be safe to truncate the roots
15281 // of the expression to this width.
15282 unsigned MaxBitWidth = 1u;
15283
15284 // True if the roots can be zero-extended back to their original type,
15285 // rather than sign-extended. We know that if the leading bits are not
15286 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15287 // True.
15288 // Determine if the sign bit of all the roots is known to be zero. If not,
15289 // IsKnownPositive is set to False.
15290 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15291 KnownBits Known = computeKnownBits(R, *DL);
15292 return Known.isNonNegative();
15293 });
15294
15295 // We first check if all the bits of the roots are demanded. If they're not,
15296 // we can truncate the roots to this narrower type.
15297 for (Value *Root : E.Scalars) {
15298 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15299 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15300 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15301 // If we can't prove that the sign bit is zero, we must add one to the
15302 // maximum bit width to account for the unknown sign bit. This preserves
15303 // the existing sign bit so we can safely sign-extend the root back to the
15304 // original type. Otherwise, if we know the sign bit is zero, we will
15305 // zero-extend the root instead.
15306 //
15307 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15308 // one to the maximum bit width will yield a larger-than-necessary
15309 // type. In general, we need to add an extra bit only if we can't
15310 // prove that the upper bit of the original type is equal to the
15311 // upper bit of the proposed smaller type. If these two bits are
15312 // the same (either zero or one) we know that sign-extending from
15313 // the smaller type will result in the same value. Here, since we
15314 // can't yet prove this, we are just making the proposed smaller
15315 // type larger to ensure correctness.
15316 if (!IsKnownPositive)
15317 ++BitWidth1;
15318
15319 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15320 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15321 MaxBitWidth =
15322 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15323 }
15324
15325 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15326 MaxBitWidth = 8;
15327
15328 // If the original type is large, but reduced type does not improve the reg
15329 // use - ignore it.
15330 if (NumParts > 1 &&
15331 NumParts ==
15333 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15334 return 0u;
15335
15336 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15337 Opcode == Instruction::SExt ||
15338 Opcode == Instruction::ZExt || NumParts > 1;
15339 // Conservatively determine if we can actually truncate the roots of the
15340 // expression. Collect the values that can be demoted in ToDemote and
15341 // additional roots that require investigating in Roots.
15343 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15344 bool NeedToDemote = IsProfitableToDemote;
15345
15346 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15347 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15348 IsTruncRoot) ||
15349 (MaxDepthLevel <= Limit &&
15350 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15351 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15352 DL->getTypeSizeInBits(TreeRootIT) /
15353 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15354 ->getOperand(0)
15355 ->getType()) >
15356 2)))))
15357 return 0u;
15358 // Round MaxBitWidth up to the next power-of-two.
15359 MaxBitWidth = bit_ceil(MaxBitWidth);
15360
15361 return MaxBitWidth;
15362 };
15363
15364 // If we can truncate the root, we must collect additional values that might
15365 // be demoted as a result. That is, those seeded by truncations we will
15366 // modify.
15367 // Add reduction ops sizes, if any.
15368 if (UserIgnoreList &&
15369 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15370 for (Value *V : *UserIgnoreList) {
15371 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15372 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15373 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15375 ++BitWidth1;
15376 unsigned BitWidth2 = BitWidth1;
15378 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15379 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15380 }
15381 ReductionBitWidth =
15382 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15383 }
15384 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15385 ReductionBitWidth = 8;
15386
15387 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15388 }
15389 bool IsTopRoot = NodeIdx == 0;
15390 while (NodeIdx < VectorizableTree.size() &&
15391 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15392 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15393 RootDemotes.push_back(NodeIdx);
15394 ++NodeIdx;
15395 IsTruncRoot = true;
15396 }
15397 bool IsSignedCmp = false;
15398 while (NodeIdx < VectorizableTree.size()) {
15399 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15400 unsigned Limit = 2;
15401 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15402 if (IsTopRoot &&
15403 ReductionBitWidth ==
15404 DL->getTypeSizeInBits(
15405 VectorizableTree.front()->Scalars.front()->getType()))
15406 Limit = 3;
15407 unsigned MaxBitWidth = ComputeMaxBitWidth(
15408 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15409 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15410 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15411 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15412 ReductionBitWidth = bit_ceil(MaxBitWidth);
15413 else if (MaxBitWidth == 0)
15414 ReductionBitWidth = 0;
15415 }
15416
15417 for (unsigned Idx : RootDemotes) {
15418 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15419 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15420 if (OrigBitWidth > MaxBitWidth) {
15421 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15422 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15423 }
15424 return false;
15425 }))
15426 ToDemote.push_back(Idx);
15427 }
15428 RootDemotes.clear();
15429 IsTopRoot = false;
15430 IsProfitableToDemoteRoot = true;
15431
15432 if (ExtraBitWidthNodes.empty()) {
15433 NodeIdx = VectorizableTree.size();
15434 } else {
15435 unsigned NewIdx = 0;
15436 do {
15437 NewIdx = *ExtraBitWidthNodes.begin();
15438 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15439 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15440 NodeIdx = NewIdx;
15441 IsTruncRoot =
15442 NodeIdx < VectorizableTree.size() &&
15443 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15444 [](const EdgeInfo &EI) {
15445 return EI.EdgeIdx == 0 &&
15446 EI.UserTE->getOpcode() == Instruction::Trunc &&
15447 !EI.UserTE->isAltShuffle();
15448 });
15449 IsSignedCmp =
15450 NodeIdx < VectorizableTree.size() &&
15451 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15452 [&](const EdgeInfo &EI) {
15453 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15454 any_of(EI.UserTE->Scalars, [&](Value *V) {
15455 auto *IC = dyn_cast<ICmpInst>(V);
15456 return IC &&
15457 (IC->isSigned() ||
15458 !isKnownNonNegative(IC->getOperand(0),
15459 SimplifyQuery(*DL)) ||
15460 !isKnownNonNegative(IC->getOperand(1),
15461 SimplifyQuery(*DL)));
15462 });
15463 });
15464 }
15465
15466 // If the maximum bit width we compute is less than the with of the roots'
15467 // type, we can proceed with the narrowing. Otherwise, do nothing.
15468 if (MaxBitWidth == 0 ||
15469 MaxBitWidth >=
15470 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15471 if (UserIgnoreList)
15472 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15473 continue;
15474 }
15475
15476 // Finally, map the values we can demote to the maximum bit with we
15477 // computed.
15478 for (unsigned Idx : ToDemote) {
15479 TreeEntry *TE = VectorizableTree[Idx].get();
15480 if (MinBWs.contains(TE))
15481 continue;
15482 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15483 any_of(TE->Scalars, [&](Value *R) {
15484 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15485 });
15486 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15487 }
15488 }
15489}
15490
15492 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15493 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15494 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15495 auto *AA = &AM.getResult<AAManager>(F);
15496 auto *LI = &AM.getResult<LoopAnalysis>(F);
15497 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15498 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15499 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15501
15502 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15503 if (!Changed)
15504 return PreservedAnalyses::all();
15505
15508 return PA;
15509}
15510
15512 TargetTransformInfo *TTI_,
15513 TargetLibraryInfo *TLI_, AAResults *AA_,
15514 LoopInfo *LI_, DominatorTree *DT_,
15515 AssumptionCache *AC_, DemandedBits *DB_,
15518 return false;
15519 SE = SE_;
15520 TTI = TTI_;
15521 TLI = TLI_;
15522 AA = AA_;
15523 LI = LI_;
15524 DT = DT_;
15525 AC = AC_;
15526 DB = DB_;
15527 DL = &F.getParent()->getDataLayout();
15528
15529 Stores.clear();
15530 GEPs.clear();
15531 bool Changed = false;
15532
15533 // If the target claims to have no vector registers don't attempt
15534 // vectorization.
15536 LLVM_DEBUG(
15537 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15538 return false;
15539 }
15540
15541 // Don't vectorize when the attribute NoImplicitFloat is used.
15542 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15543 return false;
15544
15545 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15546
15547 // Use the bottom up slp vectorizer to construct chains that start with
15548 // store instructions.
15549 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15550
15551 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15552 // delete instructions.
15553
15554 // Update DFS numbers now so that we can use them for ordering.
15555 DT->updateDFSNumbers();
15556
15557 // Scan the blocks in the function in post order.
15558 for (auto *BB : post_order(&F.getEntryBlock())) {
15559 // Start new block - clear the list of reduction roots.
15560 R.clearReductionData();
15561 collectSeedInstructions(BB);
15562
15563 // Vectorize trees that end at stores.
15564 if (!Stores.empty()) {
15565 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15566 << " underlying objects.\n");
15567 Changed |= vectorizeStoreChains(R);
15568 }
15569
15570 // Vectorize trees that end at reductions.
15571 Changed |= vectorizeChainsInBlock(BB, R);
15572
15573 // Vectorize the index computations of getelementptr instructions. This
15574 // is primarily intended to catch gather-like idioms ending at
15575 // non-consecutive loads.
15576 if (!GEPs.empty()) {
15577 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15578 << " underlying objects.\n");
15579 Changed |= vectorizeGEPIndices(BB, R);
15580 }
15581 }
15582
15583 if (Changed) {
15584 R.optimizeGatherSequence();
15585 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15586 }
15587 return Changed;
15588}
15589
15590std::optional<bool>
15591SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15592 unsigned Idx, unsigned MinVF,
15593 unsigned &Size) {
15594 Size = 0;
15595 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15596 << "\n");
15597 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15598 unsigned VF = Chain.size();
15599
15600 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15601 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15602 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15603 // all vector lanes are used.
15604 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15605 return false;
15606 }
15607
15608 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15609 << "\n");
15610
15611 SetVector<Value *> ValOps;
15612 for (Value *V : Chain)
15613 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15614 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15615 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15616 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15617 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15618 bool IsPowerOf2 =
15619 isPowerOf2_32(ValOps.size()) ||
15620 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15621 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15622 (!S.MainOp->isSafeToRemove() ||
15623 any_of(ValOps.getArrayRef(),
15624 [&](Value *V) {
15625 return !isa<ExtractElementInst>(V) &&
15626 (V->getNumUses() > Chain.size() ||
15627 any_of(V->users(), [&](User *U) {
15628 return !Stores.contains(U);
15629 }));
15630 }))) ||
15631 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15632 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15633 return false;
15634 }
15635 }
15636 if (R.isLoadCombineCandidate(Chain))
15637 return true;
15638 R.buildTree(Chain);
15639 // Check if tree tiny and store itself or its value is not vectorized.
15640 if (R.isTreeTinyAndNotFullyVectorizable()) {
15641 if (R.isGathered(Chain.front()) ||
15642 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15643 return std::nullopt;
15644 Size = R.getTreeSize();
15645 return false;
15646 }
15647 R.reorderTopToBottom();
15648 R.reorderBottomToTop();
15649 R.buildExternalUses();
15650
15651 R.computeMinimumValueSizes();
15652 R.transformNodes();
15653
15654 Size = R.getTreeSize();
15655 if (S.getOpcode() == Instruction::Load)
15656 Size = 2; // cut off masked gather small trees
15657 InstructionCost Cost = R.getTreeCost();
15658
15659 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15660 if (Cost < -SLPCostThreshold) {
15661 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15662
15663 using namespace ore;
15664
15665 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15666 cast<StoreInst>(Chain[0]))
15667 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15668 << " and with tree size "
15669 << NV("TreeSize", R.getTreeSize()));
15670
15671 R.vectorizeTree();
15672 return true;
15673 }
15674
15675 return false;
15676}
15677
15678/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15679static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15680 bool First) {
15681 unsigned Num = 0;
15682 uint64_t Sum = std::accumulate(
15683 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15684 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15685 unsigned Size = First ? Val.first : Val.second;
15686 if (Size == 1)
15687 return V;
15688 ++Num;
15689 return V + Size;
15690 });
15691 if (Num == 0)
15692 return true;
15693 uint64_t Mean = Sum / Num;
15694 if (Mean == 0)
15695 return true;
15696 uint64_t Dev = std::accumulate(
15697 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15698 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15699 unsigned P = First ? Val.first : Val.second;
15700 if (P == 1)
15701 return V;
15702 return V + (P - Mean) * (P - Mean);
15703 }) /
15704 Num;
15705 return Dev * 81 / (Mean * Mean) == 0;
15706}
15707
15708bool SLPVectorizerPass::vectorizeStores(
15709 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
15710 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15711 &Visited) {
15712 // We may run into multiple chains that merge into a single chain. We mark the
15713 // stores that we vectorized so that we don't visit the same store twice.
15714 BoUpSLP::ValueSet VectorizedStores;
15715 bool Changed = false;
15716
15717 struct StoreDistCompare {
15718 bool operator()(const std::pair<unsigned, int> &Op1,
15719 const std::pair<unsigned, int> &Op2) const {
15720 return Op1.second < Op2.second;
15721 }
15722 };
15723 // A set of pairs (index of store in Stores array ref, Distance of the store
15724 // address relative to base store address in units).
15725 using StoreIndexToDistSet =
15726 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15727 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15728 int PrevDist = -1;
15730 // Collect the chain into a list.
15731 for (auto [Idx, Data] : enumerate(Set)) {
15732 if (Operands.empty() || Data.second - PrevDist == 1) {
15733 Operands.push_back(Stores[Data.first]);
15734 PrevDist = Data.second;
15735 if (Idx != Set.size() - 1)
15736 continue;
15737 }
15738 auto E = make_scope_exit([&, &DataVar = Data]() {
15739 Operands.clear();
15740 Operands.push_back(Stores[DataVar.first]);
15741 PrevDist = DataVar.second;
15742 });
15743
15744 if (Operands.size() <= 1 ||
15745 !Visited
15746 .insert({Operands.front(),
15747 cast<StoreInst>(Operands.front())->getValueOperand(),
15748 Operands.back(),
15749 cast<StoreInst>(Operands.back())->getValueOperand(),
15750 Operands.size()})
15751 .second)
15752 continue;
15753
15754 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15755 unsigned EltSize = R.getVectorElementSize(Operands[0]);
15756 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
15757
15758 unsigned MaxVF =
15759 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15760 unsigned MaxRegVF = MaxVF;
15761 auto *Store = cast<StoreInst>(Operands[0]);
15762 Type *StoreTy = Store->getValueOperand()->getType();
15763 Type *ValueTy = StoreTy;
15764 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
15765 ValueTy = Trunc->getSrcTy();
15766 if (ValueTy == StoreTy &&
15767 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
15768 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
15769 unsigned MinVF = std::max<unsigned>(
15771 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15772 ValueTy)));
15773
15774 if (MaxVF < MinVF) {
15775 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15776 << ") < "
15777 << "MinVF (" << MinVF << ")\n");
15778 continue;
15779 }
15780
15781 unsigned NonPowerOf2VF = 0;
15783 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15784 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15785 // lanes are used.
15786 unsigned CandVF = Operands.size();
15787 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
15788 NonPowerOf2VF = CandVF;
15789 }
15790
15791 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15792 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15793 unsigned Size = MinVF;
15794 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15795 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15796 Size *= 2;
15797 });
15798 unsigned End = Operands.size();
15799 unsigned Repeat = 0;
15800 constexpr unsigned MaxAttempts = 4;
15802 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
15803 P.first = P.second = 1;
15804 });
15806 auto IsNotVectorized = [](bool First,
15807 const std::pair<unsigned, unsigned> &P) {
15808 return First ? P.first > 0 : P.second > 0;
15809 };
15810 auto IsVectorized = [](bool First,
15811 const std::pair<unsigned, unsigned> &P) {
15812 return First ? P.first == 0 : P.second == 0;
15813 };
15814 auto VFIsProfitable = [](bool First, unsigned Size,
15815 const std::pair<unsigned, unsigned> &P) {
15816 return First ? Size >= P.first : Size >= P.second;
15817 };
15818 auto FirstSizeSame = [](unsigned Size,
15819 const std::pair<unsigned, unsigned> &P) {
15820 return Size == P.first;
15821 };
15822 while (true) {
15823 ++Repeat;
15824 bool RepeatChanged = false;
15825 bool AnyProfitableGraph;
15826 for (unsigned Size : CandidateVFs) {
15827 AnyProfitableGraph = false;
15828 unsigned StartIdx = std::distance(
15829 RangeSizes.begin(),
15830 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
15831 std::placeholders::_1)));
15832 while (StartIdx < End) {
15833 unsigned EndIdx =
15834 std::distance(RangeSizes.begin(),
15835 find_if(RangeSizes.drop_front(StartIdx),
15836 std::bind(IsVectorized, Size >= MaxRegVF,
15837 std::placeholders::_1)));
15838 unsigned Sz = EndIdx >= End ? End : EndIdx;
15839 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
15840 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
15841 Size >= MaxRegVF)) {
15842 ++Cnt;
15843 continue;
15844 }
15846 assert(all_of(Slice,
15847 [&](Value *V) {
15848 return cast<StoreInst>(V)
15849 ->getValueOperand()
15850 ->getType() ==
15851 cast<StoreInst>(Slice.front())
15852 ->getValueOperand()
15853 ->getType();
15854 }) &&
15855 "Expected all operands of same type.");
15856 if (!NonSchedulable.empty()) {
15857 auto [NonSchedSizeMax, NonSchedSizeMin] =
15858 NonSchedulable.lookup(Slice.front());
15859 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
15860 Cnt += NonSchedSizeMax;
15861 continue;
15862 }
15863 }
15864 unsigned TreeSize;
15865 std::optional<bool> Res =
15866 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15867 if (!Res) {
15868 NonSchedulable
15869 .try_emplace(Slice.front(), std::make_pair(Size, Size))
15870 .first->getSecond()
15871 .second = Size;
15872 } else if (*Res) {
15873 // Mark the vectorized stores so that we don't vectorize them
15874 // again.
15875 VectorizedStores.insert(Slice.begin(), Slice.end());
15876 // Mark the vectorized stores so that we don't vectorize them
15877 // again.
15878 AnyProfitableGraph = RepeatChanged = Changed = true;
15879 // If we vectorized initial block, no need to try to vectorize
15880 // it again.
15881 for_each(RangeSizes.slice(Cnt, Size),
15882 [](std::pair<unsigned, unsigned> &P) {
15883 P.first = P.second = 0;
15884 });
15885 if (Cnt < StartIdx + MinVF) {
15886 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15887 [](std::pair<unsigned, unsigned> &P) {
15888 P.first = P.second = 0;
15889 });
15890 StartIdx = Cnt + Size;
15891 }
15892 if (Cnt > Sz - Size - MinVF) {
15893 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
15894 [](std::pair<unsigned, unsigned> &P) {
15895 P.first = P.second = 0;
15896 });
15897 if (Sz == End)
15898 End = Cnt;
15899 Sz = Cnt;
15900 }
15901 Cnt += Size;
15902 continue;
15903 }
15904 if (Size > 2 && Res &&
15905 !all_of(RangeSizes.slice(Cnt, Size),
15906 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
15907 std::placeholders::_1))) {
15908 Cnt += Size;
15909 continue;
15910 }
15911 // Check for the very big VFs that we're not rebuilding same
15912 // trees, just with larger number of elements.
15913 if (Size > MaxRegVF && TreeSize > 1 &&
15914 all_of(RangeSizes.slice(Cnt, Size),
15915 std::bind(FirstSizeSame, TreeSize,
15916 std::placeholders::_1))) {
15917 Cnt += Size;
15918 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15919 ++Cnt;
15920 continue;
15921 }
15922 if (TreeSize > 1)
15923 for_each(RangeSizes.slice(Cnt, Size),
15924 [&](std::pair<unsigned, unsigned> &P) {
15925 if (Size >= MaxRegVF)
15926 P.second = std::max(P.second, TreeSize);
15927 else
15928 P.first = std::max(P.first, TreeSize);
15929 });
15930 ++Cnt;
15931 AnyProfitableGraph = true;
15932 }
15933 if (StartIdx >= End)
15934 break;
15935 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15936 AnyProfitableGraph = true;
15937 StartIdx = std::distance(
15938 RangeSizes.begin(),
15939 find_if(RangeSizes.drop_front(Sz),
15940 std::bind(IsNotVectorized, Size >= MaxRegVF,
15941 std::placeholders::_1)));
15942 }
15943 if (!AnyProfitableGraph && Size >= MaxRegVF)
15944 break;
15945 }
15946 // All values vectorized - exit.
15947 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
15948 return P.first == 0 && P.second == 0;
15949 }))
15950 break;
15951 // Check if tried all attempts or no need for the last attempts at all.
15952 if (Repeat >= MaxAttempts ||
15953 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15954 break;
15955 constexpr unsigned StoresLimit = 64;
15956 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
15957 Operands.size(),
15958 static_cast<unsigned>(
15959 End -
15960 std::distance(
15961 RangeSizes.begin(),
15962 find_if(RangeSizes, std::bind(IsNotVectorized, true,
15963 std::placeholders::_1))) +
15964 1)));
15965 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
15966 if (VF > MaxTotalNum || VF >= StoresLimit)
15967 break;
15968 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
15969 if (P.first != 0)
15970 P.first = std::max(P.second, P.first);
15971 });
15972 // Last attempt to vectorize max number of elements, if all previous
15973 // attempts were unsuccessful because of the cost issues.
15974 CandidateVFs.clear();
15975 CandidateVFs.push_back(VF);
15976 }
15977 }
15978 };
15979
15980 // Stores pair (first: index of the store into Stores array ref, address of
15981 // which taken as base, second: sorted set of pairs {index, dist}, which are
15982 // indices of stores in the set and their store location distances relative to
15983 // the base address).
15984
15985 // Need to store the index of the very first store separately, since the set
15986 // may be reordered after the insertion and the first store may be moved. This
15987 // container allows to reduce number of calls of getPointersDiff() function.
15989 // Inserts the specified store SI with the given index Idx to the set of the
15990 // stores. If the store with the same distance is found already - stop
15991 // insertion, try to vectorize already found stores. If some stores from this
15992 // sequence were not vectorized - try to vectorize them with the new store
15993 // later. But this logic is applied only to the stores, that come before the
15994 // previous store with the same distance.
15995 // Example:
15996 // 1. store x, %p
15997 // 2. store y, %p+1
15998 // 3. store z, %p+2
15999 // 4. store a, %p
16000 // 5. store b, %p+3
16001 // - Scan this from the last to first store. The very first bunch of stores is
16002 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16003 // vector).
16004 // - The next store in the list - #1 - has the same distance from store #5 as
16005 // the store #4.
16006 // - Try to vectorize sequence of stores 4,2,3,5.
16007 // - If all these stores are vectorized - just drop them.
16008 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16009 // - Start new stores sequence.
16010 // The new bunch of stores is {1, {1, 0}}.
16011 // - Add the stores from previous sequence, that were not vectorized.
16012 // Here we consider the stores in the reversed order, rather they are used in
16013 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16014 // Store #3 can be added -> comes after store #4 with the same distance as
16015 // store #1.
16016 // Store #5 cannot be added - comes before store #4.
16017 // This logic allows to improve the compile time, we assume that the stores
16018 // after previous store with the same distance most likely have memory
16019 // dependencies and no need to waste compile time to try to vectorize them.
16020 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16021 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16022 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16023 std::optional<int> Diff = getPointersDiff(
16024 Stores[Set.first]->getValueOperand()->getType(),
16025 Stores[Set.first]->getPointerOperand(),
16026 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16027 /*StrictCheck=*/true);
16028 if (!Diff)
16029 continue;
16030 auto It = Set.second.find(std::make_pair(Idx, *Diff));
16031 if (It == Set.second.end()) {
16032 Set.second.emplace(Idx, *Diff);
16033 return;
16034 }
16035 // Try to vectorize the first found set to avoid duplicate analysis.
16036 TryToVectorize(Set.second);
16037 StoreIndexToDistSet PrevSet;
16038 PrevSet.swap(Set.second);
16039 Set.first = Idx;
16040 Set.second.emplace(Idx, 0);
16041 // Insert stores that followed previous match to try to vectorize them
16042 // with this store.
16043 unsigned StartIdx = It->first + 1;
16044 SmallBitVector UsedStores(Idx - StartIdx);
16045 // Distances to previously found dup store (or this store, since they
16046 // store to the same addresses).
16047 SmallVector<int> Dists(Idx - StartIdx, 0);
16048 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16049 // Do not try to vectorize sequences, we already tried.
16050 if (Pair.first <= It->first ||
16051 VectorizedStores.contains(Stores[Pair.first]))
16052 break;
16053 unsigned BI = Pair.first - StartIdx;
16054 UsedStores.set(BI);
16055 Dists[BI] = Pair.second - It->second;
16056 }
16057 for (unsigned I = StartIdx; I < Idx; ++I) {
16058 unsigned BI = I - StartIdx;
16059 if (UsedStores.test(BI))
16060 Set.second.emplace(I, Dists[BI]);
16061 }
16062 return;
16063 }
16064 auto &Res = SortedStores.emplace_back();
16065 Res.first = Idx;
16066 Res.second.emplace(Idx, 0);
16067 };
16068 StoreInst *PrevStore = Stores.front();
16069 for (auto [I, SI] : enumerate(Stores)) {
16070 // Check that we do not try to vectorize stores of different types.
16071 if (PrevStore->getValueOperand()->getType() !=
16072 SI->getValueOperand()->getType()) {
16073 for (auto &Set : SortedStores)
16074 TryToVectorize(Set.second);
16075 SortedStores.clear();
16076 PrevStore = SI;
16077 }
16078 FillStoresSet(I, SI);
16079 }
16080
16081 // Final vectorization attempt.
16082 for (auto &Set : SortedStores)
16083 TryToVectorize(Set.second);
16084
16085 return Changed;
16086}
16087
16088void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16089 // Initialize the collections. We will make a single pass over the block.
16090 Stores.clear();
16091 GEPs.clear();
16092
16093 // Visit the store and getelementptr instructions in BB and organize them in
16094 // Stores and GEPs according to the underlying objects of their pointer
16095 // operands.
16096 for (Instruction &I : *BB) {
16097 // Ignore store instructions that are volatile or have a pointer operand
16098 // that doesn't point to a scalar type.
16099 if (auto *SI = dyn_cast<StoreInst>(&I)) {
16100 if (!SI->isSimple())
16101 continue;
16102 if (!isValidElementType(SI->getValueOperand()->getType()))
16103 continue;
16104 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16105 }
16106
16107 // Ignore getelementptr instructions that have more than one index, a
16108 // constant index, or a pointer operand that doesn't point to a scalar
16109 // type.
16110 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16111 if (GEP->getNumIndices() != 1)
16112 continue;
16113 Value *Idx = GEP->idx_begin()->get();
16114 if (isa<Constant>(Idx))
16115 continue;
16116 if (!isValidElementType(Idx->getType()))
16117 continue;
16118 if (GEP->getType()->isVectorTy())
16119 continue;
16120 GEPs[GEP->getPointerOperand()].push_back(GEP);
16121 }
16122 }
16123}
16124
16125bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16126 bool MaxVFOnly) {
16127 if (VL.size() < 2)
16128 return false;
16129
16130 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16131 << VL.size() << ".\n");
16132
16133 // Check that all of the parts are instructions of the same type,
16134 // we permit an alternate opcode via InstructionsState.
16135 InstructionsState S = getSameOpcode(VL, *TLI);
16136 if (!S.getOpcode())
16137 return false;
16138
16139 Instruction *I0 = cast<Instruction>(S.OpValue);
16140 // Make sure invalid types (including vector type) are rejected before
16141 // determining vectorization factor for scalar instructions.
16142 for (Value *V : VL) {
16143 Type *Ty = V->getType();
16144 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16145 // NOTE: the following will give user internal llvm type name, which may
16146 // not be useful.
16147 R.getORE()->emit([&]() {
16148 std::string TypeStr;
16149 llvm::raw_string_ostream rso(TypeStr);
16150 Ty->print(rso);
16151 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16152 << "Cannot SLP vectorize list: type "
16153 << rso.str() + " is unsupported by vectorizer";
16154 });
16155 return false;
16156 }
16157 }
16158
16159 unsigned Sz = R.getVectorElementSize(I0);
16160 unsigned MinVF = R.getMinVF(Sz);
16161 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16162 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16163 if (MaxVF < 2) {
16164 R.getORE()->emit([&]() {
16165 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16166 << "Cannot SLP vectorize list: vectorization factor "
16167 << "less than 2 is not supported";
16168 });
16169 return false;
16170 }
16171
16172 bool Changed = false;
16173 bool CandidateFound = false;
16174 InstructionCost MinCost = SLPCostThreshold.getValue();
16175 Type *ScalarTy = VL[0]->getType();
16176 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16177 ScalarTy = IE->getOperand(1)->getType();
16178
16179 unsigned NextInst = 0, MaxInst = VL.size();
16180 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16181 // No actual vectorization should happen, if number of parts is the same as
16182 // provided vectorization factor (i.e. the scalar type is used for vector
16183 // code during codegen).
16184 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
16185 if (TTI->getNumberOfParts(VecTy) == VF)
16186 continue;
16187 for (unsigned I = NextInst; I < MaxInst; ++I) {
16188 unsigned ActualVF = std::min(MaxInst - I, VF);
16189
16190 if (!isPowerOf2_32(ActualVF))
16191 continue;
16192
16193 if (MaxVFOnly && ActualVF < MaxVF)
16194 break;
16195 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16196 break;
16197
16198 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16199 // Check that a previous iteration of this loop did not delete the Value.
16200 if (llvm::any_of(Ops, [&R](Value *V) {
16201 auto *I = dyn_cast<Instruction>(V);
16202 return I && R.isDeleted(I);
16203 }))
16204 continue;
16205
16206 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16207 << "\n");
16208
16209 R.buildTree(Ops);
16210 if (R.isTreeTinyAndNotFullyVectorizable())
16211 continue;
16212 R.reorderTopToBottom();
16213 R.reorderBottomToTop(
16214 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16215 !R.doesRootHaveInTreeUses());
16216 R.buildExternalUses();
16217
16218 R.computeMinimumValueSizes();
16219 R.transformNodes();
16220 InstructionCost Cost = R.getTreeCost();
16221 CandidateFound = true;
16222 MinCost = std::min(MinCost, Cost);
16223
16224 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16225 << " for VF=" << ActualVF << "\n");
16226 if (Cost < -SLPCostThreshold) {
16227 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16228 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16229 cast<Instruction>(Ops[0]))
16230 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16231 << " and with tree size "
16232 << ore::NV("TreeSize", R.getTreeSize()));
16233
16234 R.vectorizeTree();
16235 // Move to the next bundle.
16236 I += VF - 1;
16237 NextInst = I + 1;
16238 Changed = true;
16239 }
16240 }
16241 }
16242
16243 if (!Changed && CandidateFound) {
16244 R.getORE()->emit([&]() {
16245 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16246 << "List vectorization was possible but not beneficial with cost "
16247 << ore::NV("Cost", MinCost) << " >= "
16248 << ore::NV("Treshold", -SLPCostThreshold);
16249 });
16250 } else if (!Changed) {
16251 R.getORE()->emit([&]() {
16252 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16253 << "Cannot SLP vectorize list: vectorization was impossible"
16254 << " with available vectorization factors";
16255 });
16256 }
16257 return Changed;
16258}
16259
16260bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16261 if (!I)
16262 return false;
16263
16264 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16265 return false;
16266
16267 Value *P = I->getParent();
16268
16269 // Vectorize in current basic block only.
16270 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16271 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16272 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16273 return false;
16274
16275 // First collect all possible candidates
16277 Candidates.emplace_back(Op0, Op1);
16278
16279 auto *A = dyn_cast<BinaryOperator>(Op0);
16280 auto *B = dyn_cast<BinaryOperator>(Op1);
16281 // Try to skip B.
16282 if (A && B && B->hasOneUse()) {
16283 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16284 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16285 if (B0 && B0->getParent() == P)
16286 Candidates.emplace_back(A, B0);
16287 if (B1 && B1->getParent() == P)
16288 Candidates.emplace_back(A, B1);
16289 }
16290 // Try to skip A.
16291 if (B && A && A->hasOneUse()) {
16292 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16293 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16294 if (A0 && A0->getParent() == P)
16295 Candidates.emplace_back(A0, B);
16296 if (A1 && A1->getParent() == P)
16297 Candidates.emplace_back(A1, B);
16298 }
16299
16300 if (Candidates.size() == 1)
16301 return tryToVectorizeList({Op0, Op1}, R);
16302
16303 // We have multiple options. Try to pick the single best.
16304 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16305 if (!BestCandidate)
16306 return false;
16307 return tryToVectorizeList(
16308 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16309}
16310
16311namespace {
16312
16313/// Model horizontal reductions.
16314///
16315/// A horizontal reduction is a tree of reduction instructions that has values
16316/// that can be put into a vector as its leaves. For example:
16317///
16318/// mul mul mul mul
16319/// \ / \ /
16320/// + +
16321/// \ /
16322/// +
16323/// This tree has "mul" as its leaf values and "+" as its reduction
16324/// instructions. A reduction can feed into a store or a binary operation
16325/// feeding a phi.
16326/// ...
16327/// \ /
16328/// +
16329/// |
16330/// phi +=
16331///
16332/// Or:
16333/// ...
16334/// \ /
16335/// +
16336/// |
16337/// *p =
16338///
16339class HorizontalReduction {
16340 using ReductionOpsType = SmallVector<Value *, 16>;
16341 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16342 ReductionOpsListType ReductionOps;
16343 /// List of possibly reduced values.
16345 /// Maps reduced value to the corresponding reduction operation.
16347 // Use map vector to make stable output.
16349 WeakTrackingVH ReductionRoot;
16350 /// The type of reduction operation.
16351 RecurKind RdxKind;
16352 /// Checks if the optimization of original scalar identity operations on
16353 /// matched horizontal reductions is enabled and allowed.
16354 bool IsSupportedHorRdxIdentityOp = false;
16355
16356 static bool isCmpSelMinMax(Instruction *I) {
16357 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16359 }
16360
16361 // And/or are potentially poison-safe logical patterns like:
16362 // select x, y, false
16363 // select x, true, y
16364 static bool isBoolLogicOp(Instruction *I) {
16365 return isa<SelectInst>(I) &&
16366 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16367 }
16368
16369 /// Checks if instruction is associative and can be vectorized.
16370 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16371 if (Kind == RecurKind::None)
16372 return false;
16373
16374 // Integer ops that map to select instructions or intrinsics are fine.
16376 isBoolLogicOp(I))
16377 return true;
16378
16379 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16380 // FP min/max are associative except for NaN and -0.0. We do not
16381 // have to rule out -0.0 here because the intrinsic semantics do not
16382 // specify a fixed result for it.
16383 return I->getFastMathFlags().noNaNs();
16384 }
16385
16386 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16387 return true;
16388
16389 return I->isAssociative();
16390 }
16391
16392 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16393 // Poison-safe 'or' takes the form: select X, true, Y
16394 // To make that work with the normal operand processing, we skip the
16395 // true value operand.
16396 // TODO: Change the code and data structures to handle this without a hack.
16397 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16398 return I->getOperand(2);
16399 return I->getOperand(Index);
16400 }
16401
16402 /// Creates reduction operation with the current opcode.
16403 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16404 Value *RHS, const Twine &Name, bool UseSelect) {
16405 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16406 switch (Kind) {
16407 case RecurKind::Or:
16408 if (UseSelect &&
16410 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16411 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16412 Name);
16413 case RecurKind::And:
16414 if (UseSelect &&
16416 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16417 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16418 Name);
16419 case RecurKind::Add:
16420 case RecurKind::Mul:
16421 case RecurKind::Xor:
16422 case RecurKind::FAdd:
16423 case RecurKind::FMul:
16424 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16425 Name);
16426 case RecurKind::FMax:
16427 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16428 case RecurKind::FMin:
16429 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16430 case RecurKind::FMaximum:
16431 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16432 case RecurKind::FMinimum:
16433 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16434 case RecurKind::SMax:
16435 if (UseSelect) {
16436 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16437 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16438 }
16439 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16440 case RecurKind::SMin:
16441 if (UseSelect) {
16442 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16443 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16444 }
16445 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16446 case RecurKind::UMax:
16447 if (UseSelect) {
16448 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16449 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16450 }
16451 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16452 case RecurKind::UMin:
16453 if (UseSelect) {
16454 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16455 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16456 }
16457 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16458 default:
16459 llvm_unreachable("Unknown reduction operation.");
16460 }
16461 }
16462
16463 /// Creates reduction operation with the current opcode with the IR flags
16464 /// from \p ReductionOps, dropping nuw/nsw flags.
16465 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16466 Value *RHS, const Twine &Name,
16467 const ReductionOpsListType &ReductionOps) {
16468 bool UseSelect = ReductionOps.size() == 2 ||
16469 // Logical or/and.
16470 (ReductionOps.size() == 1 &&
16471 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16472 assert((!UseSelect || ReductionOps.size() != 2 ||
16473 isa<SelectInst>(ReductionOps[1][0])) &&
16474 "Expected cmp + select pairs for reduction");
16475 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16477 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16478 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16479 /*IncludeWrapFlags=*/false);
16480 propagateIRFlags(Op, ReductionOps[1], nullptr,
16481 /*IncludeWrapFlags=*/false);
16482 return Op;
16483 }
16484 }
16485 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16486 return Op;
16487 }
16488
16489public:
16490 static RecurKind getRdxKind(Value *V) {
16491 auto *I = dyn_cast<Instruction>(V);
16492 if (!I)
16493 return RecurKind::None;
16494 if (match(I, m_Add(m_Value(), m_Value())))
16495 return RecurKind::Add;
16496 if (match(I, m_Mul(m_Value(), m_Value())))
16497 return RecurKind::Mul;
16498 if (match(I, m_And(m_Value(), m_Value())) ||
16500 return RecurKind::And;
16501 if (match(I, m_Or(m_Value(), m_Value())) ||
16503 return RecurKind::Or;
16504 if (match(I, m_Xor(m_Value(), m_Value())))
16505 return RecurKind::Xor;
16506 if (match(I, m_FAdd(m_Value(), m_Value())))
16507 return RecurKind::FAdd;
16508 if (match(I, m_FMul(m_Value(), m_Value())))
16509 return RecurKind::FMul;
16510
16511 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16512 return RecurKind::FMax;
16513 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16514 return RecurKind::FMin;
16515
16516 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16517 return RecurKind::FMaximum;
16518 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16519 return RecurKind::FMinimum;
16520 // This matches either cmp+select or intrinsics. SLP is expected to handle
16521 // either form.
16522 // TODO: If we are canonicalizing to intrinsics, we can remove several
16523 // special-case paths that deal with selects.
16524 if (match(I, m_SMax(m_Value(), m_Value())))
16525 return RecurKind::SMax;
16526 if (match(I, m_SMin(m_Value(), m_Value())))
16527 return RecurKind::SMin;
16528 if (match(I, m_UMax(m_Value(), m_Value())))
16529 return RecurKind::UMax;
16530 if (match(I, m_UMin(m_Value(), m_Value())))
16531 return RecurKind::UMin;
16532
16533 if (auto *Select = dyn_cast<SelectInst>(I)) {
16534 // Try harder: look for min/max pattern based on instructions producing
16535 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16536 // During the intermediate stages of SLP, it's very common to have
16537 // pattern like this (since optimizeGatherSequence is run only once
16538 // at the end):
16539 // %1 = extractelement <2 x i32> %a, i32 0
16540 // %2 = extractelement <2 x i32> %a, i32 1
16541 // %cond = icmp sgt i32 %1, %2
16542 // %3 = extractelement <2 x i32> %a, i32 0
16543 // %4 = extractelement <2 x i32> %a, i32 1
16544 // %select = select i1 %cond, i32 %3, i32 %4
16545 CmpInst::Predicate Pred;
16546 Instruction *L1;
16547 Instruction *L2;
16548
16549 Value *LHS = Select->getTrueValue();
16550 Value *RHS = Select->getFalseValue();
16551 Value *Cond = Select->getCondition();
16552
16553 // TODO: Support inverse predicates.
16554 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16555 if (!isa<ExtractElementInst>(RHS) ||
16556 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16557 return RecurKind::None;
16558 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16559 if (!isa<ExtractElementInst>(LHS) ||
16560 !L1->isIdenticalTo(cast<Instruction>(LHS)))
16561 return RecurKind::None;
16562 } else {
16563 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16564 return RecurKind::None;
16565 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16566 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16567 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16568 return RecurKind::None;
16569 }
16570
16571 switch (Pred) {
16572 default:
16573 return RecurKind::None;
16574 case CmpInst::ICMP_SGT:
16575 case CmpInst::ICMP_SGE:
16576 return RecurKind::SMax;
16577 case CmpInst::ICMP_SLT:
16578 case CmpInst::ICMP_SLE:
16579 return RecurKind::SMin;
16580 case CmpInst::ICMP_UGT:
16581 case CmpInst::ICMP_UGE:
16582 return RecurKind::UMax;
16583 case CmpInst::ICMP_ULT:
16584 case CmpInst::ICMP_ULE:
16585 return RecurKind::UMin;
16586 }
16587 }
16588 return RecurKind::None;
16589 }
16590
16591 /// Get the index of the first operand.
16592 static unsigned getFirstOperandIndex(Instruction *I) {
16593 return isCmpSelMinMax(I) ? 1 : 0;
16594 }
16595
16596private:
16597 /// Total number of operands in the reduction operation.
16598 static unsigned getNumberOfOperands(Instruction *I) {
16599 return isCmpSelMinMax(I) ? 3 : 2;
16600 }
16601
16602 /// Checks if the instruction is in basic block \p BB.
16603 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16604 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16605 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16606 auto *Sel = cast<SelectInst>(I);
16607 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16608 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16609 }
16610 return I->getParent() == BB;
16611 }
16612
16613 /// Expected number of uses for reduction operations/reduced values.
16614 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16615 if (IsCmpSelMinMax) {
16616 // SelectInst must be used twice while the condition op must have single
16617 // use only.
16618 if (auto *Sel = dyn_cast<SelectInst>(I))
16619 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16620 return I->hasNUses(2);
16621 }
16622
16623 // Arithmetic reduction operation must be used once only.
16624 return I->hasOneUse();
16625 }
16626
16627 /// Initializes the list of reduction operations.
16628 void initReductionOps(Instruction *I) {
16629 if (isCmpSelMinMax(I))
16630 ReductionOps.assign(2, ReductionOpsType());
16631 else
16632 ReductionOps.assign(1, ReductionOpsType());
16633 }
16634
16635 /// Add all reduction operations for the reduction instruction \p I.
16636 void addReductionOps(Instruction *I) {
16637 if (isCmpSelMinMax(I)) {
16638 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16639 ReductionOps[1].emplace_back(I);
16640 } else {
16641 ReductionOps[0].emplace_back(I);
16642 }
16643 }
16644
16645 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16646 int Sz = Data.size();
16647 auto *I = dyn_cast<Instruction>(Data.front());
16648 return Sz > 1 || isConstant(Data.front()) ||
16649 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16650 }
16651
16652public:
16653 HorizontalReduction() = default;
16654
16655 /// Try to find a reduction tree.
16656 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16657 ScalarEvolution &SE, const DataLayout &DL,
16658 const TargetLibraryInfo &TLI) {
16659 RdxKind = HorizontalReduction::getRdxKind(Root);
16660 if (!isVectorizable(RdxKind, Root))
16661 return false;
16662
16663 // Analyze "regular" integer/FP types for reductions - no target-specific
16664 // types or pointers.
16665 Type *Ty = Root->getType();
16666 if (!isValidElementType(Ty) || Ty->isPointerTy())
16667 return false;
16668
16669 // Though the ultimate reduction may have multiple uses, its condition must
16670 // have only single use.
16671 if (auto *Sel = dyn_cast<SelectInst>(Root))
16672 if (!Sel->getCondition()->hasOneUse())
16673 return false;
16674
16675 ReductionRoot = Root;
16676
16677 // Iterate through all the operands of the possible reduction tree and
16678 // gather all the reduced values, sorting them by their value id.
16679 BasicBlock *BB = Root->getParent();
16680 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16681 SmallVector<Instruction *> Worklist(1, Root);
16682 // Checks if the operands of the \p TreeN instruction are also reduction
16683 // operations or should be treated as reduced values or an extra argument,
16684 // which is not part of the reduction.
16685 auto CheckOperands = [&](Instruction *TreeN,
16686 SmallVectorImpl<Value *> &ExtraArgs,
16687 SmallVectorImpl<Value *> &PossibleReducedVals,
16688 SmallVectorImpl<Instruction *> &ReductionOps) {
16689 for (int I = getFirstOperandIndex(TreeN),
16690 End = getNumberOfOperands(TreeN);
16691 I < End; ++I) {
16692 Value *EdgeVal = getRdxOperand(TreeN, I);
16693 ReducedValsToOps[EdgeVal].push_back(TreeN);
16694 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16695 // Edge has wrong parent - mark as an extra argument.
16696 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
16697 !hasSameParent(EdgeInst, BB)) {
16698 ExtraArgs.push_back(EdgeVal);
16699 continue;
16700 }
16701 // If the edge is not an instruction, or it is different from the main
16702 // reduction opcode or has too many uses - possible reduced value.
16703 // Also, do not try to reduce const values, if the operation is not
16704 // foldable.
16705 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16706 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16707 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16708 !isVectorizable(RdxKind, EdgeInst) ||
16709 (R.isAnalyzedReductionRoot(EdgeInst) &&
16710 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16711 PossibleReducedVals.push_back(EdgeVal);
16712 continue;
16713 }
16714 ReductionOps.push_back(EdgeInst);
16715 }
16716 };
16717 // Try to regroup reduced values so that it gets more profitable to try to
16718 // reduce them. Values are grouped by their value ids, instructions - by
16719 // instruction op id and/or alternate op id, plus do extra analysis for
16720 // loads (grouping them by the distabce between pointers) and cmp
16721 // instructions (grouping them by the predicate).
16723 PossibleReducedVals;
16724 initReductionOps(Root);
16726 SmallSet<size_t, 2> LoadKeyUsed;
16727 SmallPtrSet<Value *, 4> DoNotReverseVals;
16728
16729 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16731 if (LoadKeyUsed.contains(Key)) {
16732 auto LIt = LoadsMap.find(Ptr);
16733 if (LIt != LoadsMap.end()) {
16734 for (LoadInst *RLI : LIt->second) {
16735 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
16736 LI->getType(), LI->getPointerOperand(), DL, SE,
16737 /*StrictCheck=*/true))
16738 return hash_value(RLI->getPointerOperand());
16739 }
16740 for (LoadInst *RLI : LIt->second) {
16742 LI->getPointerOperand(), TLI)) {
16743 hash_code SubKey = hash_value(RLI->getPointerOperand());
16744 DoNotReverseVals.insert(RLI);
16745 return SubKey;
16746 }
16747 }
16748 if (LIt->second.size() > 2) {
16749 hash_code SubKey =
16750 hash_value(LIt->second.back()->getPointerOperand());
16751 DoNotReverseVals.insert(LIt->second.back());
16752 return SubKey;
16753 }
16754 }
16755 }
16756 LoadKeyUsed.insert(Key);
16757 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
16758 return hash_value(LI->getPointerOperand());
16759 };
16760
16761 while (!Worklist.empty()) {
16762 Instruction *TreeN = Worklist.pop_back_val();
16764 SmallVector<Value *> PossibleRedVals;
16765 SmallVector<Instruction *> PossibleReductionOps;
16766 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16767 // If too many extra args - mark the instruction itself as a reduction
16768 // value, not a reduction operation.
16769 if (Args.size() < 2) {
16770 addReductionOps(TreeN);
16771 // Add extra args.
16772 if (!Args.empty()) {
16773 assert(Args.size() == 1 && "Expected only single argument.");
16774 ExtraArgs[TreeN] = Args.front();
16775 }
16776 // Add reduction values. The values are sorted for better vectorization
16777 // results.
16778 for (Value *V : PossibleRedVals) {
16779 size_t Key, Idx;
16780 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
16781 /*AllowAlternate=*/false);
16782 ++PossibleReducedVals[Key][Idx]
16783 .insert(std::make_pair(V, 0))
16784 .first->second;
16785 }
16786 Worklist.append(PossibleReductionOps.rbegin(),
16787 PossibleReductionOps.rend());
16788 } else {
16789 size_t Key, Idx;
16790 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
16791 /*AllowAlternate=*/false);
16792 ++PossibleReducedVals[Key][Idx]
16793 .insert(std::make_pair(TreeN, 0))
16794 .first->second;
16795 }
16796 }
16797 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16798 // Sort values by the total number of values kinds to start the reduction
16799 // from the longest possible reduced values sequences.
16800 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16801 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16802 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16803 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16804 It != E; ++It) {
16805 PossibleRedValsVect.emplace_back();
16806 auto RedValsVect = It->second.takeVector();
16807 stable_sort(RedValsVect, llvm::less_second());
16808 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16809 PossibleRedValsVect.back().append(Data.second, Data.first);
16810 }
16811 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
16812 return P1.size() > P2.size();
16813 });
16814 int NewIdx = -1;
16815 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16816 if (isGoodForReduction(Data) ||
16817 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16818 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16820 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16821 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
16822 ->getPointerOperand()))) {
16823 if (NewIdx < 0) {
16824 NewIdx = ReducedVals.size();
16825 ReducedVals.emplace_back();
16826 }
16827 if (DoNotReverseVals.contains(Data.front()))
16828 ReducedVals[NewIdx].append(Data.begin(), Data.end());
16829 else
16830 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
16831 } else {
16832 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
16833 }
16834 }
16835 }
16836 // Sort the reduced values by number of same/alternate opcode and/or pointer
16837 // operand.
16838 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16839 return P1.size() > P2.size();
16840 });
16841 return true;
16842 }
16843
16844 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16845 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16846 const TargetLibraryInfo &TLI) {
16847 constexpr int ReductionLimit = 4;
16848 constexpr unsigned RegMaxNumber = 4;
16849 constexpr unsigned RedValsMaxNumber = 128;
16850 // If there are a sufficient number of reduction values, reduce
16851 // to a nearby power-of-2. We can safely generate oversized
16852 // vectors and rely on the backend to split them to legal sizes.
16853 unsigned NumReducedVals =
16854 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
16855 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16856 if (!isGoodForReduction(Vals))
16857 return Num;
16858 return Num + Vals.size();
16859 });
16860 if (NumReducedVals < ReductionLimit &&
16862 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
16863 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
16864 }))) {
16865 for (ReductionOpsType &RdxOps : ReductionOps)
16866 for (Value *RdxOp : RdxOps)
16867 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16868 return nullptr;
16869 }
16870
16871 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16872 TargetFolder(DL));
16873 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
16874
16875 // Track the reduced values in case if they are replaced by extractelement
16876 // because of the vectorization.
16878 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16879 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16880 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16881 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
16882 // The same extra argument may be used several times, so log each attempt
16883 // to use it.
16884 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16885 assert(Pair.first && "DebugLoc must be set.");
16886 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16887 TrackedVals.try_emplace(Pair.second, Pair.second);
16888 }
16889
16890 // The compare instruction of a min/max is the insertion point for new
16891 // instructions and may be replaced with a new compare instruction.
16892 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16893 assert(isa<SelectInst>(RdxRootInst) &&
16894 "Expected min/max reduction to have select root instruction");
16895 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16896 assert(isa<Instruction>(ScalarCond) &&
16897 "Expected min/max reduction to have compare condition");
16898 return cast<Instruction>(ScalarCond);
16899 };
16900
16901 // Return new VectorizedTree, based on previous value.
16902 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16903 if (VectorizedTree) {
16904 // Update the final value in the reduction.
16906 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16907 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16909 !isGuaranteedNotToBePoison(VectorizedTree))) {
16910 auto It = ReducedValsToOps.find(Res);
16911 if (It != ReducedValsToOps.end() &&
16912 any_of(It->getSecond(),
16913 [](Instruction *I) { return isBoolLogicOp(I); }))
16914 std::swap(VectorizedTree, Res);
16915 }
16916
16917 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
16918 ReductionOps);
16919 }
16920 // Initialize the final value in the reduction.
16921 return Res;
16922 };
16923 bool AnyBoolLogicOp =
16924 any_of(ReductionOps.back(), [](Value *V) {
16925 return isBoolLogicOp(cast<Instruction>(V));
16926 });
16927 // The reduction root is used as the insertion point for new instructions,
16928 // so set it as externally used to prevent it from being deleted.
16929 ExternallyUsedValues[ReductionRoot];
16930 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16931 ReductionOps.front().size());
16932 for (ReductionOpsType &RdxOps : ReductionOps)
16933 for (Value *RdxOp : RdxOps) {
16934 if (!RdxOp)
16935 continue;
16936 IgnoreList.insert(RdxOp);
16937 }
16938 // Intersect the fast-math-flags from all reduction operations.
16939 FastMathFlags RdxFMF;
16940 RdxFMF.set();
16941 for (Value *U : IgnoreList)
16942 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
16943 RdxFMF &= FPMO->getFastMathFlags();
16944 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16945
16946 // Need to track reduced vals, they may be changed during vectorization of
16947 // subvectors.
16948 for (ArrayRef<Value *> Candidates : ReducedVals)
16949 for (Value *V : Candidates)
16950 TrackedVals.try_emplace(V, V);
16951
16952 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
16953 // List of the values that were reduced in other trees as part of gather
16954 // nodes and thus requiring extract if fully vectorized in other trees.
16955 SmallPtrSet<Value *, 4> RequiredExtract;
16956 Value *VectorizedTree = nullptr;
16957 bool CheckForReusedReductionOps = false;
16958 // Try to vectorize elements based on their type.
16959 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
16960 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
16961 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
16962 SmallVector<Value *> Candidates;
16963 Candidates.reserve(2 * OrigReducedVals.size());
16964 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
16965 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16966 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16967 // Check if the reduction value was not overriden by the extractelement
16968 // instruction because of the vectorization and exclude it, if it is not
16969 // compatible with other values.
16970 // Also check if the instruction was folded to constant/other value.
16971 auto *Inst = dyn_cast<Instruction>(RdxVal);
16972 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
16973 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16974 (S.getOpcode() && !Inst))
16975 continue;
16976 Candidates.push_back(RdxVal);
16977 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16978 }
16979 bool ShuffledExtracts = false;
16980 // Try to handle shuffled extractelements.
16981 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16982 I + 1 < E) {
16983 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
16984 if (NextS.getOpcode() == Instruction::ExtractElement &&
16985 !NextS.isAltShuffle()) {
16986 SmallVector<Value *> CommonCandidates(Candidates);
16987 for (Value *RV : ReducedVals[I + 1]) {
16988 Value *RdxVal = TrackedVals.find(RV)->second;
16989 // Check if the reduction value was not overriden by the
16990 // extractelement instruction because of the vectorization and
16991 // exclude it, if it is not compatible with other values.
16992 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
16993 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16994 continue;
16995 CommonCandidates.push_back(RdxVal);
16996 TrackedToOrig.try_emplace(RdxVal, RV);
16997 }
16999 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17000 ++I;
17001 Candidates.swap(CommonCandidates);
17002 ShuffledExtracts = true;
17003 }
17004 }
17005 }
17006
17007 // Emit code for constant values.
17008 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17009 allConstant(Candidates)) {
17010 Value *Res = Candidates.front();
17011 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17012 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17013 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17014 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17015 if (auto *ResI = dyn_cast<Instruction>(Res))
17016 V.analyzedReductionRoot(ResI);
17017 }
17018 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17019 continue;
17020 }
17021
17022 unsigned NumReducedVals = Candidates.size();
17023 if (NumReducedVals < ReductionLimit &&
17024 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17025 !isSplat(Candidates)))
17026 continue;
17027
17028 // Check if we support repeated scalar values processing (optimization of
17029 // original scalar identity operations on matched horizontal reductions).
17030 IsSupportedHorRdxIdentityOp =
17031 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17032 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17033 // Gather same values.
17034 MapVector<Value *, unsigned> SameValuesCounter;
17035 if (IsSupportedHorRdxIdentityOp)
17036 for (Value *V : Candidates)
17037 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17038 // Used to check if the reduced values used same number of times. In this
17039 // case the compiler may produce better code. E.g. if reduced values are
17040 // aabbccdd (8 x values), then the first node of the tree will have a node
17041 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17042 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17043 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17044 // x abcd) * 2.
17045 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17046 // this analysis, other operations may require an extra estimation of
17047 // the profitability.
17048 bool SameScaleFactor = false;
17049 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17050 SameValuesCounter.size() != Candidates.size();
17051 if (OptReusedScalars) {
17052 SameScaleFactor =
17053 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17054 RdxKind == RecurKind::Xor) &&
17055 all_of(drop_begin(SameValuesCounter),
17056 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17057 return P.second == SameValuesCounter.front().second;
17058 });
17059 Candidates.resize(SameValuesCounter.size());
17060 transform(SameValuesCounter, Candidates.begin(),
17061 [](const auto &P) { return P.first; });
17062 NumReducedVals = Candidates.size();
17063 // Have a reduction of the same element.
17064 if (NumReducedVals == 1) {
17065 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17066 unsigned Cnt = SameValuesCounter.lookup(OrigV);
17067 Value *RedVal =
17068 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17069 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17070 VectorizedVals.try_emplace(OrigV, Cnt);
17071 continue;
17072 }
17073 }
17074
17075 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17076 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17077 unsigned MaxElts =
17078 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17079
17080 unsigned ReduxWidth = std::min<unsigned>(
17081 llvm::bit_floor(NumReducedVals),
17082 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17083 RegMaxNumber * RedValsMaxNumber));
17084 unsigned Start = 0;
17085 unsigned Pos = Start;
17086 // Restarts vectorization attempt with lower vector factor.
17087 unsigned PrevReduxWidth = ReduxWidth;
17088 bool CheckForReusedReductionOpsLocal = false;
17089 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17090 &CheckForReusedReductionOpsLocal,
17091 &PrevReduxWidth, &V,
17092 &IgnoreList](bool IgnoreVL = false) {
17093 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17094 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17095 // Check if any of the reduction ops are gathered. If so, worth
17096 // trying again with less number of reduction ops.
17097 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17098 }
17099 ++Pos;
17100 if (Pos < NumReducedVals - ReduxWidth + 1)
17101 return IsAnyRedOpGathered;
17102 Pos = Start;
17103 ReduxWidth /= 2;
17104 return IsAnyRedOpGathered;
17105 };
17106 bool AnyVectorized = false;
17107 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17108 ReduxWidth >= ReductionLimit) {
17109 // Dependency in tree of the reduction ops - drop this attempt, try
17110 // later.
17111 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17112 Start == 0) {
17113 CheckForReusedReductionOps = true;
17114 break;
17115 }
17116 PrevReduxWidth = ReduxWidth;
17117 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17118 // Beeing analyzed already - skip.
17119 if (V.areAnalyzedReductionVals(VL)) {
17120 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17121 continue;
17122 }
17123 // Early exit if any of the reduction values were deleted during
17124 // previous vectorization attempts.
17125 if (any_of(VL, [&V](Value *RedVal) {
17126 auto *RedValI = dyn_cast<Instruction>(RedVal);
17127 if (!RedValI)
17128 return false;
17129 return V.isDeleted(RedValI);
17130 }))
17131 break;
17132 V.buildTree(VL, IgnoreList);
17133 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17134 if (!AdjustReducedVals())
17135 V.analyzedReductionVals(VL);
17136 continue;
17137 }
17138 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17139 if (!AdjustReducedVals())
17140 V.analyzedReductionVals(VL);
17141 continue;
17142 }
17143 V.reorderTopToBottom();
17144 // No need to reorder the root node at all.
17145 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17146 // Keep extracted other reduction values, if they are used in the
17147 // vectorization trees.
17148 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17149 ExternallyUsedValues);
17150 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17151 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17152 continue;
17153 for (Value *V : ReducedVals[Cnt])
17154 if (isa<Instruction>(V))
17155 LocalExternallyUsedValues[TrackedVals[V]];
17156 }
17157 if (!IsSupportedHorRdxIdentityOp) {
17158 // Number of uses of the candidates in the vector of values.
17159 assert(SameValuesCounter.empty() &&
17160 "Reused values counter map is not empty");
17161 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17162 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17163 continue;
17164 Value *V = Candidates[Cnt];
17165 Value *OrigV = TrackedToOrig.find(V)->second;
17166 ++SameValuesCounter[OrigV];
17167 }
17168 }
17169 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17170 // Gather externally used values.
17172 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17173 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17174 continue;
17175 Value *RdxVal = Candidates[Cnt];
17176 if (!Visited.insert(RdxVal).second)
17177 continue;
17178 // Check if the scalar was vectorized as part of the vectorization
17179 // tree but not the top node.
17180 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17181 LocalExternallyUsedValues[RdxVal];
17182 continue;
17183 }
17184 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17185 unsigned NumOps =
17186 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17187 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17188 LocalExternallyUsedValues[RdxVal];
17189 }
17190 // Do not need the list of reused scalars in regular mode anymore.
17191 if (!IsSupportedHorRdxIdentityOp)
17192 SameValuesCounter.clear();
17193 for (Value *RdxVal : VL)
17194 if (RequiredExtract.contains(RdxVal))
17195 LocalExternallyUsedValues[RdxVal];
17196 // Update LocalExternallyUsedValues for the scalar, replaced by
17197 // extractelement instructions.
17198 DenseMap<Value *, Value *> ReplacementToExternal;
17199 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17200 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17201 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17202 Value *Ext = Pair.first;
17203 auto RIt = ReplacementToExternal.find(Ext);
17204 while (RIt != ReplacementToExternal.end()) {
17205 Ext = RIt->second;
17206 RIt = ReplacementToExternal.find(Ext);
17207 }
17208 auto *It = ExternallyUsedValues.find(Ext);
17209 if (It == ExternallyUsedValues.end())
17210 continue;
17211 LocalExternallyUsedValues[Pair.second].append(It->second);
17212 }
17213 V.buildExternalUses(LocalExternallyUsedValues);
17214
17215 V.computeMinimumValueSizes();
17216 V.transformNodes();
17217
17218 // Estimate cost.
17219 InstructionCost TreeCost = V.getTreeCost(VL);
17220 InstructionCost ReductionCost =
17221 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17222 InstructionCost Cost = TreeCost + ReductionCost;
17223 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17224 << " for reduction\n");
17225 if (!Cost.isValid())
17226 break;
17227 if (Cost >= -SLPCostThreshold) {
17228 V.getORE()->emit([&]() {
17230 SV_NAME, "HorSLPNotBeneficial",
17231 ReducedValsToOps.find(VL[0])->second.front())
17232 << "Vectorizing horizontal reduction is possible "
17233 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17234 << " and threshold "
17235 << ore::NV("Threshold", -SLPCostThreshold);
17236 });
17237 if (!AdjustReducedVals())
17238 V.analyzedReductionVals(VL);
17239 continue;
17240 }
17241
17242 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17243 << Cost << ". (HorRdx)\n");
17244 V.getORE()->emit([&]() {
17245 return OptimizationRemark(
17246 SV_NAME, "VectorizedHorizontalReduction",
17247 ReducedValsToOps.find(VL[0])->second.front())
17248 << "Vectorized horizontal reduction with cost "
17249 << ore::NV("Cost", Cost) << " and with tree size "
17250 << ore::NV("TreeSize", V.getTreeSize());
17251 });
17252
17253 Builder.setFastMathFlags(RdxFMF);
17254
17255 // Emit a reduction. If the root is a select (min/max idiom), the insert
17256 // point is the compare condition of that select.
17257 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17258 Instruction *InsertPt = RdxRootInst;
17259 if (IsCmpSelMinMax)
17260 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17261
17262 // Vectorize a tree.
17263 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17264 ReplacedExternals, InsertPt);
17265
17266 Builder.SetInsertPoint(InsertPt);
17267
17268 // To prevent poison from leaking across what used to be sequential,
17269 // safe, scalar boolean logic operations, the reduction operand must be
17270 // frozen.
17271 if ((isBoolLogicOp(RdxRootInst) ||
17272 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17273 !isGuaranteedNotToBePoison(VectorizedRoot))
17274 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17275
17276 // Emit code to correctly handle reused reduced values, if required.
17277 if (OptReusedScalars && !SameScaleFactor) {
17278 VectorizedRoot =
17279 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
17280 SameValuesCounter, TrackedToOrig);
17281 }
17282
17283 Value *ReducedSubTree =
17284 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17285 if (ReducedSubTree->getType() != VL.front()->getType()) {
17286 ReducedSubTree = Builder.CreateIntCast(
17287 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
17289 R, cast<Instruction>(ReductionOps.front().front())
17290 ->getModule()
17291 ->getDataLayout());
17292 return !Known.isNonNegative();
17293 }));
17294 }
17295
17296 // Improved analysis for add/fadd/xor reductions with same scale factor
17297 // for all operands of reductions. We can emit scalar ops for them
17298 // instead.
17299 if (OptReusedScalars && SameScaleFactor)
17300 ReducedSubTree = emitScaleForReusedOps(
17301 ReducedSubTree, Builder, SameValuesCounter.front().second);
17302
17303 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17304 // Count vectorized reduced values to exclude them from final reduction.
17305 for (Value *RdxVal : VL) {
17306 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17307 if (IsSupportedHorRdxIdentityOp) {
17308 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17309 continue;
17310 }
17311 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17312 if (!V.isVectorized(RdxVal))
17313 RequiredExtract.insert(RdxVal);
17314 }
17315 Pos += ReduxWidth;
17316 Start = Pos;
17317 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17318 AnyVectorized = true;
17319 }
17320 if (OptReusedScalars && !AnyVectorized) {
17321 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17322 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17323 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17324 Value *OrigV = TrackedToOrig.find(P.first)->second;
17325 VectorizedVals.try_emplace(OrigV, P.second);
17326 }
17327 continue;
17328 }
17329 }
17330 if (VectorizedTree) {
17331 // Reorder operands of bool logical op in the natural order to avoid
17332 // possible problem with poison propagation. If not possible to reorder
17333 // (both operands are originally RHS), emit an extra freeze instruction
17334 // for the LHS operand.
17335 // I.e., if we have original code like this:
17336 // RedOp1 = select i1 ?, i1 LHS, i1 false
17337 // RedOp2 = select i1 RHS, i1 ?, i1 false
17338
17339 // Then, we swap LHS/RHS to create a new op that matches the poison
17340 // semantics of the original code.
17341
17342 // If we have original code like this and both values could be poison:
17343 // RedOp1 = select i1 ?, i1 LHS, i1 false
17344 // RedOp2 = select i1 ?, i1 RHS, i1 false
17345
17346 // Then, we must freeze LHS in the new op.
17347 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17348 Instruction *RedOp1,
17349 Instruction *RedOp2,
17350 bool InitStep) {
17351 if (!AnyBoolLogicOp)
17352 return;
17353 if (isBoolLogicOp(RedOp1) &&
17354 ((!InitStep && LHS == VectorizedTree) ||
17355 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17356 return;
17357 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17358 getRdxOperand(RedOp2, 0) == RHS ||
17360 std::swap(LHS, RHS);
17361 return;
17362 }
17363 if (LHS != VectorizedTree)
17364 LHS = Builder.CreateFreeze(LHS);
17365 };
17366 // Finish the reduction.
17367 // Need to add extra arguments and not vectorized possible reduction
17368 // values.
17369 // Try to avoid dependencies between the scalar remainders after
17370 // reductions.
17371 auto FinalGen =
17373 bool InitStep) {
17374 unsigned Sz = InstVals.size();
17376 Sz % 2);
17377 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17378 Instruction *RedOp = InstVals[I + 1].first;
17379 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17380 Value *RdxVal1 = InstVals[I].second;
17381 Value *StableRdxVal1 = RdxVal1;
17382 auto It1 = TrackedVals.find(RdxVal1);
17383 if (It1 != TrackedVals.end())
17384 StableRdxVal1 = It1->second;
17385 Value *RdxVal2 = InstVals[I + 1].second;
17386 Value *StableRdxVal2 = RdxVal2;
17387 auto It2 = TrackedVals.find(RdxVal2);
17388 if (It2 != TrackedVals.end())
17389 StableRdxVal2 = It2->second;
17390 // To prevent poison from leaking across what used to be
17391 // sequential, safe, scalar boolean logic operations, the
17392 // reduction operand must be frozen.
17393 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17394 RedOp, InitStep);
17395 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17396 StableRdxVal2, "op.rdx", ReductionOps);
17397 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17398 }
17399 if (Sz % 2 == 1)
17400 ExtraReds[Sz / 2] = InstVals.back();
17401 return ExtraReds;
17402 };
17404 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17405 VectorizedTree);
17407 for (ArrayRef<Value *> Candidates : ReducedVals) {
17408 for (Value *RdxVal : Candidates) {
17409 if (!Visited.insert(RdxVal).second)
17410 continue;
17411 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17412 for (Instruction *RedOp :
17413 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17414 .drop_back(NumOps))
17415 ExtraReductions.emplace_back(RedOp, RdxVal);
17416 }
17417 }
17418 for (auto &Pair : ExternallyUsedValues) {
17419 // Add each externally used value to the final reduction.
17420 for (auto *I : Pair.second)
17421 ExtraReductions.emplace_back(I, Pair.first);
17422 }
17423 // Iterate through all not-vectorized reduction values/extra arguments.
17424 bool InitStep = true;
17425 while (ExtraReductions.size() > 1) {
17426 VectorizedTree = ExtraReductions.front().second;
17428 FinalGen(ExtraReductions, InitStep);
17429 ExtraReductions.swap(NewReds);
17430 InitStep = false;
17431 }
17432 VectorizedTree = ExtraReductions.front().second;
17433
17434 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17435
17436 // The original scalar reduction is expected to have no remaining
17437 // uses outside the reduction tree itself. Assert that we got this
17438 // correct, replace internal uses with undef, and mark for eventual
17439 // deletion.
17440#ifndef NDEBUG
17441 SmallSet<Value *, 4> IgnoreSet;
17442 for (ArrayRef<Value *> RdxOps : ReductionOps)
17443 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17444#endif
17445 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17446 for (Value *Ignore : RdxOps) {
17447 if (!Ignore)
17448 continue;
17449#ifndef NDEBUG
17450 for (auto *U : Ignore->users()) {
17451 assert(IgnoreSet.count(U) &&
17452 "All users must be either in the reduction ops list.");
17453 }
17454#endif
17455 if (!Ignore->use_empty()) {
17456 Value *Undef = UndefValue::get(Ignore->getType());
17457 Ignore->replaceAllUsesWith(Undef);
17458 }
17459 V.eraseInstruction(cast<Instruction>(Ignore));
17460 }
17461 }
17462 } else if (!CheckForReusedReductionOps) {
17463 for (ReductionOpsType &RdxOps : ReductionOps)
17464 for (Value *RdxOp : RdxOps)
17465 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17466 }
17467 return VectorizedTree;
17468 }
17469
17470private:
17471 /// Calculate the cost of a reduction.
17472 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17473 ArrayRef<Value *> ReducedVals,
17474 bool IsCmpSelMinMax, unsigned ReduxWidth,
17475 FastMathFlags FMF) {
17477 Type *ScalarTy = ReducedVals.front()->getType();
17478 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
17479 InstructionCost VectorCost = 0, ScalarCost;
17480 // If all of the reduced values are constant, the vector cost is 0, since
17481 // the reduction value can be calculated at the compile time.
17482 bool AllConsts = allConstant(ReducedVals);
17483 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17485 // Scalar cost is repeated for N-1 elements.
17486 int Cnt = ReducedVals.size();
17487 for (Value *RdxVal : ReducedVals) {
17488 if (Cnt == 1)
17489 break;
17490 --Cnt;
17491 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17492 Cost += GenCostFn();
17493 continue;
17494 }
17495 InstructionCost ScalarCost = 0;
17496 for (User *U : RdxVal->users()) {
17497 auto *RdxOp = cast<Instruction>(U);
17498 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17499 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17500 continue;
17501 }
17502 ScalarCost = InstructionCost::getInvalid();
17503 break;
17504 }
17505 if (ScalarCost.isValid())
17506 Cost += ScalarCost;
17507 else
17508 Cost += GenCostFn();
17509 }
17510 return Cost;
17511 };
17512 switch (RdxKind) {
17513 case RecurKind::Add:
17514 case RecurKind::Mul:
17515 case RecurKind::Or:
17516 case RecurKind::And:
17517 case RecurKind::Xor:
17518 case RecurKind::FAdd:
17519 case RecurKind::FMul: {
17520 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17521 if (!AllConsts)
17522 VectorCost =
17523 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17524 ScalarCost = EvaluateScalarCost([&]() {
17525 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17526 });
17527 break;
17528 }
17529 case RecurKind::FMax:
17530 case RecurKind::FMin:
17531 case RecurKind::FMaximum:
17532 case RecurKind::FMinimum:
17533 case RecurKind::SMax:
17534 case RecurKind::SMin:
17535 case RecurKind::UMax:
17536 case RecurKind::UMin: {
17538 if (!AllConsts)
17539 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17540 ScalarCost = EvaluateScalarCost([&]() {
17541 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17542 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17543 });
17544 break;
17545 }
17546 default:
17547 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17548 }
17549
17550 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17551 << " for reduction of " << shortBundleName(ReducedVals)
17552 << " (It is a splitting reduction)\n");
17553 return VectorCost - ScalarCost;
17554 }
17555
17556 /// Emit a horizontal reduction of the vectorized value.
17557 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17558 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17559 assert(VectorizedValue && "Need to have a vectorized tree node");
17560 assert(isPowerOf2_32(ReduxWidth) &&
17561 "We only handle power-of-two reductions for now");
17562 assert(RdxKind != RecurKind::FMulAdd &&
17563 "A call to the llvm.fmuladd intrinsic is not handled yet");
17564
17565 ++NumVectorInstructions;
17566 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17567 }
17568
17569 /// Emits optimized code for unique scalar value reused \p Cnt times.
17570 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17571 unsigned Cnt) {
17572 assert(IsSupportedHorRdxIdentityOp &&
17573 "The optimization of matched scalar identity horizontal reductions "
17574 "must be supported.");
17575 switch (RdxKind) {
17576 case RecurKind::Add: {
17577 // res = mul vv, n
17578 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17579 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17580 << VectorizedValue << ". (HorRdx)\n");
17581 return Builder.CreateMul(VectorizedValue, Scale);
17582 }
17583 case RecurKind::Xor: {
17584 // res = n % 2 ? 0 : vv
17585 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17586 << ". (HorRdx)\n");
17587 if (Cnt % 2 == 0)
17588 return Constant::getNullValue(VectorizedValue->getType());
17589 return VectorizedValue;
17590 }
17591 case RecurKind::FAdd: {
17592 // res = fmul v, n
17593 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17594 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17595 << VectorizedValue << ". (HorRdx)\n");
17596 return Builder.CreateFMul(VectorizedValue, Scale);
17597 }
17598 case RecurKind::And:
17599 case RecurKind::Or:
17600 case RecurKind::SMax:
17601 case RecurKind::SMin:
17602 case RecurKind::UMax:
17603 case RecurKind::UMin:
17604 case RecurKind::FMax:
17605 case RecurKind::FMin:
17606 case RecurKind::FMaximum:
17607 case RecurKind::FMinimum:
17608 // res = vv
17609 return VectorizedValue;
17610 case RecurKind::Mul:
17611 case RecurKind::FMul:
17612 case RecurKind::FMulAdd:
17613 case RecurKind::IAnyOf:
17614 case RecurKind::FAnyOf:
17615 case RecurKind::None:
17616 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17617 }
17618 return nullptr;
17619 }
17620
17621 /// Emits actual operation for the scalar identity values, found during
17622 /// horizontal reduction analysis.
17623 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17625 const MapVector<Value *, unsigned> &SameValuesCounter,
17626 const DenseMap<Value *, Value *> &TrackedToOrig) {
17627 assert(IsSupportedHorRdxIdentityOp &&
17628 "The optimization of matched scalar identity horizontal reductions "
17629 "must be supported.");
17630 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17631 if (VTy->getElementType() != VL.front()->getType()) {
17632 VectorizedValue = Builder.CreateIntCast(
17633 VectorizedValue,
17634 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
17635 any_of(VL, [&](Value *R) {
17637 R, cast<Instruction>(ReductionOps.front().front())
17638 ->getModule()
17639 ->getDataLayout());
17640 return !Known.isNonNegative();
17641 }));
17642 }
17643 switch (RdxKind) {
17644 case RecurKind::Add: {
17645 // root = mul prev_root, <1, 1, n, 1>
17647 for (Value *V : VL) {
17648 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17649 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17650 }
17651 auto *Scale = ConstantVector::get(Vals);
17652 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17653 << VectorizedValue << ". (HorRdx)\n");
17654 return Builder.CreateMul(VectorizedValue, Scale);
17655 }
17656 case RecurKind::And:
17657 case RecurKind::Or:
17658 // No need for multiple or/and(s).
17659 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17660 << ". (HorRdx)\n");
17661 return VectorizedValue;
17662 case RecurKind::SMax:
17663 case RecurKind::SMin:
17664 case RecurKind::UMax:
17665 case RecurKind::UMin:
17666 case RecurKind::FMax:
17667 case RecurKind::FMin:
17668 case RecurKind::FMaximum:
17669 case RecurKind::FMinimum:
17670 // No need for multiple min/max(s) of the same value.
17671 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17672 << ". (HorRdx)\n");
17673 return VectorizedValue;
17674 case RecurKind::Xor: {
17675 // Replace values with even number of repeats with 0, since
17676 // x xor x = 0.
17677 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17678 // 7>, if elements 4th and 6th elements have even number of repeats.
17680 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17682 std::iota(Mask.begin(), Mask.end(), 0);
17683 bool NeedShuffle = false;
17684 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17685 Value *V = VL[I];
17686 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17687 if (Cnt % 2 == 0) {
17688 Mask[I] = VF;
17689 NeedShuffle = true;
17690 }
17691 }
17692 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17693 : Mask) dbgs()
17694 << I << " ";
17695 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17696 if (NeedShuffle)
17697 VectorizedValue = Builder.CreateShuffleVector(
17698 VectorizedValue,
17699 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
17700 return VectorizedValue;
17701 }
17702 case RecurKind::FAdd: {
17703 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17705 for (Value *V : VL) {
17706 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17707 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17708 }
17709 auto *Scale = ConstantVector::get(Vals);
17710 return Builder.CreateFMul(VectorizedValue, Scale);
17711 }
17712 case RecurKind::Mul:
17713 case RecurKind::FMul:
17714 case RecurKind::FMulAdd:
17715 case RecurKind::IAnyOf:
17716 case RecurKind::FAnyOf:
17717 case RecurKind::None:
17718 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17719 }
17720 return nullptr;
17721 }
17722};
17723} // end anonymous namespace
17724
17725/// Gets recurrence kind from the specified value.
17727 return HorizontalReduction::getRdxKind(V);
17728}
17729static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17730 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17731 return cast<FixedVectorType>(IE->getType())->getNumElements();
17732
17733 unsigned AggregateSize = 1;
17734 auto *IV = cast<InsertValueInst>(InsertInst);
17735 Type *CurrentType = IV->getType();
17736 do {
17737 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
17738 for (auto *Elt : ST->elements())
17739 if (Elt != ST->getElementType(0)) // check homogeneity
17740 return std::nullopt;
17741 AggregateSize *= ST->getNumElements();
17742 CurrentType = ST->getElementType(0);
17743 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17744 AggregateSize *= AT->getNumElements();
17745 CurrentType = AT->getElementType();
17746 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17747 AggregateSize *= VT->getNumElements();
17748 return AggregateSize;
17749 } else if (CurrentType->isSingleValueType()) {
17750 return AggregateSize;
17751 } else {
17752 return std::nullopt;
17753 }
17754 } while (true);
17755}
17756
17757static void findBuildAggregate_rec(Instruction *LastInsertInst,
17759 SmallVectorImpl<Value *> &BuildVectorOpds,
17760 SmallVectorImpl<Value *> &InsertElts,
17761 unsigned OperandOffset) {
17762 do {
17763 Value *InsertedOperand = LastInsertInst->getOperand(1);
17764 std::optional<unsigned> OperandIndex =
17765 getInsertIndex(LastInsertInst, OperandOffset);
17766 if (!OperandIndex)
17767 return;
17768 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17769 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
17770 BuildVectorOpds, InsertElts, *OperandIndex);
17771
17772 } else {
17773 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17774 InsertElts[*OperandIndex] = LastInsertInst;
17775 }
17776 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
17777 } while (LastInsertInst != nullptr &&
17778 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17779 LastInsertInst->hasOneUse());
17780}
17781
17782/// Recognize construction of vectors like
17783/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17784/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17785/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17786/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17787/// starting from the last insertelement or insertvalue instruction.
17788///
17789/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17790/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17791/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17792///
17793/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17794///
17795/// \return true if it matches.
17796static bool findBuildAggregate(Instruction *LastInsertInst,
17798 SmallVectorImpl<Value *> &BuildVectorOpds,
17799 SmallVectorImpl<Value *> &InsertElts) {
17800
17801 assert((isa<InsertElementInst>(LastInsertInst) ||
17802 isa<InsertValueInst>(LastInsertInst)) &&
17803 "Expected insertelement or insertvalue instruction!");
17804
17805 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17806 "Expected empty result vectors!");
17807
17808 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
17809 if (!AggregateSize)
17810 return false;
17811 BuildVectorOpds.resize(*AggregateSize);
17812 InsertElts.resize(*AggregateSize);
17813
17814 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
17815 llvm::erase(BuildVectorOpds, nullptr);
17816 llvm::erase(InsertElts, nullptr);
17817 if (BuildVectorOpds.size() >= 2)
17818 return true;
17819
17820 return false;
17821}
17822
17823/// Try and get a reduction instruction from a phi node.
17824///
17825/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17826/// if they come from either \p ParentBB or a containing loop latch.
17827///
17828/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17829/// if not possible.
17831 BasicBlock *ParentBB, LoopInfo *LI) {
17832 // There are situations where the reduction value is not dominated by the
17833 // reduction phi. Vectorizing such cases has been reported to cause
17834 // miscompiles. See PR25787.
17835 auto DominatedReduxValue = [&](Value *R) {
17836 return isa<Instruction>(R) &&
17837 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
17838 };
17839
17840 Instruction *Rdx = nullptr;
17841
17842 // Return the incoming value if it comes from the same BB as the phi node.
17843 if (P->getIncomingBlock(0) == ParentBB) {
17844 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17845 } else if (P->getIncomingBlock(1) == ParentBB) {
17846 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17847 }
17848
17849 if (Rdx && DominatedReduxValue(Rdx))
17850 return Rdx;
17851
17852 // Otherwise, check whether we have a loop latch to look at.
17853 Loop *BBL = LI->getLoopFor(ParentBB);
17854 if (!BBL)
17855 return nullptr;
17856 BasicBlock *BBLatch = BBL->getLoopLatch();
17857 if (!BBLatch)
17858 return nullptr;
17859
17860 // There is a loop latch, return the incoming value if it comes from
17861 // that. This reduction pattern occasionally turns up.
17862 if (P->getIncomingBlock(0) == BBLatch) {
17863 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17864 } else if (P->getIncomingBlock(1) == BBLatch) {
17865 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17866 }
17867
17868 if (Rdx && DominatedReduxValue(Rdx))
17869 return Rdx;
17870
17871 return nullptr;
17872}
17873
17874static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17875 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
17876 return true;
17877 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
17878 return true;
17879 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
17880 return true;
17881 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
17882 return true;
17883 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17884 return true;
17885 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17886 return true;
17887 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17888 return true;
17889 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17890 return true;
17891 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17892 return true;
17893 return false;
17894}
17895
17896/// We could have an initial reduction that is not an add.
17897/// r *= v1 + v2 + v3 + v4
17898/// In such a case start looking for a tree rooted in the first '+'.
17899/// \Returns the new root if found, which may be nullptr if not an instruction.
17901 Instruction *Root) {
17902 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17903 isa<IntrinsicInst>(Root)) &&
17904 "Expected binop, select, or intrinsic for reduction matching");
17905 Value *LHS =
17906 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17907 Value *RHS =
17908 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17909 if (LHS == Phi)
17910 return dyn_cast<Instruction>(RHS);
17911 if (RHS == Phi)
17912 return dyn_cast<Instruction>(LHS);
17913 return nullptr;
17914}
17915
17916/// \p Returns the first operand of \p I that does not match \p Phi. If
17917/// operand is not an instruction it returns nullptr.
17919 Value *Op0 = nullptr;
17920 Value *Op1 = nullptr;
17921 if (!matchRdxBop(I, Op0, Op1))
17922 return nullptr;
17923 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17924}
17925
17926/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17928 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
17929 Value *B0 = nullptr, *B1 = nullptr;
17930 bool IsBinop = matchRdxBop(I, B0, B1);
17931 return IsBinop || IsSelect;
17932}
17933
17934bool SLPVectorizerPass::vectorizeHorReduction(
17936 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17937 if (!ShouldVectorizeHor)
17938 return false;
17939 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
17940
17941 if (Root->getParent() != BB || isa<PHINode>(Root))
17942 return false;
17943
17944 // If we can find a secondary reduction root, use that instead.
17945 auto SelectRoot = [&]() {
17946 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
17947 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
17948 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
17949 return NewRoot;
17950 return Root;
17951 };
17952
17953 // Start analysis starting from Root instruction. If horizontal reduction is
17954 // found, try to vectorize it. If it is not a horizontal reduction or
17955 // vectorization is not possible or not effective, and currently analyzed
17956 // instruction is a binary operation, try to vectorize the operands, using
17957 // pre-order DFS traversal order. If the operands were not vectorized, repeat
17958 // the same procedure considering each operand as a possible root of the
17959 // horizontal reduction.
17960 // Interrupt the process if the Root instruction itself was vectorized or all
17961 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17962 // If a horizintal reduction was not matched or vectorized we collect
17963 // instructions for possible later attempts for vectorization.
17964 std::queue<std::pair<Instruction *, unsigned>> Stack;
17965 Stack.emplace(SelectRoot(), 0);
17966 SmallPtrSet<Value *, 8> VisitedInstrs;
17967 bool Res = false;
17968 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
17969 if (R.isAnalyzedReductionRoot(Inst))
17970 return nullptr;
17971 if (!isReductionCandidate(Inst))
17972 return nullptr;
17973 HorizontalReduction HorRdx;
17974 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
17975 return nullptr;
17976 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
17977 };
17978 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17979 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17980 FutureSeed = getNonPhiOperand(Root, P);
17981 if (!FutureSeed)
17982 return false;
17983 }
17984 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
17985 // analysis is done separately.
17986 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17987 PostponedInsts.push_back(FutureSeed);
17988 return true;
17989 };
17990
17991 while (!Stack.empty()) {
17992 Instruction *Inst;
17993 unsigned Level;
17994 std::tie(Inst, Level) = Stack.front();
17995 Stack.pop();
17996 // Do not try to analyze instruction that has already been vectorized.
17997 // This may happen when we vectorize instruction operands on a previous
17998 // iteration while stack was populated before that happened.
17999 if (R.isDeleted(Inst))
18000 continue;
18001 if (Value *VectorizedV = TryToReduce(Inst)) {
18002 Res = true;
18003 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18004 // Try to find another reduction.
18005 Stack.emplace(I, Level);
18006 continue;
18007 }
18008 } else {
18009 // We could not vectorize `Inst` so try to use it as a future seed.
18010 if (!TryAppendToPostponedInsts(Inst)) {
18011 assert(Stack.empty() && "Expected empty stack");
18012 break;
18013 }
18014 }
18015
18016 // Try to vectorize operands.
18017 // Continue analysis for the instruction from the same basic block only to
18018 // save compile time.
18019 if (++Level < RecursionMaxDepth)
18020 for (auto *Op : Inst->operand_values())
18021 if (VisitedInstrs.insert(Op).second)
18022 if (auto *I = dyn_cast<Instruction>(Op))
18023 // Do not try to vectorize CmpInst operands, this is done
18024 // separately.
18025 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18026 !R.isDeleted(I) && I->getParent() == BB)
18027 Stack.emplace(I, Level);
18028 }
18029 return Res;
18030}
18031
18032bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18033 BasicBlock *BB, BoUpSLP &R,
18035 SmallVector<WeakTrackingVH> PostponedInsts;
18036 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18037 Res |= tryToVectorize(PostponedInsts, R);
18038 return Res;
18039}
18040
18041bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18042 BoUpSLP &R) {
18043 bool Res = false;
18044 for (Value *V : Insts)
18045 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18046 Res |= tryToVectorize(Inst, R);
18047 return Res;
18048}
18049
18050bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18051 BasicBlock *BB, BoUpSLP &R) {
18052 if (!R.canMapToVector(IVI->getType()))
18053 return false;
18054
18055 SmallVector<Value *, 16> BuildVectorOpds;
18056 SmallVector<Value *, 16> BuildVectorInsts;
18057 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18058 return false;
18059
18060 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18061 // Aggregate value is unlikely to be processed in vector register.
18062 return tryToVectorizeList(BuildVectorOpds, R);
18063}
18064
18065bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18066 BasicBlock *BB, BoUpSLP &R) {
18067 SmallVector<Value *, 16> BuildVectorInsts;
18068 SmallVector<Value *, 16> BuildVectorOpds;
18070 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18071 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18072 isFixedVectorShuffle(BuildVectorOpds, Mask)))
18073 return false;
18074
18075 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18076 return tryToVectorizeList(BuildVectorInsts, R);
18077}
18078
18079template <typename T>
18081 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18082 function_ref<bool(T *, T *)> AreCompatible,
18083 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18084 bool MaxVFOnly, BoUpSLP &R) {
18085 bool Changed = false;
18086 // Sort by type, parent, operands.
18087 stable_sort(Incoming, Comparator);
18088
18089 // Try to vectorize elements base on their type.
18090 SmallVector<T *> Candidates;
18091 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
18092 // Look for the next elements with the same type, parent and operand
18093 // kinds.
18094 auto *SameTypeIt = IncIt;
18095 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
18096 ++SameTypeIt;
18097
18098 // Try to vectorize them.
18099 unsigned NumElts = (SameTypeIt - IncIt);
18100 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18101 << NumElts << ")\n");
18102 // The vectorization is a 3-state attempt:
18103 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18104 // size of maximal register at first.
18105 // 2. Try to vectorize remaining instructions with the same type, if
18106 // possible. This may result in the better vectorization results rather than
18107 // if we try just to vectorize instructions with the same/alternate opcodes.
18108 // 3. Final attempt to try to vectorize all instructions with the
18109 // same/alternate ops only, this may result in some extra final
18110 // vectorization.
18111 if (NumElts > 1 &&
18112 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
18113 // Success start over because instructions might have been changed.
18114 Changed = true;
18115 } else {
18116 /// \Returns the minimum number of elements that we will attempt to
18117 /// vectorize.
18118 auto GetMinNumElements = [&R](Value *V) {
18119 unsigned EltSize = R.getVectorElementSize(V);
18120 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18121 };
18122 if (NumElts < GetMinNumElements(*IncIt) &&
18123 (Candidates.empty() ||
18124 Candidates.front()->getType() == (*IncIt)->getType())) {
18125 Candidates.append(IncIt, std::next(IncIt, NumElts));
18126 }
18127 }
18128 // Final attempt to vectorize instructions with the same types.
18129 if (Candidates.size() > 1 &&
18130 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18131 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18132 // Success start over because instructions might have been changed.
18133 Changed = true;
18134 } else if (MaxVFOnly) {
18135 // Try to vectorize using small vectors.
18136 for (auto *It = Candidates.begin(), *End = Candidates.end();
18137 It != End;) {
18138 auto *SameTypeIt = It;
18139 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
18140 ++SameTypeIt;
18141 unsigned NumElts = (SameTypeIt - It);
18142 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
18143 /*MaxVFOnly=*/false))
18144 Changed = true;
18145 It = SameTypeIt;
18146 }
18147 }
18148 Candidates.clear();
18149 }
18150
18151 // Start over at the next instruction of a different type (or the end).
18152 IncIt = SameTypeIt;
18153 }
18154 return Changed;
18155}
18156
18157/// Compare two cmp instructions. If IsCompatibility is true, function returns
18158/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18159/// operands. If IsCompatibility is false, function implements strict weak
18160/// ordering relation between two cmp instructions, returning true if the first
18161/// instruction is "less" than the second, i.e. its predicate is less than the
18162/// predicate of the second or the operands IDs are less than the operands IDs
18163/// of the second cmp instruction.
18164template <bool IsCompatibility>
18165static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18166 const DominatorTree &DT) {
18167 assert(isValidElementType(V->getType()) &&
18168 isValidElementType(V2->getType()) &&
18169 "Expected valid element types only.");
18170 if (V == V2)
18171 return IsCompatibility;
18172 auto *CI1 = cast<CmpInst>(V);
18173 auto *CI2 = cast<CmpInst>(V2);
18174 if (CI1->getOperand(0)->getType()->getTypeID() <
18175 CI2->getOperand(0)->getType()->getTypeID())
18176 return !IsCompatibility;
18177 if (CI1->getOperand(0)->getType()->getTypeID() >
18178 CI2->getOperand(0)->getType()->getTypeID())
18179 return false;
18180 CmpInst::Predicate Pred1 = CI1->getPredicate();
18181 CmpInst::Predicate Pred2 = CI2->getPredicate();
18184 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18185 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18186 if (BasePred1 < BasePred2)
18187 return !IsCompatibility;
18188 if (BasePred1 > BasePred2)
18189 return false;
18190 // Compare operands.
18191 bool CI1Preds = Pred1 == BasePred1;
18192 bool CI2Preds = Pred2 == BasePred1;
18193 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18194 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18195 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18196 if (Op1 == Op2)
18197 continue;
18198 if (Op1->getValueID() < Op2->getValueID())
18199 return !IsCompatibility;
18200 if (Op1->getValueID() > Op2->getValueID())
18201 return false;
18202 if (auto *I1 = dyn_cast<Instruction>(Op1))
18203 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18204 if (IsCompatibility) {
18205 if (I1->getParent() != I2->getParent())
18206 return false;
18207 } else {
18208 // Try to compare nodes with same parent.
18209 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18210 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18211 if (!NodeI1)
18212 return NodeI2 != nullptr;
18213 if (!NodeI2)
18214 return false;
18215 assert((NodeI1 == NodeI2) ==
18216 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18217 "Different nodes should have different DFS numbers");
18218 if (NodeI1 != NodeI2)
18219 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18220 }
18221 InstructionsState S = getSameOpcode({I1, I2}, TLI);
18222 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18223 continue;
18224 if (IsCompatibility)
18225 return false;
18226 if (I1->getOpcode() != I2->getOpcode())
18227 return I1->getOpcode() < I2->getOpcode();
18228 }
18229 }
18230 return IsCompatibility;
18231}
18232
18233template <typename ItT>
18234bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18235 BasicBlock *BB, BoUpSLP &R) {
18236 bool Changed = false;
18237 // Try to find reductions first.
18238 for (CmpInst *I : CmpInsts) {
18239 if (R.isDeleted(I))
18240 continue;
18241 for (Value *Op : I->operands())
18242 if (auto *RootOp = dyn_cast<Instruction>(Op))
18243 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18244 }
18245 // Try to vectorize operands as vector bundles.
18246 for (CmpInst *I : CmpInsts) {
18247 if (R.isDeleted(I))
18248 continue;
18249 Changed |= tryToVectorize(I, R);
18250 }
18251 // Try to vectorize list of compares.
18252 // Sort by type, compare predicate, etc.
18253 auto CompareSorter = [&](Value *V, Value *V2) {
18254 if (V == V2)
18255 return false;
18256 return compareCmp<false>(V, V2, *TLI, *DT);
18257 };
18258
18259 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18260 if (V1 == V2)
18261 return true;
18262 return compareCmp<true>(V1, V2, *TLI, *DT);
18263 };
18264
18266 for (Instruction *V : CmpInsts)
18267 if (!R.isDeleted(V) && isValidElementType(V->getType()))
18268 Vals.push_back(V);
18269 if (Vals.size() <= 1)
18270 return Changed;
18271 Changed |= tryToVectorizeSequence<Value>(
18272 Vals, CompareSorter, AreCompatibleCompares,
18273 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18274 // Exclude possible reductions from other blocks.
18275 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18276 return any_of(V->users(), [V](User *U) {
18277 auto *Select = dyn_cast<SelectInst>(U);
18278 return Select &&
18279 Select->getParent() != cast<Instruction>(V)->getParent();
18280 });
18281 });
18282 if (ArePossiblyReducedInOtherBlock)
18283 return false;
18284 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18285 },
18286 /*MaxVFOnly=*/true, R);
18287 return Changed;
18288}
18289
18290bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18291 BasicBlock *BB, BoUpSLP &R) {
18292 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18293 "This function only accepts Insert instructions");
18294 bool OpsChanged = false;
18295 SmallVector<WeakTrackingVH> PostponedInsts;
18296 // pass1 - try to vectorize reductions only
18297 for (auto *I : reverse(Instructions)) {
18298 if (R.isDeleted(I))
18299 continue;
18300 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18301 }
18302 // pass2 - try to match and vectorize a buildvector sequence.
18303 for (auto *I : reverse(Instructions)) {
18304 if (R.isDeleted(I) || isa<CmpInst>(I))
18305 continue;
18306 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18307 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18308 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18309 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18310 }
18311 }
18312 // Now try to vectorize postponed instructions.
18313 OpsChanged |= tryToVectorize(PostponedInsts, R);
18314
18315 Instructions.clear();
18316 return OpsChanged;
18317}
18318
18319bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18320 bool Changed = false;
18322 SmallPtrSet<Value *, 16> VisitedInstrs;
18323 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18324 // node. Allows better to identify the chains that can be vectorized in the
18325 // better way.
18327 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18329 isValidElementType(V2->getType()) &&
18330 "Expected vectorizable types only.");
18331 // It is fine to compare type IDs here, since we expect only vectorizable
18332 // types, like ints, floats and pointers, we don't care about other type.
18333 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18334 return true;
18335 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18336 return false;
18337 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18338 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18339 if (Opcodes1.size() < Opcodes2.size())
18340 return true;
18341 if (Opcodes1.size() > Opcodes2.size())
18342 return false;
18343 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18344 {
18345 // Instructions come first.
18346 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18347 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18348 if (I1 && I2) {
18349 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18350 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18351 if (!NodeI1)
18352 return NodeI2 != nullptr;
18353 if (!NodeI2)
18354 return false;
18355 assert((NodeI1 == NodeI2) ==
18356 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18357 "Different nodes should have different DFS numbers");
18358 if (NodeI1 != NodeI2)
18359 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18360 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18361 if (S.getOpcode() && !S.isAltShuffle())
18362 continue;
18363 return I1->getOpcode() < I2->getOpcode();
18364 }
18365 if (I1)
18366 return true;
18367 if (I2)
18368 return false;
18369 }
18370 {
18371 // Non-undef constants come next.
18372 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18373 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18374 if (C1 && C2)
18375 continue;
18376 if (C1)
18377 return true;
18378 if (C2)
18379 return false;
18380 }
18381 bool U1 = isa<UndefValue>(Opcodes1[I]);
18382 bool U2 = isa<UndefValue>(Opcodes2[I]);
18383 {
18384 // Non-constant non-instructions come next.
18385 if (!U1 && !U2) {
18386 auto ValID1 = Opcodes1[I]->getValueID();
18387 auto ValID2 = Opcodes2[I]->getValueID();
18388 if (ValID1 == ValID2)
18389 continue;
18390 if (ValID1 < ValID2)
18391 return true;
18392 if (ValID1 > ValID2)
18393 return false;
18394 }
18395 if (!U1)
18396 return true;
18397 if (!U2)
18398 return false;
18399 }
18400 // Undefs come last.
18401 assert(U1 && U2 && "The only thing left should be undef & undef.");
18402 continue;
18403 }
18404 return false;
18405 };
18406 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
18407 if (V1 == V2)
18408 return true;
18409 if (V1->getType() != V2->getType())
18410 return false;
18411 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18412 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18413 if (Opcodes1.size() != Opcodes2.size())
18414 return false;
18415 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18416 // Undefs are compatible with any other value.
18417 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18418 continue;
18419 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18420 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18421 if (I1->getParent() != I2->getParent())
18422 return false;
18423 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18424 if (S.getOpcode())
18425 continue;
18426 return false;
18427 }
18428 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18429 continue;
18430 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18431 return false;
18432 }
18433 return true;
18434 };
18435
18436 bool HaveVectorizedPhiNodes = false;
18437 do {
18438 // Collect the incoming values from the PHIs.
18439 Incoming.clear();
18440 for (Instruction &I : *BB) {
18441 auto *P = dyn_cast<PHINode>(&I);
18442 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18443 break;
18444
18445 // No need to analyze deleted, vectorized and non-vectorizable
18446 // instructions.
18447 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18448 isValidElementType(P->getType()))
18449 Incoming.push_back(P);
18450 }
18451
18452 if (Incoming.size() <= 1)
18453 break;
18454
18455 // Find the corresponding non-phi nodes for better matching when trying to
18456 // build the tree.
18457 for (Value *V : Incoming) {
18458 SmallVectorImpl<Value *> &Opcodes =
18459 PHIToOpcodes.try_emplace(V).first->getSecond();
18460 if (!Opcodes.empty())
18461 continue;
18462 SmallVector<Value *, 4> Nodes(1, V);
18464 while (!Nodes.empty()) {
18465 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18466 if (!Visited.insert(PHI).second)
18467 continue;
18468 for (Value *V : PHI->incoming_values()) {
18469 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18470 Nodes.push_back(PHI1);
18471 continue;
18472 }
18473 Opcodes.emplace_back(V);
18474 }
18475 }
18476 }
18477
18478 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18479 Incoming, PHICompare, AreCompatiblePHIs,
18480 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18481 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18482 },
18483 /*MaxVFOnly=*/true, R);
18484 Changed |= HaveVectorizedPhiNodes;
18485 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18486 } while (HaveVectorizedPhiNodes);
18487
18488 VisitedInstrs.clear();
18489
18490 InstSetVector PostProcessInserts;
18491 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18492 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18493 // also vectorizes `PostProcessCmps`.
18494 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18495 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18496 if (VectorizeCmps) {
18497 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18498 PostProcessCmps.clear();
18499 }
18500 PostProcessInserts.clear();
18501 return Changed;
18502 };
18503 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18504 auto IsInPostProcessInstrs = [&](Instruction *I) {
18505 if (auto *Cmp = dyn_cast<CmpInst>(I))
18506 return PostProcessCmps.contains(Cmp);
18507 return isa<InsertElementInst, InsertValueInst>(I) &&
18508 PostProcessInserts.contains(I);
18509 };
18510 // Returns true if `I` is an instruction without users, like terminator, or
18511 // function call with ignored return value, store. Ignore unused instructions
18512 // (basing on instruction type, except for CallInst and InvokeInst).
18513 auto HasNoUsers = [](Instruction *I) {
18514 return I->use_empty() &&
18515 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18516 };
18517 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18518 // Skip instructions with scalable type. The num of elements is unknown at
18519 // compile-time for scalable type.
18520 if (isa<ScalableVectorType>(It->getType()))
18521 continue;
18522
18523 // Skip instructions marked for the deletion.
18524 if (R.isDeleted(&*It))
18525 continue;
18526 // We may go through BB multiple times so skip the one we have checked.
18527 if (!VisitedInstrs.insert(&*It).second) {
18528 if (HasNoUsers(&*It) &&
18529 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18530 // We would like to start over since some instructions are deleted
18531 // and the iterator may become invalid value.
18532 Changed = true;
18533 It = BB->begin();
18534 E = BB->end();
18535 }
18536 continue;
18537 }
18538
18539 if (isa<DbgInfoIntrinsic>(It))
18540 continue;
18541
18542 // Try to vectorize reductions that use PHINodes.
18543 if (PHINode *P = dyn_cast<PHINode>(It)) {
18544 // Check that the PHI is a reduction PHI.
18545 if (P->getNumIncomingValues() == 2) {
18546 // Try to match and vectorize a horizontal reduction.
18547 Instruction *Root = getReductionInstr(DT, P, BB, LI);
18548 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18549 Changed = true;
18550 It = BB->begin();
18551 E = BB->end();
18552 continue;
18553 }
18554 }
18555 // Try to vectorize the incoming values of the PHI, to catch reductions
18556 // that feed into PHIs.
18557 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
18558 // Skip if the incoming block is the current BB for now. Also, bypass
18559 // unreachable IR for efficiency and to avoid crashing.
18560 // TODO: Collect the skipped incoming values and try to vectorize them
18561 // after processing BB.
18562 if (BB == P->getIncomingBlock(I) ||
18563 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18564 continue;
18565
18566 // Postponed instructions should not be vectorized here, delay their
18567 // vectorization.
18568 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18569 PI && !IsInPostProcessInstrs(PI))
18570 Changed |= vectorizeRootInstruction(nullptr, PI,
18571 P->getIncomingBlock(I), R, TTI);
18572 }
18573 continue;
18574 }
18575
18576 if (HasNoUsers(&*It)) {
18577 bool OpsChanged = false;
18578 auto *SI = dyn_cast<StoreInst>(It);
18579 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18580 if (SI) {
18581 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18582 // Try to vectorize chain in store, if this is the only store to the
18583 // address in the block.
18584 // TODO: This is just a temporarily solution to save compile time. Need
18585 // to investigate if we can safely turn on slp-vectorize-hor-store
18586 // instead to allow lookup for reduction chains in all non-vectorized
18587 // stores (need to check side effects and compile time).
18588 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18589 SI->getValueOperand()->hasOneUse();
18590 }
18591 if (TryToVectorizeRoot) {
18592 for (auto *V : It->operand_values()) {
18593 // Postponed instructions should not be vectorized here, delay their
18594 // vectorization.
18595 if (auto *VI = dyn_cast<Instruction>(V);
18596 VI && !IsInPostProcessInstrs(VI))
18597 // Try to match and vectorize a horizontal reduction.
18598 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18599 }
18600 }
18601 // Start vectorization of post-process list of instructions from the
18602 // top-tree instructions to try to vectorize as many instructions as
18603 // possible.
18604 OpsChanged |=
18605 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18606 if (OpsChanged) {
18607 // We would like to start over since some instructions are deleted
18608 // and the iterator may become invalid value.
18609 Changed = true;
18610 It = BB->begin();
18611 E = BB->end();
18612 continue;
18613 }
18614 }
18615
18616 if (isa<InsertElementInst, InsertValueInst>(It))
18617 PostProcessInserts.insert(&*It);
18618 else if (isa<CmpInst>(It))
18619 PostProcessCmps.insert(cast<CmpInst>(&*It));
18620 }
18621
18622 return Changed;
18623}
18624
18625bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18626 auto Changed = false;
18627 for (auto &Entry : GEPs) {
18628 // If the getelementptr list has fewer than two elements, there's nothing
18629 // to do.
18630 if (Entry.second.size() < 2)
18631 continue;
18632
18633 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18634 << Entry.second.size() << ".\n");
18635
18636 // Process the GEP list in chunks suitable for the target's supported
18637 // vector size. If a vector register can't hold 1 element, we are done. We
18638 // are trying to vectorize the index computations, so the maximum number of
18639 // elements is based on the size of the index expression, rather than the
18640 // size of the GEP itself (the target's pointer size).
18641 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18642 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
18643 if (MaxVecRegSize < EltSize)
18644 continue;
18645
18646 unsigned MaxElts = MaxVecRegSize / EltSize;
18647 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18648 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18649 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
18650
18651 // Initialize a set a candidate getelementptrs. Note that we use a
18652 // SetVector here to preserve program order. If the index computations
18653 // are vectorizable and begin with loads, we want to minimize the chance
18654 // of having to reorder them later.
18655 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18656
18657 // Some of the candidates may have already been vectorized after we
18658 // initially collected them or their index is optimized to constant value.
18659 // If so, they are marked as deleted, so remove them from the set of
18660 // candidates.
18661 Candidates.remove_if([&R](Value *I) {
18662 return R.isDeleted(cast<Instruction>(I)) ||
18663 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
18664 });
18665
18666 // Remove from the set of candidates all pairs of getelementptrs with
18667 // constant differences. Such getelementptrs are likely not good
18668 // candidates for vectorization in a bottom-up phase since one can be
18669 // computed from the other. We also ensure all candidate getelementptr
18670 // indices are unique.
18671 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
18672 auto *GEPI = GEPList[I];
18673 if (!Candidates.count(GEPI))
18674 continue;
18675 auto *SCEVI = SE->getSCEV(GEPList[I]);
18676 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
18677 auto *GEPJ = GEPList[J];
18678 auto *SCEVJ = SE->getSCEV(GEPList[J]);
18679 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
18680 Candidates.remove(GEPI);
18681 Candidates.remove(GEPJ);
18682 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18683 Candidates.remove(GEPJ);
18684 }
18685 }
18686 }
18687
18688 // We break out of the above computation as soon as we know there are
18689 // fewer than two candidates remaining.
18690 if (Candidates.size() < 2)
18691 continue;
18692
18693 // Add the single, non-constant index of each candidate to the bundle. We
18694 // ensured the indices met these constraints when we originally collected
18695 // the getelementptrs.
18696 SmallVector<Value *, 16> Bundle(Candidates.size());
18697 auto BundleIndex = 0u;
18698 for (auto *V : Candidates) {
18699 auto *GEP = cast<GetElementPtrInst>(V);
18700 auto *GEPIdx = GEP->idx_begin()->get();
18701 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18702 Bundle[BundleIndex++] = GEPIdx;
18703 }
18704
18705 // Try and vectorize the indices. We are currently only interested in
18706 // gather-like cases of the form:
18707 //
18708 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18709 //
18710 // where the loads of "a", the loads of "b", and the subtractions can be
18711 // performed in parallel. It's likely that detecting this pattern in a
18712 // bottom-up phase will be simpler and less costly than building a
18713 // full-blown top-down phase beginning at the consecutive loads.
18714 Changed |= tryToVectorizeList(Bundle, R);
18715 }
18716 }
18717 return Changed;
18718}
18719
18720bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18721 bool Changed = false;
18722 // Sort by type, base pointers and values operand. Value operands must be
18723 // compatible (have the same opcode, same parent), otherwise it is
18724 // definitely not profitable to try to vectorize them.
18725 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18726 if (V->getValueOperand()->getType()->getTypeID() <
18727 V2->getValueOperand()->getType()->getTypeID())
18728 return true;
18729 if (V->getValueOperand()->getType()->getTypeID() >
18730 V2->getValueOperand()->getType()->getTypeID())
18731 return false;
18732 if (V->getPointerOperandType()->getTypeID() <
18733 V2->getPointerOperandType()->getTypeID())
18734 return true;
18735 if (V->getPointerOperandType()->getTypeID() >
18736 V2->getPointerOperandType()->getTypeID())
18737 return false;
18738 // UndefValues are compatible with all other values.
18739 if (isa<UndefValue>(V->getValueOperand()) ||
18740 isa<UndefValue>(V2->getValueOperand()))
18741 return false;
18742 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
18743 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18745 DT->getNode(I1->getParent());
18747 DT->getNode(I2->getParent());
18748 assert(NodeI1 && "Should only process reachable instructions");
18749 assert(NodeI2 && "Should only process reachable instructions");
18750 assert((NodeI1 == NodeI2) ==
18751 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18752 "Different nodes should have different DFS numbers");
18753 if (NodeI1 != NodeI2)
18754 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18755 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18756 if (S.getOpcode())
18757 return false;
18758 return I1->getOpcode() < I2->getOpcode();
18759 }
18760 if (isa<Constant>(V->getValueOperand()) &&
18761 isa<Constant>(V2->getValueOperand()))
18762 return false;
18763 return V->getValueOperand()->getValueID() <
18764 V2->getValueOperand()->getValueID();
18765 };
18766
18767 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18768 if (V1 == V2)
18769 return true;
18770 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18771 return false;
18772 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18773 return false;
18774 // Undefs are compatible with any other value.
18775 if (isa<UndefValue>(V1->getValueOperand()) ||
18776 isa<UndefValue>(V2->getValueOperand()))
18777 return true;
18778 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
18779 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18780 if (I1->getParent() != I2->getParent())
18781 return false;
18782 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18783 return S.getOpcode() > 0;
18784 }
18785 if (isa<Constant>(V1->getValueOperand()) &&
18786 isa<Constant>(V2->getValueOperand()))
18787 return true;
18788 return V1->getValueOperand()->getValueID() ==
18789 V2->getValueOperand()->getValueID();
18790 };
18791
18792 // Attempt to sort and vectorize each of the store-groups.
18794 for (auto &Pair : Stores) {
18795 if (Pair.second.size() < 2)
18796 continue;
18797
18798 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18799 << Pair.second.size() << ".\n");
18800
18801 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
18802 continue;
18803
18804 // Reverse stores to do bottom-to-top analysis. This is important if the
18805 // values are stores to the same addresses several times, in this case need
18806 // to follow the stores order (reversed to meet the memory dependecies).
18807 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18808 Pair.second.rend());
18809 Changed |= tryToVectorizeSequence<StoreInst>(
18810 ReversedStores, StoreSorter, AreCompatibleStores,
18811 [&](ArrayRef<StoreInst *> Candidates, bool) {
18812 return vectorizeStores(Candidates, R, Attempted);
18813 },
18814 /*MaxVFOnly=*/false, R);
18815 }
18816 return Changed;
18817}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1497
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:492
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
reverse_iterator rend()
Definition: BasicBlock.h:448
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2332
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2227
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2469
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2326
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1600
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2323
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:983
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1362
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:1167
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1129
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2126
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1449
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:905
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:348
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2535
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:848
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1753
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2249
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2161
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1587
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:260
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:742
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:257
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:258
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
Type * getPointerOperandType() const
Definition: Instructions.h:420
Value * getValueOperand()
Definition: Instructions.h:414
Value * getPointerOperand()
Definition: Instructions.h:417
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:29
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:103
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1471
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:810
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:869
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1154
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7063
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1223
constexpr int PoisonMaskElem
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2490
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const