LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
121static cl::opt<bool>
122ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
123 cl::desc("Attempt to vectorize horizontal reductions"));
124
126 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
127 cl::desc(
128 "Attempt to vectorize horizontal reductions feeding into a store"));
129
130// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
131// even if we match a reduction but do not vectorize in the end.
133 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
134 cl::desc("Allow optimization of original scalar identity operations on "
135 "matched horizontal reductions."));
136
137static cl::opt<int>
139 cl::desc("Attempt to vectorize for this register size in bits"));
140
143 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
144
145/// Limits the size of scheduling regions in a block.
146/// It avoid long compile times for _very_ large blocks where vector
147/// instructions are spread over a wide range.
148/// This limit is way higher than needed by real-world functions.
149static cl::opt<int>
150ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
151 cl::desc("Limit the size of the SLP scheduling region per block"));
152
154 "slp-min-reg-size", cl::init(128), cl::Hidden,
155 cl::desc("Attempt to vectorize for this register size in bits"));
156
158 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
159 cl::desc("Limit the recursion depth when building a vectorizable tree"));
160
162 "slp-min-tree-size", cl::init(3), cl::Hidden,
163 cl::desc("Only vectorize small trees if they are fully vectorizable"));
164
165// The maximum depth that the look-ahead score heuristic will explore.
166// The higher this value, the higher the compilation time overhead.
168 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
169 cl::desc("The maximum look-ahead depth for operand reordering scores"));
170
171// The maximum depth that the look-ahead score heuristic will explore
172// when it probing among candidates for vectorization tree roots.
173// The higher this value, the higher the compilation time overhead but unlike
174// similar limit for operands ordering this is less frequently used, hence
175// impact of higher value is less noticeable.
177 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for searching best rooting option"));
179
181 "slp-min-strided-loads", cl::init(2), cl::Hidden,
182 cl::desc("The minimum number of loads, which should be considered strided, "
183 "if the stride is > 1 or is runtime value"));
184
186 "slp-max-stride", cl::init(8), cl::Hidden,
187 cl::desc("The maximum stride, considered to be profitable."));
188
189static cl::opt<bool>
190 ViewSLPTree("view-slp-tree", cl::Hidden,
191 cl::desc("Display the SLP trees with Graphviz"));
192
193// Limit the number of alias checks. The limit is chosen so that
194// it has no negative effect on the llvm benchmarks.
195static const unsigned AliasedCheckLimit = 10;
196
197// Limit of the number of uses for potentially transformed instructions/values,
198// used in checks to avoid compile-time explode.
199static constexpr int UsesLimit = 8;
200
201// Another limit for the alias checks: The maximum distance between load/store
202// instructions where alias checks are done.
203// This limit is useful for very large basic blocks.
204static const unsigned MaxMemDepDistance = 160;
205
206/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
207/// regions to be handled.
208static const int MinScheduleRegionSize = 16;
209
210/// Predicate for the element types that the SLP vectorizer supports.
211///
212/// The most important thing to filter here are types which are invalid in LLVM
213/// vectors. We also filter target specific types which have absolutely no
214/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
215/// avoids spending time checking the cost model and realizing that they will
216/// be inevitably scalarized.
217static bool isValidElementType(Type *Ty) {
218 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
219 !Ty->isPPC_FP128Ty();
220}
221
222/// \returns True if the value is a constant (but not globals/constant
223/// expressions).
224static bool isConstant(Value *V) {
225 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
226}
227
228/// Checks if \p V is one of vector-like instructions, i.e. undef,
229/// insertelement/extractelement with constant indices for fixed vector type or
230/// extractvalue instruction.
232 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
233 !isa<ExtractValueInst, UndefValue>(V))
234 return false;
235 auto *I = dyn_cast<Instruction>(V);
236 if (!I || isa<ExtractValueInst>(I))
237 return true;
238 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
239 return false;
240 if (isa<ExtractElementInst>(I))
241 return isConstant(I->getOperand(1));
242 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
243 return isConstant(I->getOperand(2));
244}
245
246#if !defined(NDEBUG)
247/// Print a short descriptor of the instruction bundle suitable for debug output.
248static std::string shortBundleName(ArrayRef<Value *> VL) {
249 std::string Result;
250 raw_string_ostream OS(Result);
251 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
252 OS.flush();
253 return Result;
254}
255#endif
256
257/// \returns true if all of the instructions in \p VL are in the same block or
258/// false otherwise.
260 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
261 if (!I0)
262 return false;
264 return true;
265
266 BasicBlock *BB = I0->getParent();
267 for (int I = 1, E = VL.size(); I < E; I++) {
268 auto *II = dyn_cast<Instruction>(VL[I]);
269 if (!II)
270 return false;
271
272 if (BB != II->getParent())
273 return false;
274 }
275 return true;
276}
277
278/// \returns True if all of the values in \p VL are constants (but not
279/// globals/constant expressions).
281 // Constant expressions and globals can't be vectorized like normal integer/FP
282 // constants.
283 return all_of(VL, isConstant);
284}
285
286/// \returns True if all of the values in \p VL are identical or some of them
287/// are UndefValue.
288static bool isSplat(ArrayRef<Value *> VL) {
289 Value *FirstNonUndef = nullptr;
290 for (Value *V : VL) {
291 if (isa<UndefValue>(V))
292 continue;
293 if (!FirstNonUndef) {
294 FirstNonUndef = V;
295 continue;
296 }
297 if (V != FirstNonUndef)
298 return false;
299 }
300 return FirstNonUndef != nullptr;
301}
302
303/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
305 if (auto *Cmp = dyn_cast<CmpInst>(I))
306 return Cmp->isCommutative();
307 if (auto *BO = dyn_cast<BinaryOperator>(I))
308 return BO->isCommutative();
309 // TODO: This should check for generic Instruction::isCommutative(), but
310 // we need to confirm that the caller code correctly handles Intrinsics
311 // for example (does not have 2 operands).
312 return false;
313}
314
315/// \returns inserting index of InsertElement or InsertValue instruction,
316/// using Offset as base offset for index.
317static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
318 unsigned Offset = 0) {
319 int Index = Offset;
320 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
321 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
322 if (!VT)
323 return std::nullopt;
324 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
325 if (!CI)
326 return std::nullopt;
327 if (CI->getValue().uge(VT->getNumElements()))
328 return std::nullopt;
329 Index *= VT->getNumElements();
330 Index += CI->getZExtValue();
331 return Index;
332 }
333
334 const auto *IV = cast<InsertValueInst>(InsertInst);
335 Type *CurrentType = IV->getType();
336 for (unsigned I : IV->indices()) {
337 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
338 Index *= ST->getNumElements();
339 CurrentType = ST->getElementType(I);
340 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
341 Index *= AT->getNumElements();
342 CurrentType = AT->getElementType();
343 } else {
344 return std::nullopt;
345 }
346 Index += I;
347 }
348 return Index;
349}
350
351namespace {
352/// Specifies the way the mask should be analyzed for undefs/poisonous elements
353/// in the shuffle mask.
354enum class UseMask {
355 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
356 ///< check for the mask elements for the first argument (mask
357 ///< indices are in range [0:VF)).
358 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
359 ///< for the mask elements for the second argument (mask indices
360 ///< are in range [VF:2*VF))
361 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
362 ///< future shuffle elements and mark them as ones as being used
363 ///< in future. Non-undef elements are considered as unused since
364 ///< they're already marked as used in the mask.
365};
366} // namespace
367
368/// Prepares a use bitset for the given mask either for the first argument or
369/// for the second.
371 UseMask MaskArg) {
372 SmallBitVector UseMask(VF, true);
373 for (auto [Idx, Value] : enumerate(Mask)) {
374 if (Value == PoisonMaskElem) {
375 if (MaskArg == UseMask::UndefsAsMask)
376 UseMask.reset(Idx);
377 continue;
378 }
379 if (MaskArg == UseMask::FirstArg && Value < VF)
380 UseMask.reset(Value);
381 else if (MaskArg == UseMask::SecondArg && Value >= VF)
382 UseMask.reset(Value - VF);
383 }
384 return UseMask;
385}
386
387/// Checks if the given value is actually an undefined constant vector.
388/// Also, if the \p UseMask is not empty, tries to check if the non-masked
389/// elements actually mask the insertelement buildvector, if any.
390template <bool IsPoisonOnly = false>
392 const SmallBitVector &UseMask = {}) {
393 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
394 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
395 if (isa<T>(V))
396 return Res;
397 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
398 if (!VecTy)
399 return Res.reset();
400 auto *C = dyn_cast<Constant>(V);
401 if (!C) {
402 if (!UseMask.empty()) {
403 const Value *Base = V;
404 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
405 Base = II->getOperand(0);
406 if (isa<T>(II->getOperand(1)))
407 continue;
408 std::optional<unsigned> Idx = getInsertIndex(II);
409 if (!Idx) {
410 Res.reset();
411 return Res;
412 }
413 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
414 Res.reset(*Idx);
415 }
416 // TODO: Add analysis for shuffles here too.
417 if (V == Base) {
418 Res.reset();
419 } else {
420 SmallBitVector SubMask(UseMask.size(), false);
421 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
422 }
423 } else {
424 Res.reset();
425 }
426 return Res;
427 }
428 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
429 if (Constant *Elem = C->getAggregateElement(I))
430 if (!isa<T>(Elem) &&
431 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
432 Res.reset(I);
433 }
434 return Res;
435}
436
437/// Checks if the vector of instructions can be represented as a shuffle, like:
438/// %x0 = extractelement <4 x i8> %x, i32 0
439/// %x3 = extractelement <4 x i8> %x, i32 3
440/// %y1 = extractelement <4 x i8> %y, i32 1
441/// %y2 = extractelement <4 x i8> %y, i32 2
442/// %x0x0 = mul i8 %x0, %x0
443/// %x3x3 = mul i8 %x3, %x3
444/// %y1y1 = mul i8 %y1, %y1
445/// %y2y2 = mul i8 %y2, %y2
446/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
447/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
448/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
449/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
450/// ret <4 x i8> %ins4
451/// can be transformed into:
452/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
453/// i32 6>
454/// %2 = mul <4 x i8> %1, %1
455/// ret <4 x i8> %2
456/// Mask will return the Shuffle Mask equivalent to the extracted elements.
457/// TODO: Can we split off and reuse the shuffle mask detection from
458/// ShuffleVectorInst/getShuffleCost?
459static std::optional<TargetTransformInfo::ShuffleKind>
461 const auto *It =
462 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
463 if (It == VL.end())
464 return std::nullopt;
465 auto *EI0 = cast<ExtractElementInst>(*It);
466 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
467 return std::nullopt;
468 unsigned Size =
469 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
470 Value *Vec1 = nullptr;
471 Value *Vec2 = nullptr;
472 enum ShuffleMode { Unknown, Select, Permute };
473 ShuffleMode CommonShuffleMode = Unknown;
474 Mask.assign(VL.size(), PoisonMaskElem);
475 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
476 // Undef can be represented as an undef element in a vector.
477 if (isa<UndefValue>(VL[I]))
478 continue;
479 auto *EI = cast<ExtractElementInst>(VL[I]);
480 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
481 return std::nullopt;
482 auto *Vec = EI->getVectorOperand();
483 // We can extractelement from undef or poison vector.
484 if (isUndefVector(Vec).all())
485 continue;
486 // All vector operands must have the same number of vector elements.
487 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
488 return std::nullopt;
489 if (isa<UndefValue>(EI->getIndexOperand()))
490 continue;
491 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
492 if (!Idx)
493 return std::nullopt;
494 // Undefined behavior if Idx is negative or >= Size.
495 if (Idx->getValue().uge(Size))
496 continue;
497 unsigned IntIdx = Idx->getValue().getZExtValue();
498 Mask[I] = IntIdx;
499 // For correct shuffling we have to have at most 2 different vector operands
500 // in all extractelement instructions.
501 if (!Vec1 || Vec1 == Vec) {
502 Vec1 = Vec;
503 } else if (!Vec2 || Vec2 == Vec) {
504 Vec2 = Vec;
505 Mask[I] += Size;
506 } else {
507 return std::nullopt;
508 }
509 if (CommonShuffleMode == Permute)
510 continue;
511 // If the extract index is not the same as the operation number, it is a
512 // permutation.
513 if (IntIdx != I) {
514 CommonShuffleMode = Permute;
515 continue;
516 }
517 CommonShuffleMode = Select;
518 }
519 // If we're not crossing lanes in different vectors, consider it as blending.
520 if (CommonShuffleMode == Select && Vec2)
522 // If Vec2 was never used, we have a permutation of a single vector, otherwise
523 // we have permutation of 2 vectors.
526}
527
528/// \returns True if Extract{Value,Element} instruction extracts element Idx.
529static std::optional<unsigned> getExtractIndex(Instruction *E) {
530 unsigned Opcode = E->getOpcode();
531 assert((Opcode == Instruction::ExtractElement ||
532 Opcode == Instruction::ExtractValue) &&
533 "Expected extractelement or extractvalue instruction.");
534 if (Opcode == Instruction::ExtractElement) {
535 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
536 if (!CI)
537 return std::nullopt;
538 return CI->getZExtValue();
539 }
540 auto *EI = cast<ExtractValueInst>(E);
541 if (EI->getNumIndices() != 1)
542 return std::nullopt;
543 return *EI->idx_begin();
544}
545
546namespace {
547
548/// Main data required for vectorization of instructions.
549struct InstructionsState {
550 /// The very first instruction in the list with the main opcode.
551 Value *OpValue = nullptr;
552
553 /// The main/alternate instruction.
554 Instruction *MainOp = nullptr;
555 Instruction *AltOp = nullptr;
556
557 /// The main/alternate opcodes for the list of instructions.
558 unsigned getOpcode() const {
559 return MainOp ? MainOp->getOpcode() : 0;
560 }
561
562 unsigned getAltOpcode() const {
563 return AltOp ? AltOp->getOpcode() : 0;
564 }
565
566 /// Some of the instructions in the list have alternate opcodes.
567 bool isAltShuffle() const { return AltOp != MainOp; }
568
569 bool isOpcodeOrAlt(Instruction *I) const {
570 unsigned CheckedOpcode = I->getOpcode();
571 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
572 }
573
574 InstructionsState() = delete;
575 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
576 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
577};
578
579} // end anonymous namespace
580
581/// Chooses the correct key for scheduling data. If \p Op has the same (or
582/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
583/// OpValue.
584static Value *isOneOf(const InstructionsState &S, Value *Op) {
585 auto *I = dyn_cast<Instruction>(Op);
586 if (I && S.isOpcodeOrAlt(I))
587 return Op;
588 return S.OpValue;
589}
590
591/// \returns true if \p Opcode is allowed as part of the main/alternate
592/// instruction for SLP vectorization.
593///
594/// Example of unsupported opcode is SDIV that can potentially cause UB if the
595/// "shuffled out" lane would result in division by zero.
596static bool isValidForAlternation(unsigned Opcode) {
597 if (Instruction::isIntDivRem(Opcode))
598 return false;
599
600 return true;
601}
602
603static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
604 const TargetLibraryInfo &TLI,
605 unsigned BaseIndex = 0);
606
607/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
608/// compatible instructions or constants, or just some other regular values.
609static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
610 Value *Op1, const TargetLibraryInfo &TLI) {
611 return (isConstant(BaseOp0) && isConstant(Op0)) ||
612 (isConstant(BaseOp1) && isConstant(Op1)) ||
613 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
614 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
615 BaseOp0 == Op0 || BaseOp1 == Op1 ||
616 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
617 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
618}
619
620/// \returns true if a compare instruction \p CI has similar "look" and
621/// same predicate as \p BaseCI, "as is" or with its operands and predicate
622/// swapped, false otherwise.
623static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
624 const TargetLibraryInfo &TLI) {
625 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
626 "Assessing comparisons of different types?");
627 CmpInst::Predicate BasePred = BaseCI->getPredicate();
628 CmpInst::Predicate Pred = CI->getPredicate();
630
631 Value *BaseOp0 = BaseCI->getOperand(0);
632 Value *BaseOp1 = BaseCI->getOperand(1);
633 Value *Op0 = CI->getOperand(0);
634 Value *Op1 = CI->getOperand(1);
635
636 return (BasePred == Pred &&
637 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
638 (BasePred == SwappedPred &&
639 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
640}
641
642/// \returns analysis of the Instructions in \p VL described in
643/// InstructionsState, the Opcode that we suppose the whole list
644/// could be vectorized even if its structure is diverse.
645static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
646 const TargetLibraryInfo &TLI,
647 unsigned BaseIndex) {
648 // Make sure these are all Instructions.
649 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
650 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
651
652 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
653 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
654 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
655 CmpInst::Predicate BasePred =
656 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
658 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
659 unsigned AltOpcode = Opcode;
660 unsigned AltIndex = BaseIndex;
661
662 // Check for one alternate opcode from another BinaryOperator.
663 // TODO - generalize to support all operators (types, calls etc.).
664 auto *IBase = cast<Instruction>(VL[BaseIndex]);
665 Intrinsic::ID BaseID = 0;
666 SmallVector<VFInfo> BaseMappings;
667 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
669 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
670 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
671 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
672 }
673 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
674 auto *I = cast<Instruction>(VL[Cnt]);
675 unsigned InstOpcode = I->getOpcode();
676 if (IsBinOp && isa<BinaryOperator>(I)) {
677 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
678 continue;
679 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
680 isValidForAlternation(Opcode)) {
681 AltOpcode = InstOpcode;
682 AltIndex = Cnt;
683 continue;
684 }
685 } else if (IsCastOp && isa<CastInst>(I)) {
686 Value *Op0 = IBase->getOperand(0);
687 Type *Ty0 = Op0->getType();
688 Value *Op1 = I->getOperand(0);
689 Type *Ty1 = Op1->getType();
690 if (Ty0 == Ty1) {
691 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
692 continue;
693 if (Opcode == AltOpcode) {
695 isValidForAlternation(InstOpcode) &&
696 "Cast isn't safe for alternation, logic needs to be updated!");
697 AltOpcode = InstOpcode;
698 AltIndex = Cnt;
699 continue;
700 }
701 }
702 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
703 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
704 Type *Ty0 = BaseInst->getOperand(0)->getType();
705 Type *Ty1 = Inst->getOperand(0)->getType();
706 if (Ty0 == Ty1) {
707 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
708 // Check for compatible operands. If the corresponding operands are not
709 // compatible - need to perform alternate vectorization.
710 CmpInst::Predicate CurrentPred = Inst->getPredicate();
711 CmpInst::Predicate SwappedCurrentPred =
712 CmpInst::getSwappedPredicate(CurrentPred);
713
714 if (E == 2 &&
715 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
716 continue;
717
718 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
719 continue;
720 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
721 if (AltIndex != BaseIndex) {
722 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
723 continue;
724 } else if (BasePred != CurrentPred) {
725 assert(
726 isValidForAlternation(InstOpcode) &&
727 "CmpInst isn't safe for alternation, logic needs to be updated!");
728 AltIndex = Cnt;
729 continue;
730 }
731 CmpInst::Predicate AltPred = AltInst->getPredicate();
732 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
733 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
734 continue;
735 }
736 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
737 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
738 if (Gep->getNumOperands() != 2 ||
739 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
740 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
741 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
743 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
744 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
745 auto *BaseLI = cast<LoadInst>(IBase);
746 if (!LI->isSimple() || !BaseLI->isSimple())
747 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
748 } else if (auto *Call = dyn_cast<CallInst>(I)) {
749 auto *CallBase = cast<CallInst>(IBase);
750 if (Call->getCalledFunction() != CallBase->getCalledFunction())
751 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
752 if (Call->hasOperandBundles() &&
753 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
754 Call->op_begin() + Call->getBundleOperandsEndIndex(),
755 CallBase->op_begin() +
757 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759 if (ID != BaseID)
760 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
761 if (!ID) {
762 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
763 if (Mappings.size() != BaseMappings.size() ||
764 Mappings.front().ISA != BaseMappings.front().ISA ||
765 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
766 Mappings.front().VectorName != BaseMappings.front().VectorName ||
767 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
768 Mappings.front().Shape.Parameters !=
769 BaseMappings.front().Shape.Parameters)
770 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
771 }
772 }
773 continue;
774 }
775 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
776 }
777
778 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
779 cast<Instruction>(VL[AltIndex]));
780}
781
782/// \returns true if all of the values in \p VL have the same type or false
783/// otherwise.
785 Type *Ty = VL.front()->getType();
786 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
787}
788
789/// \returns True if in-tree use also needs extract. This refers to
790/// possible scalar operand in vectorized instruction.
791static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
792 TargetLibraryInfo *TLI) {
793 unsigned Opcode = UserInst->getOpcode();
794 switch (Opcode) {
795 case Instruction::Load: {
796 LoadInst *LI = cast<LoadInst>(UserInst);
797 return (LI->getPointerOperand() == Scalar);
798 }
799 case Instruction::Store: {
800 StoreInst *SI = cast<StoreInst>(UserInst);
801 return (SI->getPointerOperand() == Scalar);
802 }
803 case Instruction::Call: {
804 CallInst *CI = cast<CallInst>(UserInst);
806 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
807 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
808 Arg.value().get() == Scalar;
809 });
810 }
811 default:
812 return false;
813 }
814}
815
816/// \returns the AA location that is being access by the instruction.
818 if (StoreInst *SI = dyn_cast<StoreInst>(I))
819 return MemoryLocation::get(SI);
820 if (LoadInst *LI = dyn_cast<LoadInst>(I))
821 return MemoryLocation::get(LI);
822 return MemoryLocation();
823}
824
825/// \returns True if the instruction is not a volatile or atomic load/store.
826static bool isSimple(Instruction *I) {
827 if (LoadInst *LI = dyn_cast<LoadInst>(I))
828 return LI->isSimple();
829 if (StoreInst *SI = dyn_cast<StoreInst>(I))
830 return SI->isSimple();
831 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
832 return !MI->isVolatile();
833 return true;
834}
835
836/// Shuffles \p Mask in accordance with the given \p SubMask.
837/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
838/// one but two input vectors.
839static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
840 bool ExtendingManyInputs = false) {
841 if (SubMask.empty())
842 return;
843 assert(
844 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
845 // Check if input scalars were extended to match the size of other node.
846 (SubMask.size() == Mask.size() &&
847 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
848 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
849 "SubMask with many inputs support must be larger than the mask.");
850 if (Mask.empty()) {
851 Mask.append(SubMask.begin(), SubMask.end());
852 return;
853 }
854 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
855 int TermValue = std::min(Mask.size(), SubMask.size());
856 for (int I = 0, E = SubMask.size(); I < E; ++I) {
857 if (SubMask[I] == PoisonMaskElem ||
858 (!ExtendingManyInputs &&
859 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
860 continue;
861 NewMask[I] = Mask[SubMask[I]];
862 }
863 Mask.swap(NewMask);
864}
865
866/// Order may have elements assigned special value (size) which is out of
867/// bounds. Such indices only appear on places which correspond to undef values
868/// (see canReuseExtract for details) and used in order to avoid undef values
869/// have effect on operands ordering.
870/// The first loop below simply finds all unused indices and then the next loop
871/// nest assigns these indices for undef values positions.
872/// As an example below Order has two undef positions and they have assigned
873/// values 3 and 7 respectively:
874/// before: 6 9 5 4 9 2 1 0
875/// after: 6 3 5 4 7 2 1 0
877 const unsigned Sz = Order.size();
878 SmallBitVector UnusedIndices(Sz, /*t=*/true);
879 SmallBitVector MaskedIndices(Sz);
880 for (unsigned I = 0; I < Sz; ++I) {
881 if (Order[I] < Sz)
882 UnusedIndices.reset(Order[I]);
883 else
884 MaskedIndices.set(I);
885 }
886 if (MaskedIndices.none())
887 return;
888 assert(UnusedIndices.count() == MaskedIndices.count() &&
889 "Non-synced masked/available indices.");
890 int Idx = UnusedIndices.find_first();
891 int MIdx = MaskedIndices.find_first();
892 while (MIdx >= 0) {
893 assert(Idx >= 0 && "Indices must be synced.");
894 Order[MIdx] = Idx;
895 Idx = UnusedIndices.find_next(Idx);
896 MIdx = MaskedIndices.find_next(MIdx);
897 }
898}
899
900namespace llvm {
901
903 SmallVectorImpl<int> &Mask) {
904 Mask.clear();
905 const unsigned E = Indices.size();
906 Mask.resize(E, PoisonMaskElem);
907 for (unsigned I = 0; I < E; ++I)
908 Mask[Indices[I]] = I;
909}
910
911/// Reorders the list of scalars in accordance with the given \p Mask.
913 ArrayRef<int> Mask) {
914 assert(!Mask.empty() && "Expected non-empty mask.");
915 SmallVector<Value *> Prev(Scalars.size(),
916 UndefValue::get(Scalars.front()->getType()));
917 Prev.swap(Scalars);
918 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
919 if (Mask[I] != PoisonMaskElem)
920 Scalars[Mask[I]] = Prev[I];
921}
922
923/// Checks if the provided value does not require scheduling. It does not
924/// require scheduling if this is not an instruction or it is an instruction
925/// that does not read/write memory and all operands are either not instructions
926/// or phi nodes or instructions from different blocks.
928 auto *I = dyn_cast<Instruction>(V);
929 if (!I)
930 return true;
931 return !mayHaveNonDefUseDependency(*I) &&
932 all_of(I->operands(), [I](Value *V) {
933 auto *IO = dyn_cast<Instruction>(V);
934 if (!IO)
935 return true;
936 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
937 });
938}
939
940/// Checks if the provided value does not require scheduling. It does not
941/// require scheduling if this is not an instruction or it is an instruction
942/// that does not read/write memory and all users are phi nodes or instructions
943/// from the different blocks.
944static bool isUsedOutsideBlock(Value *V) {
945 auto *I = dyn_cast<Instruction>(V);
946 if (!I)
947 return true;
948 // Limits the number of uses to save compile time.
949 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
950 all_of(I->users(), [I](User *U) {
951 auto *IU = dyn_cast<Instruction>(U);
952 if (!IU)
953 return true;
954 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
955 });
956}
957
958/// Checks if the specified value does not require scheduling. It does not
959/// require scheduling if all operands and all users do not need to be scheduled
960/// in the current basic block.
963}
964
965/// Checks if the specified array of instructions does not require scheduling.
966/// It is so if all either instructions have operands that do not require
967/// scheduling or their users do not require scheduling since they are phis or
968/// in other basic blocks.
970 return !VL.empty() &&
972}
973
974namespace slpvectorizer {
975
976/// Bottom Up SLP Vectorizer.
977class BoUpSLP {
978 struct TreeEntry;
979 struct ScheduleData;
982
983public:
984 /// Tracks the state we can represent the loads in the given sequence.
985 enum class LoadsState {
986 Gather,
987 Vectorize,
990 };
991
999
1001 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1004 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
1005 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
1006 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1007 // Use the vector register size specified by the target unless overridden
1008 // by a command-line option.
1009 // TODO: It would be better to limit the vectorization factor based on
1010 // data type rather than just register size. For example, x86 AVX has
1011 // 256-bit registers, but it does not support integer operations
1012 // at that width (that requires AVX2).
1013 if (MaxVectorRegSizeOption.getNumOccurrences())
1014 MaxVecRegSize = MaxVectorRegSizeOption;
1015 else
1016 MaxVecRegSize =
1018 .getFixedValue();
1019
1020 if (MinVectorRegSizeOption.getNumOccurrences())
1021 MinVecRegSize = MinVectorRegSizeOption;
1022 else
1023 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1024 }
1025
1026 /// Vectorize the tree that starts with the elements in \p VL.
1027 /// Returns the vectorized root.
1029
1030 /// Vectorize the tree but with the list of externally used values \p
1031 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1032 /// generated extractvalue instructions.
1033 /// \param ReplacedExternals containd list of replaced external values
1034 /// {scalar, replace} after emitting extractelement for external uses.
1035 Value *
1036 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1037 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1038 Instruction *ReductionRoot = nullptr);
1039
1040 /// \returns the cost incurred by unwanted spills and fills, caused by
1041 /// holding live values over call sites.
1043
1044 /// \returns the vectorization cost of the subtree that starts at \p VL.
1045 /// A negative number means that this is profitable.
1046 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1047
1048 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1049 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1050 void buildTree(ArrayRef<Value *> Roots,
1051 const SmallDenseSet<Value *> &UserIgnoreLst);
1052
1053 /// Construct a vectorizable tree that starts at \p Roots.
1054 void buildTree(ArrayRef<Value *> Roots);
1055
1056 /// Returns whether the root node has in-tree uses.
1058 return !VectorizableTree.empty() &&
1059 !VectorizableTree.front()->UserTreeIndices.empty();
1060 }
1061
1062 /// Return the scalars of the root node.
1064 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1065 return VectorizableTree.front()->Scalars;
1066 }
1067
1068 /// Builds external uses of the vectorized scalars, i.e. the list of
1069 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1070 /// ExternallyUsedValues contains additional list of external uses to handle
1071 /// vectorization of reductions.
1072 void
1073 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1074
1075 /// Clear the internal data structures that are created by 'buildTree'.
1076 void deleteTree() {
1077 VectorizableTree.clear();
1078 ScalarToTreeEntry.clear();
1079 MultiNodeScalars.clear();
1080 MustGather.clear();
1081 EntryToLastInstruction.clear();
1082 ExternalUses.clear();
1083 for (auto &Iter : BlocksSchedules) {
1084 BlockScheduling *BS = Iter.second.get();
1085 BS->clear();
1086 }
1087 MinBWs.clear();
1088 InstrElementSize.clear();
1089 UserIgnoreList = nullptr;
1090 PostponedGathers.clear();
1091 ValueToGatherNodes.clear();
1092 }
1093
1094 unsigned getTreeSize() const { return VectorizableTree.size(); }
1095
1096 /// Perform LICM and CSE on the newly generated gather sequences.
1098
1099 /// Checks if the specified gather tree entry \p TE can be represented as a
1100 /// shuffled vector entry + (possibly) permutation with other gathers. It
1101 /// implements the checks only for possibly ordered scalars (Loads,
1102 /// ExtractElement, ExtractValue), which can be part of the graph.
1103 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1104
1105 /// Sort loads into increasing pointers offsets to allow greater clustering.
1106 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1107
1108 /// Gets reordering data for the given tree entry. If the entry is vectorized
1109 /// - just return ReorderIndices, otherwise check if the scalars can be
1110 /// reordered and return the most optimal order.
1111 /// \return std::nullopt if ordering is not important, empty order, if
1112 /// identity order is important, or the actual order.
1113 /// \param TopToBottom If true, include the order of vectorized stores and
1114 /// insertelement nodes, otherwise skip them.
1115 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1116 bool TopToBottom);
1117
1118 /// Reorders the current graph to the most profitable order starting from the
1119 /// root node to the leaf nodes. The best order is chosen only from the nodes
1120 /// of the same size (vectorization factor). Smaller nodes are considered
1121 /// parts of subgraph with smaller VF and they are reordered independently. We
1122 /// can make it because we still need to extend smaller nodes to the wider VF
1123 /// and we can merge reordering shuffles with the widening shuffles.
1124 void reorderTopToBottom();
1125
1126 /// Reorders the current graph to the most profitable order starting from
1127 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1128 /// number of reshuffles if the leaf nodes use the same order. In this case we
1129 /// can merge the orders and just shuffle user node instead of shuffling its
1130 /// operands. Plus, even the leaf nodes have different orders, it allows to
1131 /// sink reordering in the graph closer to the root node and merge it later
1132 /// during analysis.
1133 void reorderBottomToTop(bool IgnoreReorder = false);
1134
1135 /// \return The vector element size in bits to use when vectorizing the
1136 /// expression tree ending at \p V. If V is a store, the size is the width of
1137 /// the stored value. Otherwise, the size is the width of the largest loaded
1138 /// value reaching V. This method is used by the vectorizer to calculate
1139 /// vectorization factors.
1140 unsigned getVectorElementSize(Value *V);
1141
1142 /// Compute the minimum type sizes required to represent the entries in a
1143 /// vectorizable tree.
1145
1146 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1147 unsigned getMaxVecRegSize() const {
1148 return MaxVecRegSize;
1149 }
1150
1151 // \returns minimum vector register size as set by cl::opt.
1152 unsigned getMinVecRegSize() const {
1153 return MinVecRegSize;
1154 }
1155
1156 unsigned getMinVF(unsigned Sz) const {
1157 return std::max(2U, getMinVecRegSize() / Sz);
1158 }
1159
1160 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1161 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1162 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1163 return MaxVF ? MaxVF : UINT_MAX;
1164 }
1165
1166 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1167 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1168 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1169 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1170 ///
1171 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1172 unsigned canMapToVector(Type *T) const;
1173
1174 /// \returns True if the VectorizableTree is both tiny and not fully
1175 /// vectorizable. We do not vectorize such trees.
1176 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1177
1178 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1179 /// can be load combined in the backend. Load combining may not be allowed in
1180 /// the IR optimizer, so we do not want to alter the pattern. For example,
1181 /// partially transforming a scalar bswap() pattern into vector code is
1182 /// effectively impossible for the backend to undo.
1183 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1184 /// may not be necessary.
1185 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1186
1187 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1188 /// can be load combined in the backend. Load combining may not be allowed in
1189 /// the IR optimizer, so we do not want to alter the pattern. For example,
1190 /// partially transforming a scalar bswap() pattern into vector code is
1191 /// effectively impossible for the backend to undo.
1192 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1193 /// may not be necessary.
1194 bool isLoadCombineCandidate() const;
1195
1196 /// Checks if the given array of loads can be represented as a vectorized,
1197 /// scatter or just simple gather.
1198 /// \param VL list of loads.
1199 /// \param VL0 main load value.
1200 /// \param Order returned order of load instructions.
1201 /// \param PointerOps returned list of pointer operands.
1202 /// \param TryRecursiveCheck used to check if long masked gather can be
1203 /// represented as a serie of loads/insert subvector, if profitable.
1206 SmallVectorImpl<Value *> &PointerOps,
1207 bool TryRecursiveCheck = true) const;
1208
1210
1211 /// This structure holds any data we need about the edges being traversed
1212 /// during buildTree_rec(). We keep track of:
1213 /// (i) the user TreeEntry index, and
1214 /// (ii) the index of the edge.
1215 struct EdgeInfo {
1216 EdgeInfo() = default;
1217 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1219 /// The user TreeEntry.
1220 TreeEntry *UserTE = nullptr;
1221 /// The operand index of the use.
1222 unsigned EdgeIdx = UINT_MAX;
1223#ifndef NDEBUG
1225 const BoUpSLP::EdgeInfo &EI) {
1226 EI.dump(OS);
1227 return OS;
1228 }
1229 /// Debug print.
1230 void dump(raw_ostream &OS) const {
1231 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1232 << " EdgeIdx:" << EdgeIdx << "}";
1233 }
1234 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1235#endif
1236 bool operator == (const EdgeInfo &Other) const {
1237 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1238 }
1239 };
1240
1241 /// A helper class used for scoring candidates for two consecutive lanes.
1243 const TargetLibraryInfo &TLI;
1244 const DataLayout &DL;
1245 ScalarEvolution &SE;
1246 const BoUpSLP &R;
1247 int NumLanes; // Total number of lanes (aka vectorization factor).
1248 int MaxLevel; // The maximum recursion depth for accumulating score.
1249
1250 public:
1252 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1253 int MaxLevel)
1254 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1255 MaxLevel(MaxLevel) {}
1256
1257 // The hard-coded scores listed here are not very important, though it shall
1258 // be higher for better matches to improve the resulting cost. When
1259 // computing the scores of matching one sub-tree with another, we are
1260 // basically counting the number of values that are matching. So even if all
1261 // scores are set to 1, we would still get a decent matching result.
1262 // However, sometimes we have to break ties. For example we may have to
1263 // choose between matching loads vs matching opcodes. This is what these
1264 // scores are helping us with: they provide the order of preference. Also,
1265 // this is important if the scalar is externally used or used in another
1266 // tree entry node in the different lane.
1267
1268 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1269 static const int ScoreConsecutiveLoads = 4;
1270 /// The same load multiple times. This should have a better score than
1271 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1272 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1273 /// a vector load and 1.0 for a broadcast.
1274 static const int ScoreSplatLoads = 3;
1275 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1276 static const int ScoreReversedLoads = 3;
1277 /// A load candidate for masked gather.
1278 static const int ScoreMaskedGatherCandidate = 1;
1279 /// ExtractElementInst from same vector and consecutive indexes.
1280 static const int ScoreConsecutiveExtracts = 4;
1281 /// ExtractElementInst from same vector and reversed indices.
1282 static const int ScoreReversedExtracts = 3;
1283 /// Constants.
1284 static const int ScoreConstants = 2;
1285 /// Instructions with the same opcode.
1286 static const int ScoreSameOpcode = 2;
1287 /// Instructions with alt opcodes (e.g, add + sub).
1288 static const int ScoreAltOpcodes = 1;
1289 /// Identical instructions (a.k.a. splat or broadcast).
1290 static const int ScoreSplat = 1;
1291 /// Matching with an undef is preferable to failing.
1292 static const int ScoreUndef = 1;
1293 /// Score for failing to find a decent match.
1294 static const int ScoreFail = 0;
1295 /// Score if all users are vectorized.
1296 static const int ScoreAllUserVectorized = 1;
1297
1298 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1299 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1300 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1301 /// MainAltOps.
1303 ArrayRef<Value *> MainAltOps) const {
1304 if (!isValidElementType(V1->getType()) ||
1305 !isValidElementType(V2->getType()))
1307
1308 if (V1 == V2) {
1309 if (isa<LoadInst>(V1)) {
1310 // Retruns true if the users of V1 and V2 won't need to be extracted.
1311 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1312 // Bail out if we have too many uses to save compilation time.
1313 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1314 return false;
1315
1316 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1317 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1318 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1319 });
1320 };
1321 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1322 };
1323 // A broadcast of a load can be cheaper on some targets.
1324 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1325 ElementCount::getFixed(NumLanes)) &&
1326 ((int)V1->getNumUses() == NumLanes ||
1327 AllUsersAreInternal(V1, V2)))
1329 }
1331 }
1332
1333 auto *LI1 = dyn_cast<LoadInst>(V1);
1334 auto *LI2 = dyn_cast<LoadInst>(V2);
1335 if (LI1 && LI2) {
1336 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1337 !LI2->isSimple())
1339
1340 std::optional<int> Dist = getPointersDiff(
1341 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1342 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1343 if (!Dist || *Dist == 0) {
1344 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1345 getUnderlyingObject(LI2->getPointerOperand()) &&
1346 R.TTI->isLegalMaskedGather(
1347 FixedVectorType::get(LI1->getType(), NumLanes),
1348 LI1->getAlign()))
1351 }
1352 // The distance is too large - still may be profitable to use masked
1353 // loads/gathers.
1354 if (std::abs(*Dist) > NumLanes / 2)
1356 // This still will detect consecutive loads, but we might have "holes"
1357 // in some cases. It is ok for non-power-2 vectorization and may produce
1358 // better results. It should not affect current vectorization.
1361 }
1362
1363 auto *C1 = dyn_cast<Constant>(V1);
1364 auto *C2 = dyn_cast<Constant>(V2);
1365 if (C1 && C2)
1367
1368 // Extracts from consecutive indexes of the same vector better score as
1369 // the extracts could be optimized away.
1370 Value *EV1;
1371 ConstantInt *Ex1Idx;
1372 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1373 // Undefs are always profitable for extractelements.
1374 // Compiler can easily combine poison and extractelement <non-poison> or
1375 // undef and extractelement <poison>. But combining undef +
1376 // extractelement <non-poison-but-may-produce-poison> requires some
1377 // extra operations.
1378 if (isa<UndefValue>(V2))
1379 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1382 Value *EV2 = nullptr;
1383 ConstantInt *Ex2Idx = nullptr;
1384 if (match(V2,
1386 m_Undef())))) {
1387 // Undefs are always profitable for extractelements.
1388 if (!Ex2Idx)
1390 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1392 if (EV2 == EV1) {
1393 int Idx1 = Ex1Idx->getZExtValue();
1394 int Idx2 = Ex2Idx->getZExtValue();
1395 int Dist = Idx2 - Idx1;
1396 // The distance is too large - still may be profitable to use
1397 // shuffles.
1398 if (std::abs(Dist) == 0)
1400 if (std::abs(Dist) > NumLanes / 2)
1404 }
1406 }
1408 }
1409
1410 auto *I1 = dyn_cast<Instruction>(V1);
1411 auto *I2 = dyn_cast<Instruction>(V2);
1412 if (I1 && I2) {
1413 if (I1->getParent() != I2->getParent())
1415 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1416 Ops.push_back(I1);
1417 Ops.push_back(I2);
1418 InstructionsState S = getSameOpcode(Ops, TLI);
1419 // Note: Only consider instructions with <= 2 operands to avoid
1420 // complexity explosion.
1421 if (S.getOpcode() &&
1422 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1423 !S.isAltShuffle()) &&
1424 all_of(Ops, [&S](Value *V) {
1425 return cast<Instruction>(V)->getNumOperands() ==
1426 S.MainOp->getNumOperands();
1427 }))
1428 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1430 }
1431
1432 if (isa<UndefValue>(V2))
1434
1436 }
1437
1438 /// Go through the operands of \p LHS and \p RHS recursively until
1439 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1440 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1441 /// of \p U1 and \p U2), except at the beginning of the recursion where
1442 /// these are set to nullptr.
1443 ///
1444 /// For example:
1445 /// \verbatim
1446 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1447 /// \ / \ / \ / \ /
1448 /// + + + +
1449 /// G1 G2 G3 G4
1450 /// \endverbatim
1451 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1452 /// each level recursively, accumulating the score. It starts from matching
1453 /// the additions at level 0, then moves on to the loads (level 1). The
1454 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1455 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1456 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1457 /// Please note that the order of the operands does not matter, as we
1458 /// evaluate the score of all profitable combinations of operands. In
1459 /// other words the score of G1 and G4 is the same as G1 and G2. This
1460 /// heuristic is based on ideas described in:
1461 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1462 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1463 /// Luís F. W. Góes
1465 Instruction *U2, int CurrLevel,
1466 ArrayRef<Value *> MainAltOps) const {
1467
1468 // Get the shallow score of V1 and V2.
1469 int ShallowScoreAtThisLevel =
1470 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1471
1472 // If reached MaxLevel,
1473 // or if V1 and V2 are not instructions,
1474 // or if they are SPLAT,
1475 // or if they are not consecutive,
1476 // or if profitable to vectorize loads or extractelements, early return
1477 // the current cost.
1478 auto *I1 = dyn_cast<Instruction>(LHS);
1479 auto *I2 = dyn_cast<Instruction>(RHS);
1480 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1481 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1482 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1483 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1484 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1485 ShallowScoreAtThisLevel))
1486 return ShallowScoreAtThisLevel;
1487 assert(I1 && I2 && "Should have early exited.");
1488
1489 // Contains the I2 operand indexes that got matched with I1 operands.
1490 SmallSet<unsigned, 4> Op2Used;
1491
1492 // Recursion towards the operands of I1 and I2. We are trying all possible
1493 // operand pairs, and keeping track of the best score.
1494 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1495 OpIdx1 != NumOperands1; ++OpIdx1) {
1496 // Try to pair op1I with the best operand of I2.
1497 int MaxTmpScore = 0;
1498 unsigned MaxOpIdx2 = 0;
1499 bool FoundBest = false;
1500 // If I2 is commutative try all combinations.
1501 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1502 unsigned ToIdx = isCommutative(I2)
1503 ? I2->getNumOperands()
1504 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1505 assert(FromIdx <= ToIdx && "Bad index");
1506 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1507 // Skip operands already paired with OpIdx1.
1508 if (Op2Used.count(OpIdx2))
1509 continue;
1510 // Recursively calculate the cost at each level
1511 int TmpScore =
1512 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1513 I1, I2, CurrLevel + 1, std::nullopt);
1514 // Look for the best score.
1515 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1516 TmpScore > MaxTmpScore) {
1517 MaxTmpScore = TmpScore;
1518 MaxOpIdx2 = OpIdx2;
1519 FoundBest = true;
1520 }
1521 }
1522 if (FoundBest) {
1523 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1524 Op2Used.insert(MaxOpIdx2);
1525 ShallowScoreAtThisLevel += MaxTmpScore;
1526 }
1527 }
1528 return ShallowScoreAtThisLevel;
1529 }
1530 };
1531 /// A helper data structure to hold the operands of a vector of instructions.
1532 /// This supports a fixed vector length for all operand vectors.
1534 /// For each operand we need (i) the value, and (ii) the opcode that it
1535 /// would be attached to if the expression was in a left-linearized form.
1536 /// This is required to avoid illegal operand reordering.
1537 /// For example:
1538 /// \verbatim
1539 /// 0 Op1
1540 /// |/
1541 /// Op1 Op2 Linearized + Op2
1542 /// \ / ----------> |/
1543 /// - -
1544 ///
1545 /// Op1 - Op2 (0 + Op1) - Op2
1546 /// \endverbatim
1547 ///
1548 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1549 ///
1550 /// Another way to think of this is to track all the operations across the
1551 /// path from the operand all the way to the root of the tree and to
1552 /// calculate the operation that corresponds to this path. For example, the
1553 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1554 /// corresponding operation is a '-' (which matches the one in the
1555 /// linearized tree, as shown above).
1556 ///
1557 /// For lack of a better term, we refer to this operation as Accumulated
1558 /// Path Operation (APO).
1559 struct OperandData {
1560 OperandData() = default;
1561 OperandData(Value *V, bool APO, bool IsUsed)
1562 : V(V), APO(APO), IsUsed(IsUsed) {}
1563 /// The operand value.
1564 Value *V = nullptr;
1565 /// TreeEntries only allow a single opcode, or an alternate sequence of
1566 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1567 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1568 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1569 /// (e.g., Add/Mul)
1570 bool APO = false;
1571 /// Helper data for the reordering function.
1572 bool IsUsed = false;
1573 };
1574
1575 /// During operand reordering, we are trying to select the operand at lane
1576 /// that matches best with the operand at the neighboring lane. Our
1577 /// selection is based on the type of value we are looking for. For example,
1578 /// if the neighboring lane has a load, we need to look for a load that is
1579 /// accessing a consecutive address. These strategies are summarized in the
1580 /// 'ReorderingMode' enumerator.
1581 enum class ReorderingMode {
1582 Load, ///< Matching loads to consecutive memory addresses
1583 Opcode, ///< Matching instructions based on opcode (same or alternate)
1584 Constant, ///< Matching constants
1585 Splat, ///< Matching the same instruction multiple times (broadcast)
1586 Failed, ///< We failed to create a vectorizable group
1587 };
1588
1590
1591 /// A vector of operand vectors.
1593
1594 const TargetLibraryInfo &TLI;
1595 const DataLayout &DL;
1596 ScalarEvolution &SE;
1597 const BoUpSLP &R;
1598
1599 /// \returns the operand data at \p OpIdx and \p Lane.
1600 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1601 return OpsVec[OpIdx][Lane];
1602 }
1603
1604 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1605 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1606 return OpsVec[OpIdx][Lane];
1607 }
1608
1609 /// Clears the used flag for all entries.
1610 void clearUsed() {
1611 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1612 OpIdx != NumOperands; ++OpIdx)
1613 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1614 ++Lane)
1615 OpsVec[OpIdx][Lane].IsUsed = false;
1616 }
1617
1618 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1619 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1620 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1621 }
1622
1623 /// \param Lane lane of the operands under analysis.
1624 /// \param OpIdx operand index in \p Lane lane we're looking the best
1625 /// candidate for.
1626 /// \param Idx operand index of the current candidate value.
1627 /// \returns The additional score due to possible broadcasting of the
1628 /// elements in the lane. It is more profitable to have power-of-2 unique
1629 /// elements in the lane, it will be vectorized with higher probability
1630 /// after removing duplicates. Currently the SLP vectorizer supports only
1631 /// vectorization of the power-of-2 number of unique scalars.
1632 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1633 Value *IdxLaneV = getData(Idx, Lane).V;
1634 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1635 return 0;
1637 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1638 if (Ln == Lane)
1639 continue;
1640 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1641 if (!isa<Instruction>(OpIdxLnV))
1642 return 0;
1643 Uniques.insert(OpIdxLnV);
1644 }
1645 int UniquesCount = Uniques.size();
1646 int UniquesCntWithIdxLaneV =
1647 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1648 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1649 int UniquesCntWithOpIdxLaneV =
1650 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1651 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1652 return 0;
1653 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1654 UniquesCntWithOpIdxLaneV) -
1655 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1656 }
1657
1658 /// \param Lane lane of the operands under analysis.
1659 /// \param OpIdx operand index in \p Lane lane we're looking the best
1660 /// candidate for.
1661 /// \param Idx operand index of the current candidate value.
1662 /// \returns The additional score for the scalar which users are all
1663 /// vectorized.
1664 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1665 Value *IdxLaneV = getData(Idx, Lane).V;
1666 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1667 // Do not care about number of uses for vector-like instructions
1668 // (extractelement/extractvalue with constant indices), they are extracts
1669 // themselves and already externally used. Vectorization of such
1670 // instructions does not add extra extractelement instruction, just may
1671 // remove it.
1672 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1673 isVectorLikeInstWithConstOps(OpIdxLaneV))
1675 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1676 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1677 return 0;
1678 return R.areAllUsersVectorized(IdxLaneI)
1680 : 0;
1681 }
1682
1683 /// Score scaling factor for fully compatible instructions but with
1684 /// different number of external uses. Allows better selection of the
1685 /// instructions with less external uses.
1686 static const int ScoreScaleFactor = 10;
1687
1688 /// \Returns the look-ahead score, which tells us how much the sub-trees
1689 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1690 /// score. This helps break ties in an informed way when we cannot decide on
1691 /// the order of the operands by just considering the immediate
1692 /// predecessors.
1693 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1694 int Lane, unsigned OpIdx, unsigned Idx,
1695 bool &IsUsed) {
1696 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1698 // Keep track of the instruction stack as we recurse into the operands
1699 // during the look-ahead score exploration.
1700 int Score =
1701 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1702 /*CurrLevel=*/1, MainAltOps);
1703 if (Score) {
1704 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1705 if (Score <= -SplatScore) {
1706 // Set the minimum score for splat-like sequence to avoid setting
1707 // failed state.
1708 Score = 1;
1709 } else {
1710 Score += SplatScore;
1711 // Scale score to see the difference between different operands
1712 // and similar operands but all vectorized/not all vectorized
1713 // uses. It does not affect actual selection of the best
1714 // compatible operand in general, just allows to select the
1715 // operand with all vectorized uses.
1716 Score *= ScoreScaleFactor;
1717 Score += getExternalUseScore(Lane, OpIdx, Idx);
1718 IsUsed = true;
1719 }
1720 }
1721 return Score;
1722 }
1723
1724 /// Best defined scores per lanes between the passes. Used to choose the
1725 /// best operand (with the highest score) between the passes.
1726 /// The key - {Operand Index, Lane}.
1727 /// The value - the best score between the passes for the lane and the
1728 /// operand.
1730 BestScoresPerLanes;
1731
1732 // Search all operands in Ops[*][Lane] for the one that matches best
1733 // Ops[OpIdx][LastLane] and return its opreand index.
1734 // If no good match can be found, return std::nullopt.
1735 std::optional<unsigned>
1736 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1737 ArrayRef<ReorderingMode> ReorderingModes,
1738 ArrayRef<Value *> MainAltOps) {
1739 unsigned NumOperands = getNumOperands();
1740
1741 // The operand of the previous lane at OpIdx.
1742 Value *OpLastLane = getData(OpIdx, LastLane).V;
1743
1744 // Our strategy mode for OpIdx.
1745 ReorderingMode RMode = ReorderingModes[OpIdx];
1746 if (RMode == ReorderingMode::Failed)
1747 return std::nullopt;
1748
1749 // The linearized opcode of the operand at OpIdx, Lane.
1750 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1751
1752 // The best operand index and its score.
1753 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1754 // are using the score to differentiate between the two.
1755 struct BestOpData {
1756 std::optional<unsigned> Idx;
1757 unsigned Score = 0;
1758 } BestOp;
1759 BestOp.Score =
1760 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1761 .first->second;
1762
1763 // Track if the operand must be marked as used. If the operand is set to
1764 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1765 // want to reestimate the operands again on the following iterations).
1766 bool IsUsed =
1767 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1768 // Iterate through all unused operands and look for the best.
1769 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1770 // Get the operand at Idx and Lane.
1771 OperandData &OpData = getData(Idx, Lane);
1772 Value *Op = OpData.V;
1773 bool OpAPO = OpData.APO;
1774
1775 // Skip already selected operands.
1776 if (OpData.IsUsed)
1777 continue;
1778
1779 // Skip if we are trying to move the operand to a position with a
1780 // different opcode in the linearized tree form. This would break the
1781 // semantics.
1782 if (OpAPO != OpIdxAPO)
1783 continue;
1784
1785 // Look for an operand that matches the current mode.
1786 switch (RMode) {
1787 case ReorderingMode::Load:
1788 case ReorderingMode::Constant:
1789 case ReorderingMode::Opcode: {
1790 bool LeftToRight = Lane > LastLane;
1791 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1792 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1793 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1794 OpIdx, Idx, IsUsed);
1795 if (Score > static_cast<int>(BestOp.Score)) {
1796 BestOp.Idx = Idx;
1797 BestOp.Score = Score;
1798 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1799 }
1800 break;
1801 }
1802 case ReorderingMode::Splat:
1803 if (Op == OpLastLane)
1804 BestOp.Idx = Idx;
1805 break;
1806 case ReorderingMode::Failed:
1807 llvm_unreachable("Not expected Failed reordering mode.");
1808 }
1809 }
1810
1811 if (BestOp.Idx) {
1812 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1813 return BestOp.Idx;
1814 }
1815 // If we could not find a good match return std::nullopt.
1816 return std::nullopt;
1817 }
1818
1819 /// Helper for reorderOperandVecs.
1820 /// \returns the lane that we should start reordering from. This is the one
1821 /// which has the least number of operands that can freely move about or
1822 /// less profitable because it already has the most optimal set of operands.
1823 unsigned getBestLaneToStartReordering() const {
1824 unsigned Min = UINT_MAX;
1825 unsigned SameOpNumber = 0;
1826 // std::pair<unsigned, unsigned> is used to implement a simple voting
1827 // algorithm and choose the lane with the least number of operands that
1828 // can freely move about or less profitable because it already has the
1829 // most optimal set of operands. The first unsigned is a counter for
1830 // voting, the second unsigned is the counter of lanes with instructions
1831 // with same/alternate opcodes and same parent basic block.
1833 // Try to be closer to the original results, if we have multiple lanes
1834 // with same cost. If 2 lanes have the same cost, use the one with the
1835 // lowest index.
1836 for (int I = getNumLanes(); I > 0; --I) {
1837 unsigned Lane = I - 1;
1838 OperandsOrderData NumFreeOpsHash =
1839 getMaxNumOperandsThatCanBeReordered(Lane);
1840 // Compare the number of operands that can move and choose the one with
1841 // the least number.
1842 if (NumFreeOpsHash.NumOfAPOs < Min) {
1843 Min = NumFreeOpsHash.NumOfAPOs;
1844 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1845 HashMap.clear();
1846 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1847 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1848 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1849 // Select the most optimal lane in terms of number of operands that
1850 // should be moved around.
1851 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1852 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1853 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1854 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1855 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1856 if (It == HashMap.end())
1857 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1858 else
1859 ++It->second.first;
1860 }
1861 }
1862 // Select the lane with the minimum counter.
1863 unsigned BestLane = 0;
1864 unsigned CntMin = UINT_MAX;
1865 for (const auto &Data : reverse(HashMap)) {
1866 if (Data.second.first < CntMin) {
1867 CntMin = Data.second.first;
1868 BestLane = Data.second.second;
1869 }
1870 }
1871 return BestLane;
1872 }
1873
1874 /// Data structure that helps to reorder operands.
1875 struct OperandsOrderData {
1876 /// The best number of operands with the same APOs, which can be
1877 /// reordered.
1878 unsigned NumOfAPOs = UINT_MAX;
1879 /// Number of operands with the same/alternate instruction opcode and
1880 /// parent.
1881 unsigned NumOpsWithSameOpcodeParent = 0;
1882 /// Hash for the actual operands ordering.
1883 /// Used to count operands, actually their position id and opcode
1884 /// value. It is used in the voting mechanism to find the lane with the
1885 /// least number of operands that can freely move about or less profitable
1886 /// because it already has the most optimal set of operands. Can be
1887 /// replaced with SmallVector<unsigned> instead but hash code is faster
1888 /// and requires less memory.
1889 unsigned Hash = 0;
1890 };
1891 /// \returns the maximum number of operands that are allowed to be reordered
1892 /// for \p Lane and the number of compatible instructions(with the same
1893 /// parent/opcode). This is used as a heuristic for selecting the first lane
1894 /// to start operand reordering.
1895 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1896 unsigned CntTrue = 0;
1897 unsigned NumOperands = getNumOperands();
1898 // Operands with the same APO can be reordered. We therefore need to count
1899 // how many of them we have for each APO, like this: Cnt[APO] = x.
1900 // Since we only have two APOs, namely true and false, we can avoid using
1901 // a map. Instead we can simply count the number of operands that
1902 // correspond to one of them (in this case the 'true' APO), and calculate
1903 // the other by subtracting it from the total number of operands.
1904 // Operands with the same instruction opcode and parent are more
1905 // profitable since we don't need to move them in many cases, with a high
1906 // probability such lane already can be vectorized effectively.
1907 bool AllUndefs = true;
1908 unsigned NumOpsWithSameOpcodeParent = 0;
1909 Instruction *OpcodeI = nullptr;
1910 BasicBlock *Parent = nullptr;
1911 unsigned Hash = 0;
1912 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1913 const OperandData &OpData = getData(OpIdx, Lane);
1914 if (OpData.APO)
1915 ++CntTrue;
1916 // Use Boyer-Moore majority voting for finding the majority opcode and
1917 // the number of times it occurs.
1918 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1919 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1920 I->getParent() != Parent) {
1921 if (NumOpsWithSameOpcodeParent == 0) {
1922 NumOpsWithSameOpcodeParent = 1;
1923 OpcodeI = I;
1924 Parent = I->getParent();
1925 } else {
1926 --NumOpsWithSameOpcodeParent;
1927 }
1928 } else {
1929 ++NumOpsWithSameOpcodeParent;
1930 }
1931 }
1932 Hash = hash_combine(
1933 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1934 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1935 }
1936 if (AllUndefs)
1937 return {};
1938 OperandsOrderData Data;
1939 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1940 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1941 Data.Hash = Hash;
1942 return Data;
1943 }
1944
1945 /// Go through the instructions in VL and append their operands.
1946 void appendOperandsOfVL(ArrayRef<Value *> VL) {
1947 assert(!VL.empty() && "Bad VL");
1948 assert((empty() || VL.size() == getNumLanes()) &&
1949 "Expected same number of lanes");
1950 assert(isa<Instruction>(VL[0]) && "Expected instruction");
1951 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1952 OpsVec.resize(NumOperands);
1953 unsigned NumLanes = VL.size();
1954 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1955 OpsVec[OpIdx].resize(NumLanes);
1956 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1957 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
1958 // Our tree has just 3 nodes: the root and two operands.
1959 // It is therefore trivial to get the APO. We only need to check the
1960 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1961 // RHS operand. The LHS operand of both add and sub is never attached
1962 // to an inversese operation in the linearized form, therefore its APO
1963 // is false. The RHS is true only if VL[Lane] is an inverse operation.
1964
1965 // Since operand reordering is performed on groups of commutative
1966 // operations or alternating sequences (e.g., +, -), we can safely
1967 // tell the inverse operations by checking commutativity.
1968 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1969 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1970 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1971 APO, false};
1972 }
1973 }
1974 }
1975
1976 /// \returns the number of operands.
1977 unsigned getNumOperands() const { return OpsVec.size(); }
1978
1979 /// \returns the number of lanes.
1980 unsigned getNumLanes() const { return OpsVec[0].size(); }
1981
1982 /// \returns the operand value at \p OpIdx and \p Lane.
1983 Value *getValue(unsigned OpIdx, unsigned Lane) const {
1984 return getData(OpIdx, Lane).V;
1985 }
1986
1987 /// \returns true if the data structure is empty.
1988 bool empty() const { return OpsVec.empty(); }
1989
1990 /// Clears the data.
1991 void clear() { OpsVec.clear(); }
1992
1993 /// \Returns true if there are enough operands identical to \p Op to fill
1994 /// the whole vector.
1995 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1996 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1997 bool OpAPO = getData(OpIdx, Lane).APO;
1998 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1999 if (Ln == Lane)
2000 continue;
2001 // This is set to true if we found a candidate for broadcast at Lane.
2002 bool FoundCandidate = false;
2003 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2004 OperandData &Data = getData(OpI, Ln);
2005 if (Data.APO != OpAPO || Data.IsUsed)
2006 continue;
2007 if (Data.V == Op) {
2008 FoundCandidate = true;
2009 Data.IsUsed = true;
2010 break;
2011 }
2012 }
2013 if (!FoundCandidate)
2014 return false;
2015 }
2016 return true;
2017 }
2018
2019 public:
2020 /// Initialize with all the operands of the instruction vector \p RootVL.
2022 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
2023 : TLI(TLI), DL(DL), SE(SE), R(R) {
2024 // Append all the operands of RootVL.
2025 appendOperandsOfVL(RootVL);
2026 }
2027
2028 /// \Returns a value vector with the operands across all lanes for the
2029 /// opearnd at \p OpIdx.
2030 ValueList getVL(unsigned OpIdx) const {
2031 ValueList OpVL(OpsVec[OpIdx].size());
2032 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2033 "Expected same num of lanes across all operands");
2034 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2035 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2036 return OpVL;
2037 }
2038
2039 // Performs operand reordering for 2 or more operands.
2040 // The original operands are in OrigOps[OpIdx][Lane].
2041 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2042 void reorder() {
2043 unsigned NumOperands = getNumOperands();
2044 unsigned NumLanes = getNumLanes();
2045 // Each operand has its own mode. We are using this mode to help us select
2046 // the instructions for each lane, so that they match best with the ones
2047 // we have selected so far.
2048 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2049
2050 // This is a greedy single-pass algorithm. We are going over each lane
2051 // once and deciding on the best order right away with no back-tracking.
2052 // However, in order to increase its effectiveness, we start with the lane
2053 // that has operands that can move the least. For example, given the
2054 // following lanes:
2055 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2056 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2057 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2058 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2059 // we will start at Lane 1, since the operands of the subtraction cannot
2060 // be reordered. Then we will visit the rest of the lanes in a circular
2061 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2062
2063 // Find the first lane that we will start our search from.
2064 unsigned FirstLane = getBestLaneToStartReordering();
2065
2066 // Initialize the modes.
2067 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2068 Value *OpLane0 = getValue(OpIdx, FirstLane);
2069 // Keep track if we have instructions with all the same opcode on one
2070 // side.
2071 if (isa<LoadInst>(OpLane0))
2072 ReorderingModes[OpIdx] = ReorderingMode::Load;
2073 else if (isa<Instruction>(OpLane0)) {
2074 // Check if OpLane0 should be broadcast.
2075 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2076 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2077 else
2078 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2079 }
2080 else if (isa<Constant>(OpLane0))
2081 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2082 else if (isa<Argument>(OpLane0))
2083 // Our best hope is a Splat. It may save some cost in some cases.
2084 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2085 else
2086 // NOTE: This should be unreachable.
2087 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2088 }
2089
2090 // Check that we don't have same operands. No need to reorder if operands
2091 // are just perfect diamond or shuffled diamond match. Do not do it only
2092 // for possible broadcasts or non-power of 2 number of scalars (just for
2093 // now).
2094 auto &&SkipReordering = [this]() {
2095 SmallPtrSet<Value *, 4> UniqueValues;
2096 ArrayRef<OperandData> Op0 = OpsVec.front();
2097 for (const OperandData &Data : Op0)
2098 UniqueValues.insert(Data.V);
2099 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2100 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2101 return !UniqueValues.contains(Data.V);
2102 }))
2103 return false;
2104 }
2105 // TODO: Check if we can remove a check for non-power-2 number of
2106 // scalars after full support of non-power-2 vectorization.
2107 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2108 };
2109
2110 // If the initial strategy fails for any of the operand indexes, then we
2111 // perform reordering again in a second pass. This helps avoid assigning
2112 // high priority to the failed strategy, and should improve reordering for
2113 // the non-failed operand indexes.
2114 for (int Pass = 0; Pass != 2; ++Pass) {
2115 // Check if no need to reorder operands since they're are perfect or
2116 // shuffled diamond match.
2117 // Need to do it to avoid extra external use cost counting for
2118 // shuffled matches, which may cause regressions.
2119 if (SkipReordering())
2120 break;
2121 // Skip the second pass if the first pass did not fail.
2122 bool StrategyFailed = false;
2123 // Mark all operand data as free to use.
2124 clearUsed();
2125 // We keep the original operand order for the FirstLane, so reorder the
2126 // rest of the lanes. We are visiting the nodes in a circular fashion,
2127 // using FirstLane as the center point and increasing the radius
2128 // distance.
2129 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2130 for (unsigned I = 0; I < NumOperands; ++I)
2131 MainAltOps[I].push_back(getData(I, FirstLane).V);
2132
2133 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2134 // Visit the lane on the right and then the lane on the left.
2135 for (int Direction : {+1, -1}) {
2136 int Lane = FirstLane + Direction * Distance;
2137 if (Lane < 0 || Lane >= (int)NumLanes)
2138 continue;
2139 int LastLane = Lane - Direction;
2140 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2141 "Out of bounds");
2142 // Look for a good match for each operand.
2143 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2144 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2145 std::optional<unsigned> BestIdx = getBestOperand(
2146 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2147 // By not selecting a value, we allow the operands that follow to
2148 // select a better matching value. We will get a non-null value in
2149 // the next run of getBestOperand().
2150 if (BestIdx) {
2151 // Swap the current operand with the one returned by
2152 // getBestOperand().
2153 swap(OpIdx, *BestIdx, Lane);
2154 } else {
2155 // We failed to find a best operand, set mode to 'Failed'.
2156 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2157 // Enable the second pass.
2158 StrategyFailed = true;
2159 }
2160 // Try to get the alternate opcode and follow it during analysis.
2161 if (MainAltOps[OpIdx].size() != 2) {
2162 OperandData &AltOp = getData(OpIdx, Lane);
2163 InstructionsState OpS =
2164 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2165 if (OpS.getOpcode() && OpS.isAltShuffle())
2166 MainAltOps[OpIdx].push_back(AltOp.V);
2167 }
2168 }
2169 }
2170 }
2171 // Skip second pass if the strategy did not fail.
2172 if (!StrategyFailed)
2173 break;
2174 }
2175 }
2176
2177#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2178 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2179 switch (RMode) {
2180 case ReorderingMode::Load:
2181 return "Load";
2182 case ReorderingMode::Opcode:
2183 return "Opcode";
2184 case ReorderingMode::Constant:
2185 return "Constant";
2186 case ReorderingMode::Splat:
2187 return "Splat";
2188 case ReorderingMode::Failed:
2189 return "Failed";
2190 }
2191 llvm_unreachable("Unimplemented Reordering Type");
2192 }
2193
2194 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2195 raw_ostream &OS) {
2196 return OS << getModeStr(RMode);
2197 }
2198
2199 /// Debug print.
2200 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2201 printMode(RMode, dbgs());
2202 }
2203
2204 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2205 return printMode(RMode, OS);
2206 }
2207
2209 const unsigned Indent = 2;
2210 unsigned Cnt = 0;
2211 for (const OperandDataVec &OpDataVec : OpsVec) {
2212 OS << "Operand " << Cnt++ << "\n";
2213 for (const OperandData &OpData : OpDataVec) {
2214 OS.indent(Indent) << "{";
2215 if (Value *V = OpData.V)
2216 OS << *V;
2217 else
2218 OS << "null";
2219 OS << ", APO:" << OpData.APO << "}\n";
2220 }
2221 OS << "\n";
2222 }
2223 return OS;
2224 }
2225
2226 /// Debug print.
2227 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2228#endif
2229 };
2230
2231 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2232 /// for a pair which have highest score deemed to have best chance to form
2233 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2234 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2235 /// of the cost, considered to be good enough score.
2236 std::optional<int>
2237 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2238 int Limit = LookAheadHeuristics::ScoreFail) {
2239 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2241 int BestScore = Limit;
2242 std::optional<int> Index;
2243 for (int I : seq<int>(0, Candidates.size())) {
2244 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2245 Candidates[I].second,
2246 /*U1=*/nullptr, /*U2=*/nullptr,
2247 /*Level=*/1, std::nullopt);
2248 if (Score > BestScore) {
2249 BestScore = Score;
2250 Index = I;
2251 }
2252 }
2253 return Index;
2254 }
2255
2256 /// Checks if the instruction is marked for deletion.
2257 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2258
2259 /// Removes an instruction from its block and eventually deletes it.
2260 /// It's like Instruction::eraseFromParent() except that the actual deletion
2261 /// is delayed until BoUpSLP is destructed.
2263 DeletedInstructions.insert(I);
2264 }
2265
2266 /// Checks if the instruction was already analyzed for being possible
2267 /// reduction root.
2269 return AnalyzedReductionsRoots.count(I);
2270 }
2271 /// Register given instruction as already analyzed for being possible
2272 /// reduction root.
2274 AnalyzedReductionsRoots.insert(I);
2275 }
2276 /// Checks if the provided list of reduced values was checked already for
2277 /// vectorization.
2279 return AnalyzedReductionVals.contains(hash_value(VL));
2280 }
2281 /// Adds the list of reduced values to list of already checked values for the
2282 /// vectorization.
2284 AnalyzedReductionVals.insert(hash_value(VL));
2285 }
2286 /// Clear the list of the analyzed reduction root instructions.
2288 AnalyzedReductionsRoots.clear();
2289 AnalyzedReductionVals.clear();
2290 }
2291 /// Checks if the given value is gathered in one of the nodes.
2292 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2293 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2294 }
2295
2296 /// Check if the value is vectorized in the tree.
2297 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2298
2299 ~BoUpSLP();
2300
2301private:
2302 /// Determine if a vectorized value \p V in can be demoted to
2303 /// a smaller type with a truncation. We collect the values that will be
2304 /// demoted in ToDemote and additional roots that require investigating in
2305 /// Roots.
2306 /// \param DemotedConsts list of Instruction/OperandIndex pairs that are
2307 /// constant and to be demoted. Required to correctly identify constant nodes
2308 /// to be demoted.
2309 bool collectValuesToDemote(
2310 Value *V, SmallVectorImpl<Value *> &ToDemote,
2311 DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
2312 SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
2313
2314 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2315 /// reordering (i.e. the operands can be reordered because they have only one
2316 /// user and reordarable).
2317 /// \param ReorderableGathers List of all gather nodes that require reordering
2318 /// (e.g., gather of extractlements or partially vectorizable loads).
2319 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2320 /// reordering, subset of \p NonVectorized.
2321 bool
2322 canReorderOperands(TreeEntry *UserTE,
2323 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2324 ArrayRef<TreeEntry *> ReorderableGathers,
2325 SmallVectorImpl<TreeEntry *> &GatherOps);
2326
2327 /// Checks if the given \p TE is a gather node with clustered reused scalars
2328 /// and reorders it per given \p Mask.
2329 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2330
2331 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2332 /// if any. If it is not vectorized (gather node), returns nullptr.
2333 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2334 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2335 TreeEntry *TE = nullptr;
2336 const auto *It = find_if(VL, [&](Value *V) {
2337 TE = getTreeEntry(V);
2338 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2339 return true;
2340 auto It = MultiNodeScalars.find(V);
2341 if (It != MultiNodeScalars.end()) {
2342 for (TreeEntry *E : It->second) {
2343 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2344 TE = E;
2345 return true;
2346 }
2347 }
2348 }
2349 return false;
2350 });
2351 if (It != VL.end()) {
2352 assert(TE->isSame(VL) && "Expected same scalars.");
2353 return TE;
2354 }
2355 return nullptr;
2356 }
2357
2358 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2359 /// if any. If it is not vectorized (gather node), returns nullptr.
2360 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2361 unsigned OpIdx) const {
2362 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2363 const_cast<TreeEntry *>(UserTE), OpIdx);
2364 }
2365
2366 /// Checks if all users of \p I are the part of the vectorization tree.
2367 bool areAllUsersVectorized(
2368 Instruction *I,
2369 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2370
2371 /// Return information about the vector formed for the specified index
2372 /// of a vector of (the same) instruction.
2374
2375 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2376 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2377
2378 /// \returns the cost of the vectorizable entry.
2379 InstructionCost getEntryCost(const TreeEntry *E,
2380 ArrayRef<Value *> VectorizedVals,
2381 SmallPtrSetImpl<Value *> &CheckedExtracts);
2382
2383 /// This is the recursive part of buildTree.
2384 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2385 const EdgeInfo &EI);
2386
2387 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2388 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2389 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2390 /// returns false, setting \p CurrentOrder to either an empty vector or a
2391 /// non-identity permutation that allows to reuse extract instructions.
2392 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2393 /// extract order.
2394 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2395 SmallVectorImpl<unsigned> &CurrentOrder,
2396 bool ResizeAllowed = false) const;
2397
2398 /// Vectorize a single entry in the tree.
2399 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2400 /// avoid issues with def-use order.
2401 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2402
2403 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2404 /// \p E.
2405 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2406 /// avoid issues with def-use order.
2407 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2408
2409 /// Create a new vector from a list of scalar values. Produces a sequence
2410 /// which exploits values reused across lanes, and arranges the inserts
2411 /// for ease of later optimization.
2412 template <typename BVTy, typename ResTy, typename... Args>
2413 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2414
2415 /// Create a new vector from a list of scalar values. Produces a sequence
2416 /// which exploits values reused across lanes, and arranges the inserts
2417 /// for ease of later optimization.
2418 Value *createBuildVector(const TreeEntry *E);
2419
2420 /// Returns the instruction in the bundle, which can be used as a base point
2421 /// for scheduling. Usually it is the last instruction in the bundle, except
2422 /// for the case when all operands are external (in this case, it is the first
2423 /// instruction in the list).
2424 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2425
2426 /// Tries to find extractelement instructions with constant indices from fixed
2427 /// vector type and gather such instructions into a bunch, which highly likely
2428 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2429 /// was successful, the matched scalars are replaced by poison values in \p VL
2430 /// for future analysis.
2431 std::optional<TargetTransformInfo::ShuffleKind>
2432 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2433 SmallVectorImpl<int> &Mask) const;
2434
2435 /// Tries to find extractelement instructions with constant indices from fixed
2436 /// vector type and gather such instructions into a bunch, which highly likely
2437 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2438 /// was successful, the matched scalars are replaced by poison values in \p VL
2439 /// for future analysis.
2441 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2443 unsigned NumParts) const;
2444
2445 /// Checks if the gathered \p VL can be represented as a single register
2446 /// shuffle(s) of previous tree entries.
2447 /// \param TE Tree entry checked for permutation.
2448 /// \param VL List of scalars (a subset of the TE scalar), checked for
2449 /// permutations. Must form single-register vector.
2450 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2451 /// commands to build the mask using the original vector value, without
2452 /// relying on the potential reordering.
2453 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2454 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2455 std::optional<TargetTransformInfo::ShuffleKind>
2456 isGatherShuffledSingleRegisterEntry(
2457 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2458 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2459 bool ForOrder);
2460
2461 /// Checks if the gathered \p VL can be represented as multi-register
2462 /// shuffle(s) of previous tree entries.
2463 /// \param TE Tree entry checked for permutation.
2464 /// \param VL List of scalars (a subset of the TE scalar), checked for
2465 /// permutations.
2466 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2467 /// commands to build the mask using the original vector value, without
2468 /// relying on the potential reordering.
2469 /// \returns per-register series of ShuffleKind, if gathered values can be
2470 /// represented as shuffles of previous tree entries. \p Mask is filled with
2471 /// the shuffle mask (also on per-register base).
2473 isGatherShuffledEntry(
2474 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2476 unsigned NumParts, bool ForOrder = false);
2477
2478 /// \returns the scalarization cost for this list of values. Assuming that
2479 /// this subtree gets vectorized, we may need to extract the values from the
2480 /// roots. This method calculates the cost of extracting the values.
2481 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2482 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2483
2484 /// Set the Builder insert point to one after the last instruction in
2485 /// the bundle
2486 void setInsertPointAfterBundle(const TreeEntry *E);
2487
2488 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2489 /// specified, the starting vector value is poison.
2490 Value *gather(ArrayRef<Value *> VL, Value *Root);
2491
2492 /// \returns whether the VectorizableTree is fully vectorizable and will
2493 /// be beneficial even the tree height is tiny.
2494 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2495
2496 /// Reorder commutative or alt operands to get better probability of
2497 /// generating vectorized code.
2498 static void reorderInputsAccordingToOpcode(
2501 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
2502
2503 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2504 /// users of \p TE and collects the stores. It returns the map from the store
2505 /// pointers to the collected stores.
2507 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2508
2509 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2510 /// stores in \p StoresVec can form a vector instruction. If so it returns
2511 /// true and populates \p ReorderIndices with the shuffle indices of the
2512 /// stores when compared to the sorted vector.
2513 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2514 OrdersType &ReorderIndices) const;
2515
2516 /// Iterates through the users of \p TE, looking for scalar stores that can be
2517 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2518 /// their order and builds an order index vector for each store bundle. It
2519 /// returns all these order vectors found.
2520 /// We run this after the tree has formed, otherwise we may come across user
2521 /// instructions that are not yet in the tree.
2523 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2524
2525 struct TreeEntry {
2526 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2527 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2528
2529 /// \returns Common mask for reorder indices and reused scalars.
2530 SmallVector<int> getCommonMask() const {
2532 inversePermutation(ReorderIndices, Mask);
2533 ::addMask(Mask, ReuseShuffleIndices);
2534 return Mask;
2535 }
2536
2537 /// \returns true if the scalars in VL are equal to this entry.
2538 bool isSame(ArrayRef<Value *> VL) const {
2539 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2540 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2541 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2542 return VL.size() == Mask.size() &&
2543 std::equal(VL.begin(), VL.end(), Mask.begin(),
2544 [Scalars](Value *V, int Idx) {
2545 return (isa<UndefValue>(V) &&
2546 Idx == PoisonMaskElem) ||
2547 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2548 });
2549 };
2550 if (!ReorderIndices.empty()) {
2551 // TODO: implement matching if the nodes are just reordered, still can
2552 // treat the vector as the same if the list of scalars matches VL
2553 // directly, without reordering.
2555 inversePermutation(ReorderIndices, Mask);
2556 if (VL.size() == Scalars.size())
2557 return IsSame(Scalars, Mask);
2558 if (VL.size() == ReuseShuffleIndices.size()) {
2559 ::addMask(Mask, ReuseShuffleIndices);
2560 return IsSame(Scalars, Mask);
2561 }
2562 return false;
2563 }
2564 return IsSame(Scalars, ReuseShuffleIndices);
2565 }
2566
2567 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2568 return State == TreeEntry::NeedToGather &&
2569 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2570 UserTreeIndices.front().UserTE == UserEI.UserTE;
2571 }
2572
2573 /// \returns true if current entry has same operands as \p TE.
2574 bool hasEqualOperands(const TreeEntry &TE) const {
2575 if (TE.getNumOperands() != getNumOperands())
2576 return false;
2577 SmallBitVector Used(getNumOperands());
2578 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2579 unsigned PrevCount = Used.count();
2580 for (unsigned K = 0; K < E; ++K) {
2581 if (Used.test(K))
2582 continue;
2583 if (getOperand(K) == TE.getOperand(I)) {
2584 Used.set(K);
2585 break;
2586 }
2587 }
2588 // Check if we actually found the matching operand.
2589 if (PrevCount == Used.count())
2590 return false;
2591 }
2592 return true;
2593 }
2594
2595 /// \return Final vectorization factor for the node. Defined by the total
2596 /// number of vectorized scalars, including those, used several times in the
2597 /// entry and counted in the \a ReuseShuffleIndices, if any.
2598 unsigned getVectorFactor() const {
2599 if (!ReuseShuffleIndices.empty())
2600 return ReuseShuffleIndices.size();
2601 return Scalars.size();
2602 };
2603
2604 /// A vector of scalars.
2605 ValueList Scalars;
2606
2607 /// The Scalars are vectorized into this value. It is initialized to Null.
2608 WeakTrackingVH VectorizedValue = nullptr;
2609
2610 /// New vector phi instructions emitted for the vectorized phi nodes.
2611 PHINode *PHI = nullptr;
2612
2613 /// Do we need to gather this sequence or vectorize it
2614 /// (either with vector instruction or with scatter/gather
2615 /// intrinsics for store/load)?
2616 enum EntryState {
2617 Vectorize,
2618 ScatterVectorize,
2619 StridedVectorize,
2620 NeedToGather
2621 };
2622 EntryState State;
2623
2624 /// Does this sequence require some shuffling?
2625 SmallVector<int, 4> ReuseShuffleIndices;
2626
2627 /// Does this entry require reordering?
2628 SmallVector<unsigned, 4> ReorderIndices;
2629
2630 /// Points back to the VectorizableTree.
2631 ///
2632 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2633 /// to be a pointer and needs to be able to initialize the child iterator.
2634 /// Thus we need a reference back to the container to translate the indices
2635 /// to entries.
2636 VecTreeTy &Container;
2637
2638 /// The TreeEntry index containing the user of this entry. We can actually
2639 /// have multiple users so the data structure is not truly a tree.
2640 SmallVector<EdgeInfo, 1> UserTreeIndices;
2641
2642 /// The index of this treeEntry in VectorizableTree.
2643 int Idx = -1;
2644
2645 private:
2646 /// The operands of each instruction in each lane Operands[op_index][lane].
2647 /// Note: This helps avoid the replication of the code that performs the
2648 /// reordering of operands during buildTree_rec() and vectorizeTree().
2650
2651 /// The main/alternate instruction.
2652 Instruction *MainOp = nullptr;
2653 Instruction *AltOp = nullptr;
2654
2655 public:
2656 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2657 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2658 if (Operands.size() < OpIdx + 1)
2659 Operands.resize(OpIdx + 1);
2660 assert(Operands[OpIdx].empty() && "Already resized?");
2661 assert(OpVL.size() <= Scalars.size() &&
2662 "Number of operands is greater than the number of scalars.");
2663 Operands[OpIdx].resize(OpVL.size());
2664 copy(OpVL, Operands[OpIdx].begin());
2665 }
2666
2667 /// Set the operands of this bundle in their original order.
2668 void setOperandsInOrder() {
2669 assert(Operands.empty() && "Already initialized?");
2670 auto *I0 = cast<Instruction>(Scalars[0]);
2671 Operands.resize(I0->getNumOperands());
2672 unsigned NumLanes = Scalars.size();
2673 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2674 OpIdx != NumOperands; ++OpIdx) {
2675 Operands[OpIdx].resize(NumLanes);
2676 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2677 auto *I = cast<Instruction>(Scalars[Lane]);
2678 assert(I->getNumOperands() == NumOperands &&
2679 "Expected same number of operands");
2680 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2681 }
2682 }
2683 }
2684
2685 /// Reorders operands of the node to the given mask \p Mask.
2686 void reorderOperands(ArrayRef<int> Mask) {
2687 for (ValueList &Operand : Operands)
2688 reorderScalars(Operand, Mask);
2689 }
2690
2691 /// \returns the \p OpIdx operand of this TreeEntry.
2692 ValueList &getOperand(unsigned OpIdx) {
2693 assert(OpIdx < Operands.size() && "Off bounds");
2694 return Operands[OpIdx];
2695 }
2696
2697 /// \returns the \p OpIdx operand of this TreeEntry.
2698 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2699 assert(OpIdx < Operands.size() && "Off bounds");
2700 return Operands[OpIdx];
2701 }
2702
2703 /// \returns the number of operands.
2704 unsigned getNumOperands() const { return Operands.size(); }
2705
2706 /// \return the single \p OpIdx operand.
2707 Value *getSingleOperand(unsigned OpIdx) const {
2708 assert(OpIdx < Operands.size() && "Off bounds");
2709 assert(!Operands[OpIdx].empty() && "No operand available");
2710 return Operands[OpIdx][0];
2711 }
2712
2713 /// Some of the instructions in the list have alternate opcodes.
2714 bool isAltShuffle() const { return MainOp != AltOp; }
2715
2716 bool isOpcodeOrAlt(Instruction *I) const {
2717 unsigned CheckedOpcode = I->getOpcode();
2718 return (getOpcode() == CheckedOpcode ||
2719 getAltOpcode() == CheckedOpcode);
2720 }
2721
2722 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2723 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2724 /// \p OpValue.
2725 Value *isOneOf(Value *Op) const {
2726 auto *I = dyn_cast<Instruction>(Op);
2727 if (I && isOpcodeOrAlt(I))
2728 return Op;
2729 return MainOp;
2730 }
2731
2732 void setOperations(const InstructionsState &S) {
2733 MainOp = S.MainOp;
2734 AltOp = S.AltOp;
2735 }
2736
2737 Instruction *getMainOp() const {
2738 return MainOp;
2739 }
2740
2741 Instruction *getAltOp() const {
2742 return AltOp;
2743 }
2744
2745 /// The main/alternate opcodes for the list of instructions.
2746 unsigned getOpcode() const {
2747 return MainOp ? MainOp->getOpcode() : 0;
2748 }
2749
2750 unsigned getAltOpcode() const {
2751 return AltOp ? AltOp->getOpcode() : 0;
2752 }
2753
2754 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2755 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2756 int findLaneForValue(Value *V) const {
2757 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2758 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2759 if (!ReorderIndices.empty())
2760 FoundLane = ReorderIndices[FoundLane];
2761 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2762 if (!ReuseShuffleIndices.empty()) {
2763 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2764 find(ReuseShuffleIndices, FoundLane));
2765 }
2766 return FoundLane;
2767 }
2768
2769 /// Build a shuffle mask for graph entry which represents a merge of main
2770 /// and alternate operations.
2771 void
2772 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2774 SmallVectorImpl<Value *> *OpScalars = nullptr,
2775 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2776
2777#ifndef NDEBUG
2778 /// Debug printer.
2779 LLVM_DUMP_METHOD void dump() const {
2780 dbgs() << Idx << ".\n";
2781 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2782 dbgs() << "Operand " << OpI << ":\n";
2783 for (const Value *V : Operands[OpI])
2784 dbgs().indent(2) << *V << "\n";
2785 }
2786 dbgs() << "Scalars: \n";
2787 for (Value *V : Scalars)
2788 dbgs().indent(2) << *V << "\n";
2789 dbgs() << "State: ";
2790 switch (State) {
2791 case Vectorize:
2792 dbgs() << "Vectorize\n";
2793 break;
2794 case ScatterVectorize:
2795 dbgs() << "ScatterVectorize\n";
2796 break;
2797 case StridedVectorize:
2798 dbgs() << "StridedVectorize\n";
2799 break;
2800 case NeedToGather:
2801 dbgs() << "NeedToGather\n";
2802 break;
2803 }
2804 dbgs() << "MainOp: ";
2805 if (MainOp)
2806 dbgs() << *MainOp << "\n";
2807 else
2808 dbgs() << "NULL\n";
2809 dbgs() << "AltOp: ";
2810 if (AltOp)
2811 dbgs() << *AltOp << "\n";
2812 else
2813 dbgs() << "NULL\n";
2814 dbgs() << "VectorizedValue: ";
2815 if (VectorizedValue)
2816 dbgs() << *VectorizedValue << "\n";
2817 else
2818 dbgs() << "NULL\n";
2819 dbgs() << "ReuseShuffleIndices: ";
2820 if (ReuseShuffleIndices.empty())
2821 dbgs() << "Empty";
2822 else
2823 for (int ReuseIdx : ReuseShuffleIndices)
2824 dbgs() << ReuseIdx << ", ";
2825 dbgs() << "\n";
2826 dbgs() << "ReorderIndices: ";
2827 for (unsigned ReorderIdx : ReorderIndices)
2828 dbgs() << ReorderIdx << ", ";
2829 dbgs() << "\n";
2830 dbgs() << "UserTreeIndices: ";
2831 for (const auto &EInfo : UserTreeIndices)
2832 dbgs() << EInfo << ", ";
2833 dbgs() << "\n";
2834 }
2835#endif
2836 };
2837
2838#ifndef NDEBUG
2839 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2840 InstructionCost VecCost, InstructionCost ScalarCost,
2841 StringRef Banner) const {
2842 dbgs() << "SLP: " << Banner << ":\n";
2843 E->dump();
2844 dbgs() << "SLP: Costs:\n";
2845 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2846 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2847 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2848 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2849 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2850 }
2851#endif
2852
2853 /// Create a new VectorizableTree entry.
2854 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2855 std::optional<ScheduleData *> Bundle,
2856 const InstructionsState &S,
2857 const EdgeInfo &UserTreeIdx,
2858 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2859 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2860 TreeEntry::EntryState EntryState =
2861 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2862 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2863 ReuseShuffleIndices, ReorderIndices);
2864 }
2865
2866 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2867 TreeEntry::EntryState EntryState,
2868 std::optional<ScheduleData *> Bundle,
2869 const InstructionsState &S,
2870 const EdgeInfo &UserTreeIdx,
2871 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2872 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2873 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2874 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2875 "Need to vectorize gather entry?");
2876 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2877 TreeEntry *Last = VectorizableTree.back().get();
2878 Last->Idx = VectorizableTree.size() - 1;
2879 Last->State = EntryState;
2880 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2881 ReuseShuffleIndices.end());
2882 if (ReorderIndices.empty()) {
2883 Last->Scalars.assign(VL.begin(), VL.end());
2884 Last->setOperations(S);
2885 } else {
2886 // Reorder scalars and build final mask.
2887 Last->Scalars.assign(VL.size(), nullptr);
2888 transform(ReorderIndices, Last->Scalars.begin(),
2889 [VL](unsigned Idx) -> Value * {
2890 if (Idx >= VL.size())
2891 return UndefValue::get(VL.front()->getType());
2892 return VL[Idx];
2893 });
2894 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2895 Last->setOperations(S);
2896 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2897 }
2898 if (Last->State != TreeEntry::NeedToGather) {
2899 for (Value *V : VL) {
2900 const TreeEntry *TE = getTreeEntry(V);
2901 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2902 "Scalar already in tree!");
2903 if (TE) {
2904 if (TE != Last)
2905 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2906 continue;
2907 }
2908 ScalarToTreeEntry[V] = Last;
2909 }
2910 // Update the scheduler bundle to point to this TreeEntry.
2911 ScheduleData *BundleMember = *Bundle;
2912 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2913 isVectorLikeInstWithConstOps(S.MainOp) ||
2914 doesNotNeedToSchedule(VL)) &&
2915 "Bundle and VL out of sync");
2916 if (BundleMember) {
2917 for (Value *V : VL) {
2919 continue;
2920 if (!BundleMember)
2921 continue;
2922 BundleMember->TE = Last;
2923 BundleMember = BundleMember->NextInBundle;
2924 }
2925 }
2926 assert(!BundleMember && "Bundle and VL out of sync");
2927 } else {
2928 MustGather.insert(VL.begin(), VL.end());
2929 // Build a map for gathered scalars to the nodes where they are used.
2930 for (Value *V : VL)
2931 if (!isConstant(V))
2932 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
2933 }
2934
2935 if (UserTreeIdx.UserTE)
2936 Last->UserTreeIndices.push_back(UserTreeIdx);
2937
2938 return Last;
2939 }
2940
2941 /// -- Vectorization State --
2942 /// Holds all of the tree entries.
2943 TreeEntry::VecTreeTy VectorizableTree;
2944
2945#ifndef NDEBUG
2946 /// Debug printer.
2947 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2948 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2949 VectorizableTree[Id]->dump();
2950 dbgs() << "\n";
2951 }
2952 }
2953#endif
2954
2955 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2956
2957 const TreeEntry *getTreeEntry(Value *V) const {
2958 return ScalarToTreeEntry.lookup(V);
2959 }
2960
2961 /// Checks if the specified list of the instructions/values can be vectorized
2962 /// and fills required data before actual scheduling of the instructions.
2963 TreeEntry::EntryState getScalarsVectorizationState(
2964 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2965 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2966
2967 /// Maps a specific scalar to its tree entry.
2968 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
2969
2970 /// List of scalars, used in several vectorize nodes, and the list of the
2971 /// nodes.
2973
2974 /// Maps a value to the proposed vectorizable size.
2975 SmallDenseMap<Value *, unsigned> InstrElementSize;
2976
2977 /// A list of scalars that we found that we need to keep as scalars.
2978 ValueSet MustGather;
2979
2980 /// A map between the vectorized entries and the last instructions in the
2981 /// bundles. The bundles are built in use order, not in the def order of the
2982 /// instructions. So, we cannot rely directly on the last instruction in the
2983 /// bundle being the last instruction in the program order during
2984 /// vectorization process since the basic blocks are affected, need to
2985 /// pre-gather them before.
2986 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
2987
2988 /// List of gather nodes, depending on other gather/vector nodes, which should
2989 /// be emitted after the vector instruction emission process to correctly
2990 /// handle order of the vector instructions and shuffles.
2991 SetVector<const TreeEntry *> PostponedGathers;
2992
2993 using ValueToGatherNodesMap =
2995 ValueToGatherNodesMap ValueToGatherNodes;
2996
2997 /// This POD struct describes one external user in the vectorized tree.
2998 struct ExternalUser {
2999 ExternalUser(Value *S, llvm::User *U, int L)
3000 : Scalar(S), User(U), Lane(L) {}
3001
3002 // Which scalar in our function.
3003 Value *Scalar;
3004
3005 // Which user that uses the scalar.
3007
3008 // Which lane does the scalar belong to.
3009 int Lane;
3010 };
3011 using UserList = SmallVector<ExternalUser, 16>;
3012
3013 /// Checks if two instructions may access the same memory.
3014 ///
3015 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3016 /// is invariant in the calling loop.
3017 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3018 Instruction *Inst2) {
3019 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3020 return true;
3021 // First check if the result is already in the cache.
3022 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3023 auto It = AliasCache.find(Key);
3024 if (It != AliasCache.end())
3025 return It->second;
3026 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3027 // Store the result in the cache.
3028 AliasCache.try_emplace(Key, Aliased);
3029 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3030 return Aliased;
3031 }
3032
3033 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3034
3035 /// Cache for alias results.
3036 /// TODO: consider moving this to the AliasAnalysis itself.
3038
3039 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3040 // globally through SLP because we don't perform any action which
3041 // invalidates capture results.
3042 BatchAAResults BatchAA;
3043
3044 /// Temporary store for deleted instructions. Instructions will be deleted
3045 /// eventually when the BoUpSLP is destructed. The deferral is required to
3046 /// ensure that there are no incorrect collisions in the AliasCache, which
3047 /// can happen if a new instruction is allocated at the same address as a
3048 /// previously deleted instruction.
3049 DenseSet<Instruction *> DeletedInstructions;
3050
3051 /// Set of the instruction, being analyzed already for reductions.
3052 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3053
3054 /// Set of hashes for the list of reduction values already being analyzed.
3055 DenseSet<size_t> AnalyzedReductionVals;
3056
3057 /// A list of values that need to extracted out of the tree.
3058 /// This list holds pairs of (Internal Scalar : External User). External User
3059 /// can be nullptr, it means that this Internal Scalar will be used later,
3060 /// after vectorization.
3061 UserList ExternalUses;
3062
3063 /// Values used only by @llvm.assume calls.
3065
3066 /// Holds all of the instructions that we gathered, shuffle instructions and
3067 /// extractelements.
3068 SetVector<Instruction *> GatherShuffleExtractSeq;
3069
3070 /// A list of blocks that we are going to CSE.
3071 DenseSet<BasicBlock *> CSEBlocks;
3072
3073 /// Contains all scheduling relevant data for an instruction.
3074 /// A ScheduleData either represents a single instruction or a member of an
3075 /// instruction bundle (= a group of instructions which is combined into a
3076 /// vector instruction).
3077 struct ScheduleData {
3078 // The initial value for the dependency counters. It means that the
3079 // dependencies are not calculated yet.
3080 enum { InvalidDeps = -1 };
3081
3082 ScheduleData() = default;
3083
3084 void init(int BlockSchedulingRegionID, Value *OpVal) {
3085 FirstInBundle = this;
3086 NextInBundle = nullptr;
3087 NextLoadStore = nullptr;
3088 IsScheduled = false;
3089 SchedulingRegionID = BlockSchedulingRegionID;
3090 clearDependencies();
3091 OpValue = OpVal;
3092 TE = nullptr;
3093 }
3094
3095 /// Verify basic self consistency properties
3096 void verify() {
3097 if (hasValidDependencies()) {
3098 assert(UnscheduledDeps <= Dependencies && "invariant");
3099 } else {
3100 assert(UnscheduledDeps == Dependencies && "invariant");
3101 }
3102
3103 if (IsScheduled) {
3104 assert(isSchedulingEntity() &&
3105 "unexpected scheduled state");
3106 for (const ScheduleData *BundleMember = this; BundleMember;
3107 BundleMember = BundleMember->NextInBundle) {
3108 assert(BundleMember->hasValidDependencies() &&
3109 BundleMember->UnscheduledDeps == 0 &&
3110 "unexpected scheduled state");
3111 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3112 "only bundle is marked scheduled");
3113 }
3114 }
3115
3116 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3117 "all bundle members must be in same basic block");
3118 }
3119
3120 /// Returns true if the dependency information has been calculated.
3121 /// Note that depenendency validity can vary between instructions within
3122 /// a single bundle.
3123 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3124
3125 /// Returns true for single instructions and for bundle representatives
3126 /// (= the head of a bundle).
3127 bool isSchedulingEntity() const { return FirstInBundle == this; }
3128
3129 /// Returns true if it represents an instruction bundle and not only a
3130 /// single instruction.
3131 bool isPartOfBundle() const {
3132 return NextInBundle != nullptr || FirstInBundle != this || TE;
3133 }
3134
3135 /// Returns true if it is ready for scheduling, i.e. it has no more
3136 /// unscheduled depending instructions/bundles.
3137 bool isReady() const {
3138 assert(isSchedulingEntity() &&
3139 "can't consider non-scheduling entity for ready list");
3140 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3141 }
3142
3143 /// Modifies the number of unscheduled dependencies for this instruction,
3144 /// and returns the number of remaining dependencies for the containing
3145 /// bundle.
3146 int incrementUnscheduledDeps(int Incr) {
3147 assert(hasValidDependencies() &&
3148 "increment of unscheduled deps would be meaningless");
3149 UnscheduledDeps += Incr;
3150 return FirstInBundle->unscheduledDepsInBundle();
3151 }
3152
3153 /// Sets the number of unscheduled dependencies to the number of
3154 /// dependencies.
3155 void resetUnscheduledDeps() {
3156 UnscheduledDeps = Dependencies;
3157 }
3158
3159 /// Clears all dependency information.
3160 void clearDependencies() {
3161 Dependencies = InvalidDeps;
3162 resetUnscheduledDeps();
3163 MemoryDependencies.clear();
3164 ControlDependencies.clear();
3165 }
3166
3167 int unscheduledDepsInBundle() const {
3168 assert(isSchedulingEntity() && "only meaningful on the bundle");
3169 int Sum = 0;
3170 for (const ScheduleData *BundleMember = this; BundleMember;
3171 BundleMember = BundleMember->NextInBundle) {
3172 if (BundleMember->UnscheduledDeps == InvalidDeps)
3173 return InvalidDeps;
3174 Sum += BundleMember->UnscheduledDeps;
3175 }
3176 return Sum;
3177 }
3178
3179 void dump(raw_ostream &os) const {
3180 if (!isSchedulingEntity()) {
3181 os << "/ " << *Inst;
3182 } else if (NextInBundle) {
3183 os << '[' << *Inst;
3184 ScheduleData *SD = NextInBundle;
3185 while (SD) {
3186 os << ';' << *SD->Inst;
3187 SD = SD->NextInBundle;
3188 }
3189 os << ']';
3190 } else {
3191 os << *Inst;
3192 }
3193 }
3194
3195 Instruction *Inst = nullptr;
3196
3197 /// Opcode of the current instruction in the schedule data.
3198 Value *OpValue = nullptr;
3199
3200 /// The TreeEntry that this instruction corresponds to.
3201 TreeEntry *TE = nullptr;
3202
3203 /// Points to the head in an instruction bundle (and always to this for
3204 /// single instructions).
3205 ScheduleData *FirstInBundle = nullptr;
3206
3207 /// Single linked list of all instructions in a bundle. Null if it is a
3208 /// single instruction.
3209 ScheduleData *NextInBundle = nullptr;
3210
3211 /// Single linked list of all memory instructions (e.g. load, store, call)
3212 /// in the block - until the end of the scheduling region.
3213 ScheduleData *NextLoadStore = nullptr;
3214
3215 /// The dependent memory instructions.
3216 /// This list is derived on demand in calculateDependencies().
3217 SmallVector<ScheduleData *, 4> MemoryDependencies;
3218
3219 /// List of instructions which this instruction could be control dependent
3220 /// on. Allowing such nodes to be scheduled below this one could introduce
3221 /// a runtime fault which didn't exist in the original program.
3222 /// ex: this is a load or udiv following a readonly call which inf loops
3223 SmallVector<ScheduleData *, 4> ControlDependencies;
3224
3225 /// This ScheduleData is in the current scheduling region if this matches
3226 /// the current SchedulingRegionID of BlockScheduling.
3227 int SchedulingRegionID = 0;
3228
3229 /// Used for getting a "good" final ordering of instructions.
3230 int SchedulingPriority = 0;
3231
3232 /// The number of dependencies. Constitutes of the number of users of the
3233 /// instruction plus the number of dependent memory instructions (if any).
3234 /// This value is calculated on demand.
3235 /// If InvalidDeps, the number of dependencies is not calculated yet.
3236 int Dependencies = InvalidDeps;
3237
3238 /// The number of dependencies minus the number of dependencies of scheduled
3239 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3240 /// for scheduling.
3241 /// Note that this is negative as long as Dependencies is not calculated.
3242 int UnscheduledDeps = InvalidDeps;
3243
3244 /// True if this instruction is scheduled (or considered as scheduled in the
3245 /// dry-run).
3246 bool IsScheduled = false;
3247 };
3248
3249#ifndef NDEBUG
3251 const BoUpSLP::ScheduleData &SD) {
3252 SD.dump(os);
3253 return os;
3254 }
3255#endif
3256
3257 friend struct GraphTraits<BoUpSLP *>;
3258 friend struct DOTGraphTraits<BoUpSLP *>;
3259
3260 /// Contains all scheduling data for a basic block.
3261 /// It does not schedules instructions, which are not memory read/write
3262 /// instructions and their operands are either constants, or arguments, or
3263 /// phis, or instructions from others blocks, or their users are phis or from
3264 /// the other blocks. The resulting vector instructions can be placed at the
3265 /// beginning of the basic block without scheduling (if operands does not need
3266 /// to be scheduled) or at the end of the block (if users are outside of the
3267 /// block). It allows to save some compile time and memory used by the
3268 /// compiler.
3269 /// ScheduleData is assigned for each instruction in between the boundaries of
3270 /// the tree entry, even for those, which are not part of the graph. It is
3271 /// required to correctly follow the dependencies between the instructions and
3272 /// their correct scheduling. The ScheduleData is not allocated for the
3273 /// instructions, which do not require scheduling, like phis, nodes with
3274 /// extractelements/insertelements only or nodes with instructions, with
3275 /// uses/operands outside of the block.
3276 struct BlockScheduling {
3277 BlockScheduling(BasicBlock *BB)
3278 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3279
3280 void clear() {
3281 ReadyInsts.clear();
3282 ScheduleStart = nullptr;
3283 ScheduleEnd = nullptr;
3284 FirstLoadStoreInRegion = nullptr;
3285 LastLoadStoreInRegion = nullptr;
3286 RegionHasStackSave = false;
3287
3288 // Reduce the maximum schedule region size by the size of the
3289 // previous scheduling run.
3290 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3291 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3292 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3293 ScheduleRegionSize = 0;
3294
3295 // Make a new scheduling region, i.e. all existing ScheduleData is not
3296 // in the new region yet.
3297 ++SchedulingRegionID;
3298 }
3299
3300 ScheduleData *getScheduleData(Instruction *I) {
3301 if (BB != I->getParent())
3302 // Avoid lookup if can't possibly be in map.
3303 return nullptr;
3304 ScheduleData *SD = ScheduleDataMap.lookup(I);
3305 if (SD && isInSchedulingRegion(SD))
3306 return SD;
3307 return nullptr;
3308 }
3309
3310 ScheduleData *getScheduleData(Value *V) {
3311 if (auto *I = dyn_cast<Instruction>(V))
3312 return getScheduleData(I);
3313 return nullptr;
3314 }
3315
3316 ScheduleData *getScheduleData(Value *V, Value *Key) {
3317 if (V == Key)
3318 return getScheduleData(V);
3319 auto I = ExtraScheduleDataMap.find(V);
3320 if (I != ExtraScheduleDataMap.end()) {
3321 ScheduleData *SD = I->second.lookup(Key);
3322 if (SD && isInSchedulingRegion(SD))
3323 return SD;
3324 }
3325 return nullptr;
3326 }
3327
3328 bool isInSchedulingRegion(ScheduleData *SD) const {
3329 return SD->SchedulingRegionID == SchedulingRegionID;
3330 }
3331
3332 /// Marks an instruction as scheduled and puts all dependent ready
3333 /// instructions into the ready-list.
3334 template <typename ReadyListType>
3335 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3336 SD->IsScheduled = true;
3337 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3338
3339 for (ScheduleData *BundleMember = SD; BundleMember;
3340 BundleMember = BundleMember->NextInBundle) {
3341 if (BundleMember->Inst != BundleMember->OpValue)
3342 continue;
3343
3344 // Handle the def-use chain dependencies.
3345
3346 // Decrement the unscheduled counter and insert to ready list if ready.
3347 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3348 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3349 if (OpDef && OpDef->hasValidDependencies() &&
3350 OpDef->incrementUnscheduledDeps(-1) == 0) {
3351 // There are no more unscheduled dependencies after
3352 // decrementing, so we can put the dependent instruction
3353 // into the ready list.
3354 ScheduleData *DepBundle = OpDef->FirstInBundle;
3355 assert(!DepBundle->IsScheduled &&
3356 "already scheduled bundle gets ready");
3357 ReadyList.insert(DepBundle);
3358 LLVM_DEBUG(dbgs()
3359 << "SLP: gets ready (def): " << *DepBundle << "\n");
3360 }
3361 });
3362 };
3363
3364 // If BundleMember is a vector bundle, its operands may have been
3365 // reordered during buildTree(). We therefore need to get its operands
3366 // through the TreeEntry.
3367 if (TreeEntry *TE = BundleMember->TE) {
3368 // Need to search for the lane since the tree entry can be reordered.
3369 int Lane = std::distance(TE->Scalars.begin(),
3370 find(TE->Scalars, BundleMember->Inst));
3371 assert(Lane >= 0 && "Lane not set");
3372
3373 // Since vectorization tree is being built recursively this assertion
3374 // ensures that the tree entry has all operands set before reaching
3375 // this code. Couple of exceptions known at the moment are extracts
3376 // where their second (immediate) operand is not added. Since
3377 // immediates do not affect scheduler behavior this is considered
3378 // okay.
3379 auto *In = BundleMember->Inst;
3380 assert(In &&
3381 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3382 In->getNumOperands() == TE->getNumOperands()) &&
3383 "Missed TreeEntry operands?");
3384 (void)In; // fake use to avoid build failure when assertions disabled
3385
3386 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3387 OpIdx != NumOperands; ++OpIdx)
3388 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3389 DecrUnsched(I);
3390 } else {
3391 // If BundleMember is a stand-alone instruction, no operand reordering
3392 // has taken place, so we directly access its operands.
3393 for (Use &U : BundleMember->Inst->operands())
3394 if (auto *I = dyn_cast<Instruction>(U.get()))
3395 DecrUnsched(I);
3396 }
3397 // Handle the memory dependencies.
3398 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3399 if (MemoryDepSD->hasValidDependencies() &&
3400 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3401 // There are no more unscheduled dependencies after decrementing,
3402 // so we can put the dependent instruction into the ready list.
3403 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3404 assert(!DepBundle->IsScheduled &&
3405 "already scheduled bundle gets ready");
3406 ReadyList.insert(DepBundle);
3408 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3409 }
3410 }
3411 // Handle the control dependencies.
3412 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3413 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3414 // There are no more unscheduled dependencies after decrementing,
3415 // so we can put the dependent instruction into the ready list.
3416 ScheduleData *DepBundle = DepSD->FirstInBundle;
3417 assert(!DepBundle->IsScheduled &&
3418 "already scheduled bundle gets ready");
3419 ReadyList.insert(DepBundle);
3421 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3422 }
3423 }
3424 }
3425 }
3426
3427 /// Verify basic self consistency properties of the data structure.
3428 void verify() {
3429 if (!ScheduleStart)
3430 return;
3431
3432 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3433 ScheduleStart->comesBefore(ScheduleEnd) &&
3434 "Not a valid scheduling region?");
3435
3436 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3437 auto *SD = getScheduleData(I);
3438 if (!SD)
3439 continue;
3440 assert(isInSchedulingRegion(SD) &&
3441 "primary schedule data not in window?");
3442 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3443 "entire bundle in window!");
3444 (void)SD;
3445 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3446 }
3447
3448 for (auto *SD : ReadyInsts) {
3449 assert(SD->isSchedulingEntity() && SD->isReady() &&
3450 "item in ready list not ready?");
3451 (void)SD;
3452 }
3453 }
3454
3455 void doForAllOpcodes(Value *V,
3456 function_ref<void(ScheduleData *SD)> Action) {
3457 if (ScheduleData *SD = getScheduleData(V))
3458 Action(SD);
3459 auto I = ExtraScheduleDataMap.find(V);
3460 if (I != ExtraScheduleDataMap.end())
3461 for (auto &P : I->second)
3462 if (isInSchedulingRegion(P.second))
3463 Action(P.second);
3464 }
3465
3466 /// Put all instructions into the ReadyList which are ready for scheduling.
3467 template <typename ReadyListType>
3468 void initialFillReadyList(ReadyListType &ReadyList) {
3469 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3470 doForAllOpcodes(I, [&](ScheduleData *SD) {
3471 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3472 SD->isReady()) {
3473 ReadyList.insert(SD);
3474 LLVM_DEBUG(dbgs()
3475 << "SLP: initially in ready list: " << *SD << "\n");
3476 }
3477 });
3478 }
3479 }
3480
3481 /// Build a bundle from the ScheduleData nodes corresponding to the
3482 /// scalar instruction for each lane.
3483 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3484
3485 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3486 /// cyclic dependencies. This is only a dry-run, no instructions are
3487 /// actually moved at this stage.
3488 /// \returns the scheduling bundle. The returned Optional value is not
3489 /// std::nullopt if \p VL is allowed to be scheduled.
3490 std::optional<ScheduleData *>
3491 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3492 const InstructionsState &S);
3493
3494 /// Un-bundles a group of instructions.
3495 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3496
3497 /// Allocates schedule data chunk.
3498 ScheduleData *allocateScheduleDataChunks();
3499
3500 /// Extends the scheduling region so that V is inside the region.
3501 /// \returns true if the region size is within the limit.
3502 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3503
3504 /// Initialize the ScheduleData structures for new instructions in the
3505 /// scheduling region.
3506 void initScheduleData(Instruction *FromI, Instruction *ToI,
3507 ScheduleData *PrevLoadStore,
3508 ScheduleData *NextLoadStore);
3509
3510 /// Updates the dependency information of a bundle and of all instructions/
3511 /// bundles which depend on the original bundle.
3512 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3513 BoUpSLP *SLP);
3514
3515 /// Sets all instruction in the scheduling region to un-scheduled.
3516 void resetSchedule();
3517
3518 BasicBlock *BB;
3519
3520 /// Simple memory allocation for ScheduleData.
3522
3523 /// The size of a ScheduleData array in ScheduleDataChunks.
3524 int ChunkSize;
3525
3526 /// The allocator position in the current chunk, which is the last entry
3527 /// of ScheduleDataChunks.
3528 int ChunkPos;
3529
3530 /// Attaches ScheduleData to Instruction.
3531 /// Note that the mapping survives during all vectorization iterations, i.e.
3532 /// ScheduleData structures are recycled.
3534
3535 /// Attaches ScheduleData to Instruction with the leading key.
3537 ExtraScheduleDataMap;
3538
3539 /// The ready-list for scheduling (only used for the dry-run).
3540 SetVector<ScheduleData *> ReadyInsts;
3541
3542 /// The first instruction of the scheduling region.
3543 Instruction *ScheduleStart = nullptr;
3544
3545 /// The first instruction _after_ the scheduling region.
3546 Instruction *ScheduleEnd = nullptr;
3547
3548 /// The first memory accessing instruction in the scheduling region
3549 /// (can be null).
3550 ScheduleData *FirstLoadStoreInRegion = nullptr;
3551
3552 /// The last memory accessing instruction in the scheduling region
3553 /// (can be null).
3554 ScheduleData *LastLoadStoreInRegion = nullptr;
3555
3556 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3557 /// region? Used to optimize the dependence calculation for the
3558 /// common case where there isn't.
3559 bool RegionHasStackSave = false;
3560
3561 /// The current size of the scheduling region.
3562 int ScheduleRegionSize = 0;
3563
3564 /// The maximum size allowed for the scheduling region.
3565 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3566
3567 /// The ID of the scheduling region. For a new vectorization iteration this
3568 /// is incremented which "removes" all ScheduleData from the region.
3569 /// Make sure that the initial SchedulingRegionID is greater than the
3570 /// initial SchedulingRegionID in ScheduleData (which is 0).
3571 int SchedulingRegionID = 1;
3572 };
3573
3574 /// Attaches the BlockScheduling structures to basic blocks.
3576
3577 /// Performs the "real" scheduling. Done before vectorization is actually
3578 /// performed in a basic block.
3579 void scheduleBlock(BlockScheduling *BS);
3580
3581 /// List of users to ignore during scheduling and that don't need extracting.
3582 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3583
3584 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3585 /// sorted SmallVectors of unsigned.
3586 struct OrdersTypeDenseMapInfo {
3587 static OrdersType getEmptyKey() {
3588 OrdersType V;
3589 V.push_back(~1U);
3590 return V;
3591 }
3592
3593 static OrdersType getTombstoneKey() {
3594 OrdersType V;
3595 V.push_back(~2U);
3596 return V;
3597 }
3598
3599 static unsigned getHashValue(const OrdersType &V) {
3600 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3601 }
3602
3603 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3604 return LHS == RHS;
3605 }
3606 };
3607
3608 // Analysis and block reference.
3609 Function *F;
3610 ScalarEvolution *SE;
3612 TargetLibraryInfo *TLI;
3613 LoopInfo *LI;
3614 DominatorTree *DT;
3615 AssumptionCache *AC;
3616 DemandedBits *DB;
3617 const DataLayout *DL;
3619
3620 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3621 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3622
3623 /// Instruction builder to construct the vectorized tree.
3624 IRBuilder<> Builder;
3625
3626 /// A map of scalar integer values to the smallest bit width with which they
3627 /// can legally be represented. The values map to (width, signed) pairs,
3628 /// where "width" indicates the minimum bit width and "signed" is True if the
3629 /// value must be signed-extended, rather than zero-extended, back to its
3630 /// original width.
3632};
3633
3634} // end namespace slpvectorizer
3635
3636template <> struct GraphTraits<BoUpSLP *> {
3637 using TreeEntry = BoUpSLP::TreeEntry;
3638
3639 /// NodeRef has to be a pointer per the GraphWriter.
3641
3643
3644 /// Add the VectorizableTree to the index iterator to be able to return
3645 /// TreeEntry pointers.
3646 struct ChildIteratorType
3647 : public iterator_adaptor_base<
3648 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3650
3652 ContainerTy &VT)
3653 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3654
3655 NodeRef operator*() { return I->UserTE; }
3656 };
3657
3659 return R.VectorizableTree[0].get();
3660 }
3661
3662 static ChildIteratorType child_begin(NodeRef N) {
3663 return {N->UserTreeIndices.begin(), N->Container};
3664 }
3665
3666 static ChildIteratorType child_end(NodeRef N) {
3667 return {N->UserTreeIndices.end(), N->Container};
3668 }
3669
3670 /// For the node iterator we just need to turn the TreeEntry iterator into a
3671 /// TreeEntry* iterator so that it dereferences to NodeRef.
3672 class nodes_iterator {
3674 ItTy It;
3675
3676 public:
3677 nodes_iterator(const ItTy &It2) : It(It2) {}
3678 NodeRef operator*() { return It->get(); }
3679 nodes_iterator operator++() {
3680 ++It;
3681 return *this;
3682 }
3683 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3684 };
3685
3686 static nodes_iterator nodes_begin(BoUpSLP *R) {
3687 return nodes_iterator(R->VectorizableTree.begin());
3688 }
3689
3690 static nodes_iterator nodes_end(BoUpSLP *R) {
3691 return nodes_iterator(R->VectorizableTree.end());
3692 }
3693
3694 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3695};
3696
3697template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3698 using TreeEntry = BoUpSLP::TreeEntry;
3699
3700 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3701
3702 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3703 std::string Str;
3705 OS << Entry->Idx << ".\n";
3706 if (isSplat(Entry->Scalars))
3707 OS << "<splat> ";
3708 for (auto *V : Entry->Scalars) {
3709 OS << *V;
3710 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3711 return EU.Scalar == V;
3712 }))
3713 OS << " <extract>";
3714 OS << "\n";
3715 }
3716 return Str;
3717 }
3718
3719 static std::string getNodeAttributes(const TreeEntry *Entry,
3720 const BoUpSLP *) {
3721 if (Entry->State == TreeEntry::NeedToGather)
3722 return "color=red";
3723 if (Entry->State == TreeEntry::ScatterVectorize ||
3724 Entry->State == TreeEntry::StridedVectorize)
3725 return "color=blue";
3726 return "";
3727 }
3728};
3729
3730} // end namespace llvm
3731
3734 for (auto *I : DeletedInstructions) {
3735 for (Use &U : I->operands()) {
3736 auto *Op = dyn_cast<Instruction>(U.get());
3737 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3739 DeadInsts.emplace_back(Op);
3740 }
3741 I->dropAllReferences();
3742 }
3743 for (auto *I : DeletedInstructions) {
3744 assert(I->use_empty() &&
3745 "trying to erase instruction with users.");
3746 I->eraseFromParent();
3747 }
3748
3749 // Cleanup any dead scalar code feeding the vectorized instructions
3751
3752#ifdef EXPENSIVE_CHECKS
3753 // If we could guarantee that this call is not extremely slow, we could
3754 // remove the ifdef limitation (see PR47712).
3755 assert(!verifyFunction(*F, &dbgs()));
3756#endif
3757}
3758
3759/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3760/// contains original mask for the scalars reused in the node. Procedure
3761/// transform this mask in accordance with the given \p Mask.
3763 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3764 "Expected non-empty mask.");
3765 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3766 Prev.swap(Reuses);
3767 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3768 if (Mask[I] != PoisonMaskElem)
3769 Reuses[Mask[I]] = Prev[I];
3770}
3771
3772/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3773/// the original order of the scalars. Procedure transforms the provided order
3774/// in accordance with the given \p Mask. If the resulting \p Order is just an
3775/// identity order, \p Order is cleared.
3777 bool BottomOrder = false) {
3778 assert(!Mask.empty() && "Expected non-empty mask.");
3779 unsigned Sz = Mask.size();
3780 if (BottomOrder) {
3781 SmallVector<unsigned> PrevOrder;
3782 if (Order.empty()) {
3783 PrevOrder.resize(Sz);
3784 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3785 } else {
3786 PrevOrder.swap(Order);
3787 }
3788 Order.assign(Sz, Sz);
3789 for (unsigned I = 0; I < Sz; ++I)
3790 if (Mask[I] != PoisonMaskElem)
3791 Order[I] = PrevOrder[Mask[I]];
3792 if (all_of(enumerate(Order), [&](const auto &Data) {
3793 return Data.value() == Sz || Data.index() == Data.value();
3794 })) {
3795 Order.clear();
3796 return;
3797 }
3798 fixupOrderingIndices(Order);
3799 return;
3800 }
3801 SmallVector<int> MaskOrder;
3802 if (Order.empty()) {
3803 MaskOrder.resize(Sz);
3804 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3805 } else {
3806 inversePermutation(Order, MaskOrder);
3807 }
3808 reorderReuses(MaskOrder, Mask);
3809 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
3810 Order.clear();
3811 return;
3812 }
3813 Order.assign(Sz, Sz);
3814 for (unsigned I = 0; I < Sz; ++I)
3815 if (MaskOrder[I] != PoisonMaskElem)
3816 Order[MaskOrder[I]] = I;
3817 fixupOrderingIndices(Order);
3818}
3819
3820std::optional<BoUpSLP::OrdersType>
3821BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3822 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3823 // Try to find subvector extract/insert patterns and reorder only such
3824 // patterns.
3825 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3826 Type *ScalarTy = GatheredScalars.front()->getType();
3827 int NumScalars = GatheredScalars.size();
3828 if (!isValidElementType(ScalarTy))
3829 return std::nullopt;
3830 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
3831 int NumParts = TTI->getNumberOfParts(VecTy);
3832 if (NumParts == 0 || NumParts >= NumScalars)
3833 NumParts = 1;
3834 SmallVector<int> ExtractMask;
3835 SmallVector<int> Mask;
3838 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3840 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3841 /*ForOrder=*/true);
3842 // No shuffled operands - ignore.
3843 if (GatherShuffles.empty() && ExtractShuffles.empty())
3844 return std::nullopt;
3845 OrdersType CurrentOrder(NumScalars, NumScalars);
3846 if (GatherShuffles.size() == 1 &&
3847 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3848 Entries.front().front()->isSame(TE.Scalars)) {
3849 // Perfect match in the graph, will reuse the previously vectorized
3850 // node. Cost is 0.
3851 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
3852 return CurrentOrder;
3853 }
3854 auto IsSplatMask = [](ArrayRef<int> Mask) {
3855 int SingleElt = PoisonMaskElem;
3856 return all_of(Mask, [&](int I) {
3857 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3858 SingleElt = I;
3859 return I == PoisonMaskElem || I == SingleElt;
3860 });
3861 };
3862 // Exclusive broadcast mask - ignore.
3863 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3864 (Entries.size() != 1 ||
3865 Entries.front().front()->ReorderIndices.empty())) ||
3866 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3867 return std::nullopt;
3868 SmallBitVector ShuffledSubMasks(NumParts);
3869 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3870 ArrayRef<int> Mask, int PartSz, int NumParts,
3871 function_ref<unsigned(unsigned)> GetVF) {
3872 for (int I : seq<int>(0, NumParts)) {
3873 if (ShuffledSubMasks.test(I))
3874 continue;
3875 const int VF = GetVF(I);
3876 if (VF == 0)
3877 continue;
3878 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
3879 // Shuffle of at least 2 vectors - ignore.
3880 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
3881 std::fill(Slice.begin(), Slice.end(), NumScalars);
3882 ShuffledSubMasks.set(I);
3883 continue;
3884 }
3885 // Try to include as much elements from the mask as possible.
3886 int FirstMin = INT_MAX;
3887 int SecondVecFound = false;
3888 for (int K : seq<int>(0, PartSz)) {
3889 int Idx = Mask[I * PartSz + K];
3890 if (Idx == PoisonMaskElem) {
3891 Value *V = GatheredScalars[I * PartSz + K];
3892 if (isConstant(V) && !isa<PoisonValue>(V)) {
3893 SecondVecFound = true;
3894 break;
3895 }
3896 continue;
3897 }
3898 if (Idx < VF) {
3899 if (FirstMin > Idx)
3900 FirstMin = Idx;
3901 } else {
3902 SecondVecFound = true;
3903 break;
3904 }
3905 }
3906 FirstMin = (FirstMin / PartSz) * PartSz;
3907 // Shuffle of at least 2 vectors - ignore.
3908 if (SecondVecFound) {
3909 std::fill(Slice.begin(), Slice.end(), NumScalars);
3910 ShuffledSubMasks.set(I);
3911 continue;
3912 }
3913 for (int K : seq<int>(0, PartSz)) {
3914 int Idx = Mask[I * PartSz + K];
3915 if (Idx == PoisonMaskElem)
3916 continue;
3917 Idx -= FirstMin;
3918 if (Idx >= PartSz) {
3919 SecondVecFound = true;
3920 break;
3921 }
3922 if (CurrentOrder[I * PartSz + Idx] >
3923 static_cast<unsigned>(I * PartSz + K) &&
3924 CurrentOrder[I * PartSz + Idx] !=
3925 static_cast<unsigned>(I * PartSz + Idx))
3926 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
3927 }
3928 // Shuffle of at least 2 vectors - ignore.
3929 if (SecondVecFound) {
3930 std::fill(Slice.begin(), Slice.end(), NumScalars);
3931 ShuffledSubMasks.set(I);
3932 continue;
3933 }
3934 }
3935 };
3936 int PartSz = NumScalars / NumParts;
3937 if (!ExtractShuffles.empty())
3938 TransformMaskToOrder(
3939 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
3940 if (!ExtractShuffles[I])
3941 return 0U;
3942 unsigned VF = 0;
3943 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
3944 int K = I * PartSz + Idx;
3945 if (ExtractMask[K] == PoisonMaskElem)
3946 continue;
3947 if (!TE.ReuseShuffleIndices.empty())
3948 K = TE.ReuseShuffleIndices[K];
3949 if (!TE.ReorderIndices.empty())
3950 K = std::distance(TE.ReorderIndices.begin(),
3951 find(TE.ReorderIndices, K));
3952 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
3953 if (!EI)
3954 continue;
3955 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
3956 ->getElementCount()
3957 .getKnownMinValue());
3958 }
3959 return VF;
3960 });
3961 // Check special corner case - single shuffle of the same entry.
3962 if (GatherShuffles.size() == 1 && NumParts != 1) {
3963 if (ShuffledSubMasks.any())
3964 return std::nullopt;
3965 PartSz = NumScalars;
3966 NumParts = 1;
3967 }
3968 if (!Entries.empty())
3969 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
3970 if (!GatherShuffles[I])
3971 return 0U;
3972 return std::max(Entries[I].front()->getVectorFactor(),
3973 Entries[I].back()->getVectorFactor());
3974 });
3975 int NumUndefs =
3976 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
3977 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
3978 return std::nullopt;
3979 return std::move(CurrentOrder);
3980}
3981
3982static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
3983 const TargetLibraryInfo &TLI,
3984 bool CompareOpcodes = true) {
3985 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
3986 return false;
3987 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3988 if (!GEP1)
3989 return false;
3990 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3991 if (!GEP2)
3992 return false;
3993 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3994 ((isConstant(GEP1->getOperand(1)) &&
3995 isConstant(GEP2->getOperand(1))) ||
3996 !CompareOpcodes ||
3997 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3998 .getOpcode());
3999}
4000
4001/// Calculates minimal alignment as a common alignment.
4002template <typename T>
4004 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4005 for (Value *V : VL.drop_front())
4006 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4007 return CommonAlignment;
4008}
4009
4010/// Check if \p Order represents reverse order.
4012 unsigned Sz = Order.size();
4013 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4014 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4015 });
4016}
4017
4018/// Checks if the provided list of pointers \p Pointers represents the strided
4019/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4020/// Otherwise, if \p Inst is not specified, just initialized optional value is
4021/// returned to show that the pointers represent strided pointers. If \p Inst
4022/// specified, the runtime stride is materialized before the given \p Inst.
4023/// \returns std::nullopt if the pointers are not pointers with the runtime
4024/// stride, nullptr or actual stride value, otherwise.
4025static std::optional<Value *>
4027 const DataLayout &DL, ScalarEvolution &SE,
4028 SmallVectorImpl<unsigned> &SortedIndices,
4029 Instruction *Inst = nullptr) {
4031 const SCEV *PtrSCEVLowest = nullptr;
4032 const SCEV *PtrSCEVHighest = nullptr;
4033 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4034 // addresses).
4035 for (Value *Ptr : PointerOps) {
4036 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4037 if (!PtrSCEV)
4038 return std::nullopt;
4039 SCEVs.push_back(PtrSCEV);
4040 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4041 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4042 continue;
4043 }
4044 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4045 if (isa<SCEVCouldNotCompute>(Diff))
4046 return std::nullopt;
4047 if (Diff->isNonConstantNegative()) {
4048 PtrSCEVLowest = PtrSCEV;
4049 continue;
4050 }
4051 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4052 if (isa<SCEVCouldNotCompute>(Diff1))
4053 return std::nullopt;
4054 if (Diff1->isNonConstantNegative()) {
4055 PtrSCEVHighest = PtrSCEV;
4056 continue;
4057 }
4058 }
4059 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4060 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4061 if (isa<SCEVCouldNotCompute>(Dist))
4062 return std::nullopt;
4063 int Size = DL.getTypeStoreSize(ElemTy);
4064 auto TryGetStride = [&](const SCEV *Dist,
4065 const SCEV *Multiplier) -> const SCEV * {
4066 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4067 if (M->getOperand(0) == Multiplier)
4068 return M->getOperand(1);
4069 if (M->getOperand(1) == Multiplier)
4070 return M->getOperand(0);
4071 return nullptr;
4072 }
4073 if (Multiplier == Dist)
4074 return SE.getConstant(Dist->getType(), 1);
4075 return SE.getUDivExactExpr(Dist, Multiplier);
4076 };
4077 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4078 const SCEV *Stride = nullptr;
4079 if (Size != 1 || SCEVs.size() > 2) {
4080 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4081 Stride = TryGetStride(Dist, Sz);
4082 if (!Stride)
4083 return std::nullopt;
4084 }
4085 if (!Stride || isa<SCEVConstant>(Stride))
4086 return std::nullopt;
4087 // Iterate through all pointers and check if all distances are
4088 // unique multiple of Stride.
4089 using DistOrdPair = std::pair<int64_t, int>;
4090 auto Compare = llvm::less_first();
4091 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4092 int Cnt = 0;
4093 bool IsConsecutive = true;
4094 for (const SCEV *PtrSCEV : SCEVs) {
4095 unsigned Dist = 0;
4096 if (PtrSCEV != PtrSCEVLowest) {
4097 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4098 const SCEV *Coeff = TryGetStride(Diff, Stride);
4099 if (!Coeff)
4100 return std::nullopt;
4101 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4102 if (!SC || isa<SCEVCouldNotCompute>(SC))
4103 return std::nullopt;
4104 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4105 SE.getMulExpr(Stride, SC)))
4106 ->isZero())
4107 return std::nullopt;
4108 Dist = SC->getAPInt().getZExtValue();
4109 }
4110 // If the strides are not the same or repeated, we can't vectorize.
4111 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4112 return std::nullopt;
4113 auto Res = Offsets.emplace(Dist, Cnt);
4114 if (!Res.second)
4115 return std::nullopt;
4116 // Consecutive order if the inserted element is the last one.
4117 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4118 ++Cnt;
4119 }
4120 if (Offsets.size() != SCEVs.size())
4121 return std::nullopt;
4122 SortedIndices.clear();
4123 if (!IsConsecutive) {
4124 // Fill SortedIndices array only if it is non-consecutive.
4125 SortedIndices.resize(PointerOps.size());
4126 Cnt = 0;
4127 for (const std::pair<int64_t, int> &Pair : Offsets) {
4128 SortedIndices[Cnt] = Pair.second;
4129 ++Cnt;
4130 }
4131 }
4132 if (!Inst)
4133 return nullptr;
4134 SCEVExpander Expander(SE, DL, "strided-load-vec");
4135 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4136}
4137
4139 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4140 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4141 // Check that a vectorized load would load the same memory as a scalar
4142 // load. For example, we don't want to vectorize loads that are smaller
4143 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4144 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4145 // from such a struct, we read/write packed bits disagreeing with the
4146 // unvectorized version.
4147 Type *ScalarTy = VL0->getType();
4148
4149 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4150 return LoadsState::Gather;
4151
4152 // Make sure all loads in the bundle are simple - we can't vectorize
4153 // atomic or volatile loads.
4154 PointerOps.clear();
4155 const unsigned Sz = VL.size();
4156 PointerOps.resize(Sz);
4157 auto *POIter = PointerOps.begin();
4158 for (Value *V : VL) {
4159 auto *L = cast<LoadInst>(V);
4160 if (!L->isSimple())
4161 return LoadsState::Gather;
4162 *POIter = L->getPointerOperand();
4163 ++POIter;
4164 }
4165
4166 Order.clear();
4167 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4168 // Check the order of pointer operands or that all pointers are the same.
4169 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4170 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4171 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4172 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4173 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4175 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4176 return arePointersCompatible(P, PointerOps.front(), *TLI);
4177 })) {
4178 if (IsSorted) {
4179 Value *Ptr0;
4180 Value *PtrN;
4181 if (Order.empty()) {
4182 Ptr0 = PointerOps.front();
4183 PtrN = PointerOps.back();
4184 } else {
4185 Ptr0 = PointerOps[Order.front()];
4186 PtrN = PointerOps[Order.back()];
4187 }
4188 std::optional<int> Diff =
4189 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4190 // Check that the sorted loads are consecutive.
4191 if (static_cast<unsigned>(*Diff) == Sz - 1)
4192 return LoadsState::Vectorize;
4193 // Simple check if not a strided access - clear order.
4194 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4195 // Try to generate strided load node if:
4196 // 1. Target with strided load support is detected.
4197 // 2. The number of loads is greater than MinProfitableStridedLoads,
4198 // or the potential stride <= MaxProfitableLoadStride and the
4199 // potential stride is power-of-2 (to avoid perf regressions for the very
4200 // small number of loads) and max distance > number of loads, or potential
4201 // stride is -1.
4202 // 3. The loads are ordered, or number of unordered loads <=
4203 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4204 // (this check is to avoid extra costs for very expensive shuffles).
4205 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4206 (static_cast<unsigned>(std::abs(*Diff)) <=
4208 isPowerOf2_32(std::abs(*Diff)))) &&
4209 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4210 *Diff == -(static_cast<int>(Sz) - 1))) {
4211 int Stride = *Diff / static_cast<int>(Sz - 1);
4212 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4213 Align Alignment =
4214 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4215 ->getAlign();
4216 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4217 // Iterate through all pointers and check if all distances are
4218 // unique multiple of Dist.
4219 SmallSet<int, 4> Dists;
4220 for (Value *Ptr : PointerOps) {
4221 int Dist = 0;
4222 if (Ptr == PtrN)
4223 Dist = *Diff;
4224 else if (Ptr != Ptr0)
4225 Dist =
4226 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4227 // If the strides are not the same or repeated, we can't
4228 // vectorize.
4229 if (((Dist / Stride) * Stride) != Dist ||
4230 !Dists.insert(Dist).second)
4231 break;
4232 }
4233 if (Dists.size() == Sz)
4235 }
4236 }
4237 }
4238 }
4239 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4240 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4241 unsigned MinVF = getMinVF(Sz);
4242 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4243 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4244 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4245 unsigned VectorizedCnt = 0;
4247 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4248 Cnt += VF, ++VectorizedCnt) {
4249 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4251 SmallVector<Value *> PointerOps;
4252 LoadsState LS =
4253 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4254 /*TryRecursiveCheck=*/false);
4255 // Check that the sorted loads are consecutive.
4256 if (LS == LoadsState::Gather)
4257 break;
4258 // If need the reorder - consider as high-cost masked gather for now.
4259 if ((LS == LoadsState::Vectorize ||
4261 !Order.empty() && !isReverseOrder(Order))
4263 States.push_back(LS);
4264 }
4265 // Can be vectorized later as a serie of loads/insertelements.
4266 if (VectorizedCnt == VL.size() / VF) {
4267 // Compare masked gather cost and loads + insersubvector costs.
4269 InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4270 Instruction::Load, VecTy,
4271 cast<LoadInst>(VL0)->getPointerOperand(),
4272 /*VariableMask=*/false, CommonAlignment, CostKind);
4273 InstructionCost VecLdCost = 0;
4274 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4275 for (auto [I, LS] : enumerate(States)) {
4276 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4277 switch (LS) {
4279 VecLdCost += TTI.getMemoryOpCost(
4280 Instruction::Load, SubVecTy, LI0->getAlign(),
4281 LI0->getPointerAddressSpace(), CostKind,
4283 break;
4285 VecLdCost += TTI.getStridedMemoryOpCost(
4286 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4287 /*VariableMask=*/false, CommonAlignment, CostKind);
4288 break;
4290 VecLdCost += TTI.getGatherScatterOpCost(
4291 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4292 /*VariableMask=*/false, CommonAlignment, CostKind);
4293 break;
4294 case LoadsState::Gather:
4296 "Expected only consecutive, strided or masked gather loads.");
4297 }
4298 SmallVector<int> ShuffleMask(VL.size());
4299 for (int Idx : seq<int>(0, VL.size()))
4300 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4301 VecLdCost +=
4302 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4303 ShuffleMask, CostKind, I * VF, SubVecTy);
4304 }
4305 // If masked gather cost is higher - better to vectorize, so
4306 // consider it as a gather node. It will be better estimated
4307 // later.
4308 if (MaskedGatherCost > VecLdCost)
4309 return true;
4310 }
4311 }
4312 return false;
4313 };
4314 // TODO: need to improve analysis of the pointers, if not all of them are
4315 // GEPs or have > 2 operands, we end up with a gather node, which just
4316 // increases the cost.
4317 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4318 bool ProfitableGatherPointers =
4319 L && Sz > 2 &&
4320 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4321 return L->isLoopInvariant(V);
4322 })) <= Sz / 2;
4323 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4324 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4325 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4326 (GEP && GEP->getNumOperands() == 2 &&
4327 isa<Constant, Instruction>(GEP->getOperand(1)));
4328 })) {
4329 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4330 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4331 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4332 // Check if potential masked gather can be represented as series
4333 // of loads + insertsubvectors.
4334 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4335 // If masked gather cost is higher - better to vectorize, so
4336 // consider it as a gather node. It will be better estimated
4337 // later.
4338 return LoadsState::Gather;
4339 }
4341 }
4342 }
4343 }
4344
4345 return LoadsState::Gather;
4346}
4347
4349 const DataLayout &DL, ScalarEvolution &SE,
4350 SmallVectorImpl<unsigned> &SortedIndices) {
4352 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4353 "Expected list of pointer operands.");
4354 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4355 // Ptr into, sort and return the sorted indices with values next to one
4356 // another.
4358 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4359
4360 unsigned Cnt = 1;
4361 for (Value *Ptr : VL.drop_front()) {
4362 bool Found = any_of(Bases, [&](auto &Base) {
4363 std::optional<int> Diff =
4364 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4365 /*StrictCheck=*/true);
4366 if (!Diff)
4367 return false;
4368
4369 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4370 return true;
4371 });
4372
4373 if (!Found) {
4374 // If we haven't found enough to usefully cluster, return early.
4375 if (Bases.size() > VL.size() / 2 - 1)
4376 return false;
4377
4378 // Not found already - add a new Base
4379 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4380 }
4381 }
4382
4383 // For each of the bases sort the pointers by Offset and check if any of the
4384 // base become consecutively allocated.
4385 bool AnyConsecutive = false;
4386 for (auto &Base : Bases) {
4387 auto &Vec = Base.second;
4388 if (Vec.size() > 1) {
4389 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4390 const std::tuple<Value *, int, unsigned> &Y) {
4391 return std::get<1>(X) < std::get<1>(Y);
4392 });
4393 int InitialOffset = std::get<1>(Vec[0]);
4394 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4395 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4396 });
4397 }
4398 }
4399
4400 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4401 SortedIndices.clear();
4402 if (!AnyConsecutive)
4403 return false;
4404
4405 for (auto &Base : Bases) {
4406 for (auto &T : Base.second)
4407 SortedIndices.push_back(std::get<2>(T));
4408 }
4409
4410 assert(SortedIndices.size() == VL.size() &&
4411 "Expected SortedIndices to be the size of VL");
4412 return true;
4413}
4414
4415std::optional<BoUpSLP::OrdersType>
4416BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4417 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4418 Type *ScalarTy = TE.Scalars[0]->getType();
4419
4421 Ptrs.reserve(TE.Scalars.size());
4422 for (Value *V : TE.Scalars) {
4423 auto *L = dyn_cast<LoadInst>(V);
4424 if (!L || !L->isSimple())
4425 return std::nullopt;
4426 Ptrs.push_back(L->getPointerOperand());
4427 }
4428
4429 BoUpSLP::OrdersType Order;
4430 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4431 return std::move(Order);
4432 return std::nullopt;
4433}
4434
4435/// Check if two insertelement instructions are from the same buildvector.
4438 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4439 // Instructions must be from the same basic blocks.
4440 if (VU->getParent() != V->getParent())
4441 return false;
4442 // Checks if 2 insertelements are from the same buildvector.
4443 if (VU->getType() != V->getType())
4444 return false;
4445 // Multiple used inserts are separate nodes.
4446 if (!VU->hasOneUse() && !V->hasOneUse())
4447 return false;
4448 auto *IE1 = VU;
4449 auto *IE2 = V;
4450 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4451 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4452 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4453 return false;
4454 // Go through the vector operand of insertelement instructions trying to find
4455 // either VU as the original vector for IE2 or V as the original vector for
4456 // IE1.
4457 SmallBitVector ReusedIdx(
4458 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4459 bool IsReusedIdx = false;
4460 do {
4461 if (IE2 == VU && !IE1)
4462 return VU->hasOneUse();
4463 if (IE1 == V && !IE2)
4464 return V->hasOneUse();
4465 if (IE1 && IE1 != V) {
4466 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4467 IsReusedIdx |= ReusedIdx.test(Idx1);
4468 ReusedIdx.set(Idx1);
4469 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4470 IE1 = nullptr;
4471 else
4472 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4473 }
4474 if (IE2 && IE2 != VU) {
4475 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4476 IsReusedIdx |= ReusedIdx.test(Idx2);
4477 ReusedIdx.set(Idx2);
4478 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4479 IE2 = nullptr;
4480 else
4481 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4482 }
4483 } while (!IsReusedIdx && (IE1 || IE2));
4484 return false;
4485}
4486
4487std::optional<BoUpSLP::OrdersType>
4488BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4489 // No need to reorder if need to shuffle reuses, still need to shuffle the
4490 // node.
4491 if (!TE.ReuseShuffleIndices.empty()) {
4492 if (isSplat(TE.Scalars))
4493 return std::nullopt;
4494 // Check if reuse shuffle indices can be improved by reordering.
4495 // For this, check that reuse mask is "clustered", i.e. each scalar values
4496 // is used once in each submask of size <number_of_scalars>.
4497 // Example: 4 scalar values.
4498 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4499 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4500 // element 3 is used twice in the second submask.
4501 unsigned Sz = TE.Scalars.size();
4502 if (TE.State == TreeEntry::NeedToGather) {
4503 if (std::optional<OrdersType> CurrentOrder =
4505 SmallVector<int> Mask;
4506 fixupOrderingIndices(*CurrentOrder);
4507 inversePermutation(*CurrentOrder, Mask);
4508 ::addMask(Mask, TE.ReuseShuffleIndices);
4509 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4510 unsigned Sz = TE.Scalars.size();
4511 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4512 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4513 if (Idx != PoisonMaskElem)
4514 Res[Idx + K * Sz] = I + K * Sz;
4515 }
4516 return std::move(Res);
4517 }
4518 }
4519 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4521 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4522 return std::nullopt;
4523 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4524 Sz)) {
4525 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4526 if (TE.ReorderIndices.empty())
4527 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4528 else
4529 inversePermutation(TE.ReorderIndices, ReorderMask);
4530 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4531 unsigned VF = ReorderMask.size();
4532 OrdersType ResOrder(VF, VF);
4533 unsigned NumParts = VF / Sz;
4534 SmallBitVector UsedVals(NumParts);
4535 for (unsigned I = 0; I < VF; I += Sz) {
4536 int Val = PoisonMaskElem;
4537 unsigned UndefCnt = 0;
4538 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4539 [&](int Idx) {
4540 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4541 Val = Idx;
4542 if (Idx == PoisonMaskElem)
4543 ++UndefCnt;
4544 return Idx != PoisonMaskElem && Idx != Val;
4545 }) ||
4546 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4547 UndefCnt > Sz / 2)
4548 return std::nullopt;
4549 UsedVals.set(Val);
4550 for (unsigned K = 0; K < NumParts; ++K)
4551 ResOrder[Val + Sz * K] = I + K;
4552 }
4553 return std::move(ResOrder);
4554 }
4555 unsigned VF = TE.getVectorFactor();
4556 // Try build correct order for extractelement instructions.
4557 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4558 TE.ReuseShuffleIndices.end());
4559 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4560 all_of(TE.Scalars, [Sz](Value *V) {
4561 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4562 return Idx && *Idx < Sz;
4563 })) {
4564 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4565 if (TE.ReorderIndices.empty())
4566 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4567 else
4568 inversePermutation(TE.ReorderIndices, ReorderMask);
4569 for (unsigned I = 0; I < VF; ++I) {
4570 int &Idx = ReusedMask[I];
4571 if (Idx == PoisonMaskElem)
4572 continue;
4573 Value *V = TE.Scalars[ReorderMask[Idx]];
4574 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4575 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4576 }
4577 }
4578 // Build the order of the VF size, need to reorder reuses shuffles, they are
4579 // always of VF size.
4580 OrdersType ResOrder(VF);
4581 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4582 auto *It = ResOrder.begin();
4583 for (unsigned K = 0; K < VF; K += Sz) {
4584 OrdersType CurrentOrder(TE.ReorderIndices);
4585 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4586 if (SubMask.front() == PoisonMaskElem)
4587 std::iota(SubMask.begin(), SubMask.end(), 0);
4588 reorderOrder(CurrentOrder, SubMask);
4589 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4590 std::advance(It, Sz);
4591 }
4592 if (TE.State == TreeEntry::NeedToGather &&
4593 all_of(enumerate(ResOrder),
4594 [](const auto &Data) { return Data.index() == Data.value(); }))
4595 return std::nullopt; // No need to reorder.
4596 return std::move(ResOrder);
4597 }
4598 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4599 any_of(TE.UserTreeIndices,
4600 [](const EdgeInfo &EI) {
4601 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4602 }) &&
4603 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4604 return std::nullopt;
4605 if ((TE.State == TreeEntry::Vectorize ||
4606 TE.State == TreeEntry::StridedVectorize) &&
4607 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4608 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4609 !TE.isAltShuffle())
4610 return TE.ReorderIndices;
4611 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4612 auto PHICompare = [&](unsigned I1, unsigned I2) {
4613 Value *V1 = TE.Scalars[I1];
4614 Value *V2 = TE.Scalars[I2];
4615 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4616 return false;
4617 if (V1->getNumUses() < V2->getNumUses())
4618 return true;
4619 if (V1->getNumUses() > V2->getNumUses())
4620 return false;
4621 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4622 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4623 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4624 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4626 IE1, IE2,
4627 [](InsertElementInst *II) { return II->getOperand(0); }))
4628 return I1 < I2;
4629 return getInsertIndex(IE1) < getInsertIndex(IE2);
4630 }
4631 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4632 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4633 if (EE1->getOperand(0) != EE2->getOperand(0))
4634 return I1 < I2;
4635 return getInsertIndex(EE1) < getInsertIndex(EE2);
4636 }
4637 return I1 < I2;
4638 };
4639 auto IsIdentityOrder = [](const OrdersType &Order) {
4640 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4641 if (Idx != Order[Idx])
4642 return false;
4643 return true;
4644 };
4645 if (!TE.ReorderIndices.empty())
4646 return TE.ReorderIndices;
4648 SmallVector<unsigned> Phis(TE.Scalars.size());
4649 std::iota(Phis.begin(), Phis.end(), 0);
4650 OrdersType ResOrder(TE.Scalars.size());
4651 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4652 PhiToId[Id] = Id;
4653 stable_sort(Phis, PHICompare);
4654 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4655 ResOrder[Id] = PhiToId[Phis[Id]];
4656 if (IsIdentityOrder(ResOrder))
4657 return std::nullopt; // No need to reorder.
4658 return std::move(ResOrder);
4659 }
4660 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4661 allSameType(TE.Scalars)) {
4662 // TODO: add analysis of other gather nodes with extractelement
4663 // instructions and other values/instructions, not only undefs.
4664 if ((TE.getOpcode() == Instruction::ExtractElement ||
4665 (all_of(TE.Scalars,
4666 [](Value *V) {
4667 return isa<UndefValue, ExtractElementInst>(V);
4668 }) &&
4669 any_of(TE.Scalars,
4670 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
4671 all_of(TE.Scalars, [](Value *V) {
4672 auto *EE = dyn_cast<ExtractElementInst>(V);
4673 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4674 })) {
4675 // Check that gather of extractelements can be represented as
4676 // just a shuffle of a single vector.
4677 OrdersType CurrentOrder;
4678 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4679 /*ResizeAllowed=*/true);
4680 if (Reuse || !CurrentOrder.empty())
4681 return std::move(CurrentOrder);
4682 }
4683 // If the gather node is <undef, v, .., poison> and
4684 // insertelement poison, v, 0 [+ permute]
4685 // is cheaper than
4686 // insertelement poison, v, n - try to reorder.
4687 // If rotating the whole graph, exclude the permute cost, the whole graph
4688 // might be transformed.
4689 int Sz = TE.Scalars.size();
4690 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4691 count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
4692 const auto *It =
4693 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4694 if (It == TE.Scalars.begin())
4695 return OrdersType();
4696 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4697 if (It != TE.Scalars.end()) {
4698 OrdersType Order(Sz, Sz);
4699 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4700 Order[Idx] = 0;
4701 fixupOrderingIndices(Order);
4702 SmallVector<int> Mask;
4703 inversePermutation(Order, Mask);
4704 InstructionCost PermuteCost =
4705 TopToBottom
4706 ? 0
4708 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4709 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4710 PoisonValue::get(Ty), *It);
4711 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4712 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4713 PoisonValue::get(Ty), *It);
4714 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4715 OrdersType Order(Sz, Sz);
4716 Order[Idx] = 0;
4717 return std::move(Order);
4718 }
4719 }
4720 }
4721 if (isSplat(TE.Scalars))
4722 return std::nullopt;
4723 if (TE.Scalars.size() >= 4)
4724 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4725 return Order;
4726 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4727 return CurrentOrder;
4728 }
4729 return std::nullopt;
4730}
4731
4732/// Checks if the given mask is a "clustered" mask with the same clusters of
4733/// size \p Sz, which are not identity submasks.
4735 unsigned Sz) {
4736 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4737 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4738 return false;
4739 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4740 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4741 if (Cluster != FirstCluster)
4742 return false;
4743 }
4744 return true;
4745}
4746
4747void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4748 // Reorder reuses mask.
4749 reorderReuses(TE.ReuseShuffleIndices, Mask);
4750 const unsigned Sz = TE.Scalars.size();
4751 // For vectorized and non-clustered reused no need to do anything else.
4752 if (TE.State != TreeEntry::NeedToGather ||
4754 Sz) ||
4755 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4756 return;
4757 SmallVector<int> NewMask;
4758 inversePermutation(TE.ReorderIndices, NewMask);
4759 addMask(NewMask, TE.ReuseShuffleIndices);
4760 // Clear reorder since it is going to be applied to the new mask.
4761 TE.ReorderIndices.clear();
4762 // Try to improve gathered nodes with clustered reuses, if possible.
4763 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4764 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4765 inversePermutation(NewOrder, NewMask);
4766 reorderScalars(TE.Scalars, NewMask);
4767 // Fill the reuses mask with the identity submasks.
4768 for (auto *It = TE.ReuseShuffleIndices.begin(),
4769 *End = TE.ReuseShuffleIndices.end();
4770 It != End; std::advance(It, Sz))
4771 std::iota(It, std::next(It, Sz), 0);
4772}
4773
4775 ArrayRef<unsigned> SecondaryOrder) {
4776 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4777 "Expected same size of orders");
4778 unsigned Sz = Order.size();
4779 SmallBitVector UsedIndices(Sz);
4780 for (unsigned Idx : seq<unsigned>(0, Sz)) {
4781 if (Order[Idx] != Sz)
4782 UsedIndices.set(Order[Idx]);
4783 }
4784 if (SecondaryOrder.empty()) {
4785 for (unsigned Idx : seq<unsigned>(0, Sz))
4786 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4787 Order[Idx] = Idx;
4788 } else {
4789 for (unsigned Idx : seq<unsigned>(0, Sz))
4790 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4791 !UsedIndices.test(SecondaryOrder[Idx]))
4792 Order[Idx] = SecondaryOrder[Idx];
4793 }
4794}
4795
4797 // Maps VF to the graph nodes.
4799 // ExtractElement gather nodes which can be vectorized and need to handle
4800 // their ordering.
4802
4803 // Phi nodes can have preferred ordering based on their result users
4805
4806 // AltShuffles can also have a preferred ordering that leads to fewer
4807 // instructions, e.g., the addsub instruction in x86.
4808 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4809
4810 // Maps a TreeEntry to the reorder indices of external users.
4812 ExternalUserReorderMap;
4813 // FIXME: Workaround for syntax error reported by MSVC buildbots.
4814 TargetTransformInfo &TTIRef = *TTI;
4815 // Find all reorderable nodes with the given VF.
4816 // Currently the are vectorized stores,loads,extracts + some gathering of
4817 // extracts.
4818 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
4819 &GathersToOrders, &ExternalUserReorderMap,
4820 &AltShufflesToOrders, &PhisToOrders](
4821 const std::unique_ptr<TreeEntry> &TE) {
4822 // Look for external users that will probably be vectorized.
4823 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4824 findExternalStoreUsersReorderIndices(TE.get());
4825 if (!ExternalUserReorderIndices.empty()) {
4826 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4827 ExternalUserReorderMap.try_emplace(TE.get(),
4828 std::move(ExternalUserReorderIndices));
4829 }
4830
4831 // Patterns like [fadd,fsub] can be combined into a single instruction in
4832 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4833 // to take into account their order when looking for the most used order.
4834 if (TE->isAltShuffle()) {
4835 VectorType *VecTy =
4836 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4837 unsigned Opcode0 = TE->getOpcode();
4838 unsigned Opcode1 = TE->getAltOpcode();
4839 // The opcode mask selects between the two opcodes.
4840 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4841 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4842 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4843 OpcodeMask.set(Lane);
4844 // If this pattern is supported by the target then we consider the order.
4845 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4846 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4847 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4848 }
4849 // TODO: Check the reverse order too.
4850 }
4851
4852 if (std::optional<OrdersType> CurrentOrder =
4853 getReorderingData(*TE, /*TopToBottom=*/true)) {
4854 // Do not include ordering for nodes used in the alt opcode vectorization,
4855 // better to reorder them during bottom-to-top stage. If follow the order
4856 // here, it causes reordering of the whole graph though actually it is
4857 // profitable just to reorder the subgraph that starts from the alternate
4858 // opcode vectorization node. Such nodes already end-up with the shuffle
4859 // instruction and it is just enough to change this shuffle rather than
4860 // rotate the scalars for the whole graph.
4861 unsigned Cnt = 0;
4862 const TreeEntry *UserTE = TE.get();
4863 while (UserTE && Cnt < RecursionMaxDepth) {
4864 if (UserTE->UserTreeIndices.size() != 1)
4865 break;
4866 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4867 return EI.UserTE->State == TreeEntry::Vectorize &&
4868 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4869 }))
4870 return;
4871 UserTE = UserTE->UserTreeIndices.back().UserTE;
4872 ++Cnt;
4873 }
4874 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4875 if (!(TE->State == TreeEntry::Vectorize ||
4876 TE->State == TreeEntry::StridedVectorize) ||
4877 !TE->ReuseShuffleIndices.empty())
4878 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4879 if (TE->State == TreeEntry::Vectorize &&
4880 TE->getOpcode() == Instruction::PHI)
4881 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4882 }
4883 });
4884
4885 // Reorder the graph nodes according to their vectorization factor.
4886 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4887 VF /= 2) {
4888 auto It = VFToOrderedEntries.find(VF);
4889 if (It == VFToOrderedEntries.end())
4890 continue;
4891 // Try to find the most profitable order. We just are looking for the most
4892 // used order and reorder scalar elements in the nodes according to this
4893 // mostly used order.
4894 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
4895 // All operands are reordered and used only in this node - propagate the
4896 // most used order to the user node.
4899 OrdersUses;
4901 for (const TreeEntry *OpTE : OrderedEntries) {
4902 // No need to reorder this nodes, still need to extend and to use shuffle,
4903 // just need to merge reordering shuffle and the reuse shuffle.
4904 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4905 continue;
4906 // Count number of orders uses.
4907 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4908 &PhisToOrders]() -> const OrdersType & {
4909 if (OpTE->State == TreeEntry::NeedToGather ||
4910 !OpTE->ReuseShuffleIndices.empty()) {
4911 auto It = GathersToOrders.find(OpTE);
4912 if (It != GathersToOrders.end())
4913 return It->second;
4914 }
4915 if (OpTE->isAltShuffle()) {
4916 auto It = AltShufflesToOrders.find(OpTE);
4917 if (It != AltShufflesToOrders.end())
4918 return It->second;
4919 }
4920 if (OpTE->State == TreeEntry::Vectorize &&
4921 OpTE->getOpcode() == Instruction::PHI) {
4922 auto It = PhisToOrders.find(OpTE);
4923 if (It != PhisToOrders.end())
4924 return It->second;
4925 }
4926 return OpTE->ReorderIndices;
4927 }();
4928 // First consider the order of the external scalar users.
4929 auto It = ExternalUserReorderMap.find(OpTE);
4930 if (It != ExternalUserReorderMap.end()) {
4931 const auto &ExternalUserReorderIndices = It->second;
4932 // If the OpTE vector factor != number of scalars - use natural order,
4933 // it is an attempt to reorder node with reused scalars but with
4934 // external uses.
4935 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4936 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
4937 ExternalUserReorderIndices.size();
4938 } else {
4939 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
4940 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4941 }
4942 // No other useful reorder data in this entry.
4943 if (Order.empty())
4944 continue;
4945 }
4946 // Stores actually store the mask, not the order, need to invert.
4947 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4948 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4949 SmallVector<int> Mask;
4950 inversePermutation(Order, Mask);
4951 unsigned E = Order.size();
4952 OrdersType CurrentOrder(E, E);
4953 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4954 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4955 });
4956 fixupOrderingIndices(CurrentOrder);
4957 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4958 } else {
4959 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4960 }
4961 }
4962 if (OrdersUses.empty())
4963 continue;
4964 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
4965 const unsigned Sz = Order.size();
4966 for (unsigned Idx : seq<unsigned>(0, Sz))
4967 if (Idx != Order[Idx] && Order[Idx] != Sz)
4968 return false;
4969 return true;
4970 };
4971 // Choose the most used order.
4972 unsigned IdentityCnt = 0;
4973 unsigned FilledIdentityCnt = 0;
4974 OrdersType IdentityOrder(VF, VF);
4975 for (auto &Pair : OrdersUses) {
4976 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
4977 if (!Pair.first.empty())
4978 FilledIdentityCnt += Pair.second;
4979 IdentityCnt += Pair.second;
4980 combineOrders(IdentityOrder, Pair.first);
4981 }
4982 }
4983 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
4984 unsigned Cnt = IdentityCnt;
4985 for (auto &Pair : OrdersUses) {
4986 // Prefer identity order. But, if filled identity found (non-empty order)
4987 // with same number of uses, as the new candidate order, we can choose
4988 // this candidate order.
4989 if (Cnt < Pair.second ||
4990 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
4991 Cnt == Pair.second && !BestOrder.empty() &&
4992 IsIdentityOrder(BestOrder))) {
4993 combineOrders(Pair.first, BestOrder);
4994 BestOrder = Pair.first;
4995 Cnt = Pair.second;
4996 } else {
4997 combineOrders(BestOrder, Pair.first);
4998 }
4999 }
5000 // Set order of the user node.
5001 if (IsIdentityOrder(BestOrder))
5002 continue;
5003 fixupOrderingIndices(BestOrder);
5004 SmallVector<int> Mask;
5005 inversePermutation(BestOrder, Mask);
5006 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5007 unsigned E = BestOrder.size();
5008 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5009 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5010 });
5011 // Do an actual reordering, if profitable.
5012 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5013 // Just do the reordering for the nodes with the given VF.
5014 if (TE->Scalars.size() != VF) {
5015 if (TE->ReuseShuffleIndices.size() == VF) {
5016 // Need to reorder the reuses masks of the operands with smaller VF to
5017 // be able to find the match between the graph nodes and scalar
5018 // operands of the given node during vectorization/cost estimation.
5019 assert(all_of(TE->UserTreeIndices,
5020 [VF, &TE](const EdgeInfo &EI) {
5021 return EI.UserTE->Scalars.size() == VF ||
5022 EI.UserTE->Scalars.size() ==
5023 TE->Scalars.size();
5024 }) &&
5025 "All users must be of VF size.");
5026 // Update ordering of the operands with the smaller VF than the given
5027 // one.
5028 reorderNodeWithReuses(*TE, Mask);
5029 }
5030 continue;
5031 }
5032 if ((TE->State == TreeEntry::Vectorize ||
5033 TE->State == TreeEntry::StridedVectorize) &&
5035 InsertElementInst>(TE->getMainOp()) &&
5036 !TE->isAltShuffle()) {
5037 // Build correct orders for extract{element,value}, loads and
5038 // stores.
5039 reorderOrder(TE->ReorderIndices, Mask);
5040 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5041 TE->reorderOperands(Mask);
5042 } else {
5043 // Reorder the node and its operands.
5044 TE->reorderOperands(Mask);
5045 assert(TE->ReorderIndices.empty() &&
5046 "Expected empty reorder sequence.");
5047 reorderScalars(TE->Scalars, Mask);
5048 }
5049 if (!TE->ReuseShuffleIndices.empty()) {
5050 // Apply reversed order to keep the original ordering of the reused
5051 // elements to avoid extra reorder indices shuffling.
5052 OrdersType CurrentOrder;
5053 reorderOrder(CurrentOrder, MaskOrder);
5054 SmallVector<int> NewReuses;
5055 inversePermutation(CurrentOrder, NewReuses);
5056 addMask(NewReuses, TE->ReuseShuffleIndices);
5057 TE->ReuseShuffleIndices.swap(NewReuses);
5058 }
5059 }
5060 }
5061}
5062
5063bool BoUpSLP::canReorderOperands(
5064 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5065 ArrayRef<TreeEntry *> ReorderableGathers,
5066 SmallVectorImpl<TreeEntry *> &GatherOps) {
5067 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5068 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5069 return OpData.first == I &&
5070 (OpData.second->State == TreeEntry::Vectorize ||
5071 OpData.second->State == TreeEntry::StridedVectorize);
5072 }))
5073 continue;
5074 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5075 // Do not reorder if operand node is used by many user nodes.
5076 if (any_of(TE->UserTreeIndices,
5077 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5078 return false;
5079 // Add the node to the list of the ordered nodes with the identity
5080 // order.
5081 Edges.emplace_back(I, TE);
5082 // Add ScatterVectorize nodes to the list of operands, where just
5083 // reordering of the scalars is required. Similar to the gathers, so
5084 // simply add to the list of gathered ops.
5085 // If there are reused scalars, process this node as a regular vectorize
5086 // node, just reorder reuses mask.
5087 if (TE->State != TreeEntry::Vectorize &&
5088 TE->State != TreeEntry::StridedVectorize &&
5089 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5090 GatherOps.push_back(TE);
5091 continue;
5092 }
5093 TreeEntry *Gather = nullptr;
5094 if (count_if(ReorderableGathers,
5095 [&Gather, UserTE, I](TreeEntry *TE) {
5096 assert(TE->State != TreeEntry::Vectorize &&
5097 TE->State != TreeEntry::StridedVectorize &&
5098 "Only non-vectorized nodes are expected.");
5099 if (any_of(TE->UserTreeIndices,
5100 [UserTE, I](const EdgeInfo &EI) {
5101 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5102 })) {
5103 assert(TE->isSame(UserTE->getOperand(I)) &&
5104 "Operand entry does not match operands.");
5105 Gather = TE;
5106 return true;
5107 }
5108 return false;
5109 }) > 1 &&
5110 !allConstant(UserTE->getOperand(I)))
5111 return false;
5112 if (Gather)
5113 GatherOps.push_back(Gather);
5114 }
5115 return true;
5116}
5117
5118void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5119 SetVector<TreeEntry *> OrderedEntries;
5120 DenseSet<const TreeEntry *> GathersToOrders;
5121 // Find all reorderable leaf nodes with the given VF.
5122 // Currently the are vectorized loads,extracts without alternate operands +
5123 // some gathering of extracts.
5124 SmallVector<TreeEntry *> NonVectorized;
5125 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5126 if (TE->State != TreeEntry::Vectorize &&
5127 TE->State != TreeEntry::StridedVectorize)
5128 NonVectorized.push_back(TE.get());
5129 if (std::optional<OrdersType> CurrentOrder =
5130 getReorderingData(*TE, /*TopToBottom=*/false)) {
5131 OrderedEntries.insert(TE.get());
5132 if (!(TE->State == TreeEntry::Vectorize ||
5133 TE->State == TreeEntry::StridedVectorize) ||
5134 !TE->ReuseShuffleIndices.empty())
5135 GathersToOrders.insert(TE.get());
5136 }
5137 }
5138
5139 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5140 // I.e., if the node has operands, that are reordered, try to make at least
5141 // one operand order in the natural order and reorder others + reorder the
5142 // user node itself.
5144 while (!OrderedEntries.empty()) {
5145 // 1. Filter out only reordered nodes.
5146 // 2. If the entry has multiple uses - skip it and jump to the next node.
5148 SmallVector<TreeEntry *> Filtered;
5149 for (TreeEntry *TE : OrderedEntries) {
5150 if (!(TE->State == TreeEntry::Vectorize ||
5151 TE->State == TreeEntry::StridedVectorize ||
5152 (TE->State == TreeEntry::NeedToGather &&
5153 GathersToOrders.contains(TE))) ||
5154 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5155 !all_of(drop_begin(TE->UserTreeIndices),
5156 [TE](const EdgeInfo &EI) {
5157 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5158 }) ||
5159 !Visited.insert(TE).second) {
5160 Filtered.push_back(TE);
5161 continue;
5162 }
5163 // Build a map between user nodes and their operands order to speedup
5164 // search. The graph currently does not provide this dependency directly.
5165 for (EdgeInfo &EI : TE->UserTreeIndices) {
5166 TreeEntry *UserTE = EI.UserTE;
5167 auto It = Users.find(UserTE);
5168 if (It == Users.end())
5169 It = Users.insert({UserTE, {}}).first;
5170 It->second.emplace_back(EI.EdgeIdx, TE);
5171 }
5172 }
5173 // Erase filtered entries.
5174 for (TreeEntry *TE : Filtered)
5175 OrderedEntries.remove(TE);
5177 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5178 UsersVec(Users.begin(), Users.end());
5179 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5180 return Data1.first->Idx > Data2.first->Idx;
5181 });
5182 for (auto &Data : UsersVec) {
5183 // Check that operands are used only in the User node.
5184 SmallVector<TreeEntry *> GatherOps;
5185 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5186 GatherOps)) {
5187 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5188 OrderedEntries.remove(Op.second);
5189 continue;
5190 }
5191 // All operands are reordered and used only in this node - propagate the
5192 // most used order to the user node.
5195 OrdersUses;
5196 // Do the analysis for each tree entry only once, otherwise the order of
5197 // the same node my be considered several times, though might be not
5198 // profitable.
5201 for (const auto &Op : Data.second) {
5202 TreeEntry *OpTE = Op.second;
5203 if (!VisitedOps.insert(OpTE).second)
5204 continue;
5205 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5206 continue;
5207 const auto Order = [&]() -> const OrdersType {
5208 if (OpTE->State == TreeEntry::NeedToGather ||
5209 !OpTE->ReuseShuffleIndices.empty())
5210 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5211 .value_or(OrdersType(1));
5212 return OpTE->ReorderIndices;
5213 }();
5214 // The order is partially ordered, skip it in favor of fully non-ordered
5215 // orders.
5216 if (Order.size() == 1)
5217 continue;
5218 unsigned NumOps = count_if(
5219 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5220 return P.second == OpTE;
5221 });
5222 // Stores actually store the mask, not the order, need to invert.
5223 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5224 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5225 SmallVector<int> Mask;
5226 inversePermutation(Order, Mask);
5227 unsigned E = Order.size();
5228 OrdersType CurrentOrder(E, E);
5229 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5230 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5231 });
5232 fixupOrderingIndices(CurrentOrder);
5233 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5234 NumOps;
5235 } else {
5236 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5237 }
5238 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5239 const auto AllowsReordering = [&](const TreeEntry *TE) {
5240 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5241 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5242 (IgnoreReorder && TE->Idx == 0))
5243 return true;
5244 if (TE->State == TreeEntry::NeedToGather) {
5245 if (GathersToOrders.contains(TE))
5246 return !getReorderingData(*TE, /*TopToBottom=*/false)
5247 .value_or(OrdersType(1))
5248 .empty();
5249 return true;
5250 }
5251 return false;
5252 };
5253 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5254 TreeEntry *UserTE = EI.UserTE;
5255 if (!VisitedUsers.insert(UserTE).second)
5256 continue;
5257 // May reorder user node if it requires reordering, has reused
5258 // scalars, is an alternate op vectorize node or its op nodes require
5259 // reordering.
5260 if (AllowsReordering(UserTE))
5261 continue;
5262 // Check if users allow reordering.
5263 // Currently look up just 1 level of operands to avoid increase of
5264 // the compile time.
5265 // Profitable to reorder if definitely more operands allow
5266 // reordering rather than those with natural order.
5268 if (static_cast<unsigned>(count_if(
5269 Ops, [UserTE, &AllowsReordering](
5270 const std::pair<unsigned, TreeEntry *> &Op) {
5271 return AllowsReordering(Op.second) &&
5272 all_of(Op.second->UserTreeIndices,
5273 [UserTE](const EdgeInfo &EI) {
5274 return EI.UserTE == UserTE;
5275 });
5276 })) <= Ops.size() / 2)
5277 ++Res.first->second;
5278 }
5279 }
5280 if (OrdersUses.empty()) {
5281 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5282 OrderedEntries.remove(Op.second);
5283 continue;
5284 }
5285 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5286 const unsigned Sz = Order.size();
5287 for (unsigned Idx : seq<unsigned>(0, Sz))
5288 if (Idx != Order[Idx] && Order[Idx] != Sz)
5289 return false;
5290 return true;
5291 };
5292 // Choose the most used order.
5293 unsigned IdentityCnt = 0;
5294 unsigned VF = Data.second.front().second->getVectorFactor();
5295 OrdersType IdentityOrder(VF, VF);
5296 for (auto &Pair : OrdersUses) {
5297 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5298 IdentityCnt += Pair.second;
5299 combineOrders(IdentityOrder, Pair.first);
5300 }
5301 }
5302 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5303 unsigned Cnt = IdentityCnt;
5304 for (auto &Pair : OrdersUses) {
5305 // Prefer identity order. But, if filled identity found (non-empty
5306 // order) with same number of uses, as the new candidate order, we can
5307 // choose this candidate order.
5308 if (Cnt < Pair.second) {
5309 combineOrders(Pair.first, BestOrder);
5310 BestOrder = Pair.first;
5311 Cnt = Pair.second;
5312 } else {
5313 combineOrders(BestOrder, Pair.first);
5314 }
5315 }
5316 // Set order of the user node.
5317 if (IsIdentityOrder(BestOrder)) {
5318 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5319 OrderedEntries.remove(Op.second);
5320 continue;
5321 }
5322 fixupOrderingIndices(BestOrder);
5323 // Erase operands from OrderedEntries list and adjust their orders.
5324 VisitedOps.clear();
5325 SmallVector<int> Mask;
5326 inversePermutation(BestOrder, Mask);
5327 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5328 unsigned E = BestOrder.size();
5329 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5330 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5331 });
5332 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5333 TreeEntry *TE = Op.second;
5334 OrderedEntries.remove(TE);
5335 if (!VisitedOps.insert(TE).second)
5336 continue;
5337 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5338 reorderNodeWithReuses(*TE, Mask);
5339 continue;
5340 }
5341 // Gathers are processed separately.
5342 if (TE->State != TreeEntry::Vectorize &&
5343 TE->State != TreeEntry::StridedVectorize &&
5344 (TE->State != TreeEntry::ScatterVectorize ||
5345 TE->ReorderIndices.empty()))
5346 continue;
5347 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5348 TE->ReorderIndices.empty()) &&
5349 "Non-matching sizes of user/operand entries.");
5350 reorderOrder(TE->ReorderIndices, Mask);
5351 if (IgnoreReorder && TE == VectorizableTree.front().get())
5352 IgnoreReorder = false;
5353 }
5354 // For gathers just need to reorder its scalars.
5355 for (TreeEntry *Gather : GatherOps) {
5356 assert(Gather->ReorderIndices.empty() &&
5357 "Unexpected reordering of gathers.");
5358 if (!Gather->ReuseShuffleIndices.empty()) {
5359 // Just reorder reuses indices.
5360 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5361 continue;
5362 }
5363 reorderScalars(Gather->Scalars, Mask);
5364 OrderedEntries.remove(Gather);
5365 }
5366 // Reorder operands of the user node and set the ordering for the user
5367 // node itself.
5368 if (Data.first->State != TreeEntry::Vectorize ||
5369 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5370 Data.first->getMainOp()) ||
5371 Data.first->isAltShuffle())
5372 Data.first->reorderOperands(Mask);
5373 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5374 Data.first->isAltShuffle() ||
5375 Data.first->State == TreeEntry::StridedVectorize) {
5376 reorderScalars(Data.first->Scalars, Mask);
5377 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5378 /*BottomOrder=*/true);
5379 if (Data.first->ReuseShuffleIndices.empty() &&
5380 !Data.first->ReorderIndices.empty() &&
5381 !Data.first->isAltShuffle()) {
5382 // Insert user node to the list to try to sink reordering deeper in
5383 // the graph.
5384 OrderedEntries.insert(Data.first);
5385 }
5386 } else {
5387 reorderOrder(Data.first->ReorderIndices, Mask);
5388 }
5389 }
5390 }
5391 // If the reordering is unnecessary, just remove the reorder.
5392 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5393 VectorizableTree.front()->ReuseShuffleIndices.empty())
5394 VectorizableTree.front()->ReorderIndices.clear();
5395}
5396
5398 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5399 // Collect the values that we need to extract from the tree.
5400 for (auto &TEPtr : VectorizableTree) {
5401 TreeEntry *Entry = TEPtr.get();
5402
5403 // No need to handle users of gathered values.
5404 if (Entry->State == TreeEntry::NeedToGather)
5405 continue;
5406
5407 // For each lane:
5408 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5409 Value *Scalar = Entry->Scalars[Lane];
5410 if (!isa<Instruction>(Scalar))
5411 continue;
5412 int FoundLane = Entry->findLaneForValue(Scalar);
5413
5414 // Check if the scalar is externally used as an extra arg.
5415 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5416 if (ExtI != ExternallyUsedValues.end()) {
5417 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5418 << Lane << " from " << *Scalar << ".\n");
5419 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5420 }
5421 for (User *U : Scalar->users()) {
5422 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5423
5424 Instruction *UserInst = dyn_cast<Instruction>(U);
5425 if (!UserInst || isDeleted(UserInst))
5426 continue;
5427
5428 // Ignore users in the user ignore list.
5429 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5430 continue;
5431
5432 // Skip in-tree scalars that become vectors
5433 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5434 // Some in-tree scalars will remain as scalar in vectorized
5435 // instructions. If that is the case, the one in FoundLane will
5436 // be used.
5437 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5439 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5440 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5441 << ".\n");
5442 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5443 continue;
5444 }
5445 U = nullptr;
5446 }
5447
5448 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5449 << " from lane " << Lane << " from " << *Scalar
5450 << ".\n");
5451 ExternalUses.emplace_back(Scalar, U, FoundLane);
5452 }
5453 }
5454 }
5455}
5456
5458BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5460 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5461 Value *V = TE->Scalars[Lane];
5462 // To save compilation time we don't visit if we have too many users.
5463 if (V->hasNUsesOrMore(UsesLimit))
5464 break;
5465
5466 // Collect stores per pointer object.
5467 for (User *U : V->users()) {
5468 auto *SI = dyn_cast<StoreInst>(U);
5469 if (SI == nullptr || !SI->isSimple() ||
5470 !isValidElementType(SI->getValueOperand()->getType()))
5471 continue;
5472 // Skip entry if already
5473 if (getTreeEntry(U))
5474 continue;
5475
5476 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5477 auto &StoresVec = PtrToStoresMap[Ptr];
5478 // For now just keep one store per pointer object per lane.
5479 // TODO: Extend this to support multiple stores per pointer per lane
5480 if (StoresVec.size() > Lane)
5481 continue;
5482 // Skip if in different BBs.
5483 if (!StoresVec.empty() &&
5484 SI->getParent() != StoresVec.back()->getParent())
5485 continue;
5486 // Make sure that the stores are of the same type.
5487 if (!StoresVec.empty() &&
5488 SI->getValueOperand()->getType() !=
5489 StoresVec.back()->getValueOperand()->getType())
5490 continue;
5491 StoresVec.push_back(SI);
5492 }
5493 }
5494 return PtrToStoresMap;
5495}
5496
5497bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5498 OrdersType &ReorderIndices) const {
5499 // We check whether the stores in StoreVec can form a vector by sorting them
5500 // and checking whether they are consecutive.
5501
5502 // To avoid calling getPointersDiff() while sorting we create a vector of
5503 // pairs {store, offset from first} and sort this instead.
5504 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5505 StoreInst *S0 = StoresVec[0];
5506 StoreOffsetVec[0] = {S0, 0};
5507 Type *S0Ty = S0->getValueOperand()->getType();
5508 Value *S0Ptr = S0->getPointerOperand();
5509 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5510 StoreInst *SI = StoresVec[Idx];
5511 std::optional<int> Diff =
5512 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5513 SI->getPointerOperand(), *DL, *SE,
5514 /*StrictCheck=*/true);
5515 // We failed to compare the pointers so just abandon this StoresVec.
5516 if (!Diff)
5517 return false;
5518 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5519 }
5520
5521 // Sort the vector based on the pointers. We create a copy because we may
5522 // need the original later for calculating the reorder (shuffle) indices.
5523 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5524 const std::pair<StoreInst *, int> &Pair2) {
5525 int Offset1 = Pair1.second;
5526 int Offset2 = Pair2.second;
5527 return Offset1 < Offset2;
5528 });
5529
5530 // Check if the stores are consecutive by checking if their difference is 1.
5531 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5532 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5533 return false;
5534
5535 // Calculate the shuffle indices according to their offset against the sorted
5536 // StoreOffsetVec.
5537 ReorderIndices.reserve(StoresVec.size());
5538 for (StoreInst *SI : StoresVec) {
5539 unsigned Idx = find_if(StoreOffsetVec,
5540 [SI](const std::pair<StoreInst *, int> &Pair) {
5541 return Pair.first == SI;
5542 }) -
5543 StoreOffsetVec.begin();
5544 ReorderIndices.push_back(Idx);
5545 }
5546 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5547 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5548 // same convention here.
5549 auto IsIdentityOrder = [](const OrdersType &Order) {
5550 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5551 if (Idx != Order[Idx])
5552 return false;
5553 return true;
5554 };
5555 if (IsIdentityOrder(ReorderIndices))
5556 ReorderIndices.clear();
5557
5558 return true;
5559}
5560
5561#ifndef NDEBUG
5563 for (unsigned Idx : Order)
5564 dbgs() << Idx << ", ";
5565 dbgs() << "\n";
5566}
5567#endif
5568
5570BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5571 unsigned NumLanes = TE->Scalars.size();
5572
5574 collectUserStores(TE);
5575
5576 // Holds the reorder indices for each candidate store vector that is a user of
5577 // the current TreeEntry.
5578 SmallVector<OrdersType, 1> ExternalReorderIndices;
5579
5580 // Now inspect the stores collected per pointer and look for vectorization
5581 // candidates. For each candidate calculate the reorder index vector and push
5582 // it into `ExternalReorderIndices`
5583 for (const auto &Pair : PtrToStoresMap) {
5584 auto &StoresVec = Pair.second;
5585 // If we have fewer than NumLanes stores, then we can't form a vector.
5586 if (StoresVec.size() != NumLanes)
5587 continue;
5588
5589 // If the stores are not consecutive then abandon this StoresVec.
5590 OrdersType ReorderIndices;
5591 if (!canFormVector(StoresVec, ReorderIndices))
5592 continue;
5593
5594 // We now know that the scalars in StoresVec can form a vector instruction,
5595 // so set the reorder indices.
5596 ExternalReorderIndices.push_back(ReorderIndices);
5597 }
5598 return ExternalReorderIndices;
5599}
5600
5602 const SmallDenseSet<Value *> &UserIgnoreLst) {
5603 deleteTree();
5604 UserIgnoreList = &UserIgnoreLst;
5605 if (!allSameType(Roots))
5606 return;
5607 buildTree_rec(Roots, 0, EdgeInfo());
5608}
5609
5611 deleteTree();
5612 if (!allSameType(Roots))
5613 return;
5614 buildTree_rec(Roots, 0, EdgeInfo());
5615}
5616
5617/// \return true if the specified list of values has only one instruction that
5618/// requires scheduling, false otherwise.
5619#ifndef NDEBUG
5621 Value *NeedsScheduling = nullptr;
5622 for (Value *V : VL) {
5624 continue;
5625 if (!NeedsScheduling) {
5626 NeedsScheduling = V;
5627 continue;
5628 }
5629 return false;
5630 }
5631 return NeedsScheduling;
5632}
5633#endif
5634
5635/// Generates key/subkey pair for the given value to provide effective sorting
5636/// of the values and better detection of the vectorizable values sequences. The
5637/// keys/subkeys can be used for better sorting of the values themselves (keys)
5638/// and in values subgroups (subkeys).
5639static std::pair<size_t, size_t> generateKeySubkey(
5640 Value *V, const TargetLibraryInfo *TLI,
5641 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5642 bool AllowAlternate) {
5643 hash_code Key = hash_value(V->getValueID() + 2);
5644 hash_code SubKey = hash_value(0);
5645 // Sort the loads by the distance between the pointers.
5646 if (auto *LI = dyn_cast<LoadInst>(V)) {
5647 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5648 if (LI->isSimple())
5649 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5650 else
5651 Key = SubKey = hash_value(LI);
5652 } else if (isVectorLikeInstWithConstOps(V)) {
5653 // Sort extracts by the vector operands.
5654 if (isa<ExtractElementInst, UndefValue>(V))
5655 Key = hash_value(Value::UndefValueVal + 1);
5656 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5657 if (!isUndefVector(EI->getVectorOperand()).all() &&
5658 !isa<UndefValue>(EI->getIndexOperand()))
5659 SubKey = hash_value(EI->getVectorOperand());
5660 }
5661 } else if (auto *I = dyn_cast<Instruction>(V)) {
5662 // Sort other instructions just by the opcodes except for CMPInst.
5663 // For CMP also sort by the predicate kind.
5664 if ((isa<BinaryOperator, CastInst>(I)) &&
5665 isValidForAlternation(I->getOpcode())) {
5666 if (AllowAlternate)
5667 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5668 else
5669 Key = hash_combine(hash_value(I->getOpcode()), Key);
5670 SubKey = hash_combine(
5671 hash_value(I->getOpcode()), hash_value(I->getType()),
5672 hash_value(isa<BinaryOperator>(I)
5673 ? I->getType()
5674 : cast<CastInst>(I)->getOperand(0)->getType()));
5675 // For casts, look through the only operand to improve compile time.
5676 if (isa<CastInst>(I)) {
5677 std::pair<size_t, size_t> OpVals =
5678 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5679 /*AllowAlternate=*/true);
5680 Key = hash_combine(OpVals.first, Key);
5681 SubKey = hash_combine(OpVals.first, SubKey);
5682 }
5683 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5684 CmpInst::Predicate Pred = CI->getPredicate();
5685 if (CI->isCommutative())
5686 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5688 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5689 hash_value(SwapPred),
5690 hash_value(CI->getOperand(0)->getType()));
5691 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5694 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5695 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5696 SubKey = hash_combine(hash_value(I->getOpcode()),
5697 hash_value(Call->getCalledFunction()));
5698 } else {
5699 Key = hash_combine(hash_value(Call), Key);
5700 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5701 }
5702 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5703 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5704 hash_value(Op.Tag), SubKey);
5705 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5706 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5707 SubKey = hash_value(Gep->getPointerOperand());
5708 else
5709 SubKey = hash_value(Gep);
5710 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5711 !isa<ConstantInt>(I->getOperand(1))) {
5712 // Do not try to vectorize instructions with potentially high cost.
5713 SubKey = hash_value(I);
5714 } else {
5715 SubKey = hash_value(I->getOpcode());
5716 }
5717 Key = hash_combine(hash_value(I->getParent()), Key);
5718 }
5719 return std::make_pair(Key, SubKey);
5720}
5721
5722/// Checks if the specified instruction \p I is an alternate operation for
5723/// the given \p MainOp and \p AltOp instructions.
5724static bool isAlternateInstruction(const Instruction *I,
5725 const Instruction *MainOp,
5726 const Instruction *AltOp,
5727 const TargetLibraryInfo &TLI);
5728
5729BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5730 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5731 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5732 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5733
5734 unsigned ShuffleOrOp =
5735 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
5736 auto *VL0 = cast<Instruction>(S.OpValue);
5737 switch (ShuffleOrOp) {
5738 case Instruction::PHI: {
5739 // Check for terminator values (e.g. invoke).
5740 for (Value *V : VL)
5741 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
5742 Instruction *Term = dyn_cast<Instruction>(Incoming);
5743 if (Term && Term->isTerminator()) {
5745 << "SLP: Need to swizzle PHINodes (terminator use).\n");
5746 return TreeEntry::NeedToGather;
5747 }
5748 }
5749
5750 return TreeEntry::Vectorize;
5751 }
5752 case Instruction::ExtractValue:
5753 case Instruction::ExtractElement: {
5754 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5755 if (Reuse || !CurrentOrder.empty())
5756 return TreeEntry::Vectorize;
5757 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
5758 return TreeEntry::NeedToGather;
5759 }
5760 case Instruction::InsertElement: {
5761 // Check that we have a buildvector and not a shuffle of 2 or more
5762 // different vectors.
5763 ValueSet SourceVectors;
5764 for (Value *V : VL) {
5765 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
5766 assert(getInsertIndex(V) != std::nullopt &&
5767 "Non-constant or undef index?");
5768 }
5769
5770 if (count_if(VL, [&SourceVectors](Value *V) {
5771 return !SourceVectors.contains(V);
5772 }) >= 2) {
5773 // Found 2nd source vector - cancel.
5774 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
5775 "different source vectors.\n");
5776 return TreeEntry::NeedToGather;
5777 }
5778
5779 return TreeEntry::Vectorize;
5780 }
5781 case Instruction::Load: {
5782 // Check that a vectorized load would load the same memory as a scalar
5783 // load. For example, we don't want to vectorize loads that are smaller
5784 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5785 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5786 // from such a struct, we read/write packed bits disagreeing with the
5787 // unvectorized version.
5788 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
5790 return TreeEntry::Vectorize;
5792 return TreeEntry::ScatterVectorize;
5794 return TreeEntry::StridedVectorize;
5795 case LoadsState::Gather:
5796#ifndef NDEBUG
5797 Type *ScalarTy = VL0->getType();
5798 if (DL->getTypeSizeInBits(ScalarTy) !=
5799 DL->getTypeAllocSizeInBits(ScalarTy))
5800 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
5801 else if (any_of(VL,
5802 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
5803 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
5804 else
5805 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
5806#endif // NDEBUG
5807 return TreeEntry::NeedToGather;
5808 }
5809 llvm_unreachable("Unexpected state of loads");
5810 }
5811 case Instruction::ZExt:
5812 case Instruction::SExt:
5813 case Instruction::FPToUI:
5814 case Instruction::FPToSI:
5815 case Instruction::FPExt:
5816 case Instruction::PtrToInt:
5817 case Instruction::IntToPtr:
5818 case Instruction::SIToFP:
5819 case Instruction::UIToFP:
5820 case Instruction::Trunc:
5821 case Instruction::FPTrunc:
5822 case Instruction::BitCast: {
5823 Type *SrcTy = VL0->getOperand(0)->getType();
5824 for (Value *V : VL) {
5825 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
5826 if (Ty != SrcTy || !isValidElementType(Ty)) {
5827 LLVM_DEBUG(
5828 dbgs() << "SLP: Gathering casts with different src types.\n");
5829 return TreeEntry::NeedToGather;
5830 }
5831 }
5832 return TreeEntry::Vectorize;
5833 }
5834 case Instruction::ICmp:
5835 case Instruction::FCmp: {
5836 // Check that all of the compares have the same predicate.
5837 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
5839 Type *ComparedTy = VL0->getOperand(0)->getType();
5840 for (Value *V : VL) {
5841 CmpInst *Cmp = cast<CmpInst>(V);
5842 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
5843 Cmp->getOperand(0)->getType() != ComparedTy) {
5844 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
5845 return TreeEntry::NeedToGather;
5846 }
5847 }
5848 return TreeEntry::Vectorize;
5849 }
5850 case Instruction::Select:
5851 case Instruction::FNeg:
5852 case Instruction::Add:
5853 case Instruction::FAdd:
5854 case Instruction::Sub:
5855 case Instruction::FSub:
5856 case Instruction::Mul:
5857 case Instruction::FMul:
5858 case Instruction::UDiv:
5859 case Instruction::SDiv:
5860 case Instruction::FDiv:
5861 case Instruction::URem:
5862 case Instruction::SRem:
5863 case Instruction::FRem:
5864 case Instruction::Shl:
5865 case Instruction::LShr:
5866 case Instruction::AShr:
5867 case Instruction::And:
5868 case Instruction::Or:
5869 case Instruction::Xor:
5870 return TreeEntry::Vectorize;
5871 case Instruction::GetElementPtr: {
5872 // We don't combine GEPs with complicated (nested) indexing.
5873 for (Value *V : VL) {
5874 auto *I = dyn_cast<GetElementPtrInst>(V);
5875 if (!I)
5876 continue;
5877 if (I->getNumOperands() != 2) {
5878 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
5879 return TreeEntry::NeedToGather;
5880 }
5881 }
5882
5883 // We can't combine several GEPs into one vector if they operate on
5884 // different types.
5885 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
5886 for (Value *V : VL) {
5887 auto *GEP = dyn_cast<GEPOperator>(V);
5888 if (!GEP)
5889 continue;
5890 Type *CurTy = GEP->getSourceElementType();
5891 if (Ty0 != CurTy) {
5892 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
5893 return TreeEntry::NeedToGather;
5894 }
5895 }
5896
5897 // We don't combine GEPs with non-constant indexes.
5898 Type *Ty1 = VL0->getOperand(1)->getType();
5899 for (Value *V : VL) {
5900 auto *I = dyn_cast<GetElementPtrInst>(V);
5901 if (!I)
5902 continue;
5903 auto *Op = I->getOperand(1);
5904 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
5905 (Op->getType() != Ty1 &&
5906 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
5907 Op->getType()->getScalarSizeInBits() >
5908 DL->getIndexSizeInBits(
5909 V->getType()->getPointerAddressSpace())))) {
5910 LLVM_DEBUG(
5911 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
5912 return TreeEntry::NeedToGather;
5913 }
5914 }
5915
5916 return TreeEntry::Vectorize;
5917 }
5918 case Instruction::Store: {
5919 // Check if the stores are consecutive or if we need to swizzle them.
5920 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
5921 // Avoid types that are padded when being allocated as scalars, while
5922 // being packed together in a vector (such as i1).
5923 if (DL->getTypeSizeInBits(ScalarTy) !=
5924 DL->getTypeAllocSizeInBits(ScalarTy)) {
5925 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
5926 return TreeEntry::NeedToGather;
5927 }
5928 // Make sure all stores in the bundle are simple - we can't vectorize
5929 // atomic or volatile stores.
5930 for (Value *V : VL) {
5931 auto *SI = cast<StoreInst>(V);
5932 if (!SI->isSimple()) {
5933 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
5934 return TreeEntry::NeedToGather;
5935 }
5936 PointerOps.push_back(SI->getPointerOperand());
5937 }
5938
5939 // Check the order of pointer operands.
5940 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
5941 Value *Ptr0;
5942 Value *PtrN;
5943 if (CurrentOrder.empty()) {
5944 Ptr0 = PointerOps.front();
5945 PtrN = PointerOps.back();
5946 } else {
5947 Ptr0 = PointerOps[CurrentOrder.front()];
5948 PtrN = PointerOps[CurrentOrder.back()];
5949 }
5950 std::optional<int> Dist =
5951 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5952 // Check that the sorted pointer operands are consecutive.
5953 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
5954 return TreeEntry::Vectorize;
5955 }
5956
5957 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
5958 return TreeEntry::NeedToGather;
5959 }
5960 case Instruction::Call: {
5961 // Check if the calls are all to the same vectorizable intrinsic or
5962 // library function.
5963 CallInst *CI = cast<CallInst>(VL0);
5965
5966 VFShape Shape = VFShape::get(
5967 CI->getFunctionType(),
5968 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
5969 false /*HasGlobalPred*/);
5970 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
5971
5972 if (!VecFunc && !isTriviallyVectorizable(ID)) {
5973 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
5974 return TreeEntry::NeedToGather;
5975 }
5976 Function *F = CI->getCalledFunction();
5977 unsigned NumArgs = CI->arg_size();
5978 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
5979 for (unsigned J = 0; J != NumArgs; ++J)
5981 ScalarArgs[J] = CI->getArgOperand(J);
5982 for (Value *V : VL) {
5983 CallInst *CI2 = dyn_cast<CallInst>(V);
5984 if (!CI2 || CI2->getCalledFunction() != F ||
5985 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
5986 (VecFunc &&
5987 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
5989 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
5990 << "\n");
5991 return TreeEntry::NeedToGather;
5992 }
5993 // Some intrinsics have scalar arguments and should be same in order for
5994 // them to be vectorized.
5995 for (unsigned J = 0; J != NumArgs; ++J) {
5997 Value *A1J = CI2->getArgOperand(J);
5998 if (ScalarArgs[J] != A1J) {
6000 << "SLP: mismatched arguments in call:" << *CI
6001 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6002 return TreeEntry::NeedToGather;
6003 }
6004 }
6005 }
6006 // Verify that the bundle operands are identical between the two calls.
6007 if (CI->hasOperandBundles() &&
6008 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6009 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6010 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6011 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6012 << "!=" << *V << '\n');
6013 return TreeEntry::NeedToGather;
6014 }
6015 }
6016
6017 return TreeEntry::Vectorize;
6018 }
6019 case Instruction::ShuffleVector: {
6020 // If this is not an alternate sequence of opcode like add-sub
6021 // then do not vectorize this instruction.
6022 if (!S.isAltShuffle()) {
6023 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6024 return TreeEntry::NeedToGather;
6025 }
6026 return TreeEntry::Vectorize;
6027 }
6028 default:
6029 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6030 return TreeEntry::NeedToGather;
6031 }
6032}
6033
6034void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6035 const EdgeInfo &UserTreeIdx) {
6036 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6037
6038 SmallVector<int> ReuseShuffleIndicies;
6039 SmallVector<Value *> UniqueValues;
6040 SmallVector<Value *> NonUniqueValueVL;
6041 auto TryToFindDuplicates = [&](const InstructionsState &S,
6042 bool DoNotFail = false) {
6043 // Check that every instruction appears once in this bundle.
6044 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6045 for (Value *V : VL) {
6046 if (isConstant(V)) {
6047 ReuseShuffleIndicies.emplace_back(
6048 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6049 UniqueValues.emplace_back(V);
6050 continue;
6051 }
6052 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6053 ReuseShuffleIndicies.emplace_back(Res.first->second);
6054 if (Res.second)
6055 UniqueValues.emplace_back(V);
6056 }
6057 size_t NumUniqueScalarValues = UniqueValues.size();
6058 if (NumUniqueScalarValues == VL.size()) {
6059 ReuseShuffleIndicies.clear();
6060 } else {
6061 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6062 if (NumUniqueScalarValues <= 1 ||
6063 (UniquePositions.size() == 1 && all_of(UniqueValues,
6064 [](Value *V) {
6065 return isa<UndefValue>(V) ||
6066 !isConstant(V);
6067 })) ||
6068 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6069 if (DoNotFail && UniquePositions.size() > 1 &&
6070 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6071 all_of(UniqueValues, [=](Value *V) {
6072 return isa<ExtractElementInst>(V) ||
6073 areAllUsersVectorized(cast<Instruction>(V),
6074 UserIgnoreList);
6075 })) {
6076 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6077 if (PWSz == VL.size()) {
6078 ReuseShuffleIndicies.clear();
6079 } else {
6080 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6081 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6082 UniqueValues.back());
6083 VL = NonUniqueValueVL;
6084 }
6085 return true;
6086 }
6087 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6088 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6089 return false;
6090 }
6091 VL = UniqueValues;
6092 }
6093 return true;
6094 };
6095
6096 InstructionsState S = getSameOpcode(VL, *TLI);
6097
6098 // Don't vectorize ephemeral values.
6099 if (!EphValues.empty()) {
6100 for (Value *V : VL) {
6101 if (EphValues.count(V)) {
6102 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6103 << ") is ephemeral.\n");
6104 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6105 return;
6106 }
6107 }
6108 }
6109
6110 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6111 // a load), in which case peek through to include it in the tree, without
6112 // ballooning over-budget.
6113 if (Depth >= RecursionMaxDepth &&
6114 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6115 VL.size() >= 4 &&
6116 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6117 return match(I,
6119 cast<Instruction>(I)->getOpcode() ==
6120 cast<Instruction>(S.MainOp)->getOpcode();
6121 })))) {
6122 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6123 if (TryToFindDuplicates(S))
6124 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6125 ReuseShuffleIndicies);
6126 return;
6127 }
6128
6129 // Don't handle scalable vectors
6130 if (S.getOpcode() == Instruction::ExtractElement &&
6131 isa<ScalableVectorType>(
6132 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6133 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6134 if (TryToFindDuplicates(S))
6135 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6136 ReuseShuffleIndicies);
6137 return;
6138 }
6139
6140 // Don't handle vectors.
6141 if (S.OpValue->getType()->isVectorTy() &&
6142 !isa<InsertElementInst>(S.OpValue)) {
6143 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6144 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6145 return;
6146 }
6147
6148 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6149 if (SI->getValueOperand()->getType()->isVectorTy()) {
6150 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6151 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6152 return;
6153 }
6154
6155 // If all of the operands are identical or constant we have a simple solution.
6156 // If we deal with insert/extract instructions, they all must have constant
6157 // indices, otherwise we should gather them, not try to vectorize.
6158 // If alternate op node with 2 elements with gathered operands - do not
6159 // vectorize.
6160 auto &&NotProfitableForVectorization = [&S, this,
6162 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6163 return false;
6164 if (VectorizableTree.size() < MinTreeSize)
6165 return false;
6166 if (Depth >= RecursionMaxDepth - 1)
6167 return true;
6168 // Check if all operands are extracts, part of vector node or can build a
6169 // regular vectorize node.
6170 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6171 for (Value *V : VL) {
6172 auto *I = cast<Instruction>(V);
6173 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6174 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6175 }));
6176 }
6177 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6178 if ((IsCommutative &&
6179 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6180 (!IsCommutative &&
6181 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6182 return true;
6183 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6185 auto *I1 = cast<Instruction>(VL.front());
6186 auto *I2 = cast<Instruction>(VL.back());
6187 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6188 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6189 I2->getOperand(Op));
6190 if (static_cast<unsigned>(count_if(
6191 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6193 })) >= S.MainOp->getNumOperands() / 2)
6194 return false;
6195 if (S.MainOp->getNumOperands() > 2)
6196 return true;
6197 if (IsCommutative) {
6198 // Check permuted operands.
6199 Candidates.clear();
6200 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6201 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6202 I2->getOperand((Op + 1) % E));
6203 if (any_of(
6204 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6206 }))
6207 return false;
6208 }
6209 return true;
6210 };
6211 SmallVector<unsigned> SortedIndices;
6212 BasicBlock *BB = nullptr;
6213 bool IsScatterVectorizeUserTE =
6214 UserTreeIdx.UserTE &&
6215 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6216 bool AreAllSameInsts =
6217 (S.getOpcode() && allSameBlock(VL)) ||
6218 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6219 VL.size() > 2 &&
6220 all_of(VL,
6221 [&BB](Value *V) {
6222 auto *I = dyn_cast<GetElementPtrInst>(V);
6223 if (!I)
6224 return doesNotNeedToBeScheduled(V);
6225 if (!BB)
6226 BB = I->getParent();
6227 return BB == I->getParent() && I->getNumOperands() == 2;
6228 }) &&
6229 BB &&
6230 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6231 SortedIndices));
6232 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6233 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6234 S.OpValue) &&
6236 NotProfitableForVectorization(VL)) {
6237 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6238 if (TryToFindDuplicates(S))
6239 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6240 ReuseShuffleIndicies);
6241 return;
6242 }
6243
6244 // We now know that this is a vector of instructions of the same type from
6245 // the same block.
6246
6247 // Check if this is a duplicate of another entry.
6248 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6249 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6250 if (!E->isSame(VL)) {
6251 auto It = MultiNodeScalars.find(S.OpValue);
6252 if (It != MultiNodeScalars.end()) {
6253 auto *TEIt = find_if(It->getSecond(),
6254 [&](TreeEntry *ME) { return ME->isSame(VL); });
6255 if (TEIt != It->getSecond().end())
6256 E = *TEIt;
6257 else
6258 E = nullptr;
6259 } else {
6260 E = nullptr;
6261 }
6262 }
6263 if (!E) {
6264 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6265 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6266 if (TryToFindDuplicates(S))
6267 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6268 ReuseShuffleIndicies);
6269 return;
6270 }
6271 } else {
6272 // Record the reuse of the tree node. FIXME, currently this is only used
6273 // to properly draw the graph rather than for the actual vectorization.
6274 E->UserTreeIndices.push_back(UserTreeIdx);
6275 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6276 << ".\n");
6277 return;
6278 }
6279 }
6280
6281 // Check that none of the instructions in the bundle are already in the tree.
6282 for (Value *V : VL) {
6283 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6285 continue;
6286 if (getTreeEntry(V)) {
6287 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6288 << ") is already in tree.\n");
6289 if (TryToFindDuplicates(S))
6290 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6291 ReuseShuffleIndicies);
6292 return;
6293 }
6294 }
6295
6296 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6297 if (UserIgnoreList && !UserIgnoreList->empty()) {
6298 for (Value *V : VL) {
6299 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6300 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6301 if (TryToFindDuplicates(S))
6302 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6303 ReuseShuffleIndicies);
6304 return;
6305 }
6306 }
6307 }
6308
6309 // Special processing for sorted pointers for ScatterVectorize node with
6310 // constant indeces only.
6311 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6312 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6313 !(S.getOpcode() && allSameBlock(VL))) {
6314 assert(S.OpValue->getType()->isPointerTy() &&
6315 count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
6316 2 &&
6317 "Expected pointers only.");
6318 // Reset S to make it GetElementPtr kind of node.
6319 const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
6320 assert(It != VL.end() && "Expected at least one GEP.");
6321 S = getSameOpcode(*It, *TLI);
6322 }
6323
6324 // Check that all of the users of the scalars that we want to vectorize are
6325 // schedulable.
6326 auto *VL0 = cast<Instruction>(S.OpValue);
6327 BB = VL0->getParent();
6328
6329 if (!DT->isReachableFromEntry(BB)) {
6330 // Don't go into unreachable blocks. They may contain instructions with
6331 // dependency cycles which confuse the final scheduling.
6332 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6333 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6334 return;
6335 }
6336
6337 // Don't go into catchswitch blocks, which can happen with PHIs.
6338 // Such blocks can only have PHIs and the catchswitch. There is no
6339 // place to insert a shuffle if we need to, so just avoid that issue.
6340 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6341 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6342 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6343 return;
6344 }
6345
6346 // Check that every instruction appears once in this bundle.
6347 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6348 return;
6349
6350 // Perform specific checks for each particular instruction kind.
6351 OrdersType CurrentOrder;
6352 SmallVector<Value *> PointerOps;
6353 TreeEntry::EntryState State = getScalarsVectorizationState(
6354 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6355 if (State == TreeEntry::NeedToGather) {
6356 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6357 ReuseShuffleIndicies);
6358 return;
6359 }
6360
6361 auto &BSRef = BlocksSchedules[BB];
6362 if (!BSRef)
6363 BSRef = std::make_unique<BlockScheduling>(BB);
6364
6365 BlockScheduling &BS = *BSRef;
6366
6367 std::optional<ScheduleData *> Bundle =
6368 BS.tryScheduleBundle(UniqueValues, this, S);
6369#ifdef EXPENSIVE_CHECKS
6370 // Make sure we didn't break any internal invariants
6371 BS.verify();
6372#endif
6373 if (!Bundle) {
6374 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6375 assert((!BS.getScheduleData(VL0) ||
6376 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6377 "tryScheduleBundle should cancelScheduling on failure");
6378 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6379 ReuseShuffleIndicies);
6380 return;
6381 }
6382 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6383
6384 unsigned ShuffleOrOp = S.isAltShuffle() ?
6385 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6386 switch (ShuffleOrOp) {
6387 case Instruction::PHI: {
6388 auto *PH = cast<PHINode>(VL0);
6389
6390 TreeEntry *TE =
6391 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6392 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6393
6394 // Keeps the reordered operands to avoid code duplication.
6395 SmallVector<ValueList, 2> OperandsVec;
6396 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
6397 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
6398 ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
6399 TE->setOperand(I, Operands);
6400 OperandsVec.push_back(Operands);
6401 continue;
6402 }
6404 // Prepare the operand vector.
6405 for (Value *V : VL)
6406 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6407 PH->getIncomingBlock(I)));
6408 TE->setOperand(I, Operands);
6409 OperandsVec.push_back(Operands);
6410 }
6411 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
6412 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
6413 return;
6414 }
6415 case Instruction::ExtractValue:
6416 case Instruction::ExtractElement: {
6417 if (CurrentOrder.empty()) {
6418 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6419 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6420 ReuseShuffleIndicies);
6421 // This is a special case, as it does not gather, but at the same time
6422 // we are not extending buildTree_rec() towards the operands.
6423 ValueList Op0;
6424 Op0.assign(VL.size(), VL0->getOperand(0));
6425 VectorizableTree.back()->setOperand(0, Op0);
6426 return;
6427 }
6428 LLVM_DEBUG({
6429 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6430 "with order";
6431 for (unsigned Idx : CurrentOrder)
6432 dbgs() << " " << Idx;
6433 dbgs() << "\n";
6434 });
6435 fixupOrderingIndices(CurrentOrder);
6436 // Insert new order with initial value 0, if it does not exist,
6437 // otherwise return the iterator to the existing one.
6438 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6439 ReuseShuffleIndicies, CurrentOrder);
6440 // This is a special case, as it does not gather, but at the same time
6441 // we are not extending buildTree_rec() towards the operands.
6442 ValueList Op0;
6443 Op0.assign(VL.size(), VL0->getOperand(0));
6444 VectorizableTree.back()->setOperand(0, Op0);
6445 return;
6446 }
6447 case Instruction::InsertElement: {
6448 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6449
6450 auto OrdCompare = [](const std::pair<int, int> &P1,
6451 const std::pair<int, int> &P2) {
6452 return P1.first > P2.first;
6453 };
6455 decltype(OrdCompare)>
6456 Indices(OrdCompare);
6457 for (int I = 0, E = VL.size(); I < E; ++I) {
6458 unsigned Idx = *getInsertIndex(VL[I]);
6459 Indices.emplace(Idx, I);
6460 }
6461 OrdersType CurrentOrder(VL.size(), VL.size());
6462 bool IsIdentity = true;
6463 for (int I = 0, E = VL.size(); I < E; ++I) {
6464 CurrentOrder[Indices.top().second] = I;
6465 IsIdentity &= Indices.top().second == I;
6466 Indices.pop();
6467 }
6468 if (IsIdentity)
6469 CurrentOrder.clear();
6470 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6471 std::nullopt, CurrentOrder);
6472 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6473
6474 constexpr int NumOps = 2;
6475 ValueList VectorOperands[NumOps];
6476 for (int I = 0; I < NumOps; ++I) {
6477 for (Value *V : VL)
6478 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
6479
6480 TE->setOperand(I, VectorOperands[I]);
6481 }
6482 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
6483 return;
6484 }
6485 case Instruction::Load: {
6486 // Check that a vectorized load would load the same memory as a scalar
6487 // load. For example, we don't want to vectorize loads that are smaller
6488 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6489 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6490 // from such a struct, we read/write packed bits disagreeing with the
6491 // unvectorized version.
6492 TreeEntry *TE = nullptr;
6493 fixupOrderingIndices(CurrentOrder);
6494 switch (State) {
6495 case TreeEntry::Vectorize:
6496 if (CurrentOrder.empty()) {
6497 // Original loads are consecutive and does not require reordering.
6498 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6499 ReuseShuffleIndicies);
6500 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6501 } else {
6502 // Need to reorder.
6503 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6504 ReuseShuffleIndicies, CurrentOrder);
6505 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6506 }
6507 TE->setOperandsInOrder();
6508 break;
6509 case TreeEntry::StridedVectorize:
6510 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6511 if (CurrentOrder.empty()) {
6512 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6513 UserTreeIdx, ReuseShuffleIndicies);
6514 } else {
6515 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6516 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6517 }
6518 TE->setOperandsInOrder();
6519 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6520 break;
6521 case TreeEntry::ScatterVectorize:
6522 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6523 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6524 UserTreeIdx, ReuseShuffleIndicies);
6525 TE->setOperandsInOrder();
6526 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6527 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6528 break;
6529 case TreeEntry::NeedToGather:
6530 llvm_unreachable("Unexpected loads state.");
6531 }
6532 return;
6533 }
6534 case Instruction::ZExt:
6535 case Instruction::SExt:
6536 case Instruction::FPToUI:
6537 case Instruction::FPToSI:
6538 case Instruction::FPExt:
6539 case Instruction::PtrToInt:
6540 case Instruction::IntToPtr:
6541 case Instruction::SIToFP:
6542 case Instruction::UIToFP:
6543 case Instruction::Trunc:
6544 case Instruction::FPTrunc:
6545 case Instruction::BitCast: {
6546 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6547 ReuseShuffleIndicies);
6548 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6549
6550 TE->setOperandsInOrder();
6551 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6553 // Prepare the operand vector.
6554 for (Value *V : VL)
6555 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6556
6557 buildTree_rec(Operands, Depth + 1, {TE, I});
6558 }
6559 return;
6560 }
6561 case Instruction::ICmp:
6562 case Instruction::FCmp: {
6563 // Check that all of the compares have the same predicate.
6564 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6565 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6566 ReuseShuffleIndicies);
6567 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6568
6570 if (cast<CmpInst>(VL0)->isCommutative()) {
6571 // Commutative predicate - collect + sort operands of the instructions
6572 // so that each side is more likely to have the same opcode.
6574 "Commutative Predicate mismatch");
6575 reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
6576 } else {
6577 // Collect operands - commute if it uses the swapped predicate.
6578 for (Value *V : VL) {
6579 auto *Cmp = cast<CmpInst>(V);
6580 Value *LHS = Cmp->getOperand(0);
6581 Value *RHS = Cmp->getOperand(1);
6582 if (Cmp->getPredicate() != P0)
6583 std::swap(LHS, RHS);
6584 Left.push_back(LHS);
6585 Right.push_back(RHS);
6586 }
6587 }
6588 TE->setOperand(0, Left);
6589 TE->setOperand(1, Right);
6590 buildTree_rec(Left, Depth + 1, {TE, 0});
6591 buildTree_rec(Right, Depth + 1, {TE, 1});
6592 return;
6593 }
6594 case Instruction::Select:
6595 case Instruction::FNeg:
6596 case Instruction::Add:
6597 case Instruction::FAdd:
6598 case Instruction::Sub:
6599 case Instruction::FSub:
6600 case Instruction::Mul:
6601 case Instruction::FMul:
6602 case Instruction::UDiv:
6603 case Instruction::SDiv:
6604 case Instruction::FDiv:
6605 case Instruction::URem:
6606 case Instruction::SRem:
6607 case Instruction::FRem:
6608 case Instruction::Shl:
6609 case Instruction::LShr:
6610 case Instruction::AShr:
6611 case Instruction::And:
6612 case Instruction::Or:
6613 case Instruction::Xor: {
6614 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6615 ReuseShuffleIndicies);
6616 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6617
6618 // Sort operands of the instructions so that each side is more likely to
6619 // have the same opcode.
6620 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
6622 reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
6623 TE->setOperand(0, Left);
6624 TE->setOperand(1, Right);
6625 buildTree_rec(Left, Depth + 1, {TE, 0});
6626 buildTree_rec(Right, Depth + 1, {TE, 1});
6627 return;
6628 }
6629
6630 TE->setOperandsInOrder();
6631 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6633 // Prepare the operand vector.
6634 for (Value *V : VL)
6635 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6636
6637 buildTree_rec(Operands, Depth + 1, {TE, I});
6638 }
6639 return;
6640 }
6641 case Instruction::GetElementPtr: {
6642 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6643 ReuseShuffleIndicies);
6644 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6646 // Prepare the operand vector for pointer operands.
6647 for (Value *V : VL) {
6648 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6649 if (!GEP) {
6650 Operands.front().push_back(V);
6651 continue;
6652 }
6653 Operands.front().push_back(GEP->getPointerOperand());
6654 }
6655 TE->setOperand(0, Operands.front());
6656 // Need to cast all indices to the same type before vectorization to
6657 // avoid crash.
6658 // Required to be able to find correct matches between different gather
6659 // nodes and reuse the vectorized values rather than trying to gather them
6660 // again.
6661 int IndexIdx = 1;
6662 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6663 Type *Ty = all_of(VL,
6664 [VL0Ty, IndexIdx](Value *V) {
6665 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6666 if (!GEP)
6667 return true;
6668 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
6669 })
6670 ? VL0Ty
6671 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
6672 ->getPointerOperandType()
6673 ->getScalarType());
6674 // Prepare the operand vector.
6675 for (Value *V : VL) {
6676 auto *I = dyn_cast<GetElementPtrInst>(V);
6677 if (!I) {
6678 Operands.back().push_back(
6679 ConstantInt::get(Ty, 0, /*isSigned=*/false));
6680 continue;
6681 }
6682 auto *Op = I->getOperand(IndexIdx);
6683 auto *CI = dyn_cast<ConstantInt>(Op);
6684 if (!CI)
6685 Operands.back().push_back(Op);
6686 else
6687 Operands.back().push_back(ConstantFoldIntegerCast(
6688 CI, Ty, CI->getValue().isSignBitSet(), *DL));
6689 }
6690 TE->setOperand(IndexIdx, Operands.back());
6691
6692 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
6693 buildTree_rec(Operands[I], Depth + 1, {TE, I});
6694 return;
6695 }
6696 case Instruction::Store: {
6697 // Check if the stores are consecutive or if we need to swizzle them.
6698 ValueList Operands(VL.size());
6699 auto *OIter = Operands.begin();
6700 for (Value *V : VL) {
6701 auto *SI = cast<StoreInst>(V);
6702 *OIter = SI->getValueOperand();
6703 ++OIter;
6704 }
6705 // Check that the sorted pointer operands are consecutive.
6706 if (CurrentOrder.empty()) {
6707 // Original stores are consecutive and does not require reordering.
6708 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6709 ReuseShuffleIndicies);
6710 TE->setOperandsInOrder();
6711 buildTree_rec(Operands, Depth + 1, {TE, 0});
6712 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
6713 } else {
6714 fixupOrderingIndices(CurrentOrder);
6715 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6716 ReuseShuffleIndicies, CurrentOrder);
6717 TE->setOperandsInOrder();
6718 buildTree_rec(Operands, Depth + 1, {TE, 0});
6719 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
6720 }
6721 return;
6722 }
6723 case Instruction::Call: {
6724 // Check if the calls are all to the same vectorizable intrinsic or
6725 // library function.
6726 CallInst *CI = cast<CallInst>(VL0);
6728
6729 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6730 ReuseShuffleIndicies);
6731 TE->setOperandsInOrder();
6732 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
6733 // For scalar operands no need to create an entry since no need to
6734 // vectorize it.
6736 continue;
6738 // Prepare the operand vector.
6739 for (Value *V : VL) {
6740 auto *CI2 = cast<CallInst>(V);
6741 Operands.push_back(CI2->getArgOperand(I));
6742 }
6743 buildTree_rec(Operands, Depth + 1, {TE, I});
6744 }
6745 return;
6746 }
6747 case Instruction::ShuffleVector: {
6748 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6749 ReuseShuffleIndicies);
6750 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
6751
6752 // Reorder operands if reordering would enable vectorization.
6753 auto *CI = dyn_cast<CmpInst>(VL0);
6754 if (isa<BinaryOperator>(VL0) || CI) {
6756 if (!CI || all_of(VL, [](Value *V) {
6757 return cast<CmpInst>(V)->isCommutative();
6758 })) {
6759 reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE,
6760 *this);
6761 } else {
6762 auto *MainCI = cast<CmpInst>(S.MainOp);
6763 auto *AltCI = cast<CmpInst>(S.AltOp);
6764 CmpInst::Predicate MainP = MainCI->getPredicate();
6765 CmpInst::Predicate AltP = AltCI->getPredicate();
6766 assert(MainP != AltP &&
6767 "Expected different main/alternate predicates.");
6768 // Collect operands - commute if it uses the swapped predicate or
6769 // alternate operation.
6770 for (Value *V : VL) {
6771 auto *Cmp = cast<CmpInst>(V);
6772 Value *LHS = Cmp->getOperand(0);
6773 Value *RHS = Cmp->getOperand(1);
6774
6775 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
6776 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
6777 std::swap(LHS, RHS);
6778 } else {
6779 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
6780 std::swap(LHS, RHS);
6781 }
6782 Left.push_back(LHS);
6783 Right.push_back(RHS);
6784 }
6785 }
6786 TE->setOperand(0, Left);
6787 TE->setOperand(1, Right);
6788 buildTree_rec(Left, Depth + 1, {TE, 0});
6789 buildTree_rec(Right, Depth + 1, {TE, 1});
6790 return;
6791 }
6792
6793 TE->setOperandsInOrder();
6794 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6796 // Prepare the operand vector.
6797 for (Value *V : VL)
6798 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6799
6800 buildTree_rec(Operands, Depth + 1, {TE, I});
6801 }
6802 return;
6803 }
6804 default:
6805 break;
6806 }
6807 llvm_unreachable("Unexpected vectorization of the instructions.");
6808}
6809
6811 unsigned N = 1;
6812 Type *EltTy = T;
6813
6814 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
6815 if (auto *ST = dyn_cast<StructType>(EltTy)) {
6816 // Check that struct is homogeneous.
6817 for (const auto *Ty : ST->elements())
6818 if (Ty != *ST->element_begin())
6819 return 0;
6820 N *= ST->getNumElements();
6821 EltTy = *ST->element_begin();
6822 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
6823 N *= AT->getNumElements();
6824 EltTy = AT->getElementType();
6825 } else {
6826 auto *VT = cast<FixedVectorType>(EltTy);
6827 N *= VT->getNumElements();
6828 EltTy = VT->getElementType();
6829 }
6830 }
6831
6832 if (!isValidElementType(EltTy))
6833 return 0;
6834 uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
6835 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
6836 VTSize != DL->getTypeStoreSizeInBits(T))
6837 return 0;
6838 return N;
6839}
6840
6841bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
6842 SmallVectorImpl<unsigned> &CurrentOrder,
6843 bool ResizeAllowed) const {
6844 const auto *It = find_if(VL, [](Value *V) {
6845 return isa<ExtractElementInst, ExtractValueInst>(V);
6846 });
6847 assert(It != VL.end() && "Expected at least one extract instruction.");
6848 auto *E0 = cast<Instruction>(*It);
6849 assert(all_of(VL,
6850 [](Value *V) {
6851 return isa<UndefValue, ExtractElementInst, ExtractValueInst>(
6852 V);
6853 }) &&
6854 "Invalid opcode");
6855 // Check if all of the extracts come from the same vector and from the
6856 // correct offset.
6857 Value *Vec = E0->getOperand(0);
6858
6859 CurrentOrder.clear();
6860
6861 // We have to extract from a vector/aggregate with the same number of elements.
6862 unsigned NElts;
6863 if (E0->getOpcode() == Instruction::ExtractValue) {
6864 NElts = canMapToVector(Vec->getType());
6865 if (!NElts)
6866 return false;
6867 // Check if load can be rewritten as load of vector.
6868 LoadInst *LI = dyn_cast<LoadInst>(Vec);
6869 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
6870 return false;
6871 } else {
6872 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
6873 }
6874
6875 unsigned E = VL.size();
6876 if (!ResizeAllowed && NElts != E)
6877 return false;
6879 unsigned MinIdx = NElts, MaxIdx = 0;
6880 for (auto [I, V] : enumerate(VL)) {
6881 auto *Inst = dyn_cast<Instruction>(V);
6882 if (!Inst)
6883 continue;
6884 if (Inst->getOperand(0) != Vec)
6885 return false;
6886 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
6887 if (isa<UndefValue>(EE->getIndexOperand()))
6888 continue;
6889 std::optional<unsigned> Idx = getExtractIndex(Inst);
6890 if (!Idx)
6891 return false;
6892 const unsigned ExtIdx = *Idx;
6893 if (ExtIdx >= NElts)
6894 continue;
6895 Indices[I] = ExtIdx;
6896 if (MinIdx > ExtIdx)
6897 MinIdx = ExtIdx;
6898 if (MaxIdx < ExtIdx)
6899 MaxIdx = ExtIdx;
6900 }
6901 if (MaxIdx - MinIdx + 1 > E)
6902 return false;
6903 if (MaxIdx + 1 <= E)
6904 MinIdx = 0;
6905
6906 // Check that all of the indices extract from the correct offset.
6907 bool ShouldKeepOrder = true;
6908 // Assign to all items the initial value E + 1 so we can check if the extract
6909 // instruction index was used already.
6910 // Also, later we can check that all the indices are used and we have a
6911 // consecutive access in the extract instructions, by checking that no
6912 // element of CurrentOrder still has value E + 1.
6913 CurrentOrder.assign(E, E);
6914 for (unsigned I = 0; I < E; ++I) {
6915 if (Indices[I] == PoisonMaskElem)
6916 continue;
6917 const unsigned ExtIdx = Indices[I] - MinIdx;
6918 if (CurrentOrder[ExtIdx] != E) {
6919 CurrentOrder.clear();
6920 return false;
6921 }
6922 ShouldKeepOrder &= ExtIdx == I;
6923 CurrentOrder[ExtIdx] = I;
6924 }
6925 if (ShouldKeepOrder)
6926 CurrentOrder.clear();
6927
6928 return ShouldKeepOrder;
6929}
6930
6931bool BoUpSLP::areAllUsersVectorized(
6932 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
6933 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
6934 all_of(I->users(), [this](User *U) {
6935 return ScalarToTreeEntry.contains(U) ||
6936 isVectorLikeInstWithConstOps(U) ||
6937 (isa<ExtractElementInst>(U) && MustGather.contains(U));
6938 });
6939}
6940
6941static std::pair<InstructionCost, InstructionCost>
6945
6946 // Calculate the cost of the scalar and vector calls.
6948 for (Use &Arg : CI->args())
6949 VecTys.push_back(
6950 FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
6951 FastMathFlags FMF;
6952 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
6953 FMF = FPCI->getFastMathFlags();
6955 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
6956 dyn_cast<IntrinsicInst>(CI));
6957 auto IntrinsicCost =
6959
6960 auto Shape = VFShape::get(CI->getFunctionType(),
6962 false /*HasGlobalPred*/);
6963 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6964 auto LibCost = IntrinsicCost;
6965 if (!CI->isNoBuiltin() && VecFunc) {
6966 // Calculate the cost of the vector library call.
6967 // If the corresponding vector call is cheaper, return its cost.
6968 LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
6970 }
6971 return {IntrinsicCost, LibCost};
6972}
6973
6974void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
6975 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
6976 SmallVectorImpl<Value *> *OpScalars,
6977 SmallVectorImpl<Value *> *AltScalars) const {
6978 unsigned Sz = Scalars.size();
6979 Mask.assign(Sz, PoisonMaskElem);
6980 SmallVector<int> OrderMask;
6981 if (!ReorderIndices.empty())
6982 inversePermutation(ReorderIndices, OrderMask);
6983 for (unsigned I = 0; I < Sz; ++I) {
6984 unsigned Idx = I;
6985 if (!ReorderIndices.empty())
6986 Idx = OrderMask[I];
6987 auto *OpInst = cast<Instruction>(Scalars[Idx]);
6988 if (IsAltOp(OpInst)) {
6989 Mask[I] = Sz + Idx;
6990 if (AltScalars)
6991 AltScalars->push_back(OpInst);
6992 } else {
6993 Mask[I] = Idx;
6994 if (OpScalars)
6995 OpScalars->push_back(OpInst);
6996 }
6997 }
6998 if (!ReuseShuffleIndices.empty()) {
6999 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7000 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7001 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7002 });
7003 Mask.swap(NewMask);
7004 }
7005}
7006
7008 const Instruction *MainOp,
7009 const Instruction *AltOp,
7010 const TargetLibraryInfo &TLI) {
7011 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7012 auto *AltCI = cast<CmpInst>(AltOp);
7013 CmpInst::Predicate MainP = MainCI->getPredicate();
7014 CmpInst::Predicate AltP = AltCI->getPredicate();
7015 assert(MainP != AltP && "Expected different main/alternate predicates.");
7016 auto *CI = cast<CmpInst>(I);
7017 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7018 return false;
7019 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7020 return true;
7021 CmpInst::Predicate P = CI->getPredicate();
7023
7024 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7025 "CmpInst expected to match either main or alternate predicate or "
7026 "their swap.");
7027 (void)AltP;
7028 return MainP != P && MainP != SwappedP;
7029 }
7030 return I->getOpcode() == AltOp->getOpcode();
7031}
7032
7033TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7034 assert(!Ops.empty());
7035 const auto *Op0 = Ops.front();
7036
7037 const bool IsConstant = all_of(Ops, [](Value *V) {
7038 // TODO: We should allow undef elements here
7039 return isConstant(V) && !isa<UndefValue>(V);
7040 });
7041 const bool IsUniform = all_of(Ops, [=](Value *V) {
7042 // TODO: We should allow undef elements here
7043 return V == Op0;
7044 });
7045 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7046 // TODO: We should allow undef elements here
7047 if (auto *CI = dyn_cast<ConstantInt>(V))
7048 return CI->getValue().isPowerOf2();
7049 return false;
7050 });
7051 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7052 // TODO: We should allow undef elements here
7053 if (auto *CI = dyn_cast<ConstantInt>(V))
7054 return CI->getValue().isNegatedPowerOf2();
7055 return false;
7056 });
7057
7059 if (IsConstant && IsUniform)
7061 else if (IsConstant)
7063 else if (IsUniform)
7065
7067 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7068 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7069
7070 return {VK, VP};
7071}
7072
7073namespace {
7074/// The base class for shuffle instruction emission and shuffle cost estimation.
7075class BaseShuffleAnalysis {
7076protected:
7077 /// Checks if the mask is an identity mask.
7078 /// \param IsStrict if is true the function returns false if mask size does
7079 /// not match vector size.
7080 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7081 bool IsStrict) {
7082 int Limit = Mask.size();
7083 int VF = VecTy->getNumElements();
7084 int Index = -1;
7085 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7086 return true;
7087 if (!IsStrict) {
7088 // Consider extract subvector starting from index 0.
7090 Index == 0)
7091 return true;
7092 // All VF-size submasks are identity (e.g.
7093 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7094 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7095 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7096 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7098 }))
7099 return true;
7100 }
7101 return false;
7102 }
7103
7104 /// Tries to combine 2 different masks into single one.
7105 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7106 /// change the size of the vector, \p LocalVF is the original size of the
7107 /// shuffled vector.
7108 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7109 ArrayRef<int> ExtMask) {
7110 unsigned VF = Mask.size();
7111 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7112 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7113 if (ExtMask[I] == PoisonMaskElem)
7114 continue;
7115 int MaskedIdx = Mask[ExtMask[I] % VF];
7116 NewMask[I] =
7117 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7118 }
7119 Mask.swap(NewMask);
7120 }
7121
7122 /// Looks through shuffles trying to reduce final number of shuffles in the
7123 /// code. The function looks through the previously emitted shuffle
7124 /// instructions and properly mark indices in mask as undef.
7125 /// For example, given the code
7126 /// \code
7127 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7128 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7129 /// \endcode
7130 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7131 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7132 /// <0, 1, 2, 3> for the shuffle.
7133 /// If 2 operands are of different size, the smallest one will be resized and
7134 /// the mask recalculated properly.
7135 /// For example, given the code
7136 /// \code
7137 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7138 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7139 /// \endcode
7140 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7141 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7142 /// <0, 1, 2, 3> for the shuffle.
7143 /// So, it tries to transform permutations to simple vector merge, if
7144 /// possible.
7145 /// \param V The input vector which must be shuffled using the given \p Mask.
7146 /// If the better candidate is found, \p V is set to this best candidate
7147 /// vector.
7148 /// \param Mask The input mask for the shuffle. If the best candidate is found
7149 /// during looking-through-shuffles attempt, it is updated accordingly.
7150 /// \param SinglePermute true if the shuffle operation is originally a
7151 /// single-value-permutation. In this case the look-through-shuffles procedure
7152 /// may look for resizing shuffles as the best candidates.
7153 /// \return true if the shuffle results in the non-resizing identity shuffle
7154 /// (and thus can be ignored), false - otherwise.
7155 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7156 bool SinglePermute) {
7157 Value *Op = V;
7158 ShuffleVectorInst *IdentityOp = nullptr;
7159 SmallVector<int> IdentityMask;
7160 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7161 // Exit if not a fixed vector type or changing size shuffle.
7162 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7163 if (!SVTy)
7164 break;
7165 // Remember the identity or broadcast mask, if it is not a resizing
7166 // shuffle. If no better candidates are found, this Op and Mask will be
7167 // used in the final shuffle.
7168 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7169 if (!IdentityOp || !SinglePermute ||
7170 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7172 IdentityMask.size()))) {
7173 IdentityOp = SV;
7174 // Store current mask in the IdentityMask so later we did not lost
7175 // this info if IdentityOp is selected as the best candidate for the
7176 // permutation.
7177 IdentityMask.assign(Mask);
7178 }
7179 }
7180 // Remember the broadcast mask. If no better candidates are found, this Op
7181 // and Mask will be used in the final shuffle.
7182 // Zero splat can be used as identity too, since it might be used with
7183 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7184 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7185 // expensive, the analysis founds out, that the source vector is just a
7186 // broadcast, this original mask can be transformed to identity mask <0,
7187 // 1, 2, 3>.
7188 // \code
7189 // %0 = shuffle %v, poison, zeroinitalizer
7190 // %res = shuffle %0, poison, <3, 1, 2, 0>
7191 // \endcode
7192 // may be transformed to
7193 // \code
7194 // %0 = shuffle %v, poison, zeroinitalizer
7195 // %res = shuffle %0, poison, <0, 1, 2, 3>
7196 // \endcode
7197 if (SV->isZeroEltSplat()) {
7198 IdentityOp = SV;
7199 IdentityMask.assign(Mask);
7200 }
7201 int LocalVF = Mask.size();
7202 if (auto *SVOpTy =
7203 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7204 LocalVF = SVOpTy->getNumElements();
7205 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7206 for (auto [Idx, I] : enumerate(Mask)) {
7207 if (I == PoisonMaskElem ||
7208 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7209 continue;
7210 ExtMask[Idx] = SV->getMaskValue(I);
7211 }
7212 bool IsOp1Undef =
7213 isUndefVector(SV->getOperand(0),
7214 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7215 .all();
7216 bool IsOp2Undef =
7217 isUndefVector(SV->getOperand(1),
7218 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7219 .all();
7220 if (!IsOp1Undef && !IsOp2Undef) {
7221 // Update mask and mark undef elems.
7222 for (int &I : Mask) {
7223 if (I == PoisonMaskElem)
7224 continue;
7225 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7227 I = PoisonMaskElem;
7228 }
7229 break;
7230 }
7231 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7232 SV->getShuffleMask().end());
7233 combineMasks(LocalVF, ShuffleMask, Mask);
7234 Mask.swap(ShuffleMask);
7235 if (IsOp2Undef)
7236 Op = SV->getOperand(0);
7237 else
7238 Op = SV->getOperand(1);
7239 }
7240 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7241 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7243 if (IdentityOp) {
7244 V = IdentityOp;
7245 assert(Mask.size() == IdentityMask.size() &&
7246 "Expected masks of same sizes.");
7247 // Clear known poison elements.
7248 for (auto [I, Idx] : enumerate(Mask))
7249 if (Idx == PoisonMaskElem)
7250 IdentityMask[I] = PoisonMaskElem;
7251 Mask.swap(IdentityMask);
7252 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7253 return SinglePermute &&
7254 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7255 /*IsStrict=*/true) ||
7256 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7257 Shuffle->isZeroEltSplat() &&
7259 }
7260 V = Op;
7261 return false;
7262 }
7263 V = Op;
7264 return true;
7265 }
7266
7267 /// Smart shuffle instruction emission, walks through shuffles trees and
7268 /// tries to find the best matching vector for the actual shuffle
7269 /// instruction.
7270 template <typename T, typename ShuffleBuilderTy>
7271 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7272 ShuffleBuilderTy &Builder) {
7273 assert(V1 && "Expected at least one vector value.");
7274 if (V2)
7275 Builder.resizeToMatch(V1, V2);
7276 int VF = Mask.size();
7277 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7278 VF = FTy->getNumElements();
7279 if (V2 &&
7280 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7281 // Peek through shuffles.
7282 Value *Op1 = V1;
7283 Value *Op2 = V2;
7284 int VF =
7285 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7286 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7287 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7288 for (int I = 0, E = Mask.size(); I < E; ++I) {
7289 if (Mask[I] < VF)
7290 CombinedMask1[I] = Mask[I];
7291 else
7292 CombinedMask2[I] = Mask[I] - VF;
7293 }
7294 Value *PrevOp1;
7295 Value *PrevOp2;
7296 do {
7297 PrevOp1 = Op1;
7298 PrevOp2 = Op2;
7299 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7300 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7301 // Check if we have 2 resizing shuffles - need to peek through operands
7302 // again.
7303 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7304 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7305 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7306 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7307 if (I == PoisonMaskElem)
7308 continue;
7309 ExtMask1[Idx] = SV1->getMaskValue(I);
7310 }
7311 SmallBitVector UseMask1 = buildUseMask(
7312 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7313 ->getNumElements(),
7314 ExtMask1, UseMask::SecondArg);
7315 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7316 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7317 if (I == PoisonMaskElem)
7318 continue;
7319 ExtMask2[Idx] = SV2->getMaskValue(I);
7320 }
7321 SmallBitVector UseMask2 = buildUseMask(
7322 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7323 ->getNumElements(),
7324 ExtMask2, UseMask::SecondArg);
7325 if (SV1->getOperand(0)->getType() ==
7326 SV2->getOperand(0)->getType() &&
7327 SV1->getOperand(0)->getType() != SV1->getType() &&
7328 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7329 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7330 Op1 = SV1->getOperand(0);
7331 Op2 = SV2->getOperand(0);
7332 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7333 SV1->getShuffleMask().end());
7334 int LocalVF = ShuffleMask1.size();
7335 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7336 LocalVF = FTy->getNumElements();
7337 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7338 CombinedMask1.swap(ShuffleMask1);
7339 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7340 SV2->getShuffleMask().end());
7341 LocalVF = ShuffleMask2.size();
7342 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7343 LocalVF = FTy->getNumElements();
7344 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7345 CombinedMask2.swap(ShuffleMask2);
7346 }
7347 }
7348 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7349 Builder.resizeToMatch(Op1, Op2);
7350 VF = std::max(cast<VectorType>(Op1->getType())
7351 ->getElementCount()
7352 .getKnownMinValue(),
7353 cast<VectorType>(Op2->getType())
7354 ->getElementCount()
7355 .getKnownMinValue());
7356 for (int I = 0, E = Mask.size(); I < E; ++I) {
7357 if (CombinedMask2[I] != PoisonMaskElem) {
7358 assert(CombinedMask1[I] == PoisonMaskElem &&
7359 "Expected undefined mask element");
7360 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7361 }
7362 }
7363 if (Op1 == Op2 &&
7364 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7365 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7366 isa<ShuffleVectorInst>(Op1) &&
7367 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7368 ArrayRef(CombinedMask1))))
7369 return Builder.createIdentity(Op1);
7370 return Builder.createShuffleVector(
7371 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7372 CombinedMask1);
7373 }
7374 if (isa<PoisonValue>(V1))
7375 return Builder.createPoison(
7376 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7377 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7378 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7379 assert(V1 && "Expected non-null value after looking through shuffles.");
7380
7381 if (!IsIdentity)
7382 return Builder.createShuffleVector(V1, NewMask);
7383 return Builder.createIdentity(V1);
7384 }
7385};
7386} // namespace
7387
7388/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7389/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7390/// subvector pattern.
7391static InstructionCost
7393 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7395 int Index = 0, VectorType *SubTp = nullptr,
7396 ArrayRef<const Value *> Args = std::nullopt) {
7397 if (Kind != TTI::SK_PermuteTwoSrc)
7398 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7399 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7400 int NumSubElts;
7401 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7402 Mask, NumSrcElts, NumSubElts, Index)) {
7403 if (Index + NumSubElts > NumSrcElts &&
7404 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7405 return TTI.getShuffleCost(
7407 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7409 }
7410 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7411}
7412
7413/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7414static std::pair<InstructionCost, InstructionCost>
7416 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7417 Type *ScalarTy, VectorType *VecTy) {
7418 InstructionCost ScalarCost = 0;
7419 InstructionCost VecCost = 0;
7420 // Here we differentiate two cases: (1) when Ptrs represent a regular
7421 // vectorization tree node (as they are pointer arguments of scattered
7422 // loads) or (2) when Ptrs are the arguments of loads or stores being
7423 // vectorized as plane wide unit-stride load/store since all the
7424 // loads/stores are known to be from/to adjacent locations.
7425 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7426 // Case 2: estimate costs for pointer related costs when vectorizing to
7427 // a wide load/store.
7428 // Scalar cost is estimated as a set of pointers with known relationship
7429 // between them.
7430 // For vector code we will use BasePtr as argument for the wide load/store
7431 // but we also need to account all the instructions which are going to
7432 // stay in vectorized code due to uses outside of these scalar
7433 // loads/stores.
7434 ScalarCost = TTI.getPointersChainCost(
7435 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7436 CostKind);
7437
7438 SmallVector<const Value *> PtrsRetainedInVecCode;
7439 for (Value *V : Ptrs) {
7440 if (V == BasePtr) {
7441 PtrsRetainedInVecCode.push_back(V);
7442 continue;
7443 }
7444 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7445 // For simplicity assume Ptr to stay in vectorized code if it's not a
7446 // GEP instruction. We don't care since it's cost considered free.
7447 // TODO: We should check for any uses outside of vectorizable tree
7448 // rather than just single use.
7449 if (!Ptr || !Ptr->hasOneUse())
7450 PtrsRetainedInVecCode.push_back(V);
7451 }
7452
7453 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7454 // If all pointers stay in vectorized code then we don't have
7455 // any savings on that.
7456 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7457 }
7458 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7459 TTI::PointersChainInfo::getKnownStride(),
7460 VecTy, CostKind);
7461 } else {
7462 // Case 1: Ptrs are the arguments of loads that we are going to transform
7463 // into masked gather load intrinsic.
7464 // All the scalar GEPs will be removed as a result of vectorization.
7465 // For any external uses of some lanes extract element instructions will
7466 // be generated (which cost is estimated separately).
7467 TTI::PointersChainInfo PtrsInfo =
7468 all_of(Ptrs,
7469 [](const Value *V) {
7470 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7471 return Ptr && !Ptr->hasAllConstantIndices();
7472 })
7473 ? TTI::PointersChainInfo::getUnknownStride()
7474 : TTI::PointersChainInfo::getKnownStride();
7475
7476 ScalarCost =
7477 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7478 if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7479 SmallVector<const Value *> Indices(BaseGEP->indices());
7480 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7481 BaseGEP->getPointerOperand(), Indices, VecTy,
7482 CostKind);
7483 }
7484 }
7485
7486 return std::make_pair(ScalarCost, VecCost);
7487}
7488
7489/// Merges shuffle masks and emits final shuffle instruction, if required. It
7490/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7491/// when the actual shuffle instruction is generated only if this is actually
7492/// required. Otherwise, the shuffle instruction emission is delayed till the
7493/// end of the process, to reduce the number of emitted instructions and further
7494/// analysis/transformations.
7495class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7496 bool IsFinalized = false;
7497 SmallVector<int> CommonMask;
7499 const TargetTransformInfo &TTI;
7501 SmallDenseSet<Value *> VectorizedVals;
7502 BoUpSLP &R;
7503 SmallPtrSetImpl<Value *> &CheckedExtracts;
7504 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7505 /// While set, still trying to estimate the cost for the same nodes and we
7506 /// can delay actual cost estimation (virtual shuffle instruction emission).
7507 /// May help better estimate the cost if same nodes must be permuted + allows
7508 /// to move most of the long shuffles cost estimation to TTI.
7509 bool SameNodesEstimated = true;
7510
7511 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
7512 if (Ty->getScalarType()->isPointerTy()) {
7516 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
7517 Ty->getScalarType());
7518 if (auto *VTy = dyn_cast<VectorType>(Ty))
7519 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
7520 return Res;
7521 }
7522 return Constant::getAllOnesValue(Ty);
7523 }
7524
7525 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
7526 if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
7527 return TTI::TCC_Free;
7528 auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
7529 InstructionCost GatherCost = 0;
7530 SmallVector<Value *> Gathers(VL.begin(), VL.end());
7531 // Improve gather cost for gather of loads, if we can group some of the
7532 // loads into vector loads.
7533 InstructionsState S = getSameOpcode(VL, *R.TLI);
7534 const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
7535 unsigned MinVF = R.getMinVF(2 * Sz);
7536 if (VL.size() > 2 &&
7537 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7538 (InVectors.empty() &&
7539 any_of(seq<unsigned>(0, VL.size() / MinVF),
7540 [&](unsigned Idx) {
7541 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7542 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7543 return S.getOpcode() == Instruction::Load &&
7544 !S.isAltShuffle();
7545 }))) &&
7546 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
7547 !isSplat(Gathers)) {
7548 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
7549 SetVector<Value *> VectorizedLoads;
7551 SmallVector<unsigned> ScatterVectorized;
7552 unsigned StartIdx = 0;
7553 unsigned VF = VL.size() / 2;
7554 for (; VF >= MinVF; VF /= 2) {
7555 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
7556 Cnt += VF) {
7557 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7558 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7559 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
7560 if (SliceS.getOpcode() != Instruction::Load ||
7561 SliceS.isAltShuffle())
7562 continue;
7563 }
7564 if (!VectorizedLoads.count(Slice.front()) &&
7565 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
7566 SmallVector<Value *> PointerOps;
7567 OrdersType CurrentOrder;
7568 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
7569 CurrentOrder, PointerOps);
7570 switch (LS) {
7574 // Mark the vectorized loads so that we don't vectorize them
7575 // again.
7576 // TODO: better handling of loads with reorders.
7577 if (((LS == LoadsState::Vectorize ||
7579 CurrentOrder.empty()) ||
7581 isReverseOrder(CurrentOrder)))
7582 VectorizedStarts.emplace_back(Cnt, LS);
7583 else
7584 ScatterVectorized.push_back(Cnt);
7585 VectorizedLoads.insert(Slice.begin(), Slice.end());
7586 // If we vectorized initial block, no need to try to vectorize
7587 // it again.
7588 if (Cnt == StartIdx)
7589 StartIdx += VF;
7590 break;
7591 case LoadsState::Gather:
7592 break;
7593 }
7594 }
7595 }
7596 // Check if the whole array was vectorized already - exit.
7597 if (StartIdx >= VL.size())
7598 break;
7599 // Found vectorizable parts - exit.
7600 if (!VectorizedLoads.empty())
7601 break;
7602 }
7603 if (!VectorizedLoads.empty()) {
7604 unsigned NumParts = TTI.getNumberOfParts(VecTy);
7605 bool NeedInsertSubvectorAnalysis =
7606 !NumParts || (VL.size() / VF) > NumParts;
7607 // Get the cost for gathered loads.
7608 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7609 if (VectorizedLoads.contains(VL[I]))
7610 continue;
7611 GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
7612 }
7613 // Exclude potentially vectorized loads from list of gathered
7614 // scalars.
7615 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
7616 // The cost for vectorized loads.
7617 InstructionCost ScalarsCost = 0;
7618 for (Value *V : VectorizedLoads) {
7619 auto *LI = cast<LoadInst>(V);
7620 ScalarsCost +=
7621 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
7622 LI->getAlign(), LI->getPointerAddressSpace(),
7623 CostKind, TTI::OperandValueInfo(), LI);
7624 }
7625 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
7626 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
7627 auto *LI = cast<LoadInst>(VL[P.first]);
7628 Align Alignment = LI->getAlign();
7629 GatherCost +=
7630 P.second == LoadsState::Vectorize
7631 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7632 LI->getPointerAddressSpace(), CostKind,
7635 Instruction::Load, LoadTy, LI->getPointerOperand(),
7636 /*VariableMask=*/false, Alignment, CostKind, LI);
7637 // Estimate GEP cost.
7638 SmallVector<Value *> PointerOps(VF);
7639 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
7640 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
7641 auto [ScalarGEPCost, VectorGEPCost] =
7642 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
7643 Instruction::Load, CostKind, LI->getType(), LoadTy);
7644 GatherCost += VectorGEPCost - ScalarGEPCost;
7645 }
7646 for (unsigned P : ScatterVectorized) {
7647 auto *LI0 = cast<LoadInst>(VL[P]);
7648 ArrayRef<Value *> Slice = VL.slice(P, VF);
7649 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
7650 GatherCost += TTI.getGatherScatterOpCost(
7651 Instruction::Load, LoadTy, LI0->getPointerOperand(),
7652 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
7653 // Estimate GEP cost.
7654 SmallVector<Value *> PointerOps(VF);
7655 for (auto [I, V] : enumerate(Slice))
7656 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
7657 OrdersType Order;
7658 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
7659 Order)) {
7660 // TODO: improve checks if GEPs can be vectorized.
7661 Value *Ptr0 = PointerOps.front();
7662 Type *ScalarTy = Ptr0->getType();
7663 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
7664 auto [ScalarGEPCost, VectorGEPCost] =
7665 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
7666 CostKind, ScalarTy, VecTy);
7667 GatherCost += VectorGEPCost - ScalarGEPCost;
7668 if (!Order.empty()) {
7669 SmallVector<int> Mask;
7670 inversePermutation(Order, Mask);
7672 VecTy, Mask, CostKind);
7673 }
7674 } else {
7675 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
7676 }
7677 }
7678 if (NeedInsertSubvectorAnalysis) {
7679 // Add the cost for the subvectors insert.
7680 SmallVector<int> ShuffleMask(VL.size());
7681 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
7682 for (unsigned Idx : seq<unsigned>(0, E))
7683 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
7684 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
7685 ShuffleMask, CostKind, I, LoadTy);
7686 }
7687 }
7688 GatherCost -= ScalarsCost;
7689 }
7690 GatherCost = std::min(BaseCost, GatherCost);
7691 } else if (!Root && isSplat(VL)) {
7692 // Found the broadcasting of the single scalar, calculate the cost as
7693 // the broadcast.
7694 const auto *It =
7695 find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
7696 assert(It != VL.end() && "Expected at least one non-undef value.");
7697 // Add broadcast for non-identity shuffle only.
7698 bool NeedShuffle =
7699 count(VL, *It) > 1 &&
7700 (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
7701 if (!NeedShuffle)
7702 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
7703 CostKind, std::distance(VL.begin(), It),
7704 PoisonValue::get(VecTy), *It);
7705
7706 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
7707 transform(VL, ShuffleMask.begin(), [](Value *V) {
7708 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
7709 });
7711 Instruction::InsertElement, VecTy, CostKind, 0,
7712 PoisonValue::get(VecTy), *It);
7713 return InsertCost +
7715 ShuffleMask, CostKind, /*Index=*/0,
7716 /*SubTp=*/nullptr, /*Args=*/*It);
7717 }
7718 return GatherCost +
7719 (all_of(Gathers, UndefValue::classof)
7721 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
7722 };
7723
7724 /// Compute the cost of creating a vector containing the extracted values from
7725 /// \p VL.
7727 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
7728 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
7729 unsigned NumParts) {
7730 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
7731 unsigned NumElts =
7732 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
7733 auto *EE = dyn_cast<ExtractElementInst>(V);
7734 if (!EE)
7735 return Sz;
7736 auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
7737 return std::max(Sz, VecTy->getNumElements());
7738 });
7739 unsigned NumSrcRegs = TTI.getNumberOfParts(
7740 FixedVectorType::get(VL.front()->getType(), NumElts));
7741 if (NumSrcRegs == 0)
7742 NumSrcRegs = 1;
7743 // FIXME: this must be moved to TTI for better estimation.
7744 unsigned EltsPerVector = PowerOf2Ceil(std::max(
7745 divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
7746 auto CheckPerRegistersShuffle =
7747 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
7748 DenseSet<int> RegIndices;
7749 // Check that if trying to permute same single/2 input vectors.
7751 int FirstRegId = -1;
7752 for (int &I : Mask) {
7753 if (I == PoisonMaskElem)
7754 continue;
7755 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
7756 if (FirstRegId < 0)
7757 FirstRegId = RegId;
7758 RegIndices.insert(RegId);
7759 if (RegIndices.size() > 2)
7760 return std::nullopt;
7761 if (RegIndices.size() == 2)
7762 ShuffleKind = TTI::SK_PermuteTwoSrc;
7763 I = (I % NumElts) % EltsPerVector +
7764 (RegId == FirstRegId ? 0 : EltsPerVector);
7765 }
7766 return ShuffleKind;
7767 };
7769
7770 // Process extracts in blocks of EltsPerVector to check if the source vector
7771 // operand can be re-used directly. If not, add the cost of creating a
7772 // shuffle to extract the values into a vector register.
7773 for (unsigned Part = 0; Part < NumParts; ++Part) {
7774 if (!ShuffleKinds[Part])
7775 continue;
7776 ArrayRef<int> MaskSlice =
7777 Mask.slice(Part * EltsPerVector,
7778 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
7779 ? Mask.size() % EltsPerVector
7780 : EltsPerVector);
7781 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
7782 copy(MaskSlice, SubMask.begin());
7783 std::optional<TTI::ShuffleKind> RegShuffleKind =
7784 CheckPerRegistersShuffle(SubMask);
7785 if (!RegShuffleKind) {
7787 TTI, *ShuffleKinds[Part],
7788 FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
7789 continue;
7790 }
7791 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
7792 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
7794 TTI, *RegShuffleKind,
7795 FixedVectorType::get(VL.front()->getType(), EltsPerVector),
7796 SubMask);
7797 }
7798 }
7799 return Cost;
7800 }
7801 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
7802 /// shuffle emission.
7803 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
7804 ArrayRef<int> Mask) {
7805 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
7806 if (Mask[Idx] != PoisonMaskElem)
7807 CommonMask[Idx] = Idx;
7808 }
7809 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
7810 /// mask \p Mask, register number \p Part, that includes \p SliceSize
7811 /// elements.
7812 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
7813 ArrayRef<int> Mask, unsigned Part,
7814 unsigned SliceSize) {
7815 if (SameNodesEstimated) {
7816 // Delay the cost estimation if the same nodes are reshuffling.
7817 // If we already requested the cost of reshuffling of E1 and E2 before, no
7818 // need to estimate another cost with the sub-Mask, instead include this
7819 // sub-Mask into the CommonMask to estimate it later and avoid double cost
7820 // estimation.
7821 if ((InVectors.size() == 2 &&
7822 InVectors.front().get<const TreeEntry *>() == &E1 &&
7823 InVectors.back().get<const TreeEntry *>() == E2) ||
7824 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
7825 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
7826 [](int Idx) { return Idx == PoisonMaskElem; }) &&
7827 "Expected all poisoned elements.");
7828 ArrayRef<int> SubMask =
7829 ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
7830 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
7831 return;
7832 }
7833 // Found non-matching nodes - need to estimate the cost for the matched
7834 // and transform mask.
7835 Cost += createShuffle(InVectors.front(),
7836 InVectors.size() == 1 ? nullptr : InVectors.back(),
7837 CommonMask);
7838 transformMaskAfterShuffle(CommonMask, CommonMask);
7839 }
7840 SameNodesEstimated = false;
7841 if (!E2 && InVectors.size() == 1) {
7842 unsigned VF = E1.getVectorFactor();
7843 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
7844 VF = std::max(VF,
7845 cast<FixedVectorType>(V1->getType())->getNumElements());
7846 } else {
7847 const auto *E = InVectors.front().get<const TreeEntry *>();
7848 VF = std::max(VF, E->getVectorFactor());
7849 }
7850 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
7851 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
7852 CommonMask[Idx] = Mask[Idx] + VF;
7853 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
7854 transformMaskAfterShuffle(CommonMask, CommonMask);
7855 } else {
7856 Cost += createShuffle(&E1, E2, Mask);
7857 transformMaskAfterShuffle(CommonMask, Mask);
7858 }
7859 }
7860
7861 class ShuffleCostBuilder {
7862 const TargetTransformInfo &TTI;
7863
7864 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
7865 int Index = -1;
7866 return Mask.empty() ||
7867 (VF == Mask.size() &&
7870 Index == 0);
7871 }
7872
7873 public:
7874 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
7875 ~ShuffleCostBuilder() = default;
7876 InstructionCost createShuffleVector(Value *V1, Value *,
7877 ArrayRef<int> Mask) const {
7878 // Empty mask or identity mask are free.
7879 unsigned VF =
7880 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7881 if (isEmptyOrIdentity(Mask, VF))
7882 return TTI::TCC_Free;
7883 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
7884 cast<VectorType>(V1->getType()), Mask);
7885 }
7886 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
7887 // Empty mask or identity mask are free.
7888 unsigned VF =
7889 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7890 if (isEmptyOrIdentity(Mask, VF))
7891 return TTI::TCC_Free;
7893 cast<VectorType>(V1->getType()), Mask);
7894 }
7895 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
7896 InstructionCost createPoison(Type *Ty, unsigned VF) const {
7897 return TTI::TCC_Free;
7898 }
7899 void resizeToMatch(Value *&, Value *&) const {}
7900 };
7901
7902 /// Smart shuffle instruction emission, walks through shuffles trees and
7903 /// tries to find the best matching vector for the actual shuffle
7904 /// instruction.
7906 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
7908 ArrayRef<int> Mask) {
7909 ShuffleCostBuilder Builder(TTI);
7910 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
7911 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
7912 unsigned CommonVF = Mask.size();
7913 if (!V1 && !V2 && !P2.isNull()) {
7914 // Shuffle 2 entry nodes.
7915 const TreeEntry *E = P1.get<const TreeEntry *>();
7916 unsigned VF = E->getVectorFactor();
7917 const TreeEntry *E2 = P2.get<const TreeEntry *>();
7918 CommonVF = std::max(VF, E2->getVectorFactor());
7919 assert(all_of(Mask,
7920 [=](int Idx) {
7921 return Idx < 2 * static_cast<int>(CommonVF);
7922 }) &&
7923 "All elements in mask must be less than 2 * CommonVF.");
7924 if (E->Scalars.size() == E2->Scalars.size()) {
7925 SmallVector<int> EMask = E->getCommonMask();
7926 SmallVector<int> E2Mask = E2->getCommonMask();
7927 if (!EMask.empty() || !E2Mask.empty()) {
7928 for (int &Idx : CommonMask) {
7929 if (Idx == PoisonMaskElem)
7930 continue;
7931 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
7932 Idx = EMask[Idx];
7933 else if (Idx >= static_cast<int>(CommonVF))
7934 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
7935 E->Scalars.size();
7936 }
7937 }
7938 CommonVF = E->Scalars.size();
7939 }
7941 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
7942 V2 = getAllOnesValue(
7943 *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
7944 } else if (!V1 && P2.isNull()) {
7945 // Shuffle single entry node.
7946 const TreeEntry *E = P1.get<const TreeEntry *>();
7947 unsigned VF = E->getVectorFactor();
7948 CommonVF = VF;
7949 assert(
7950 all_of(Mask,
7951 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
7952 "All elements in mask must be less than CommonVF.");
7953 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
7954 SmallVector<int> EMask = E->getCommonMask();
7955 assert(!EMask.empty() && "Expected non-empty common mask.");
7956 for (int &Idx : CommonMask) {
7957 if (Idx != PoisonMaskElem)
7958 Idx = EMask[Idx];
7959 }
7960 CommonVF = E->Scalars.size();
7961 }
7963 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
7964 // Not identity/broadcast? Try to see if the original vector is better.
7965 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
7966 CommonVF == CommonMask.size() &&
7967 any_of(enumerate(CommonMask),
7968 [](const auto &&P) {
7969 return P.value() != PoisonMaskElem &&
7970 static_cast<unsigned>(P.value()) != P.index();
7971 }) &&
7972 any_of(CommonMask,
7973 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
7974 SmallVector<int> ReorderMask;
7975 inversePermutation(E->ReorderIndices, ReorderMask);
7976 ::addMask(CommonMask, ReorderMask);
7977 }
7978 } else if (V1 && P2.isNull()) {
7979 // Shuffle single vector.
7980 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
7981 assert(
7982 all_of(Mask,
7983 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
7984 "All elements in mask must be less than CommonVF.");
7985 } else if (V1 && !V2) {
7986 // Shuffle vector and tree node.
7987 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
7988 const TreeEntry *E2 = P2.get<const TreeEntry *>();
7989 CommonVF = std::max(VF, E2->getVectorFactor());
7990 assert(all_of(Mask,
7991 [=](int Idx) {
7992 return Idx < 2 * static_cast<int>(CommonVF);
7993 }) &&
7994 "All elements in mask must be less than 2 * CommonVF.");
7995 if (E2->Scalars.size() == VF && VF != CommonVF) {
7996 SmallVector<int> E2Mask = E2->getCommonMask();
7997 assert(!E2Mask.empty() && "Expected non-empty common mask.");
7998 for (int &Idx : CommonMask) {
7999 if (Idx == PoisonMaskElem)
8000 continue;
8001 if (Idx >= static_cast<int>(CommonVF))
8002 Idx = E2Mask[Idx - CommonVF] + VF;
8003 }
8004 CommonVF = VF;
8005 }
8007 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8008 V2 = getAllOnesValue(
8009 *R.DL,
8010 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8011 } else if (!V1 && V2) {
8012 // Shuffle vector and tree node.
8013 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8014 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8015 CommonVF = std::max(VF, E1->getVectorFactor());
8016 assert(all_of(Mask,
8017 [=](int Idx) {
8018 return Idx < 2 * static_cast<int>(CommonVF);
8019 }) &&
8020 "All elements in mask must be less than 2 * CommonVF.");
8021 if (E1->Scalars.size() == VF && VF != CommonVF) {
8022 SmallVector<int> E1Mask = E1->getCommonMask();
8023 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8024 for (int &Idx : CommonMask) {
8025 if (Idx == PoisonMaskElem)
8026 continue;
8027 if (Idx >= static_cast<int>(CommonVF))
8028 Idx = E1Mask[Idx - CommonVF] + VF;
8029 else
8030 Idx = E1Mask[Idx];
8031 }
8032 CommonVF = VF;
8033 }
8035 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8036 V2 = getAllOnesValue(
8037 *R.DL,
8038 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8039 } else {
8040 assert(V1 && V2 && "Expected both vectors.");
8041 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8042 CommonVF =
8043 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8044 assert(all_of(Mask,
8045 [=](int Idx) {
8046 return Idx < 2 * static_cast<int>(CommonVF);
8047 }) &&
8048 "All elements in mask must be less than 2 * CommonVF.");
8049 if (V1->getType() != V2->getType()) {
8051 cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
8052 V2 = getAllOnesValue(
8053 *R.DL, FixedVectorType::get(
8054 cast<FixedVectorType>(V1->getType())->getElementType(),
8055 CommonVF));
8056 }
8057 }
8059 cast<FixedVectorType>(V1->getType())->getElementType(),
8060 CommonMask.size()));
8061 if (InVectors.size() == 2)
8062 InVectors.pop_back();
8063 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8064 V1, V2, CommonMask, Builder);
8065 }
8066
8067public:
8069 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8070 SmallPtrSetImpl<Value *> &CheckedExtracts)
8071 : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8072 R(R), CheckedExtracts(CheckedExtracts) {}
8074 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8075 unsigned NumParts, bool &UseVecBaseAsInput) {
8076 UseVecBaseAsInput = false;
8077 if (Mask.empty())
8078 return nullptr;
8079 Value *VecBase = nullptr;
8080 ArrayRef<Value *> VL = E->Scalars;
8081 // If the resulting type is scalarized, do not adjust the cost.
8082 if (NumParts == VL.size())
8083 return nullptr;
8084 // Check if it can be considered reused if same extractelements were
8085 // vectorized already.
8086 bool PrevNodeFound = any_of(
8087 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8088 [&](const std::unique_ptr<TreeEntry> &TE) {
8089 return ((!TE->isAltShuffle() &&
8090 TE->getOpcode() == Instruction::ExtractElement) ||
8091 TE->State == TreeEntry::NeedToGather) &&
8092 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8093 return VL.size() > Data.index() &&
8094 (Mask[Data.index()] == PoisonMaskElem ||
8095 isa<UndefValue>(VL[Data.index()]) ||
8096 Data.value() == VL[Data.index()]);
8097 });
8098 });
8099 SmallPtrSet<Value *, 4> UniqueBases;
8100 unsigned SliceSize = VL.size() / NumParts;
8101 for (unsigned Part = 0; Part < NumParts; ++Part) {
8102 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8103 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
8104 // Ignore non-extractelement scalars.
8105 if (isa<UndefValue>(V) ||
8106 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8107 continue;
8108 // If all users of instruction are going to be vectorized and this
8109 // instruction itself is not going to be vectorized, consider this
8110 // instruction as dead and remove its cost from the final cost of the
8111 // vectorized tree.
8112 // Also, avoid adjusting the cost for extractelements with multiple uses
8113 // in different graph entries.
8114 auto *EE = cast<ExtractElementInst>(V);
8115 VecBase = EE->getVectorOperand();
8116 UniqueBases.insert(VecBase);
8117 const TreeEntry *VE = R.getTreeEntry(V);
8118 if (!CheckedExtracts.insert(V).second ||
8119 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8120 (VE && VE != E))
8121 continue;
8122 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8123 if (!EEIdx)
8124 continue;
8125 unsigned Idx = *EEIdx;
8126 // Take credit for instruction that will become dead.
8127 if (EE->hasOneUse() || !PrevNodeFound) {
8128 Instruction *Ext = EE->user_back();
8129 if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
8130 return isa<GetElementPtrInst>(U);
8131 })) {
8132 // Use getExtractWithExtendCost() to calculate the cost of
8133 // extractelement/ext pair.
8134 Cost -=
8135 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8136 EE->getVectorOperandType(), Idx);
8137 // Add back the cost of s|zext which is subtracted separately.
8139 Ext->getOpcode(), Ext->getType(), EE->getType(),
8140 TTI::getCastContextHint(Ext), CostKind, Ext);
8141 continue;
8142 }
8143 }
8144 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8145 CostKind, Idx);
8146 }
8147 }
8148 // Check that gather of extractelements can be represented as just a
8149 // shuffle of a single/two vectors the scalars are extracted from.
8150 // Found the bunch of extractelement instructions that must be gathered
8151 // into a vector and can be represented as a permutation elements in a
8152 // single input vector or of 2 input vectors.
8153 // Done for reused if same extractelements were vectorized already.
8154 if (!PrevNodeFound)
8155 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8156 InVectors.assign(1, E);
8157 CommonMask.assign(Mask.begin(), Mask.end());
8158 transformMaskAfterShuffle(CommonMask, CommonMask);
8159 SameNodesEstimated = false;
8160 if (NumParts != 1 && UniqueBases.size() != 1) {
8161 UseVecBaseAsInput = true;
8162 VecBase = Constant::getNullValue(
8163 FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
8164 }
8165 return VecBase;
8166 }
8167 /// Checks if the specified entry \p E needs to be delayed because of its
8168 /// dependency nodes.
8169 std::optional<InstructionCost>
8170 needToDelay(const TreeEntry *,
8172 // No need to delay the cost estimation during analysis.
8173 return std::nullopt;
8174 }
8175 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8176 if (&E1 == &E2) {
8177 assert(all_of(Mask,
8178 [&](int Idx) {
8179 return Idx < static_cast<int>(E1.getVectorFactor());
8180 }) &&
8181 "Expected single vector shuffle mask.");
8182 add(E1, Mask);
8183 return;
8184 }
8185 if (InVectors.empty()) {
8186 CommonMask.assign(Mask.begin(), Mask.end());
8187 InVectors.assign({&E1, &E2});
8188 return;
8189 }
8190 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8191 auto *MaskVecTy =
8192 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8193 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8194 if (NumParts == 0 || NumParts >= Mask.size())
8195 NumParts = 1;
8196 unsigned SliceSize = Mask.size() / NumParts;
8197 const auto *It =
8198 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8199 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8200 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8201 }
8202 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8203 if (InVectors.empty()) {
8204 CommonMask.assign(Mask.begin(), Mask.end());
8205 InVectors.assign(1, &E1);
8206 return;
8207 }
8208 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8209 auto *MaskVecTy =
8210 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8211 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8212 if (NumParts == 0 || NumParts >= Mask.size())
8213 NumParts = 1;
8214 unsigned SliceSize = Mask.size() / NumParts;
8215 const auto *It =
8216 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8217 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8218 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8219 if (!SameNodesEstimated && InVectors.size() == 1)
8220 InVectors.emplace_back(&E1);
8221 }
8222 /// Adds 2 input vectors and the mask for their shuffling.
8223 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8224 // May come only for shuffling of 2 vectors with extractelements, already
8225 // handled in adjustExtracts.
8226 assert(InVectors.size() == 1 &&
8227 all_of(enumerate(CommonMask),
8228 [&](auto P) {
8229 if (P.value() == PoisonMaskElem)
8230 return Mask[P.index()] == PoisonMaskElem;
8231 auto *EI =
8232 cast<ExtractElementInst>(InVectors.front()
8233 .get<const TreeEntry *>()
8234 ->Scalars[P.index()]);
8235 return EI->getVectorOperand() == V1 ||
8236 EI->getVectorOperand() == V2;
8237 }) &&
8238 "Expected extractelement vectors.");
8239 }
8240 /// Adds another one input vector and the mask for the shuffling.
8241 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8242 if (InVectors.empty()) {
8243 assert(CommonMask.empty() && !ForExtracts &&
8244 "Expected empty input mask/vectors.");
8245 CommonMask.assign(Mask.begin(), Mask.end());
8246 InVectors.assign(1, V1);
8247 return;
8248 }
8249 if (ForExtracts) {
8250 // No need to add vectors here, already handled them in adjustExtracts.
8251 assert(InVectors.size() == 1 &&
8252 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8253 all_of(enumerate(CommonMask),
8254 [&](auto P) {
8255 Value *Scalar = InVectors.front()
8256 .get<const TreeEntry *>()
8257 ->Scalars[P.index()];
8258 if (P.value() == PoisonMaskElem)
8259 return P.value() == Mask[P.index()] ||
8260 isa<UndefValue>(Scalar);
8261 if (isa<Constant>(V1))
8262 return true;
8263 auto *EI = cast<ExtractElementInst>(Scalar);
8264 return EI->getVectorOperand() == V1;
8265 }) &&
8266 "Expected only tree entry for extractelement vectors.");
8267 return;
8268 }
8269 assert(!InVectors.empty() && !CommonMask.empty() &&
8270 "Expected only tree entries from extracts/reused buildvectors.");
8271 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8272 if (InVectors.size() == 2) {
8273 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8274 transformMaskAfterShuffle(CommonMask, CommonMask);
8275 VF = std::max<unsigned>(VF, CommonMask.size());
8276 } else if (const auto *InTE =
8277 InVectors.front().dyn_cast<const TreeEntry *>()) {
8278 VF = std::max(VF, InTE->getVectorFactor());
8279 } else {
8280 VF = std::max(
8281 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8282 ->getNumElements());
8283 }
8284 InVectors.push_back(V1);
8285 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8286 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8287 CommonMask[Idx] = Mask[Idx] + VF;
8288 }
8289 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8290 Value *Root = nullptr) {
8291 Cost += getBuildVectorCost(VL, Root);
8292 if (!Root) {
8293 // FIXME: Need to find a way to avoid use of getNullValue here.
8295 unsigned VF = VL.size();
8296 if (MaskVF != 0)
8297 VF = std::min(VF, MaskVF);
8298 for (Value *V : VL.take_front(VF)) {
8299 if (isa<UndefValue>(V)) {
8300 Vals.push_back(cast<Constant>(V));
8301 continue;
8302 }
8303 Vals.push_back(Constant::getNullValue(V->getType()));
8304 }
8305 return ConstantVector::get(Vals);
8306 }
8309 cast<FixedVectorType>(Root->getType())->getNumElements()),
8310 getAllOnesValue(*R.DL, VL.front()->getType()));
8311 }
8313 /// Finalize emission of the shuffles.
8315 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8316 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8317 IsFinalized = true;
8318 if (Action) {
8319 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8320 if (InVectors.size() == 2)
8321 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8322 else
8323 Cost += createShuffle(Vec, nullptr, CommonMask);
8324 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8325 if (CommonMask[Idx] != PoisonMaskElem)
8326 CommonMask[Idx] = Idx;
8327 assert(VF > 0 &&
8328 "Expected vector length for the final value before action.");
8329 Value *V = Vec.get<Value *>();
8330 Action(V, CommonMask);
8331 InVectors.front() = V;
8332 }
8333 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8334 if (CommonMask.empty()) {
8335 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8336 return Cost;
8337 }
8338 return Cost +
8339 createShuffle(InVectors.front(),
8340 InVectors.size() == 2 ? InVectors.back() : nullptr,
8341 CommonMask);
8342 }
8343
8345 assert((IsFinalized || CommonMask.empty()) &&
8346 "Shuffle construction must be finalized.");
8347 }
8348};
8349
8350const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8351 unsigned Idx) const {
8352 Value *Op = E->getOperand(Idx).front();
8353 if (const TreeEntry *TE = getTreeEntry(Op)) {
8354 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8355 return EI.EdgeIdx == Idx && EI.UserTE == E;
8356 }) != TE->UserTreeIndices.end())
8357 return TE;
8358 auto MIt = MultiNodeScalars.find(Op);
8359 if (MIt != MultiNodeScalars.end()) {
8360 for (const TreeEntry *TE : MIt->second) {
8361 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8362 return EI.EdgeIdx == Idx && EI.UserTE == E;
8363 }) != TE->UserTreeIndices.end())
8364 return TE;
8365 }
8366 }
8367 }
8368 const auto *It =
8369 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8370 return TE->State == TreeEntry::NeedToGather &&
8371 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8372 return EI.EdgeIdx == Idx && EI.UserTE == E;
8373 }) != TE->UserTreeIndices.end();
8374 });
8375 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
8376 return It->get();
8377}
8378
8380BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8381 SmallPtrSetImpl<Value *> &CheckedExtracts) {
8382 ArrayRef<Value *> VL = E->Scalars;
8383
8384 Type *ScalarTy = VL[0]->getType();
8385 if (E->State != TreeEntry::NeedToGather) {
8386 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
8387 ScalarTy = SI->getValueOperand()->getType();
8388 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
8389 ScalarTy = CI->getOperand(0)->getType();
8390 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8391 ScalarTy = IE->getOperand(1)->getType();
8392 }
8395 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8397
8398 // If we have computed a smaller type for the expression, update VecTy so
8399 // that the costs will be accurate.
8400 auto It = MinBWs.find(E);
8401 if (It != MinBWs.end()) {
8402 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
8403 VecTy = FixedVectorType::get(ScalarTy, VL.size());
8404 }
8405 unsigned EntryVF = E->getVectorFactor();
8406 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
8407
8408 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8409 if (E->State == TreeEntry::NeedToGather) {
8410 if (allConstant(VL))
8411 return 0;
8412 if (isa<InsertElementInst>(VL[0]))
8414 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8415 E, *TTI, VectorizedVals, *this, CheckedExtracts);
8416 }
8417 InstructionCost CommonCost = 0;
8419 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
8420 if (!E->ReorderIndices.empty() &&
8421 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8422 SmallVector<int> NewMask;
8423 if (E->getOpcode() == Instruction::Store) {
8424 // For stores the order is actually a mask.
8425 NewMask.resize(E->ReorderIndices.size());
8426 copy(E->ReorderIndices, NewMask.begin());
8427 } else {
8428 inversePermutation(E->ReorderIndices, NewMask);
8429 }
8430 ::addMask(Mask, NewMask);
8431 }
8432 if (NeedToShuffleReuses)
8433 ::addMask(Mask, E->ReuseShuffleIndices);
8434 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
8435 CommonCost =
8436 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
8437 assert((E->State == TreeEntry::Vectorize ||
8438 E->State == TreeEntry::ScatterVectorize ||
8439 E->State == TreeEntry::StridedVectorize) &&
8440 "Unhandled state");
8441 assert(E->getOpcode() &&
8442 ((allSameType(VL) && allSameBlock(VL)) ||
8443 (E->getOpcode() == Instruction::GetElementPtr &&
8444 E->getMainOp()->getType()->isPointerTy())) &&
8445 "Invalid VL");
8446 Instruction *VL0 = E->getMainOp();
8447 unsigned ShuffleOrOp =
8448 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
8449 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
8450 const unsigned Sz = UniqueValues.size();
8451 SmallBitVector UsedScalars(Sz, false);
8452 for (unsigned I = 0; I < Sz; ++I) {
8453 if (getTreeEntry(UniqueValues[I]) == E)
8454 continue;
8455 UsedScalars.set(I);
8456 }
8457 auto GetCastContextHint = [&](Value *V) {
8458 if (const TreeEntry *OpTE = getTreeEntry(V)) {
8459 if (OpTE->State == TreeEntry::ScatterVectorize ||
8460 OpTE->State == TreeEntry::StridedVectorize)
8462 if (OpTE->State == TreeEntry::Vectorize &&
8463 OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
8464 if (OpTE->ReorderIndices.empty())
8467 inversePermutation(OpTE->ReorderIndices, Mask);
8468 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
8470 }
8471 } else {
8472 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
8473 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8475 }
8477 };
8478 auto GetCostDiff =
8479 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
8481 // Calculate the cost of this instruction.
8482 InstructionCost ScalarCost = 0;
8483 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8484 // For some of the instructions no need to calculate cost for each
8485 // particular instruction, we can use the cost of the single
8486 // instruction x total number of scalar instructions.
8487 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8488 } else {
8489 for (unsigned I = 0; I < Sz; ++I) {
8490 if (UsedScalars.test(I))
8491 continue;
8492 ScalarCost += ScalarEltCost(I);
8493 }
8494 }
8495
8496 InstructionCost VecCost = VectorCost(CommonCost);
8497 // Check if the current node must be resized, if the parent node is not
8498 // resized.
8499 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
8500 const EdgeInfo &EI = E->UserTreeIndices.front();
8501 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8502 EI.EdgeIdx != 0) &&
8503 It != MinBWs.end()) {
8504 auto UserBWIt = MinBWs.find(EI.UserTE);
8505 Type *UserScalarTy =
8506 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8507 if (UserBWIt != MinBWs.end())
8508 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
8509 UserBWIt->second.first);
8510 if (ScalarTy != UserScalarTy) {
8511 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
8512 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
8513 unsigned VecOpcode;
8514 auto *UserVecTy =
8515 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
8516 if (BWSz > SrcBWSz)
8517 VecOpcode = Instruction::Trunc;
8518 else
8519 VecOpcode =
8520 It->second.second ? Instruction::SExt : Instruction::ZExt;
8521 TTI::CastContextHint CCH = GetCastContextHint(VL0);
8522 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
8523 CostKind);
8524 ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy,
8525 ScalarTy, CCH, CostKind);
8526 }
8527 }
8528 }
8529 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8530 ScalarCost, "Calculated costs for Tree"));
8531 return VecCost - ScalarCost;
8532 };
8533 // Calculate cost difference from vectorizing set of GEPs.
8534 // Negative value means vectorizing is profitable.
8535 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
8536 assert((E->State == TreeEntry::Vectorize ||
8537 E->State == TreeEntry::StridedVectorize) &&
8538 "Entry state expected to be Vectorize or StridedVectorize here.");
8539 InstructionCost ScalarCost = 0;
8540 InstructionCost VecCost = 0;
8541 std::tie(ScalarCost, VecCost) = getGEPCosts(
8542 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy);
8543 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8544 "Calculated GEPs cost for Tree"));
8545
8546 return VecCost - ScalarCost;
8547 };
8548
8549 switch (ShuffleOrOp) {
8550 case Instruction::PHI: {
8551 // Count reused scalars.
8552 InstructionCost ScalarCost = 0;
8554 for (Value *V : UniqueValues) {
8555 auto *PHI = dyn_cast<PHINode>(V);
8556 if (!PHI)
8557 continue;
8558
8559 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
8560 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
8561 Value *Op = PHI->getIncomingValue(I);
8562 Operands[I] = Op;
8563 }
8564 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
8565 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
8566 if (!OpTE->ReuseShuffleIndices.empty())
8567 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8568 OpTE->Scalars.size());
8569 }
8570
8571 return CommonCost - ScalarCost;
8572 }
8573 case Instruction::ExtractValue:
8574 case Instruction::ExtractElement: {
8575 auto GetScalarCost = [&](unsigned Idx) {
8576 auto *I = cast<Instruction>(UniqueValues[Idx]);
8577 VectorType *SrcVecTy;
8578 if (ShuffleOrOp == Instruction::ExtractElement) {
8579 auto *EE = cast<ExtractElementInst>(I);
8580 SrcVecTy = EE->getVectorOperandType();
8581 } else {
8582 auto *EV = cast<ExtractValueInst>(I);
8583 Type *AggregateTy = EV->getAggregateOperand()->getType();
8584 unsigned NumElts;
8585 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8586 NumElts = ATy->getNumElements();
8587 else
8588 NumElts = AggregateTy->getStructNumElements();
8589 SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
8590 }
8591 if (I->hasOneUse()) {
8592 Instruction *Ext = I->user_back();
8593 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
8594 all_of(Ext->users(),
8595 [](User *U) { return isa<GetElementPtrInst>(U); })) {
8596 // Use getExtractWithExtendCost() to calculate the cost of
8597 // extractelement/ext pair.
8599 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
8600 // Subtract the cost of s|zext which is subtracted separately.
8602 Ext->getOpcode(), Ext->getType(), I->getType(),
8604 return Cost;
8605 }
8606 }
8607 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
8609 };
8610 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
8611 return GetCostDiff(GetScalarCost, GetVectorCost);
8612 }
8613 case Instruction::InsertElement: {
8614 assert(E->ReuseShuffleIndices.empty() &&
8615 "Unique insertelements only are expected.");
8616 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
8617 unsigned const NumElts = SrcVecTy->getNumElements();
8618 unsigned const NumScalars = VL.size();
8619
8620 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
8621
8622 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
8623 unsigned OffsetBeg = *getInsertIndex(VL.front());
8624 unsigned OffsetEnd = OffsetBeg;
8625 InsertMask[OffsetBeg] = 0;
8626 for (auto [I, V] : enumerate(VL.drop_front())) {
8627 unsigned Idx = *getInsertIndex(V);
8628 if (OffsetBeg > Idx)
8629 OffsetBeg = Idx;
8630 else if (OffsetEnd < Idx)
8631 OffsetEnd = Idx;
8632 InsertMask[Idx] = I + 1;
8633 }
8634 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
8635 if (NumOfParts > 0)
8636 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
8637 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
8638 VecScalarsSz;
8639 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
8640 unsigned InsertVecSz = std::min<unsigned>(
8641 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
8642 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
8643 bool IsWholeSubvector =
8644 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
8645 // Check if we can safely insert a subvector. If it is not possible, just
8646 // generate a whole-sized vector and shuffle the source vector and the new
8647 // subvector.
8648 if (OffsetBeg + InsertVecSz > VecSz) {
8649 // Align OffsetBeg to generate correct mask.
8650 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
8651 InsertVecSz = VecSz;
8652 }
8653
8654 APInt DemandedElts = APInt::getZero(NumElts);
8655 // TODO: Add support for Instruction::InsertValue.
8657 if (!E->ReorderIndices.empty()) {
8658 inversePermutation(E->ReorderIndices, Mask);
8659 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
8660 } else {
8661 Mask.assign(VecSz, PoisonMaskElem);
8662 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
8663 }
8664 bool IsIdentity = true;
8665 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
8666 Mask.swap(PrevMask);
8667 for (unsigned I = 0; I < NumScalars; ++I) {
8668 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
8669 DemandedElts.setBit(InsertIdx);
8670 IsIdentity &= InsertIdx - OffsetBeg == I;
8671 Mask[InsertIdx - OffsetBeg] = I;
8672 }
8673 assert(Offset < NumElts && "Failed to find vector index offset");
8674
8676 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
8677 /*Insert*/ true, /*Extract*/ false,
8678 CostKind);
8679
8680 // First cost - resize to actual vector size if not identity shuffle or
8681 // need to shift the vector.
8682 // Do not calculate the cost if the actual size is the register size and
8683 // we can merge this shuffle with the following SK_Select.
8684 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
8685 if (!IsIdentity)
8687 InsertVecTy, Mask);
8688 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
8689 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
8690 }));
8691 // Second cost - permutation with subvector, if some elements are from the
8692 // initial vector or inserting a subvector.
8693 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
8694 // subvector of ActualVecTy.
8695 SmallBitVector InMask =
8696 isUndefVector(FirstInsert->getOperand(0),
8697 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
8698 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
8699 if (InsertVecSz != VecSz) {
8700 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
8702 std::nullopt, CostKind, OffsetBeg - Offset,
8703 InsertVecTy);
8704 } else {
8705 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
8706 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
8707 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
8708 I <= End; ++I)
8709 if (Mask[I] != PoisonMaskElem)
8710 Mask[I] = I + VecSz;
8711 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
8712 Mask[I] =
8713 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
8714 Cost +=
8715 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
8716 }
8717 }
8718 return Cost;
8719 }
8720 case Instruction::ZExt:
8721 case Instruction::SExt:
8722 case Instruction::FPToUI:
8723 case Instruction::FPToSI:
8724 case Instruction::FPExt:
8725 case Instruction::PtrToInt:
8726 case Instruction::IntToPtr:
8727 case Instruction::SIToFP:
8728 case Instruction::UIToFP:
8729 case Instruction::Trunc:
8730 case Instruction::FPTrunc:
8731 case Instruction::BitCast: {
8732 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
8733 Type *SrcScalarTy = VL0->getOperand(0)->getType();
8734 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
8735 unsigned Opcode = ShuffleOrOp;
8736 unsigned VecOpcode = Opcode;
8737 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
8738 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
8739 // Check if the values are candidates to demote.
8740 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
8741 if (SrcIt != MinBWs.end()) {
8742 SrcBWSz = SrcIt->second.first;
8743 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
8744 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
8745 }
8746 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
8747 if (BWSz == SrcBWSz) {
8748 VecOpcode = Instruction::BitCast;
8749 } else if (BWSz < SrcBWSz) {
8750 VecOpcode = Instruction::Trunc;
8751 } else if (It != MinBWs.end()) {
8752 assert(BWSz > SrcBWSz && "Invalid cast!");
8753 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
8754 }
8755 }
8756 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
8757 // Do not count cost here if minimum bitwidth is in effect and it is just
8758 // a bitcast (here it is just a noop).
8759 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
8760 return TTI::TCC_Free;
8761 auto *VI = VL0->getOpcode() == Opcode
8762 ? cast<Instruction>(UniqueValues[Idx])
8763 : nullptr;
8764 return TTI->getCastInstrCost(Opcode, VL0->getType(),
8765 VL0->getOperand(0)->getType(),
8767 };
8768 auto GetVectorCost = [=](InstructionCost CommonCost) {
8769 // Do not count cost here if minimum bitwidth is in effect and it is just
8770 // a bitcast (here it is just a noop).
8771 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
8772 return CommonCost;
8773 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
8774 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
8775 return CommonCost +
8776 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
8777 VecOpcode == Opcode ? VI : nullptr);
8778 };
8779 return GetCostDiff(GetScalarCost, GetVectorCost);
8780 }
8781 case Instruction::FCmp:
8782 case Instruction::ICmp:
8783 case Instruction::Select: {
8784 CmpInst::Predicate VecPred, SwappedVecPred;
8785 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
8786 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
8787 match(VL0, MatchCmp))
8788 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
8789 else
8790 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
8793 auto GetScalarCost = [&](unsigned Idx) {
8794 auto *VI = cast<Instruction>(UniqueValues[Idx]);
8795 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
8798 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
8799 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
8800 !match(VI, MatchCmp)) ||
8801 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
8802 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
8805
8806 return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
8807 Builder.getInt1Ty(), CurrentPred, CostKind,
8808 VI);
8809 };
8810 auto GetVectorCost = [&](InstructionCost CommonCost) {
8811 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
8812
8814 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
8815 // Check if it is possible and profitable to use min/max for selects
8816 // in VL.
8817 //
8818 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
8819 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
8820 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
8821 {VecTy, VecTy});
8822 InstructionCost IntrinsicCost =
8823 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
8824 // If the selects are the only uses of the compares, they will be
8825 // dead and we can adjust the cost by removing their cost.
8826 if (IntrinsicAndUse.second)
8827 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
8828 MaskTy, VecPred, CostKind);
8829 VecCost = std::min(VecCost, IntrinsicCost);
8830 }
8831 return VecCost + CommonCost;
8832 };
8833 return GetCostDiff(GetScalarCost, GetVectorCost);
8834 }
8835 case Instruction::FNeg:
8836 case Instruction::Add:
8837 case Instruction::FAdd:
8838 case Instruction::Sub:
8839 case Instruction::FSub:
8840 case Instruction::Mul:
8841 case Instruction::FMul:
8842 case Instruction::UDiv:
8843 case Instruction::SDiv:
8844 case Instruction::FDiv:
8845 case Instruction::URem:
8846 case Instruction::SRem:
8847 case Instruction::FRem:
8848 case Instruction::Shl:
8849 case Instruction::LShr:
8850 case Instruction::AShr:
8851 case Instruction::And:
8852 case Instruction::Or:
8853 case Instruction::Xor: {
8854 auto GetScalarCost = [&](unsigned Idx) {
8855 auto *VI = cast<Instruction>(UniqueValues[Idx]);
8856 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
8857 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
8858 TTI::OperandValueInfo Op2Info =
8859 TTI::getOperandInfo(VI->getOperand(OpIdx));
8860 SmallVector<const Value *> Operands(VI->operand_values());
8861 return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
8862 Op1Info, Op2Info, Operands, VI);
8863 };
8864 auto GetVectorCost = [=](InstructionCost CommonCost) {
8865 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
8866 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
8867 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
8868 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
8869 Op2Info, std::nullopt, nullptr, TLI) +
8870 CommonCost;
8871 };
8872 return GetCostDiff(GetScalarCost, GetVectorCost);
8873 }
8874 case Instruction::GetElementPtr: {
8875 return CommonCost + GetGEPCostDiff(VL, VL0);
8876 }
8877 case Instruction::Load: {
8878 auto GetScalarCost = [&](unsigned Idx) {
8879 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
8880 return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
8881 VI->getPointerAddressSpace(), CostKind,
8882 TTI::OperandValueInfo(), VI);
8883 };
8884 auto *LI0 = cast<LoadInst>(VL0);
8885 auto GetVectorCost = [&](InstructionCost CommonCost) {
8886 InstructionCost VecLdCost;
8887 if (E->State == TreeEntry::Vectorize) {
8888 VecLdCost = TTI->getMemoryOpCost(
8889 Instruction::Load, VecTy, LI0->getAlign(),
8890 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
8891 } else if (E->State == TreeEntry::StridedVectorize) {
8892 Align CommonAlignment =
8893 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
8894 VecLdCost = TTI->getStridedMemoryOpCost(
8895 Instruction::Load, VecTy, LI0->getPointerOperand(),
8896 /*VariableMask=*/false, CommonAlignment, CostKind);
8897 } else {
8898 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
8899 Align CommonAlignment =
8900 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
8901 VecLdCost = TTI->getGatherScatterOpCost(
8902 Instruction::Load, VecTy, LI0->getPointerOperand(),
8903 /*VariableMask=*/false, CommonAlignment, CostKind);
8904 }
8905 return VecLdCost + CommonCost;
8906 };
8907
8908 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
8909 // If this node generates masked gather load then it is not a terminal node.
8910 // Hence address operand cost is estimated separately.
8911 if (E->State == TreeEntry::ScatterVectorize)
8912 return Cost;
8913
8914 // Estimate cost of GEPs since this tree node is a terminator.
8915 SmallVector<Value *> PointerOps(VL.size());
8916 for (auto [I, V] : enumerate(VL))
8917 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8918 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
8919 }
8920 case Instruction::Store: {
8921 bool IsReorder = !E->ReorderIndices.empty();
8922 auto GetScalarCost = [=](unsigned Idx) {
8923 auto *VI = cast<StoreInst>(VL[Idx]);
8924 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
8925 return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
8926 VI->getPointerAddressSpace(), CostKind,
8927 OpInfo, VI);
8928 };
8929 auto *BaseSI =
8930 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
8931 auto GetVectorCost = [=](InstructionCost CommonCost) {
8932 // We know that we can merge the stores. Calculate the cost.
8933 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
8934 return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8935 BaseSI->getPointerAddressSpace(), CostKind,
8936 OpInfo) +
8937 CommonCost;
8938 };
8939 SmallVector<Value *> PointerOps(VL.size());
8940 for (auto [I, V] : enumerate(VL)) {
8941 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
8942 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
8943 }
8944
8945 return GetCostDiff(GetScalarCost, GetVectorCost) +
8946 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
8947 }
8948 case Instruction::Call: {
8949 auto GetScalarCost = [&](unsigned Idx) {
8950 auto *CI = cast<CallInst>(UniqueValues[Idx]);
8953 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
8954 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
8955 }
8958 CI->getFunctionType()->params(), CostKind);
8959 };
8960 auto GetVectorCost = [=](InstructionCost CommonCost) {
8961 auto *CI = cast<CallInst>(VL0);
8962 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
8963 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
8964 };
8965 return GetCostDiff(GetScalarCost, GetVectorCost);
8966 }
8967 case Instruction::ShuffleVector: {
8968 assert(E->isAltShuffle() &&
8969 ((Instruction::isBinaryOp(E->getOpcode()) &&
8970 Instruction::isBinaryOp(E->getAltOpcode())) ||
8971 (Instruction::isCast(E->getOpcode()) &&
8972 Instruction::isCast(E->getAltOpcode())) ||
8973 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
8974 "Invalid Shuffle Vector Operand");
8975 // Try to find the previous shuffle node with the same operands and same
8976 // main/alternate ops.
8977 auto TryFindNodeWithEqualOperands = [=]() {
8978 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8979 if (TE.get() == E)
8980 break;
8981 if (TE->isAltShuffle() &&
8982 ((TE->getOpcode() == E->getOpcode() &&
8983 TE->getAltOpcode() == E->getAltOpcode()) ||
8984 (TE->getOpcode() == E->getAltOpcode() &&
8985 TE->getAltOpcode() == E->getOpcode())) &&
8986 TE->hasEqualOperands(*E))
8987 return true;
8988 }
8989 return false;
8990 };
8991 auto GetScalarCost = [&](unsigned Idx) {
8992 auto *VI = cast<Instruction>(UniqueValues[Idx]);
8993 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
8994 (void)E;
8995 return TTI->getInstructionCost(VI, CostKind);
8996 };
8997 // FIXME: Workaround for syntax error reported by MSVC buildbots.
8998 TargetTransformInfo &TTIRef = *TTI;
8999 // Need to clear CommonCost since the final shuffle cost is included into
9000 // vector cost.
9001 auto GetVectorCost = [&](InstructionCost) {
9002 // VecCost is equal to sum of the cost of creating 2 vectors
9003 // and the cost of creating shuffle.
9004 InstructionCost VecCost = 0;
9005 if (TryFindNodeWithEqualOperands()) {
9006 LLVM_DEBUG({
9007 dbgs() << "SLP: diamond match for alternate node found.\n";
9008 E->dump();
9009 });
9010 // No need to add new vector costs here since we're going to reuse
9011 // same main/alternate vector ops, just do different shuffling.
9012 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9013 VecCost =
9014 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9015 VecCost +=
9016 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9017 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9018 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9019 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9020 CI0->getPredicate(), CostKind, VL0);
9021 VecCost += TTIRef.getCmpSelInstrCost(
9022 E->getOpcode(), VecTy, MaskTy,
9023 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9024 E->getAltOp());
9025 } else {
9026 Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
9027 Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
9028 auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
9029 auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
9030 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
9032 VecCost +=
9033 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
9035 }
9037 E->buildAltOpShuffleMask(
9038 [E](Instruction *I) {
9039 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9040 return I->getOpcode() == E->getAltOpcode();
9041 },
9042 Mask);
9044 FinalVecTy, Mask);
9045 // Patterns like [fadd,fsub] can be combined into a single instruction
9046 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9047 // need to take into account their order when looking for the most used
9048 // order.
9049 unsigned Opcode0 = E->getOpcode();
9050 unsigned Opcode1 = E->getAltOpcode();
9051 // The opcode mask selects between the two opcodes.
9052 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9053 for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9054 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9055 OpcodeMask.set(Lane);
9056 // If this pattern is supported by the target then we consider the
9057 // order.
9058 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9059 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9060 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9061 return AltVecCost < VecCost ? AltVecCost : VecCost;
9062 }
9063 // TODO: Check the reverse order too.
9064 return VecCost;
9065 };
9066 return GetCostDiff(GetScalarCost, GetVectorCost);
9067 }
9068 default:
9069 llvm_unreachable("Unknown instruction");
9070 }
9071}
9072
9073bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9074 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9075 << VectorizableTree.size() << " is fully vectorizable .\n");
9076
9077 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9079 return TE->State == TreeEntry::NeedToGather &&
9080 !any_of(TE->Scalars,
9081 [this](Value *V) { return EphValues.contains(V); }) &&
9082 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9083 TE->Scalars.size() < Limit ||
9084 ((TE->getOpcode() == Instruction::ExtractElement ||
9085 all_of(TE->Scalars,
9086 [](Value *V) {
9087 return isa<ExtractElementInst, UndefValue>(V);
9088 })) &&
9089 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9090 (TE->State == TreeEntry::NeedToGather &&
9091 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9092 };
9093
9094 // We only handle trees of heights 1 and 2.
9095 if (VectorizableTree.size() == 1 &&
9096 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9097 (ForReduction &&
9098 AreVectorizableGathers(VectorizableTree[0].get(),
9099 VectorizableTree[0]->Scalars.size()) &&
9100 VectorizableTree[0]->getVectorFactor() > 2)))
9101 return true;
9102
9103 if (VectorizableTree.size() != 2)
9104 return false;
9105
9106 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9107 // with the second gather nodes if they have less scalar operands rather than
9108 // the initial tree element (may be profitable to shuffle the second gather)
9109 // or they are extractelements, which form shuffle.
9111 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9112 AreVectorizableGathers(VectorizableTree[1].get(),
9113 VectorizableTree[0]->Scalars.size()))
9114 return true;
9115
9116 // Gathering cost would be too much for tiny trees.
9117 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9118 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9119 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9120 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9121 return false;
9122
9123 return true;
9124}
9125
9126static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9128 bool MustMatchOrInst) {
9129 // Look past the root to find a source value. Arbitrarily follow the
9130 // path through operand 0 of any 'or'. Also, peek through optional
9131 // shift-left-by-multiple-of-8-bits.
9132 Value *ZextLoad = Root;
9133 const APInt *ShAmtC;
9134 bool FoundOr = false;
9135 while (!isa<ConstantExpr>(ZextLoad) &&
9136 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9137 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9138 ShAmtC->urem(8) == 0))) {
9139 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9140 ZextLoad = BinOp->getOperand(0);
9141 if (BinOp->getOpcode() == Instruction::Or)
9142 FoundOr = true;
9143 }
9144 // Check if the input is an extended load of the required or/shift expression.
9145 Value *Load;
9146 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9147 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9148 return false;
9149
9150 // Require that the total load bit width is a legal integer type.
9151 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9152 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9153 Type *SrcTy = Load->getType();
9154 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9155 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9156 return false;
9157
9158 // Everything matched - assume that we can fold the whole sequence using
9159 // load combining.
9160 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9161 << *(cast<Instruction>(Root)) << "\n");
9162
9163 return true;
9164}
9165
9167 if (RdxKind != RecurKind::Or)
9168 return false;
9169
9170 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9171 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9172 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9173 /* MatchOr */ false);
9174}
9175
9177 // Peek through a final sequence of stores and check if all operations are
9178 // likely to be load-combined.
9179 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9180 for (Value *Scalar : VectorizableTree[0]->Scalars) {
9181 Value *X;
9182 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9183 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9184 return false;
9185 }
9186 return true;
9187}
9188
9189bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9190 // No need to vectorize inserts of gathered values.
9191 if (VectorizableTree.size() == 2 &&
9192 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9193 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9194 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9195 !(isSplat(VectorizableTree[1]->Scalars) ||
9196 allConstant(VectorizableTree[1]->Scalars))))
9197 return true;
9198
9199 // If the graph includes only PHI nodes and gathers, it is defnitely not
9200 // profitable for the vectorization, we can skip it, if the cost threshold is
9201 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9202 // gathers/buildvectors.
9203 constexpr int Limit = 4;
9204 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9205 !VectorizableTree.empty() &&
9206 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9207 return (TE->State == TreeEntry::NeedToGather &&
9208 TE->getOpcode() != Instruction::ExtractElement &&
9209 count_if(TE->Scalars,
9210 [](Value *V) { return isa<ExtractElementInst>(V); }) <=
9211 Limit) ||
9212 TE->getOpcode() == Instruction::PHI;
9213 }))
9214 return true;
9215
9216 // We can vectorize the tree if its size is greater than or equal to the
9217 // minimum size specified by the MinTreeSize command line option.
9218 if (VectorizableTree.size() >= MinTreeSize)
9219 return false;
9220
9221 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9222 // can vectorize it if we can prove it fully vectorizable.
9223 if (isFullyVectorizableTinyTree(ForReduction))
9224 return false;
9225
9226 // Check if any of the gather node forms an insertelement buildvector
9227 // somewhere.
9228 if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
9229 return TE->State == TreeEntry::NeedToGather &&
9230 all_of(TE->Scalars, [](Value *V) {
9231 return isa<ExtractElementInst, UndefValue>(V) ||
9232 (!V->hasNUsesOrMore(UsesLimit) &&
9233 any_of(V->users(), [](User *U) {
9234 return isa<InsertElementInst>(U);
9235 }));
9236 });
9237 }))
9238 return false;
9239
9240 assert(VectorizableTree.empty()
9241 ? ExternalUses.empty()
9242 : true && "We shouldn't have any external users");
9243
9244 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9245 // vectorizable.
9246 return true;
9247}
9248
9250 // Walk from the bottom of the tree to the top, tracking which values are
9251 // live. When we see a call instruction that is not part of our tree,
9252 // query TTI to see if there is a cost to keeping values live over it
9253 // (for example, if spills and fills are required).
9254 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9256
9258 Instruction *PrevInst = nullptr;
9259
9260 // The entries in VectorizableTree are not necessarily ordered by their
9261 // position in basic blocks. Collect them and order them by dominance so later
9262 // instructions are guaranteed to be visited first. For instructions in
9263 // different basic blocks, we only scan to the beginning of the block, so
9264 // their order does not matter, as long as all instructions in a basic block
9265 // are grouped together. Using dominance ensures a deterministic order.
9266 SmallVector<Instruction *, 16> OrderedScalars;
9267 for (const auto &TEPtr : VectorizableTree) {
9268 if (TEPtr->State != TreeEntry::Vectorize)
9269 continue;
9270 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9271 if (!Inst)
9272 continue;
9273 OrderedScalars.push_back(Inst);
9274 }
9275 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
9276 auto *NodeA = DT->getNode(A->getParent());
9277 auto *NodeB = DT->getNode(B->getParent());
9278 assert(NodeA && "Should only process reachable instructions");
9279 assert(NodeB && "Should only process reachable instructions");
9280 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9281 "Different nodes should have different DFS numbers");
9282 if (NodeA != NodeB)
9283 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9284 return B->comesBefore(A);
9285 });
9286
9287 for (Instruction *Inst : OrderedScalars) {
9288 if (!PrevInst) {
9289 PrevInst = Inst;
9290 continue;
9291 }
9292
9293 // Update LiveValues.
9294 LiveValues.erase(PrevInst);
9295 for (auto &J : PrevInst->operands()) {
9296 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9297 LiveValues.insert(cast<Instruction>(&*J));
9298 }
9299
9300 LLVM_DEBUG({
9301 dbgs() << "SLP: #LV: " << LiveValues.size();
9302 for (auto *X : LiveValues)
9303 dbgs() << " " << X->getName();
9304 dbgs() << ", Looking at ";
9305 Inst->dump();
9306 });
9307
9308 // Now find the sequence of instructions between PrevInst and Inst.
9309 unsigned NumCalls = 0;
9310 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
9311 PrevInstIt =
9312 PrevInst->getIterator().getReverse();
9313 while (InstIt != PrevInstIt) {
9314 if (PrevInstIt == PrevInst->getParent()->rend()) {
9315 PrevInstIt = Inst->getParent()->rbegin();
9316 continue;
9317 }
9318
9319 auto NoCallIntrinsic = [this](Instruction *I) {
9320 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
9321 if (II->isAssumeLikeIntrinsic())
9322 return true;
9323 FastMathFlags FMF;
9325 for (auto &ArgOp : II->args())
9326 Tys.push_back(ArgOp->getType());
9327 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
9328 FMF = FPMO->getFastMathFlags();
9329 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
9330 FMF);
9331 InstructionCost IntrCost =
9334 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
9335 if (IntrCost < CallCost)
9336 return true;
9337 }
9338 return false;
9339 };
9340
9341 // Debug information does not impact spill cost.
9342 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9343 &*PrevInstIt != PrevInst)
9344 NumCalls++;
9345
9346 ++PrevInstIt;
9347 }
9348
9349 if (NumCalls) {
9351 for (auto *II : LiveValues) {
9352 auto *ScalarTy = II->getType();
9353 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9354 ScalarTy = VectorTy->getElementType();
9355 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
9356 }
9357 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
9358 }
9359
9360 PrevInst = Inst;
9361 }
9362
9363 return Cost;
9364}
9365
9366/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
9367/// buildvector sequence.
9369 const InsertElementInst *IE2) {
9370 if (IE1 == IE2)
9371 return false;
9372 const auto *I1 = IE1;
9373 const auto *I2 = IE2;
9374 const InsertElementInst *PrevI1;
9375 const InsertElementInst *PrevI2;
9376 unsigned Idx1 = *getInsertIndex(IE1);
9377 unsigned Idx2 = *getInsertIndex(IE2);
9378 do {
9379 if (I2 == IE1)
9380 return true;
9381 if (I1 == IE2)
9382 return false;
9383 PrevI1 = I1;
9384 PrevI2 = I2;
9385 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9386 getInsertIndex(I1).value_or(Idx2) != Idx2)
9387 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9388 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
9389 getInsertIndex(I2).value_or(Idx1) != Idx1)
9390 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9391 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9392 llvm_unreachable("Two different buildvectors not expected.");
9393}
9394
9395namespace {
9396/// Returns incoming Value *, if the requested type is Value * too, or a default
9397/// value, otherwise.
9398struct ValueSelect {
9399 template <typename U>
9400 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
9401 return V;
9402 }
9403 template <typename U>
9404 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
9405 return U();
9406 }
9407};
9408} // namespace
9409
9410/// Does the analysis of the provided shuffle masks and performs the requested
9411/// actions on the vectors with the given shuffle masks. It tries to do it in
9412/// several steps.
9413/// 1. If the Base vector is not undef vector, resizing the very first mask to
9414/// have common VF and perform action for 2 input vectors (including non-undef
9415/// Base). Other shuffle masks are combined with the resulting after the 1 stage
9416/// and processed as a shuffle of 2 elements.
9417/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
9418/// action only for 1 vector with the given mask, if it is not the identity
9419/// mask.
9420/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
9421/// vectors, combing the masks properly between the steps.
9422template <typename T>
9424 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
9425 function_ref<unsigned(T *)> GetVF,
9426 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
9428 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
9429 SmallVector<int> Mask(ShuffleMask.begin()->second);
9430 auto VMIt = std::next(ShuffleMask.begin());
9431 T *Prev = nullptr;
9432 SmallBitVector UseMask =
9433 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9434 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
9435 if (!IsBaseUndef.all()) {
9436 // Base is not undef, need to combine it with the next subvectors.
9437 std::pair<T *, bool> Res =
9438 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
9439 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
9440 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
9441 if (Mask[Idx] == PoisonMaskElem)
9442 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
9443 else
9444 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
9445 }
9446 auto *V = ValueSelect::get<T *>(Base);
9447 (void)V;
9448 assert((!V || GetVF(V) == Mask.size()) &&
9449 "Expected base vector of VF number of elements.");
9450 Prev = Action(Mask, {nullptr, Res.first});
9451 } else if (ShuffleMask.size() == 1) {
9452 // Base is undef and only 1 vector is shuffled - perform the action only for
9453 // single vector, if the mask is not the identity mask.
9454 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9455 /*ForSingleMask=*/true);
9456 if (Res.second)
9457 // Identity mask is found.
9458 Prev = Res.first;
9459 else
9460 Prev = Action(Mask, {ShuffleMask.begin()->first});
9461 } else {
9462 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
9463 // shuffles step by step, combining shuffle between the steps.
9464 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9465 unsigned Vec2VF = GetVF(VMIt->first);
9466 if (Vec1VF == Vec2VF) {
9467 // No need to resize the input vectors since they are of the same size, we
9468 // can shuffle them directly.
9469 ArrayRef<int> SecMask = VMIt->second;
9470 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9471 if (SecMask[I] != PoisonMaskElem) {
9472 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9473 Mask[I] = SecMask[I] + Vec1VF;
9474 }
9475 }
9476 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9477 } else {
9478 // Vectors of different sizes - resize and reshuffle.
9479 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9480 /*ForSingleMask=*/false);
9481 std::pair<T *, bool> Res2 =
9482 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9483 ArrayRef<int> SecMask = VMIt->second;
9484 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9485 if (Mask[I] != PoisonMaskElem) {
9486 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9487 if (Res1.second)
9488 Mask[I] = I;
9489 } else if (SecMask[I] != PoisonMaskElem) {
9490 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9491 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
9492 }
9493 }
9494 Prev = Action(Mask, {Res1.first, Res2.first});
9495 }
9496 VMIt = std::next(VMIt);
9497 }
9498 bool IsBaseNotUndef = !IsBaseUndef.all();
9499 (void)IsBaseNotUndef;
9500 // Perform requested actions for the remaining masks/vectors.
9501 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9502 // Shuffle other input vectors, if any.
9503 std::pair<T *, bool> Res =
9504 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9505 ArrayRef<int> SecMask = VMIt->second;
9506 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9507 if (SecMask[I] != PoisonMaskElem) {
9508 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
9509 "Multiple uses of scalars.");
9510 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
9511 } else if (Mask[I] != PoisonMaskElem) {
9512 Mask[I] = I;
9513 }
9514 }
9515 Prev = Action(Mask, {Prev, Res.first});
9516 }
9517 return Prev;
9518}
9519
9522 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
9523 << VectorizableTree.size() << ".\n");
9524
9525 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9526
9527 SmallPtrSet<Value *, 4> CheckedExtracts;
9528 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
9529 TreeEntry &TE = *VectorizableTree[I];
9530 if (TE.State == TreeEntry::NeedToGather) {
9531 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
9532 E && E->getVectorFactor() == TE.getVectorFactor() &&
9533 E->isSame(TE.Scalars)) {
9534 // Some gather nodes might be absolutely the same as some vectorizable
9535 // nodes after reordering, need to handle it.
9536 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
9537 << shortBundleName(TE.Scalars) << ".\n"
9538 << "SLP: Current total cost = " << Cost << "\n");
9539 continue;
9540 }
9541 }
9542
9543 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
9544 Cost += C;
9545 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
9546 << shortBundleName(TE.Scalars) << ".\n"
9547 << "SLP: Current total cost = " << Cost << "\n");
9548 }
9549
9550 SmallPtrSet<Value *, 16> ExtractCostCalculated;
9551 InstructionCost ExtractCost = 0;
9554 SmallVector<APInt> DemandedElts;
9555 SmallDenseSet<Value *, 4> UsedInserts;
9557 for (ExternalUser &EU : ExternalUses) {
9558 // We only add extract cost once for the same scalar.
9559 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9560 !ExtractCostCalculated.insert(EU.Scalar).second)
9561 continue;
9562
9563 // Uses by ephemeral values are free (because the ephemeral value will be
9564 // removed prior to code generation, and so the extraction will be
9565 // removed as well).
9566 if (EphValues.count(EU.User))
9567 continue;
9568
9569 // No extract cost for vector "scalar"
9570 if (isa<FixedVectorType>(EU.Scalar->getType()))
9571 continue;
9572
9573 // If found user is an insertelement, do not calculate extract cost but try
9574 // to detect it as a final shuffled/identity match.
9575 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
9576 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
9577 if (!UsedInserts.insert(VU).second)
9578 continue;
9579 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
9580 if (InsertIdx) {
9581 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
9582 auto *It = find_if(
9583 FirstUsers,
9584 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
9586 VU, cast<InsertElementInst>(Pair.first),
9587 [this](InsertElementInst *II) -> Value * {
9588 Value *Op0 = II->getOperand(0);
9589 if (getTreeEntry(II) && !getTreeEntry(Op0))
9590 return nullptr;
9591 return Op0;
9592 });
9593 });
9594 int VecId = -1;
9595 if (It == FirstUsers.end()) {
9596 (void)ShuffleMasks.emplace_back();
9597 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
9598 if (Mask.empty())
9599 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
9600 // Find the insertvector, vectorized in tree, if any.
9601 Value *Base = VU;
9602 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
9603 if (IEBase != EU.User &&
9604 (!IEBase->hasOneUse() ||
9605 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
9606 break;
9607 // Build the mask for the vectorized insertelement instructions.
9608 if (const TreeEntry *E = getTreeEntry(IEBase)) {
9609 VU = IEBase;
9610 do {
9611 IEBase = cast<InsertElementInst>(Base);
9612 int Idx = *getInsertIndex(IEBase);
9613 assert(Mask[Idx] == PoisonMaskElem &&
9614 "InsertElementInstruction used already.");
9615 Mask[Idx] = Idx;
9616 Base = IEBase->getOperand(0);
9617 } while (E == getTreeEntry(Base));
9618 break;
9619 }
9620 Base = cast<InsertElementInst>(Base)->getOperand(0);
9621 }
9622 FirstUsers.emplace_back(VU, ScalarTE);
9623 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
9624 VecId = FirstUsers.size() - 1;
9625 auto It = MinBWs.find(ScalarTE);
9626 if (It != MinBWs.end() &&
9627 VectorCasts
9628 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
9629 .second) {
9630 unsigned BWSz = It->second.first;
9631 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
9632 unsigned VecOpcode;
9633 if (DstBWSz < BWSz)
9634 VecOpcode = Instruction::Trunc;
9635 else
9636 VecOpcode =
9637 It->second.second ? Instruction::SExt : Instruction::ZExt;
9640 VecOpcode, FTy,
9642 IntegerType::get(FTy->getContext(), BWSz),
9643 FTy->getNumElements()),
9645 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
9646 << " for extending externally used vector with "
9647 "non-equal minimum bitwidth.\n");
9648 Cost += C;
9649 }
9650 } else {
9651 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
9652 It->first = VU;
9653 VecId = std::distance(FirstUsers.begin(), It);
9654 }
9655 int InIdx = *InsertIdx;
9656 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
9657 if (Mask.empty())
9658 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
9659 Mask[InIdx] = EU.Lane;
9660 DemandedElts[VecId].setBit(InIdx);
9661 continue;
9662 }
9663 }
9664 }
9665
9666 // If we plan to rewrite the tree in a smaller type, we will need to sign
9667 // extend the extracted value back to the original type. Here, we account
9668 // for the extract and the added cost of the sign extend if needed.
9669 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
9671 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
9672 if (It != MinBWs.end()) {
9673 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
9674 unsigned Extend =
9675 It->second.second ? Instruction::SExt : Instruction::ZExt;
9676 VecTy = FixedVectorType::get(MinTy, BundleWidth);
9677 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
9678 VecTy, EU.Lane);
9679 } else {
9680 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
9681 CostKind, EU.Lane);
9682 }
9683 }
9684 // Add reduced value cost, if resized.
9685 if (!VectorizedVals.empty()) {
9686 const TreeEntry &Root = *VectorizableTree.front().get();
9687 auto BWIt = MinBWs.find(&Root);
9688 if (BWIt != MinBWs.end()) {
9689 Type *DstTy = Root.Scalars.front()->getType();
9690 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
9691 if (OriginalSz != BWIt->second.first) {
9692 unsigned Opcode = Instruction::Trunc;
9693 if (OriginalSz < BWIt->second.first)
9694 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
9695 Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
9696 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
9699 }
9700 }
9701 }
9702
9703 InstructionCost SpillCost = getSpillCost();
9704 Cost += SpillCost + ExtractCost;
9705 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
9706 bool) {
9707 InstructionCost C = 0;
9708 unsigned VF = Mask.size();
9709 unsigned VecVF = TE->getVectorFactor();
9710 if (VF != VecVF &&
9711 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
9713 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
9714 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
9715 OrigMask.begin());
9716 C = TTI->getShuffleCost(
9718 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
9719 LLVM_DEBUG(
9720 dbgs() << "SLP: Adding cost " << C
9721 << " for final shuffle of insertelement external users.\n";
9722 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
9723 Cost += C;
9724 return std::make_pair(TE, true);
9725 }
9726 return std::make_pair(TE, false);
9727 };
9728 // Calculate the cost of the reshuffled vectors, if any.
9729 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
9730 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
9731 auto Vector = ShuffleMasks[I].takeVector();
9732 unsigned VF = 0;
9733 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
9735 assert((TEs.size() == 1 || TEs.size() == 2) &&
9736 "Expected exactly 1 or 2 tree entries.");
9737 if (TEs.size() == 1) {
9738 if (VF == 0)
9739 VF = TEs.front()->getVectorFactor();
9740 auto *FTy =
9741 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
9742 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
9743 !all_of(enumerate(Mask), [=](const auto &Data) {
9744 return Data.value() == PoisonMaskElem ||
9745 (Data.index() < VF &&
9746 static_cast<int>(Data.index()) == Data.value());
9747 })) {
9750 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
9751 << " for final shuffle of insertelement "
9752 "external users.\n";
9753 TEs.front()->dump();
9754 dbgs() << "SLP: Current total cost = " << Cost << "\n");
9755 Cost += C;
9756 }
9757 } else {
9758 if (VF == 0) {
9759 if (TEs.front() &&
9760 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
9761 VF = TEs.front()->getVectorFactor();
9762 else
9763 VF = Mask.size();
9764 }
9765 auto *FTy =
9766 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
9769 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
9770 << " for final shuffle of vector node and external "
9771 "insertelement users.\n";
9772 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
9773 dbgs() << "SLP: Current total cost = " << Cost << "\n");
9774 Cost += C;
9775 }
9776 VF = Mask.size();
9777 return TEs.back();
9778 };
9779 (void)performExtractsShuffleAction<const TreeEntry>(
9780 MutableArrayRef(Vector.data(), Vector.size()), Base,
9781 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
9782 EstimateShufflesCost);
9784 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
9785 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
9786 Cost -= InsertCost;
9787 }
9788
9789#ifndef NDEBUG
9790 SmallString<256> Str;
9791 {
9793 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
9794 << "SLP: Extract Cost = " << ExtractCost << ".\n"
9795 << "SLP: Total Cost = " << Cost << ".\n";
9796 }
9797 LLVM_DEBUG(dbgs() << Str);
9798 if (ViewSLPTree)
9799 ViewGraph(this, "SLP" + F->getName(), false, Str);
9800#endif
9801
9802 return Cost;
9803}
9804
9805/// Tries to find extractelement instructions with constant indices from fixed
9806/// vector type and gather such instructions into a bunch, which highly likely
9807/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
9808/// successful, the matched scalars are replaced by poison values in \p VL for
9809/// future analysis.
9810std::optional<TTI::ShuffleKind>
9811BoUpSLP::tryToGatherSingleRegisterExtractElements(
9813 // Scan list of gathered scalars for extractelements that can be represented
9814 // as shuffles.
9816 SmallVector<int> UndefVectorExtracts;
9817 for (int I = 0, E = VL.size(); I < E; ++I) {
9818 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
9819 if (!EI) {
9820 if (isa<UndefValue>(VL[I]))
9821 UndefVectorExtracts.push_back(I);
9822 continue;
9823 }
9824 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
9825 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
9826 continue;
9827 std::optional<unsigned> Idx = getExtractIndex(EI);
9828 // Undefined index.
9829 if (!Idx) {
9830 UndefVectorExtracts.push_back(I);
9831 continue;
9832 }
9833 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
9834 ExtractMask.reset(*Idx);
9835 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
9836 UndefVectorExtracts.push_back(I);
9837 continue;
9838 }
9839 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
9840 }
9841 // Sort the vector operands by the maximum number of uses in extractelements.
9843 for (const auto &Data : VectorOpToIdx)
9844 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
9845 .push_back(Data.first);
9846 for (auto &Data : VFToVector) {
9847 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
9848 return VectorOpToIdx.find(V1)->second.size() >
9849 VectorOpToIdx.find(V2)->second.size();
9850 });
9851 }
9852 // Find the best pair of the vectors with the same number of elements or a
9853 // single vector.
9854 const int UndefSz = UndefVectorExtracts.size();
9855 unsigned SingleMax = 0;
9856 Value *SingleVec = nullptr;
9857 unsigned PairMax = 0;
9858 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
9859 for (auto &Data : VFToVector) {
9860 Value *V1 = Data.second.front();
9861 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
9862 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
9863 SingleVec = V1;
9864 }
9865 Value *V2 = nullptr;
9866 if (Data.second.size() > 1)
9867 V2 = *std::next(Data.second.begin());
9868 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
9869 UndefSz) {
9870 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
9871 PairVec = std::make_pair(V1, V2);
9872 }
9873 }
9874 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
9875 return std::nullopt;
9876 // Check if better to perform a shuffle of 2 vectors or just of a single
9877 // vector.
9878 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
9879 SmallVector<Value *> GatheredExtracts(
9880 VL.size(), PoisonValue::get(VL.front()->getType()));
9881 if (SingleMax >= PairMax && SingleMax) {
9882 for (int Idx : VectorOpToIdx[SingleVec])
9883 std::swap(GatheredExtracts[Idx], VL[Idx]);
9884 } else {
9885 for (Value *V : {PairVec.first, PairVec.second})
9886 for (int Idx : VectorOpToIdx[V])
9887 std::swap(GatheredExtracts[Idx], VL[Idx]);
9888 }
9889 // Add extracts from undefs too.
9890 for (int Idx : UndefVectorExtracts)
9891 std::swap(GatheredExtracts[Idx], VL[Idx]);
9892 // Check that gather of extractelements can be represented as just a
9893 // shuffle of a single/two vectors the scalars are extracted from.
9894 std::optional<TTI::ShuffleKind> Res =
9895 isFixedVectorShuffle(GatheredExtracts, Mask);
9896 if (!Res) {
9897 // TODO: try to check other subsets if possible.
9898 // Restore the original VL if attempt was not successful.
9899 copy(SavedVL, VL.begin());
9900 return std::nullopt;
9901 }
9902 // Restore unused scalars from mask, if some of the extractelements were not
9903 // selected for shuffle.
9904 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
9905 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
9906 isa<UndefValue>(GatheredExtracts[I])) {
9907 std::swap(VL[I], GatheredExtracts[I]);
9908 continue;
9909 }
9910 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
9911 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
9912 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
9913 is_contained(UndefVectorExtracts, I))
9914 continue;
9915 }
9916 return Res;
9917}
9918
9919/// Tries to find extractelement instructions with constant indices from fixed
9920/// vector type and gather such instructions into a bunch, which highly likely
9921/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
9922/// successful, the matched scalars are replaced by poison values in \p VL for
9923/// future analysis.
9925BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
9927 unsigned NumParts) const {
9928 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
9929 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
9930 Mask.assign(VL.size(), PoisonMaskElem);
9931 unsigned SliceSize = VL.size() / NumParts;
9932 for (unsigned Part = 0; Part < NumParts; ++Part) {
9933 // Scan list of gathered scalars for extractelements that can be represented
9934 // as shuffles.
9936 MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
9937 SmallVector<int> SubMask;
9938 std::optional<TTI::ShuffleKind> Res =
9939 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
9940 ShufflesRes[Part] = Res;
9941 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
9942 }
9943 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
9944 return Res.has_value();
9945 }))
9946 ShufflesRes.clear();
9947 return ShufflesRes;
9948}
9949
9950std::optional<TargetTransformInfo::ShuffleKind>
9951BoUpSLP::isGatherShuffledSingleRegisterEntry(
9952 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
9953 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
9954 Entries.clear();
9955 // TODO: currently checking only for Scalars in the tree entry, need to count
9956 // reused elements too for better cost estimation.
9957 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
9958 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
9959 const BasicBlock *TEInsertBlock = nullptr;
9960 // Main node of PHI entries keeps the correct order of operands/incoming
9961 // blocks.
9962 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
9963 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
9964 TEInsertPt = TEInsertBlock->getTerminator();
9965 } else {
9966 TEInsertBlock = TEInsertPt->getParent();
9967 }
9968 if (!DT->isReachableFromEntry(TEInsertBlock))
9969 return std::nullopt;
9970 auto *NodeUI = DT->getNode(TEInsertBlock);
9971 assert(NodeUI && "Should only process reachable instructions");
9972 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
9973 auto CheckOrdering = [&](const Instruction *InsertPt) {
9974 // Argument InsertPt is an instruction where vector code for some other
9975 // tree entry (one that shares one or more scalars with TE) is going to be
9976 // generated. This lambda returns true if insertion point of vector code
9977 // for the TE dominates that point (otherwise dependency is the other way
9978 // around). The other node is not limited to be of a gather kind. Gather
9979 // nodes are not scheduled and their vector code is inserted before their
9980 // first user. If user is PHI, that is supposed to be at the end of a
9981 // predecessor block. Otherwise it is the last instruction among scalars of
9982 // the user node. So, instead of checking dependency between instructions
9983 // themselves, we check dependency between their insertion points for vector
9984 // code (since each scalar instruction ends up as a lane of a vector
9985 // instruction).
9986 const BasicBlock *InsertBlock = InsertPt->getParent();
9987 auto *NodeEUI = DT->getNode(InsertBlock);
9988 if (!NodeEUI)
9989 return false;
9990 assert((NodeUI == NodeEUI) ==
9991 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
9992 "Different nodes should have different DFS numbers");
9993 // Check the order of the gather nodes users.
9994 if (TEInsertPt->getParent() != InsertBlock &&
9995 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
9996 return false;
9997 if (TEInsertPt->getParent() == InsertBlock &&
9998 TEInsertPt->comesBefore(InsertPt))
9999 return false;
10000 return true;
10001 };
10002 // Find all tree entries used by the gathered values. If no common entries
10003 // found - not a shuffle.
10004 // Here we build a set of tree nodes for each gathered value and trying to
10005 // find the intersection between these sets. If we have at least one common
10006 // tree node for each gathered value - we have just a permutation of the
10007 // single vector. If we have 2 different sets, we're in situation where we
10008 // have a permutation of 2 input vectors.
10010 DenseMap<Value *, int> UsedValuesEntry;
10011 for (Value *V : VL) {
10012 if (isConstant(V))
10013 continue;
10014 // Build a list of tree entries where V is used.
10016 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10017 if (TEPtr == TE)
10018 continue;
10019 assert(any_of(TEPtr->Scalars,
10020 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10021 "Must contain at least single gathered value.");
10022 assert(TEPtr->UserTreeIndices.size() == 1 &&
10023 "Expected only single user of a gather node.");
10024 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10025
10026 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10027 const Instruction *InsertPt =
10028 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10029 : &getLastInstructionInBundle(UseEI.UserTE);
10030 if (TEInsertPt == InsertPt) {
10031 // If 2 gathers are operands of the same entry (regardless of whether
10032 // user is PHI or else), compare operands indices, use the earlier one
10033 // as the base.
10034 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10035 continue;
10036 // If the user instruction is used for some reason in different
10037 // vectorized nodes - make it depend on index.
10038 if (TEUseEI.UserTE != UseEI.UserTE &&
10039 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10040 continue;
10041 }
10042
10043 // Check if the user node of the TE comes after user node of TEPtr,
10044 // otherwise TEPtr depends on TE.
10045 if ((TEInsertBlock != InsertPt->getParent() ||
10046 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10047 !CheckOrdering(InsertPt))
10048 continue;
10049 VToTEs.insert(TEPtr);
10050 }
10051 if (const TreeEntry *VTE = getTreeEntry(V)) {
10052 if (ForOrder) {
10053 if (VTE->State != TreeEntry::Vectorize) {
10054 auto It = MultiNodeScalars.find(V);
10055 if (It == MultiNodeScalars.end())
10056 continue;
10057 VTE = *It->getSecond().begin();
10058 // Iterate through all vectorized nodes.
10059 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10060 return MTE->State == TreeEntry::Vectorize;
10061 });
10062 if (MIt == It->getSecond().end())
10063 continue;
10064 VTE = *MIt;
10065 }
10066 }
10067 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10068 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10069 continue;
10070 auto It = MinBWs.find(VTE);
10071 // If vectorize node is demoted - do not match.
10072 if (It != MinBWs.end() &&
10073 It->second.first != DL->getTypeSizeInBits(V->getType()))
10074 continue;
10075 VToTEs.insert(VTE);
10076 }
10077 if (VToTEs.empty())
10078 continue;
10079 if (UsedTEs.empty()) {
10080 // The first iteration, just insert the list of nodes to vector.
10081 UsedTEs.push_back(VToTEs);
10082 UsedValuesEntry.try_emplace(V, 0);
10083 } else {
10084 // Need to check if there are any previously used tree nodes which use V.
10085 // If there are no such nodes, consider that we have another one input
10086 // vector.
10087 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10088 unsigned Idx = 0;
10089 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10090 // Do we have a non-empty intersection of previously listed tree entries
10091 // and tree entries using current V?
10092 set_intersect(VToTEs, Set);
10093 if (!VToTEs.empty()) {
10094 // Yes, write the new subset and continue analysis for the next
10095 // scalar.
10096 Set.swap(VToTEs);
10097 break;
10098 }
10099 VToTEs = SavedVToTEs;
10100 ++Idx;
10101 }
10102 // No non-empty intersection found - need to add a second set of possible
10103 // source vectors.
10104 if (Idx == UsedTEs.size()) {
10105 // If the number of input vectors is greater than 2 - not a permutation,
10106 // fallback to the regular gather.
10107 // TODO: support multiple reshuffled nodes.
10108 if (UsedTEs.size() == 2)
10109 continue;
10110 UsedTEs.push_back(SavedVToTEs);
10111 Idx = UsedTEs.size() - 1;
10112 }
10113 UsedValuesEntry.try_emplace(V, Idx);
10114 }
10115 }
10116
10117 if (UsedTEs.empty()) {
10118 Entries.clear();
10119 return std::nullopt;
10120 }
10121
10122 unsigned VF = 0;
10123 if (UsedTEs.size() == 1) {
10124 // Keep the order to avoid non-determinism.
10125 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10126 UsedTEs.front().end());
10127 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10128 return TE1->Idx < TE2->Idx;
10129 });
10130 // Try to find the perfect match in another gather node at first.
10131 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10132 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10133 });
10134 if (It != FirstEntries.end() &&
10135 ((*It)->getVectorFactor() == VL.size() ||
10136 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10137 TE->ReuseShuffleIndices.size() == VL.size() &&
10138 (*It)->isSame(TE->Scalars)))) {
10139 Entries.push_back(*It);
10140 if ((*It)->getVectorFactor() == VL.size()) {
10141 std::iota(std::next(Mask.begin(), Part * VL.size()),
10142 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10143 } else {
10144 SmallVector<int> CommonMask = TE->getCommonMask();
10145 copy(CommonMask, Mask.begin());
10146 }
10147 // Clear undef scalars.
10148 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10149 if (isa<PoisonValue>(VL[I]))
10152 }
10153 // No perfect match, just shuffle, so choose the first tree node from the
10154 // tree.
10155 Entries.push_back(FirstEntries.front());
10156 } else {
10157 // Try to find nodes with the same vector factor.
10158 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10159 // Keep the order of tree nodes to avoid non-determinism.
10161 for (const TreeEntry *TE : UsedTEs.front()) {
10162 unsigned VF = TE->getVectorFactor();
10163 auto It = VFToTE.find(VF);
10164 if (It != VFToTE.end()) {
10165 if (It->second->Idx > TE->Idx)
10166 It->getSecond() = TE;
10167 continue;
10168 }
10169 VFToTE.try_emplace(VF, TE);
10170 }
10171 // Same, keep the order to avoid non-determinism.
10172 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10173 UsedTEs.back().end());
10174 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10175 return TE1->Idx < TE2->Idx;
10176 });
10177 for (const TreeEntry *TE : SecondEntries) {
10178 auto It = VFToTE.find(TE->getVectorFactor());
10179 if (It != VFToTE.end()) {
10180 VF = It->first;
10181 Entries.push_back(It->second);
10182 Entries.push_back(TE);
10183 break;
10184 }
10185 }
10186 // No 2 source vectors with the same vector factor - just choose 2 with max
10187 // index.
10188 if (Entries.empty()) {
10189 Entries.push_back(*llvm::max_element(
10190 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10191 return TE1->Idx < TE2->Idx;
10192 }));
10193 Entries.push_back(SecondEntries.front());
10194 VF = std::max(Entries.front()->getVectorFactor(),
10195 Entries.back()->getVectorFactor());
10196 }
10197 }
10198
10199 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof);
10200 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10201 // vectorized.
10202 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10203 auto *PHI = cast<PHINode>(V);
10204 auto *PHI1 = cast<PHINode>(V1);
10205 // Check that all incoming values are compatible/from same parent (if they
10206 // are instructions).
10207 // The incoming values are compatible if they all are constants, or
10208 // instruction with the same/alternate opcodes from the same basic block.
10209 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10210 Value *In = PHI->getIncomingValue(I);
10211 Value *In1 = PHI1->getIncomingValue(I);
10212 if (isConstant(In) && isConstant(In1))
10213 continue;
10214 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
10215 return false;
10216 if (cast<Instruction>(In)->getParent() !=
10217 cast<Instruction>(In1)->getParent())
10218 return false;
10219 }
10220 return true;
10221 };
10222 // Check if the value can be ignored during analysis for shuffled gathers.
10223 // We suppose it is better to ignore instruction, which do not form splats,
10224 // are not vectorized/not extractelements (these instructions will be handled
10225 // by extractelements processing) or may form vector node in future.
10226 auto MightBeIgnored = [=](Value *V) {
10227 auto *I = dyn_cast<Instruction>(V);
10228 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
10230 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
10231 };
10232 // Check that the neighbor instruction may form a full vector node with the
10233 // current instruction V. It is possible, if they have same/alternate opcode
10234 // and same parent basic block.
10235 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
10236 Value *V1 = VL[Idx];
10237 bool UsedInSameVTE = false;
10238 auto It = UsedValuesEntry.find(V1);
10239 if (It != UsedValuesEntry.end())
10240 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
10241 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10242 getSameOpcode({V, V1}, *TLI).getOpcode() &&
10243 cast<Instruction>(V)->getParent() ==
10244 cast<Instruction>(V1)->getParent() &&
10245 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10246 };
10247 // Build a shuffle mask for better cost estimation and vector emission.
10248 SmallBitVector UsedIdxs(Entries.size());
10250 for (int I = 0, E = VL.size(); I < E; ++I) {
10251 Value *V = VL[I];
10252 auto It = UsedValuesEntry.find(V);
10253 if (It == UsedValuesEntry.end())
10254 continue;
10255 // Do not try to shuffle scalars, if they are constants, or instructions
10256 // that can be vectorized as a result of the following vector build
10257 // vectorization.
10258 if (isConstant(V) || (MightBeIgnored(V) &&
10259 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
10260 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
10261 continue;
10262 unsigned Idx = It->second;
10263 EntryLanes.emplace_back(Idx, I);
10264 UsedIdxs.set(Idx);
10265 }
10266 // Iterate through all shuffled scalars and select entries, which can be used
10267 // for final shuffle.
10269 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
10270 if (!UsedIdxs.test(I))
10271 continue;
10272 // Fix the entry number for the given scalar. If it is the first entry, set
10273 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
10274 // These indices are used when calculating final shuffle mask as the vector
10275 // offset.
10276 for (std::pair<unsigned, int> &Pair : EntryLanes)
10277 if (Pair.first == I)
10278 Pair.first = TempEntries.size();
10279 TempEntries.push_back(Entries[I]);
10280 }
10281 Entries.swap(TempEntries);
10282 if (EntryLanes.size() == Entries.size() &&
10283 !VL.equals(ArrayRef(TE->Scalars)
10284 .slice(Part * VL.size(),
10285 std::min<int>(VL.size(), TE->Scalars.size())))) {
10286 // We may have here 1 or 2 entries only. If the number of scalars is equal
10287 // to the number of entries, no need to do the analysis, it is not very
10288 // profitable. Since VL is not the same as TE->Scalars, it means we already
10289 // have some shuffles before. Cut off not profitable case.
10290 Entries.clear();
10291 return std::nullopt;
10292 }
10293 // Build the final mask, check for the identity shuffle, if possible.
10294 bool IsIdentity = Entries.size() == 1;
10295 // Pair.first is the offset to the vector, while Pair.second is the index of
10296 // scalar in the list.
10297 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
10298 unsigned Idx = Part * VL.size() + Pair.second;
10299 Mask[Idx] =
10300 Pair.first * VF +
10301 (ForOrder ? std::distance(
10302 Entries[Pair.first]->Scalars.begin(),
10303 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10304 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10305 IsIdentity &= Mask[Idx] == Pair.second;
10306 }
10307 switch (Entries.size()) {
10308 case 1:
10309 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10311 break;
10312 case 2:
10313 if (EntryLanes.size() > 2 || VL.size() <= 2)
10315 break;
10316 default:
10317 break;
10318 }
10319 Entries.clear();
10320 // Clear the corresponding mask elements.
10321 std::fill(std::next(Mask.begin(), Part * VL.size()),
10322 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
10323 return std::nullopt;
10324}
10325
10327BoUpSLP::isGatherShuffledEntry(
10328 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
10329 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
10330 bool ForOrder) {
10331 assert(NumParts > 0 && NumParts < VL.size() &&
10332 "Expected positive number of registers.");
10333 Entries.clear();
10334 // No need to check for the topmost gather node.
10335 if (TE == VectorizableTree.front().get())
10336 return {};
10337 Mask.assign(VL.size(), PoisonMaskElem);
10338 assert(TE->UserTreeIndices.size() == 1 &&
10339 "Expected only single user of the gather node.");
10340 assert(VL.size() % NumParts == 0 &&
10341 "Number of scalars must be divisible by NumParts.");
10342 unsigned SliceSize = VL.size() / NumParts;
10344 for (unsigned Part = 0; Part < NumParts; ++Part) {
10345 ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
10346 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
10347 std::optional<TTI::ShuffleKind> SubRes =
10348 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10349 ForOrder);
10350 if (!SubRes)
10351 SubEntries.clear();
10352 Res.push_back(SubRes);
10353 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
10354 SubEntries.front()->getVectorFactor() == VL.size() &&
10355 (SubEntries.front()->isSame(TE->Scalars) ||
10356 SubEntries.front()->isSame(VL))) {
10357 SmallVector<const TreeEntry *> LocalSubEntries;
10358 LocalSubEntries.swap(SubEntries);
10359 Entries.clear();
10360 Res.clear();
10361 std::iota(Mask.begin(), Mask.end(), 0);
10362 // Clear undef scalars.
10363 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10364 if (isa<PoisonValue>(VL[I]))
10366 Entries.emplace_back(1, LocalSubEntries.front());
10368 return Res;
10369 }
10370 }
10371 if (all_of(Res,
10372 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
10373 Entries.clear();
10374 return {};
10375 }
10376 return Res;
10377}
10378
10379InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
10380 bool ForPoisonSrc) const {
10381 // Find the type of the operands in VL.
10382 Type *ScalarTy = VL[0]->getType();
10383 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10384 ScalarTy = SI->getValueOperand()->getType();
10385 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
10386 bool DuplicateNonConst = false;
10387 // Find the cost of inserting/extracting values from the vector.
10388 // Check if the same elements are inserted several times and count them as
10389 // shuffle candidates.
10390 APInt ShuffledElements = APInt::getZero(VL.size());
10391 DenseMap<Value *, unsigned> UniqueElements;
10394 auto EstimateInsertCost = [&](unsigned I, Value *V) {
10395 if (!ForPoisonSrc)
10396 Cost +=
10397 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
10398 I, Constant::getNullValue(VecTy), V);
10399 };
10400 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10401 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
10402 Value *V = VL[I];
10403 // No need to shuffle duplicates for constants.
10404 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
10405 ShuffledElements.setBit(I);
10406 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
10407 continue;
10408 }
10409
10410 auto Res = UniqueElements.try_emplace(V, I);
10411 if (Res.second) {
10412 EstimateInsertCost(I, V);
10413 ShuffleMask[I] = I;
10414 continue;
10415 }
10416
10417 DuplicateNonConst = true;
10418 ShuffledElements.setBit(I);
10419 ShuffleMask[I] = Res.first->second;
10420 }
10421 if (ForPoisonSrc)
10422 Cost =
10423 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
10424 /*Extract*/ false, CostKind);
10425 if (DuplicateNonConst)
10427 VecTy, ShuffleMask);
10428 return Cost;
10429}
10430
10431// Perform operand reordering on the instructions in VL and return the reordered
10432// operands in Left and Right.
10433void BoUpSLP::reorderInputsAccordingToOpcode(
10436 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) {
10437 if (VL.empty())
10438 return;
10439 VLOperands Ops(VL, TLI, DL, SE, R);
10440 // Reorder the operands in place.
10441 Ops.reorder();
10442 Left = Ops.getVL(0);
10443 Right = Ops.getVL(1);
10444}
10445
10446Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
10447 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
10448 if (Res.second)
10449 return *Res.second;
10450 // Get the basic block this bundle is in. All instructions in the bundle
10451 // should be in this block (except for extractelement-like instructions with
10452 // constant indeces).
10453 auto *Front = E->getMainOp();
10454 auto *BB = Front->getParent();
10455 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
10456 if (E->getOpcode() == Instruction::GetElementPtr &&
10457 !isa<GetElementPtrInst>(V))
10458 return true;
10459 auto *I = cast<Instruction>(V);
10460 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10461 isVectorLikeInstWithConstOps(I);
10462 }));
10463
10464 auto FindLastInst = [&]() {
10465 Instruction *LastInst = Front;
10466 for (Value *V : E->Scalars) {
10467 auto *I = dyn_cast<Instruction>(V);
10468 if (!I)
10469 continue;
10470 if (LastInst->getParent() == I->getParent()) {
10471 if (LastInst->comesBefore(I))
10472 LastInst = I;
10473 continue;
10474 }
10475 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10476 !isa<GetElementPtrInst>(I)) ||
10477 (isVectorLikeInstWithConstOps(LastInst) &&
10479 "Expected vector-like or non-GEP in GEP node insts only.");
10480 if (!DT->isReachableFromEntry(LastInst->getParent())) {
10481 LastInst = I;
10482 continue;
10483 }
10484 if (!DT->isReachableFromEntry(I->getParent()))
10485 continue;
10486 auto *NodeA = DT->getNode(LastInst->getParent());
10487 auto *NodeB = DT->getNode(I->getParent());
10488 assert(NodeA && "Should only process reachable instructions");
10489 assert(NodeB && "Should only process reachable instructions");
10490 assert((NodeA == NodeB) ==
10491 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10492 "Different nodes should have different DFS numbers");
10493 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10494 LastInst = I;
10495 }
10496 BB = LastInst->getParent();
10497 return LastInst;
10498 };
10499
10500 auto FindFirstInst = [&]() {
10501 Instruction *FirstInst = Front;
10502 for (Value *V : E->Scalars) {
10503 auto *I = dyn_cast<Instruction>(V);
10504 if (!I)
10505 continue;
10506 if (FirstInst->getParent() == I->getParent()) {
10507 if (I->comesBefore(FirstInst))
10508 FirstInst = I;
10509 continue;
10510 }
10511 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10512 !isa<GetElementPtrInst>(I)) ||
10513 (isVectorLikeInstWithConstOps(FirstInst) &&
10515 "Expected vector-like or non-GEP in GEP node insts only.");
10516 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
10517 FirstInst = I;
10518 continue;
10519 }
10520 if (!DT->isReachableFromEntry(I->getParent()))
10521 continue;
10522 auto *NodeA = DT->getNode(FirstInst->getParent());
10523 auto *NodeB = DT->getNode(I->getParent());
10524 assert(NodeA && "Should only process reachable instructions");
10525 assert(NodeB && "Should only process reachable instructions");
10526 assert((NodeA == NodeB) ==
10527 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10528 "Different nodes should have different DFS numbers");
10529 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
10530 FirstInst = I;
10531 }
10532 return FirstInst;
10533 };
10534
10535 // Set the insert point to the beginning of the basic block if the entry
10536 // should not be scheduled.
10537 if (doesNotNeedToSchedule(E->Scalars) ||
10538 (E->State != TreeEntry::NeedToGather &&
10540 if ((E->getOpcode() == Instruction::GetElementPtr &&
10541 any_of(E->Scalars,
10542 [](Value *V) {
10543 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
10544 })) ||
10545 all_of(E->Scalars, [](Value *V) {
10546 return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V);
10547 }))
10548 Res.second = FindLastInst();
10549 else
10550 Res.second = FindFirstInst();
10551 return *Res.second;
10552 }
10553
10554 // Find the last instruction. The common case should be that BB has been
10555 // scheduled, and the last instruction is VL.back(). So we start with
10556 // VL.back() and iterate over schedule data until we reach the end of the
10557 // bundle. The end of the bundle is marked by null ScheduleData.
10558 if (BlocksSchedules.count(BB)) {
10559 Value *V = E->isOneOf(E->Scalars.back());
10562 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
10563 if (Bundle && Bundle->isPartOfBundle())
10564 for (; Bundle; Bundle = Bundle->NextInBundle)
10565 if (Bundle->OpValue == Bundle->Inst)
10566 Res.second = Bundle->Inst;
10567 }
10568
10569 // LastInst can still be null at this point if there's either not an entry
10570 // for BB in BlocksSchedules or there's no ScheduleData available for
10571 // VL.back(). This can be the case if buildTree_rec aborts for various
10572 // reasons (e.g., the maximum recursion depth is reached, the maximum region
10573 // size is reached, etc.). ScheduleData is initialized in the scheduling
10574 // "dry-run".
10575 //
10576 // If this happens, we can still find the last instruction by brute force. We
10577 // iterate forwards from Front (inclusive) until we either see all
10578 // instructions in the bundle or reach the end of the block. If Front is the
10579 // last instruction in program order, LastInst will be set to Front, and we
10580 // will visit all the remaining instructions in the block.
10581 //
10582 // One of the reasons we exit early from buildTree_rec is to place an upper
10583 // bound on compile-time. Thus, taking an additional compile-time hit here is
10584 // not ideal. However, this should be exceedingly rare since it requires that
10585 // we both exit early from buildTree_rec and that the bundle be out-of-order
10586 // (causing us to iterate all the way to the end of the block).
10587 if (!Res.second)
10588 Res.second = FindLastInst();
10589 assert(Res.second && "Failed to find last instruction in bundle");
10590 return *Res.second;
10591}
10592
10593void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
10594 auto *Front = E->getMainOp();
10595 Instruction *LastInst = &getLastInstructionInBundle(E);
10596 assert(LastInst && "Failed to find last instruction in bundle");
10597 BasicBlock::iterator LastInstIt = LastInst->getIterator();
10598 // If the instruction is PHI, set the insert point after all the PHIs.
10599 bool IsPHI = isa<PHINode>(LastInst);
10600 if (IsPHI)
10601 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
10602 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
10603 doesNotNeedToSchedule(E->Scalars))) {
10604 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
10605 } else {
10606 // Set the insertion point after the last instruction in the bundle. Set the
10607 // debug location to Front.
10608 Builder.SetInsertPoint(
10609 LastInst->getParent(),
10611 }
10612 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
10613}
10614
10615Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
10616 // List of instructions/lanes from current block and/or the blocks which are
10617 // part of the current loop. These instructions will be inserted at the end to
10618 // make it possible to optimize loops and hoist invariant instructions out of
10619 // the loops body with better chances for success.
10621 SmallSet<int, 4> PostponedIndices;
10622 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
10623 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
10625 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
10626 InsertBB = InsertBB->getSinglePredecessor();
10627 return InsertBB && InsertBB == InstBB;
10628 };
10629 for (int I = 0, E = VL.size(); I < E; ++I) {
10630 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
10631 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
10632 getTreeEntry(Inst) ||
10633 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
10634 PostponedIndices.insert(I).second)
10635 PostponedInsts.emplace_back(Inst, I);
10636 }
10637
10638 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {
10639 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
10640 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
10641 if (!InsElt)
10642 return Vec;
10643 GatherShuffleExtractSeq.insert(InsElt);
10644 CSEBlocks.insert(InsElt->getParent());
10645 // Add to our 'need-to-extract' list.
10646 if (isa<Instruction>(V)) {
10647 if (TreeEntry *Entry = getTreeEntry(V)) {
10648 // Find which lane we need to extract.
10649 unsigned FoundLane = Entry->findLaneForValue(V);
10650 ExternalUses.emplace_back(V, InsElt, FoundLane);
10651 }
10652 }
10653 return Vec;
10654 };
10655 Value *Val0 =
10656 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
10657 FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
10658 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
10659 SmallVector<int> NonConsts;
10660 // Insert constant values at first.
10661 for (int I = 0, E = VL.size(); I < E; ++I) {
10662 if (PostponedIndices.contains(I))
10663 continue;
10664 if (!isConstant(VL[I])) {
10665 NonConsts.push_back(I);
10666 continue;
10667 }
10668 if (Root) {
10669 if (!isa<UndefValue>(VL[I])) {
10670 NonConsts.push_back(I);
10671 continue;
10672 }
10673 if (isa<PoisonValue>(VL[I]))
10674 continue;
10675 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
10676 if (SV->getMaskValue(I) == PoisonMaskElem)
10677 continue;
10678 }
10679 }
10680 Vec = CreateInsertElement(Vec, VL[I], I);
10681 }
10682 // Insert non-constant values.
10683 for (int I : NonConsts)
10684 Vec = CreateInsertElement(Vec, VL[I], I);
10685 // Append instructions, which are/may be part of the loop, in the end to make
10686 // it possible to hoist non-loop-based instructions.
10687 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
10688 Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
10689
10690 return Vec;
10691}
10692
10693/// Merges shuffle masks and emits final shuffle instruction, if required. It
10694/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10695/// when the actual shuffle instruction is generated only if this is actually
10696/// required. Otherwise, the shuffle instruction emission is delayed till the
10697/// end of the process, to reduce the number of emitted instructions and further
10698/// analysis/transformations.
10699/// The class also will look through the previously emitted shuffle instructions
10700/// and properly mark indices in mask as undef.
10701/// For example, given the code
10702/// \code
10703/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
10704/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
10705/// \endcode
10706/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
10707/// look through %s1 and %s2 and emit
10708/// \code
10709/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
10710/// \endcode
10711/// instead.
10712/// If 2 operands are of different size, the smallest one will be resized and
10713/// the mask recalculated properly.
10714/// For example, given the code
10715/// \code
10716/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
10717/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
10718/// \endcode
10719/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
10720/// look through %s1 and %s2 and emit
10721/// \code
10722/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
10723/// \endcode
10724/// instead.
10725class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10726 bool IsFinalized = false;
10727 /// Combined mask for all applied operands and masks. It is built during
10728 /// analysis and actual emission of shuffle vector instructions.
10729 SmallVector<int> CommonMask;
10730 /// List of operands for the shuffle vector instruction. It hold at max 2
10731 /// operands, if the 3rd is going to be added, the first 2 are combined into
10732 /// shuffle with \p CommonMask mask, the first operand sets to be the
10733 /// resulting shuffle and the second operand sets to be the newly added
10734 /// operand. The \p CommonMask is transformed in the proper way after that.
10735 SmallVector<Value *, 2> InVectors;
10736 IRBuilderBase &Builder;
10737 BoUpSLP &R;
10738
10739 class ShuffleIRBuilder {
10740 IRBuilderBase &Builder;
10741 /// Holds all of the instructions that we gathered.
10742 SetVector<Instruction *> &GatherShuffleExtractSeq;
10743 /// A list of blocks that we are going to CSE.
10744 DenseSet<BasicBlock *> &CSEBlocks;
10745
10746 public:
10747 ShuffleIRBuilder(IRBuilderBase &Builder,
10748 SetVector<Instruction *> &GatherShuffleExtractSeq,
10749 DenseSet<BasicBlock *> &CSEBlocks)
10750 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
10751 CSEBlocks(CSEBlocks) {}
10752 ~ShuffleIRBuilder() = default;
10753 /// Creates shufflevector for the 2 operands with the given mask.
10754 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
10755 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
10756 if (auto *I = dyn_cast<Instruction>(Vec)) {
10757 GatherShuffleExtractSeq.insert(I);
10758 CSEBlocks.insert(I->getParent());
10759 }
10760 return Vec;
10761 }
10762 /// Creates permutation of the single vector operand with the given mask, if
10763 /// it is not identity mask.
10764 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
10765 if (Mask.empty())
10766 return V1;
10767 unsigned VF = Mask.size();
10768 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
10769 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
10770 return V1;
10771 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
10772 if (auto *I = dyn_cast<Instruction>(Vec)) {
10773 GatherShuffleExtractSeq.insert(I);
10774 CSEBlocks.insert(I->getParent());
10775 }
10776 return Vec;
10777 }
10778 Value *createIdentity(Value *V) { return V; }
10779 Value *createPoison(Type *Ty, unsigned VF) {
10780 return PoisonValue::get(FixedVectorType::get(Ty, VF));
10781 }
10782 /// Resizes 2 input vector to match the sizes, if the they are not equal
10783 /// yet. The smallest vector is resized to the size of the larger vector.
10784 void resizeToMatch(Value *&V1, Value *&V2) {
10785 if (V1->getType() == V2->getType())
10786 return;
10787 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
10788 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
10789 int VF = std::max(V1VF, V2VF);
10790 int MinVF = std::min(V1VF, V2VF);
10791 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
10792 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
10793 0);
10794 Value *&Op = MinVF == V1VF ? V1 : V2;
10795 Op = Builder.CreateShuffleVector(Op, IdentityMask);
10796 if (auto *I = dyn_cast<Instruction>(Op)) {
10797 GatherShuffleExtractSeq.insert(I);
10798 CSEBlocks.insert(I->getParent());
10799 }
10800 if (MinVF == V1VF)
10801 V1 = Op;
10802 else
10803 V2 = Op;
10804 }
10805 };
10806
10807 /// Smart shuffle instruction emission, walks through shuffles trees and
10808 /// tries to find the best matching vector for the actual shuffle
10809 /// instruction.
10810 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
10811 assert(V1 && "Expected at least one vector value.");
10812 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
10813 R.CSEBlocks);
10814 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
10815 ShuffleBuilder);
10816 }
10817
10818 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10819 /// shuffle emission.
10820 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10821 ArrayRef<int> Mask) {
10822 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10823 if (Mask[Idx] != PoisonMaskElem)
10824 CommonMask[Idx] = Idx;
10825 }
10826
10827public:
10829 : Builder(Builder), R(R) {}
10830
10831 /// Adjusts extractelements after reusing them.
10833 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10834 unsigned NumParts, bool &UseVecBaseAsInput) {
10835 UseVecBaseAsInput = false;
10836 SmallPtrSet<Value *, 4> UniqueBases;
10837 Value *VecBase = nullptr;
10838 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
10839 int Idx = Mask[I];
10840 if (Idx == PoisonMaskElem)
10841 continue;
10842 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
10843 VecBase = EI->getVectorOperand();
10844 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
10845 VecBase = TE->VectorizedValue;
10846 assert(VecBase && "Expected vectorized value.");
10847 UniqueBases.insert(VecBase);
10848 // If the only one use is vectorized - can delete the extractelement
10849 // itself.
10850 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
10851 any_of(EI->users(), [&](User *U) {
10852 const TreeEntry *UTE = R.getTreeEntry(U);
10853 return !UTE || R.MultiNodeScalars.contains(U) ||
10854 count_if(R.VectorizableTree,
10855 [&](const std::unique_ptr<TreeEntry> &TE) {
10856 return any_of(TE->UserTreeIndices,
10857 [&](const EdgeInfo &Edge) {
10858 return Edge.UserTE == UTE;
10859 }) &&
10860 is_contained(TE->Scalars, EI);
10861 }) != 1;
10862 }))
10863 continue;
10864 R.eraseInstruction(EI);
10865 }
10866 if (NumParts == 1 || UniqueBases.size() == 1)
10867 return VecBase;
10868 UseVecBaseAsInput = true;
10869 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
10870 for (auto [I, Idx] : enumerate(Mask))
10871 if (Idx != PoisonMaskElem)
10872 Idx = I;
10873 };
10874 // Perform multi-register vector shuffle, joining them into a single virtual
10875 // long vector.
10876 // Need to shuffle each part independently and then insert all this parts
10877 // into a long virtual vector register, forming the original vector.
10878 Value *Vec = nullptr;
10879 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
10880 unsigned SliceSize = E->Scalars.size() / NumParts;
10881 for (unsigned Part = 0; Part < NumParts; ++Part) {
10883 ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
10884 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
10885 constexpr int MaxBases = 2;
10886 SmallVector<Value *, MaxBases> Bases(MaxBases);
10887#ifndef NDEBUG
10888 int PrevSize = 0;
10889#endif // NDEBUG
10890 for (const auto [I, V]: enumerate(VL)) {
10891 if (SubMask[I] == PoisonMaskElem)
10892 continue;
10893 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
10894 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
10895 VecOp = TE->VectorizedValue;
10896 assert(VecOp && "Expected vectorized value.");
10897 const int Size =
10898 cast<FixedVectorType>(VecOp->getType())->getNumElements();
10899#ifndef NDEBUG
10900 assert((PrevSize == Size || PrevSize == 0) &&
10901 "Expected vectors of the same size.");
10902 PrevSize = Size;
10903#endif // NDEBUG
10904 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
10905 }
10906 if (!Bases.front())
10907 continue;
10908 Value *SubVec;
10909 if (Bases.back()) {
10910 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
10911 TransformToIdentity(SubMask);
10912 } else {
10913 SubVec = Bases.front();
10914 }
10915 if (!Vec) {
10916 Vec = SubVec;
10917 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
10918 [&](unsigned P) {
10919 ArrayRef<int> SubMask =
10920 Mask.slice(P * SliceSize, SliceSize);
10921 return all_of(SubMask, [](int Idx) {
10922 return Idx == PoisonMaskElem;
10923 });
10924 })) &&
10925 "Expected first part or all previous parts masked.");
10926 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
10927 } else {
10928 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
10929 if (Vec->getType() != SubVec->getType()) {
10930 unsigned SubVecVF =
10931 cast<FixedVectorType>(SubVec->getType())->getNumElements();
10932 VF = std::max(VF, SubVecVF);
10933 }
10934 // Adjust SubMask.
10935 for (auto [I, Idx] : enumerate(SubMask))
10936 if (Idx != PoisonMaskElem)
10937 Idx += VF;
10938 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
10939 Vec = createShuffle(Vec, SubVec, VecMask);
10940 TransformToIdentity(VecMask);
10941 }
10942 }
10943 copy(VecMask, Mask.begin());
10944 return Vec;
10945 }
10946 /// Checks if the specified entry \p E needs to be delayed because of its
10947 /// dependency nodes.
10948 std::optional<Value *>
10949 needToDelay(const TreeEntry *E,
10951 // No need to delay emission if all deps are ready.
10952 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
10953 return all_of(
10954 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
10955 }))
10956 return std::nullopt;
10957 // Postpone gather emission, will be emitted after the end of the
10958 // process to keep correct order.
10959 auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
10960 E->getVectorFactor());
10961 return Builder.CreateAlignedLoad(
10963 MaybeAlign());
10964 }
10965 /// Adds 2 input vectors (in form of tree entries) and the mask for their
10966 /// shuffling.
10967 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10968 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
10969 }
10970 /// Adds single input vector (in form of tree entry) and the mask for its
10971 /// shuffling.
10972 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10973 add(E1.VectorizedValue, Mask);
10974 }
10975 /// Adds 2 input vectors and the mask for their shuffling.
10976 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10977 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
10978 if (InVectors.empty()) {
10979 InVectors.push_back(V1);
10980 InVectors.push_back(V2);
10981 CommonMask.assign(Mask.begin(), Mask.end());
10982 return;
10983 }
10984 Value *Vec = InVectors.front();
10985 if (InVectors.size() == 2) {
10986 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
10987 transformMaskAfterShuffle(CommonMask, CommonMask);
10988 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
10989 Mask.size()) {
10990 Vec = createShuffle(Vec, nullptr, CommonMask);
10991 transformMaskAfterShuffle(CommonMask, CommonMask);
10992 }
10993 V1 = createShuffle(V1, V2, Mask);
10994 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10995 if (Mask[Idx] != PoisonMaskElem)
10996 CommonMask[Idx] = Idx + Sz;
10997 InVectors.front() = Vec;
10998 if (InVectors.size() == 2)
10999 InVectors.back() = V1;
11000 else
11001 InVectors.push_back(V1);
11002 }
11003 /// Adds another one input vector and the mask for the shuffling.
11004 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11005 if (InVectors.empty()) {
11006 if (!isa<FixedVectorType>(V1->getType())) {
11007 V1 = createShuffle(V1, nullptr, CommonMask);
11008 CommonMask.assign(Mask.size(), PoisonMaskElem);
11009 transformMaskAfterShuffle(CommonMask, Mask);
11010 }
11011 InVectors.push_back(V1);
11012 CommonMask.assign(Mask.begin(), Mask.end());
11013 return;
11014 }
11015 const auto *It = find(InVectors, V1);
11016 if (It == InVectors.end()) {
11017 if (InVectors.size() == 2 ||
11018 InVectors.front()->getType() != V1->getType() ||
11019 !isa<FixedVectorType>(V1->getType())) {
11020 Value *V = InVectors.front();
11021 if (InVectors.size() == 2) {
11022 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11023 transformMaskAfterShuffle(CommonMask, CommonMask);
11024 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11025 CommonMask.size()) {
11026 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11027 transformMaskAfterShuffle(CommonMask, CommonMask);
11028 }
11029 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11030 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11031 CommonMask[Idx] =
11032 V->getType() != V1->getType()
11033 ? Idx + Sz
11034 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11035 ->getNumElements();
11036 if (V->getType() != V1->getType())
11037 V1 = createShuffle(V1, nullptr, Mask);
11038 InVectors.front() = V;
11039 if (InVectors.size() == 2)
11040 InVectors.back() = V1;
11041 else
11042 InVectors.push_back(V1);
11043 return;
11044 }
11045 // Check if second vector is required if the used elements are already
11046 // used from the first one.
11047 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11048 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11049 InVectors.push_back(V1);
11050 break;
11051 }
11052 }
11053 int VF = CommonMask.size();
11054 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11055 VF = FTy->getNumElements();
11056 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11057 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11058 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11059 }
11060 /// Adds another one input vector and the mask for the shuffling.
11062 SmallVector<int> NewMask;
11063 inversePermutation(Order, NewMask);
11064 add(V1, NewMask);
11065 }
11066 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11067 Value *Root = nullptr) {
11068 return R.gather(VL, Root);
11069 }
11070 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11071 /// Finalize emission of the shuffles.
11072 /// \param Action the action (if any) to be performed before final applying of
11073 /// the \p ExtMask mask.
11074 Value *
11075 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11076 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11077 IsFinalized = true;
11078 if (Action) {
11079 Value *Vec = InVectors.front();
11080 if (InVectors.size() == 2) {
11081 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11082 InVectors.pop_back();
11083 } else {
11084 Vec = createShuffle(Vec, nullptr, CommonMask);
11085 }
11086 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11087 if (CommonMask[Idx] != PoisonMaskElem)
11088 CommonMask[Idx] = Idx;
11089 assert(VF > 0 &&
11090 "Expected vector length for the final value before action.");
11091 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11092 if (VecVF < VF) {
11093 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11094 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11095 Vec = createShuffle(Vec, nullptr, ResizeMask);
11096 }
11097 Action(Vec, CommonMask);
11098 InVectors.front() = Vec;
11099 }
11100 if (!ExtMask.empty()) {
11101 if (CommonMask.empty()) {
11102 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11103 } else {
11104 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11105 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11106 if (ExtMask[I] == PoisonMaskElem)
11107 continue;
11108 NewMask[I] = CommonMask[ExtMask[I]];
11109 }
11110 CommonMask.swap(NewMask);
11111 }
11112 }
11113 if (CommonMask.empty()) {
11114 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11115 return InVectors.front();
11116 }
11117 if (InVectors.size() == 2)
11118 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11119 return createShuffle(InVectors.front(), nullptr, CommonMask);
11120 }
11121
11123 assert((IsFinalized || CommonMask.empty()) &&
11124 "Shuffle construction must be finalized.");
11125 }
11126};
11127
11128Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11129 bool PostponedPHIs) {
11130 ValueList &VL = E->getOperand(NodeIdx);
11131 const unsigned VF = VL.size();
11132 InstructionsState S = getSameOpcode(VL, *TLI);
11133 // Special processing for GEPs bundle, which may include non-gep values.
11134 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11135 const auto *It =
11136 find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
11137 if (It != VL.end())
11138 S = getSameOpcode(*It, *TLI);
11139 }
11140 if (S.getOpcode()) {
11141 auto CheckSameVE = [&](const TreeEntry *VE) {
11142 return VE->isSame(VL) &&
11143 (any_of(VE->UserTreeIndices,
11144 [E, NodeIdx](const EdgeInfo &EI) {
11145 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11146 }) ||
11147 any_of(VectorizableTree,
11148 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
11149 return TE->isOperandGatherNode({E, NodeIdx}) &&
11150 VE->isSame(TE->Scalars);
11151 }));
11152 };
11153 TreeEntry *VE = getTreeEntry(S.OpValue);
11154 bool IsSameVE = VE && CheckSameVE(VE);
11155 if (!IsSameVE) {
11156 auto It = MultiNodeScalars.find(S.OpValue);
11157 if (It != MultiNodeScalars.end()) {
11158 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
11159 return TE != VE && CheckSameVE(TE);
11160 });
11161 if (I != It->getSecond().end()) {
11162 VE = *I;
11163 IsSameVE = true;
11164 }
11165 }
11166 }
11167 if (IsSameVE) {
11168 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
11169 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11170 ShuffleBuilder.add(V, Mask);
11171 return ShuffleBuilder.finalize(std::nullopt);
11172 };
11173 Value *V = vectorizeTree(VE, PostponedPHIs);
11174 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
11175 if (!VE->ReuseShuffleIndices.empty()) {
11176 // Reshuffle to get only unique values.
11177 // If some of the scalars are duplicated in the vectorization
11178 // tree entry, we do not vectorize them but instead generate a
11179 // mask for the reuses. But if there are several users of the
11180 // same entry, they may have different vectorization factors.
11181 // This is especially important for PHI nodes. In this case, we
11182 // need to adapt the resulting instruction for the user
11183 // vectorization factor and have to reshuffle it again to take
11184 // only unique elements of the vector. Without this code the
11185 // function incorrectly returns reduced vector instruction with
11186 // the same elements, not with the unique ones.
11187
11188 // block:
11189 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
11190 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
11191 // ... (use %2)
11192 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
11193 // br %block
11195 for (auto [I, V] : enumerate(VL)) {
11196 if (isa<PoisonValue>(V))
11197 continue;
11198 Mask[I] = VE->findLaneForValue(V);
11199 }
11200 V = FinalShuffle(V, Mask);
11201 } else {
11202 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
11203 "Expected vectorization factor less "
11204 "than original vector size.");
11205 SmallVector<int> UniformMask(VF, 0);
11206 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11207 V = FinalShuffle(V, UniformMask);
11208 }
11209 }
11210 // Need to update the operand gather node, if actually the operand is not a
11211 // vectorized node, but the buildvector/gather node, which matches one of
11212 // the vectorized nodes.
11213 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
11214 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11215 }) == VE->UserTreeIndices.end()) {
11216 auto *It = find_if(
11217 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11218 return TE->State == TreeEntry::NeedToGather &&
11219 TE->UserTreeIndices.front().UserTE == E &&
11220 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11221 });
11222 assert(It != VectorizableTree.end() && "Expected gather node operand.");
11223 (*It)->VectorizedValue = V;
11224 }
11225 return V;
11226 }
11227 }
11228
11229 // Find the corresponding gather entry and vectorize it.
11230 // Allows to be more accurate with tree/graph transformations, checks for the
11231 // correctness of the transformations in many cases.
11232 auto *I = find_if(VectorizableTree,
11233 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
11234 return TE->isOperandGatherNode({E, NodeIdx});
11235 });
11236 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
11237 assert(I->get()->UserTreeIndices.size() == 1 &&
11238 "Expected only single user for the gather node.");
11239 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
11240 return vectorizeTree(I->get(), PostponedPHIs);
11241}
11242
11243template <typename BVTy, typename ResTy, typename... Args>
11244ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11245 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
11246 unsigned VF = E->getVectorFactor();
11247
11248 bool NeedFreeze = false;
11249 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
11250 E->ReuseShuffleIndices.end());
11251 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
11252 // Build a mask out of the reorder indices and reorder scalars per this
11253 // mask.
11254 SmallVector<int> ReorderMask;
11255 inversePermutation(E->ReorderIndices, ReorderMask);
11256 if (!ReorderMask.empty())
11257 reorderScalars(GatheredScalars, ReorderMask);
11258 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
11259 unsigned I, unsigned SliceSize) {
11260 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
11261 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11262 }))
11263 return false;
11264 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11265 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11266 if (UserTE->getNumOperands() != 2)
11267 return false;
11268 auto *It =
11269 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
11270 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
11271 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11272 }) != TE->UserTreeIndices.end();
11273 });
11274 if (It == VectorizableTree.end())
11275 return false;
11276 int Idx;
11277 if ((Mask.size() < InputVF &&
11279 Idx == 0) ||
11280 (Mask.size() == InputVF &&
11281 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
11282 std::iota(std::next(Mask.begin(), I * SliceSize),
11283 std::next(Mask.begin(), (I + 1) * SliceSize), 0);
11284 } else {
11285 unsigned IVal =
11286 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
11287 std::fill(std::next(Mask.begin(), I * SliceSize),
11288 std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
11289 }
11290 return true;
11291 };
11292 BVTy ShuffleBuilder(Params...);
11293 ResTy Res = ResTy();
11295 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
11297 Value *ExtractVecBase = nullptr;
11298 bool UseVecBaseAsInput = false;
11301 Type *ScalarTy = GatheredScalars.front()->getType();
11302 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
11303 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11304 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11305 NumParts = 1;
11306 if (!all_of(GatheredScalars, UndefValue::classof)) {
11307 // Check for gathered extracts.
11308 bool Resized = false;
11309 ExtractShuffles =
11310 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11311 if (!ExtractShuffles.empty()) {
11312 SmallVector<const TreeEntry *> ExtractEntries;
11313 for (auto [Idx, I] : enumerate(ExtractMask)) {
11314 if (I == PoisonMaskElem)
11315 continue;
11316 if (const auto *TE = getTreeEntry(
11317 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
11318 ExtractEntries.push_back(TE);
11319 }
11320 if (std::optional<ResTy> Delayed =
11321 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11322 // Delay emission of gathers which are not ready yet.
11323 PostponedGathers.insert(E);
11324 // Postpone gather emission, will be emitted after the end of the
11325 // process to keep correct order.
11326 return *Delayed;
11327 }
11328 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
11329 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11330 ExtractVecBase = VecBase;
11331 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11332 if (VF == VecBaseTy->getNumElements() &&
11333 GatheredScalars.size() != VF) {
11334 Resized = true;
11335 GatheredScalars.append(VF - GatheredScalars.size(),
11336 PoisonValue::get(ScalarTy));
11337 }
11338 }
11339 }
11340 // Gather extracts after we check for full matched gathers only.
11341 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
11342 E->isAltShuffle() ||
11343 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
11344 isSplat(E->Scalars) ||
11345 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11346 GatherShuffles =
11347 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11348 }
11349 if (!GatherShuffles.empty()) {
11350 if (std::optional<ResTy> Delayed =
11351 ShuffleBuilder.needToDelay(E, Entries)) {
11352 // Delay emission of gathers which are not ready yet.
11353 PostponedGathers.insert(E);
11354 // Postpone gather emission, will be emitted after the end of the
11355 // process to keep correct order.
11356 return *Delayed;
11357 }
11358 if (GatherShuffles.size() == 1 &&
11359 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
11360 Entries.front().front()->isSame(E->Scalars)) {
11361 // Perfect match in the graph, will reuse the previously vectorized
11362 // node. Cost is 0.
11363 LLVM_DEBUG(
11364 dbgs()
11365 << "SLP: perfect diamond match for gather bundle "
11366 << shortBundleName(E->Scalars) << ".\n");
11367 // Restore the mask for previous partially matched values.
11368 Mask.resize(E->Scalars.size());
11369 const TreeEntry *FrontTE = Entries.front().front();
11370 if (FrontTE->ReorderIndices.empty() &&
11371 ((FrontTE->ReuseShuffleIndices.empty() &&
11372 E->Scalars.size() == FrontTE->Scalars.size()) ||
11373 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11374 std::iota(Mask.begin(), Mask.end(), 0);
11375 } else {
11376 for (auto [I, V] : enumerate(E->Scalars)) {
11377 if (isa<PoisonValue>(V)) {
11379 continue;
11380 }
11381 Mask[I] = FrontTE->findLaneForValue(V);
11382 }
11383 }
11384 ShuffleBuilder.add(*FrontTE, Mask);
11385 Res = ShuffleBuilder.finalize(E->getCommonMask());
11386 return Res;
11387 }
11388 if (!Resized) {
11389 if (GatheredScalars.size() != VF &&
11390 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
11391 return any_of(TEs, [&](const TreeEntry *TE) {
11392 return TE->getVectorFactor() == VF;
11393 });
11394 }))
11395 GatheredScalars.append(VF - GatheredScalars.size(),
11396 PoisonValue::get(ScalarTy));
11397 }
11398 // Remove shuffled elements from list of gathers.
11399 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11400 if (Mask[I] != PoisonMaskElem)
11401 GatheredScalars[I] = PoisonValue::get(ScalarTy);
11402 }
11403 }
11404 }
11405 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
11406 SmallVectorImpl<int> &ReuseMask,
11407 bool IsRootPoison) {
11408 // For splats with can emit broadcasts instead of gathers, so try to find
11409 // such sequences.
11410 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
11411 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
11412 Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
11413 SmallVector<int> UndefPos;
11414 DenseMap<Value *, unsigned> UniquePositions;
11415 // Gather unique non-const values and all constant values.
11416 // For repeated values, just shuffle them.
11417 int NumNonConsts = 0;
11418 int SinglePos = 0;
11419 for (auto [I, V] : enumerate(Scalars)) {
11420 if (isa<UndefValue>(V)) {
11421 if (!isa<PoisonValue>(V)) {
11422 ReuseMask[I] = I;
11423 UndefPos.push_back(I);
11424 }
11425 continue;
11426 }
11427 if (isConstant(V)) {
11428 ReuseMask[I] = I;
11429 continue;
11430 }
11431 ++NumNonConsts;
11432 SinglePos = I;
11433 Value *OrigV = V;
11434 Scalars[I] = PoisonValue::get(ScalarTy);
11435 if (IsSplat) {
11436 Scalars.front() = OrigV;
11437 ReuseMask[I] = 0;
11438 } else {
11439 const auto Res = UniquePositions.try_emplace(OrigV, I);
11440 Scalars[Res.first->second] = OrigV;
11441 ReuseMask[I] = Res.first->second;
11442 }
11443 }
11444 if (NumNonConsts == 1) {
11445 // Restore single insert element.
11446 if (IsSplat) {
11447 ReuseMask.assign(VF, PoisonMaskElem);
11448 std::swap(Scalars.front(), Scalars[SinglePos]);
11449 if (!UndefPos.empty() && UndefPos.front() == 0)
11450 Scalars.front() = UndefValue::get(ScalarTy);
11451 }
11452 ReuseMask[SinglePos] = SinglePos;
11453 } else if (!UndefPos.empty() && IsSplat) {
11454 // For undef values, try to replace them with the simple broadcast.
11455 // We can do it if the broadcasted value is guaranteed to be
11456 // non-poisonous, or by freezing the incoming scalar value first.
11457 auto *It = find_if(Scalars, [this, E](Value *V) {
11458 return !isa<UndefValue>(V) &&
11459 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
11460 (E->UserTreeIndices.size() == 1 &&
11461 any_of(V->uses(), [E](const Use &U) {
11462 // Check if the value already used in the same operation in
11463 // one of the nodes already.
11464 return E->UserTreeIndices.front().EdgeIdx !=
11465 U.getOperandNo() &&
11466 is_contained(
11467 E->UserTreeIndices.front().UserTE->Scalars,
11468 U.getUser());
11469 })));
11470 });
11471 if (It != Scalars.end()) {
11472 // Replace undefs by the non-poisoned scalars and emit broadcast.
11473 int Pos = std::distance(Scalars.begin(), It);
11474 for (int I : UndefPos) {
11475 // Set the undef position to the non-poisoned scalar.
11476 ReuseMask[I] = Pos;
11477 // Replace the undef by the poison, in the mask it is replaced by
11478 // non-poisoned scalar already.
11479 if (I != Pos)
11480 Scalars[I] = PoisonValue::get(ScalarTy);
11481 }
11482 } else {
11483 // Replace undefs by the poisons, emit broadcast and then emit
11484 // freeze.
11485 for (int I : UndefPos) {
11486 ReuseMask[I] = PoisonMaskElem;
11487 if (isa<UndefValue>(Scalars[I]))
11488 Scalars[I] = PoisonValue::get(ScalarTy);
11489 }
11490 NeedFreeze = true;
11491 }
11492 }
11493 };
11494 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
11495 bool IsNonPoisoned = true;
11496 bool IsUsedInExpr = true;
11497 Value *Vec1 = nullptr;
11498 if (!ExtractShuffles.empty()) {
11499 // Gather of extractelements can be represented as just a shuffle of
11500 // a single/two vectors the scalars are extracted from.
11501 // Find input vectors.
11502 Value *Vec2 = nullptr;
11503 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
11504 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
11505 ExtractMask[I] = PoisonMaskElem;
11506 }
11507 if (UseVecBaseAsInput) {
11508 Vec1 = ExtractVecBase;
11509 } else {
11510 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
11511 if (ExtractMask[I] == PoisonMaskElem)
11512 continue;
11513 if (isa<UndefValue>(E->Scalars[I]))
11514 continue;
11515 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11516 Value *VecOp = EI->getVectorOperand();
11517 if (const auto *TE = getTreeEntry(VecOp))
11518 if (TE->VectorizedValue)
11519 VecOp = TE->VectorizedValue;
11520 if (!Vec1) {
11521 Vec1 = VecOp;
11522 } else if (Vec1 != EI->getVectorOperand()) {
11523 assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
11524 "Expected only 1 or 2 vectors shuffle.");
11525 Vec2 = VecOp;
11526 }
11527 }
11528 }
11529 if (Vec2) {
11530 IsUsedInExpr = false;
11531 IsNonPoisoned &=
11533 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
11534 } else if (Vec1) {
11535 IsUsedInExpr &= FindReusedSplat(
11536 ExtractMask,
11537 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
11538 ExtractMask.size());
11539 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
11540 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
11541 } else {
11542 IsUsedInExpr = false;
11543 ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
11544 ScalarTy, GatheredScalars.size())),
11545 ExtractMask, /*ForExtracts=*/true);
11546 }
11547 }
11548 if (!GatherShuffles.empty()) {
11549 unsigned SliceSize = E->Scalars.size() / NumParts;
11550 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11551 for (const auto [I, TEs] : enumerate(Entries)) {
11552 if (TEs.empty()) {
11553 assert(!GatherShuffles[I] &&
11554 "No shuffles with empty entries list expected.");
11555 continue;
11556 }
11557 assert((TEs.size() == 1 || TEs.size() == 2) &&
11558 "Expected shuffle of 1 or 2 entries.");
11559 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
11560 VecMask.assign(VecMask.size(), PoisonMaskElem);
11561 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
11562 if (TEs.size() == 1) {
11563 IsUsedInExpr &=
11564 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
11565 ShuffleBuilder.add(*TEs.front(), VecMask);
11566 if (TEs.front()->VectorizedValue)
11567 IsNonPoisoned &=
11568 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
11569 } else {
11570 IsUsedInExpr = false;
11571 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
11572 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
11573 IsNonPoisoned &=
11574 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
11575 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
11576 }
11577 }
11578 }
11579 // Try to figure out best way to combine values: build a shuffle and insert
11580 // elements or just build several shuffles.
11581 // Insert non-constant scalars.
11582 SmallVector<Value *> NonConstants(GatheredScalars);
11583 int EMSz = ExtractMask.size();
11584 int MSz = Mask.size();
11585 // Try to build constant vector and shuffle with it only if currently we
11586 // have a single permutation and more than 1 scalar constants.
11587 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
11588 bool IsIdentityShuffle =
11589 ((UseVecBaseAsInput ||
11590 all_of(ExtractShuffles,
11591 [](const std::optional<TTI::ShuffleKind> &SK) {
11592 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
11594 })) &&
11595 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
11596 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
11597 (!GatherShuffles.empty() &&
11598 all_of(GatherShuffles,
11599 [](const std::optional<TTI::ShuffleKind> &SK) {
11600 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
11602 }) &&
11603 none_of(Mask, [&](int I) { return I >= MSz; }) &&
11605 bool EnoughConstsForShuffle =
11606 IsSingleShuffle &&
11607 (none_of(GatheredScalars,
11608 [](Value *V) {
11609 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11610 }) ||
11611 any_of(GatheredScalars,
11612 [](Value *V) {
11613 return isa<Constant>(V) && !isa<UndefValue>(V);
11614 })) &&
11615 (!IsIdentityShuffle ||
11616 (GatheredScalars.size() == 2 &&
11617 any_of(GatheredScalars,
11618 [](Value *V) { return !isa<UndefValue>(V); })) ||
11619 count_if(GatheredScalars, [](Value *V) {
11620 return isa<Constant>(V) && !isa<PoisonValue>(V);
11621 }) > 1);
11622 // NonConstants array contains just non-constant values, GatheredScalars
11623 // contains only constant to build final vector and then shuffle.
11624 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
11625 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
11626 NonConstants[I] = PoisonValue::get(ScalarTy);
11627 else
11628 GatheredScalars[I] = PoisonValue::get(ScalarTy);
11629 }
11630 // Generate constants for final shuffle and build a mask for them.
11631 if (!all_of(GatheredScalars, PoisonValue::classof)) {
11632 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
11633 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
11634 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
11635 ShuffleBuilder.add(BV, BVMask);
11636 }
11637 if (all_of(NonConstants, [=](Value *V) {
11638 return isa<PoisonValue>(V) ||
11639 (IsSingleShuffle && ((IsIdentityShuffle &&
11640 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
11641 }))
11642 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11643 else
11644 Res = ShuffleBuilder.finalize(
11645 E->ReuseShuffleIndices, E->Scalars.size(),
11646 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
11647 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
11648 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
11649 });
11650 } else if (!allConstant(GatheredScalars)) {
11651 // Gather unique scalars and all constants.
11652 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
11653 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
11654 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
11655 ShuffleBuilder.add(BV, ReuseMask);
11656 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11657 } else {
11658 // Gather all constants.
11659 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
11660 for (auto [I, V] : enumerate(E->Scalars)) {
11661 if (!isa<PoisonValue>(V))
11662 Mask[I] = I;
11663 }
11664 Value *BV = ShuffleBuilder.gather(E->Scalars);
11665 ShuffleBuilder.add(BV, Mask);
11666 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11667 }
11668
11669 if (NeedFreeze)
11670 Res = ShuffleBuilder.createFreeze(Res);
11671 return Res;
11672}
11673
11674Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
11675 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
11676 *this);
11677}
11678
11679Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
11680 IRBuilder<>::InsertPointGuard Guard(Builder);
11681
11682 if (E->VectorizedValue &&
11683 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
11684 E->isAltShuffle())) {
11685 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
11686 return E->VectorizedValue;
11687 }
11688
11689 if (E->State == TreeEntry::NeedToGather) {
11690 // Set insert point for non-reduction initial nodes.
11691 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
11692 setInsertPointAfterBundle(E);
11693 Value *Vec = createBuildVector(E);
11694 E->VectorizedValue = Vec;
11695 return Vec;
11696 }
11697
11698 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
11699 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
11700 bool IsSigned) {
11701 if (V->getType() != VecTy)
11702 V = Builder.CreateIntCast(V, VecTy, IsSigned);
11703 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11704 if (E->getOpcode() == Instruction::Store) {
11706 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
11707 E->ReorderIndices.size());
11708 ShuffleBuilder.add(V, Mask);
11709 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
11710 ShuffleBuilder.addOrdered(V, std::nullopt);
11711 } else {
11712 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
11713 }
11714 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11715 };
11716
11717 assert((E->State == TreeEntry::Vectorize ||
11718 E->State == TreeEntry::ScatterVectorize ||
11719 E->State == TreeEntry::StridedVectorize) &&
11720 "Unhandled state");
11721 unsigned ShuffleOrOp =
11722 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11723 Instruction *VL0 = E->getMainOp();
11724 Type *ScalarTy = VL0->getType();
11725 if (auto *Store = dyn_cast<StoreInst>(VL0))
11726 ScalarTy = Store->getValueOperand()->getType();
11727 else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
11728 ScalarTy = IE->getOperand(1)->getType();
11729 bool IsSigned = false;
11730 auto It = MinBWs.find(E);
11731 if (It != MinBWs.end()) {
11732 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11733 IsSigned = It->second.second;
11734 }
11735 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
11736 switch (ShuffleOrOp) {
11737 case Instruction::PHI: {
11738 assert((E->ReorderIndices.empty() ||
11739 E != VectorizableTree.front().get() ||
11740 !E->UserTreeIndices.empty()) &&
11741 "PHI reordering is free.");
11742 if (PostponedPHIs && E->VectorizedValue)
11743 return E->VectorizedValue;
11744 auto *PH = cast<PHINode>(VL0);
11745 Builder.SetInsertPoint(PH->getParent(),
11746 PH->getParent()->getFirstNonPHIIt());
11747 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
11748 if (PostponedPHIs || !E->VectorizedValue) {
11749 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
11750 E->PHI = NewPhi;
11751 Value *V = NewPhi;
11752
11753 // Adjust insertion point once all PHI's have been generated.
11754 Builder.SetInsertPoint(PH->getParent(),
11755 PH->getParent()->getFirstInsertionPt());
11756 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
11757
11758 V = FinalShuffle(V, E, VecTy, IsSigned);
11759
11760 E->VectorizedValue = V;
11761 if (PostponedPHIs)
11762 return V;
11763 }
11764 PHINode *NewPhi = cast<PHINode>(E->PHI);
11765 // If phi node is fully emitted - exit.
11766 if (NewPhi->getNumIncomingValues() != 0)
11767 return NewPhi;
11768
11769 // PHINodes may have multiple entries from the same block. We want to
11770 // visit every block once.
11772
11773 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
11775 BasicBlock *IBB = PH->getIncomingBlock(I);
11776
11777 // Stop emission if all incoming values are generated.
11778 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
11779 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11780 return NewPhi;
11781 }
11782
11783 if (!VisitedBBs.insert(IBB).second) {
11784 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
11785 continue;
11786 }
11787
11788 Builder.SetInsertPoint(IBB->getTerminator());
11789 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
11790 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
11791 if (VecTy != Vec->getType()) {
11792 assert(MinBWs.contains(getOperandEntry(E, I)) &&
11793 "Expected item in MinBWs.");
11794 Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);
11795 }
11796 NewPhi->addIncoming(Vec, IBB);
11797 }
11798
11799 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
11800 "Invalid number of incoming values");
11801 return NewPhi;
11802 }
11803
11804 case Instruction::ExtractElement: {
11805 Value *V = E->getSingleOperand(0);
11806 if (const TreeEntry *TE = getTreeEntry(V))
11807 V = TE->VectorizedValue;
11808 setInsertPointAfterBundle(E);
11809 V = FinalShuffle(V, E, VecTy, IsSigned);
11810 E->VectorizedValue = V;
11811 return V;
11812 }
11813 case Instruction::ExtractValue: {
11814 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
11815 Builder.SetInsertPoint(LI);
11816 Value *Ptr = LI->getPointerOperand();
11817 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
11818 Value *NewV = propagateMetadata(V, E->Scalars);
11819 NewV = FinalShuffle(NewV, E, VecTy, IsSigned);
11820 E->VectorizedValue = NewV;
11821 return NewV;
11822 }
11823 case Instruction::InsertElement: {
11824 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
11825 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
11826 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
11827 ArrayRef<Value *> Op = E->getOperand(1);
11828 Type *ScalarTy = Op.front()->getType();
11829 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
11830 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
11831 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
11832 assert(Res.first > 0 && "Expected item in MinBWs.");
11833 V = Builder.CreateIntCast(
11834 V,
11836 ScalarTy,
11837 cast<FixedVectorType>(V->getType())->getNumElements()),
11838 Res.second);
11839 }
11840
11841 // Create InsertVector shuffle if necessary
11842 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11843 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11844 }));
11845 const unsigned NumElts =
11846 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
11847 const unsigned NumScalars = E->Scalars.size();
11848
11849 unsigned Offset = *getInsertIndex(VL0);
11850 assert(Offset < NumElts && "Failed to find vector index offset");
11851
11852 // Create shuffle to resize vector
11854 if (!E->ReorderIndices.empty()) {
11855 inversePermutation(E->ReorderIndices, Mask);
11856 Mask.append(NumElts - NumScalars, PoisonMaskElem);
11857 } else {
11858 Mask.assign(NumElts, PoisonMaskElem);
11859 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
11860 }
11861 // Create InsertVector shuffle if necessary
11862 bool IsIdentity = true;
11863 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
11864 Mask.swap(PrevMask);
11865 for (unsigned I = 0; I < NumScalars; ++I) {
11866 Value *Scalar = E->Scalars[PrevMask[I]];
11867 unsigned InsertIdx = *getInsertIndex(Scalar);
11868 IsIdentity &= InsertIdx - Offset == I;
11869 Mask[InsertIdx - Offset] = I;
11870 }
11871 if (!IsIdentity || NumElts != NumScalars) {
11872 Value *V2 = nullptr;
11873 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
11874 SmallVector<int> InsertMask(Mask);
11875 if (NumElts != NumScalars && Offset == 0) {
11876 // Follow all insert element instructions from the current buildvector
11877 // sequence.
11878 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
11879 do {
11880 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
11881 if (!InsertIdx)
11882 break;
11883 if (InsertMask[*InsertIdx] == PoisonMaskElem)
11884 InsertMask[*InsertIdx] = *InsertIdx;
11885 if (!Ins->hasOneUse())
11886 break;
11887 Ins = dyn_cast_or_null<InsertElementInst>(
11888 Ins->getUniqueUndroppableUser());
11889 } while (Ins);
11890 SmallBitVector UseMask =
11891 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
11892 SmallBitVector IsFirstPoison =
11893 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11894 SmallBitVector IsFirstUndef =
11895 isUndefVector(FirstInsert->getOperand(0), UseMask);
11896 if (!IsFirstPoison.all()) {
11897 unsigned Idx = 0;
11898 for (unsigned I = 0; I < NumElts; I++) {
11899 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
11900 IsFirstUndef.test(I)) {
11901 if (IsVNonPoisonous) {
11902 InsertMask[I] = I < NumScalars ? I : 0;
11903 continue;
11904 }
11905 if (!V2)
11906 V2 = UndefValue::get(V->getType());
11907 if (Idx >= NumScalars)
11908 Idx = NumScalars - 1;
11909 InsertMask[I] = NumScalars + Idx;
11910 ++Idx;
11911 } else if (InsertMask[I] != PoisonMaskElem &&
11912 Mask[I] == PoisonMaskElem) {
11913 InsertMask[I] = PoisonMaskElem;
11914 }
11915 }
11916 } else {
11917 InsertMask = Mask;
11918 }
11919 }
11920 if (!V2)
11921 V2 = PoisonValue::get(V->getType());
11922 V = Builder.CreateShuffleVector(V, V2, InsertMask);
11923 if (auto *I = dyn_cast<Instruction>(V)) {
11924 GatherShuffleExtractSeq.insert(I);
11925 CSEBlocks.insert(I->getParent());
11926 }
11927 }
11928
11929 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11930 for (unsigned I = 0; I < NumElts; I++) {
11931 if (Mask[I] != PoisonMaskElem)
11932 InsertMask[Offset + I] = I;
11933 }
11934 SmallBitVector UseMask =
11935 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
11936 SmallBitVector IsFirstUndef =
11937 isUndefVector(FirstInsert->getOperand(0), UseMask);
11938 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
11939 NumElts != NumScalars) {
11940 if (IsFirstUndef.all()) {
11941 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
11942 SmallBitVector IsFirstPoison =
11943 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11944 if (!IsFirstPoison.all()) {
11945 for (unsigned I = 0; I < NumElts; I++) {
11946 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
11947 InsertMask[I] = I + NumElts;
11948 }
11949 }
11950 V = Builder.CreateShuffleVector(
11951 V,
11952 IsFirstPoison.all() ? PoisonValue::get(V->getType())
11953 : FirstInsert->getOperand(0),
11954 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
11955 if (auto *I = dyn_cast<Instruction>(V)) {
11956 GatherShuffleExtractSeq.insert(I);
11957 CSEBlocks.insert(I->getParent());
11958 }
11959 }
11960 } else {
11961 SmallBitVector IsFirstPoison =
11962 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11963 for (unsigned I = 0; I < NumElts; I++) {
11964 if (InsertMask[I] == PoisonMaskElem)
11965 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
11966 else
11967 InsertMask[I] += NumElts;
11968 }
11969 V = Builder.CreateShuffleVector(
11970 FirstInsert->getOperand(0), V, InsertMask,
11971 cast<Instruction>(E->Scalars.back())->getName());
11972 if (auto *I = dyn_cast<Instruction>(V)) {
11973 GatherShuffleExtractSeq.insert(I);
11974 CSEBlocks.insert(I->getParent());
11975 }
11976 }
11977 }
11978
11979 ++NumVectorInstructions;
11980 E->VectorizedValue = V;
11981 return V;
11982 }
11983 case Instruction::ZExt:
11984 case Instruction::SExt:
11985 case Instruction::FPToUI:
11986 case Instruction::FPToSI:
11987 case Instruction::FPExt:
11988 case Instruction::PtrToInt:
11989 case Instruction::IntToPtr:
11990 case Instruction::SIToFP:
11991 case Instruction::UIToFP:
11992 case Instruction::Trunc:
11993 case Instruction::FPTrunc:
11994 case Instruction::BitCast: {
11995 setInsertPointAfterBundle(E);
11996
11997 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
11998 if (E->VectorizedValue) {
11999 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12000 return E->VectorizedValue;
12001 }
12002
12003 auto *CI = cast<CastInst>(VL0);
12004 Instruction::CastOps VecOpcode = CI->getOpcode();
12005 Type *SrcScalarTy = VL0->getOperand(0)->getType();
12006 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12007 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12008 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
12009 // Check if the values are candidates to demote.
12010 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12011 if (SrcIt != MinBWs.end())
12012 SrcBWSz = SrcIt->second.first;
12013 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12014 if (BWSz == SrcBWSz) {
12015 VecOpcode = Instruction::BitCast;
12016 } else if (BWSz < SrcBWSz) {
12017 VecOpcode = Instruction::Trunc;
12018 } else if (SrcIt != MinBWs.end()) {
12019 assert(BWSz > SrcBWSz && "Invalid cast!");
12020 VecOpcode =
12021 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12022 }
12023 }
12024 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12025 ? InVec
12026 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12027 V = FinalShuffle(V, E, VecTy, IsSigned);
12028
12029 E->VectorizedValue = V;
12030 ++NumVectorInstructions;
12031 return V;
12032 }
12033 case Instruction::FCmp:
12034 case Instruction::ICmp: {
12035 setInsertPointAfterBundle(E);
12036
12037 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12038 if (E->VectorizedValue) {
12039 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12040 return E->VectorizedValue;
12041 }
12042 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12043 if (E->VectorizedValue) {
12044 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12045 return E->VectorizedValue;
12046 }
12047 if (L->getType() != R->getType()) {
12048 assert((MinBWs.contains(getOperandEntry(E, 0)) ||
12049 MinBWs.contains(getOperandEntry(E, 1))) &&
12050 "Expected item in MinBWs.");
12051 L = Builder.CreateIntCast(L, VecTy, IsSigned);
12052 R = Builder.CreateIntCast(R, VecTy, IsSigned);
12053 }
12054
12055 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12056 Value *V = Builder.CreateCmp(P0, L, R);
12057 propagateIRFlags(V, E->Scalars, VL0);
12058 // Do not cast for cmps.
12059 VecTy = cast<FixedVectorType>(V->getType());
12060 V = FinalShuffle(V, E, VecTy, IsSigned);
12061
12062 E->VectorizedValue = V;
12063 ++NumVectorInstructions;
12064 return V;
12065 }
12066 case Instruction::Select: {
12067 setInsertPointAfterBundle(E);
12068
12069 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12070 if (E->VectorizedValue) {
12071 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12072 return E->VectorizedValue;
12073 }
12074 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12075 if (E->VectorizedValue) {
12076 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12077 return E->VectorizedValue;
12078 }
12079 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12080 if (E->VectorizedValue) {
12081 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12082 return E->VectorizedValue;
12083 }
12084 if (True->getType() != False->getType()) {
12085 assert((MinBWs.contains(getOperandEntry(E, 1)) ||
12086 MinBWs.contains(getOperandEntry(E, 2))) &&
12087 "Expected item in MinBWs.");
12088 True = Builder.CreateIntCast(True, VecTy, IsSigned);
12089 False = Builder.CreateIntCast(False, VecTy, IsSigned);
12090 }
12091
12092 Value *V = Builder.CreateSelect(Cond, True, False);
12093 V = FinalShuffle(V, E, VecTy, IsSigned);
12094
12095 E->VectorizedValue = V;
12096 ++NumVectorInstructions;
12097 return V;
12098 }
12099 case Instruction::FNeg: {
12100 setInsertPointAfterBundle(E);
12101
12102 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
12103
12104 if (E->VectorizedValue) {
12105 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12106 return E->VectorizedValue;
12107 }
12108
12109 Value *V = Builder.CreateUnOp(
12110 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
12111 propagateIRFlags(V, E->Scalars, VL0);
12112 if (auto *I = dyn_cast<Instruction>(V))
12113 V = propagateMetadata(I, E->Scalars);
12114
12115 V = FinalShuffle(V, E, VecTy, IsSigned);
12116
12117 E->VectorizedValue = V;
12118 ++NumVectorInstructions;
12119
12120 return V;
12121 }
12122 case Instruction::Add:
12123 case Instruction::FAdd:
12124 case Instruction::Sub:
12125 case Instruction::FSub:
12126 case Instruction::Mul:
12127 case Instruction::FMul:
12128 case Instruction::UDiv:
12129 case Instruction::SDiv:
12130 case Instruction::FDiv:
12131 case Instruction::URem:
12132 case Instruction::SRem:
12133 case Instruction::FRem:
12134 case Instruction::Shl:
12135 case Instruction::LShr:
12136 case Instruction::AShr:
12137 case Instruction::And:
12138 case Instruction::Or:
12139 case Instruction::Xor: {
12140 setInsertPointAfterBundle(E);
12141
12142 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
12143 if (E->VectorizedValue) {
12144 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12145 return E->VectorizedValue;
12146 }
12147 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
12148 if (E->VectorizedValue) {
12149 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12150 return E->VectorizedValue;
12151 }
12152 if (LHS->getType() != RHS->getType()) {
12153 assert((MinBWs.contains(getOperandEntry(E, 0)) ||
12154 MinBWs.contains(getOperandEntry(E, 1))) &&
12155 "Expected item in MinBWs.");
12156 LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
12157 RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
12158 }
12159
12160 Value *V = Builder.CreateBinOp(
12161 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
12162 RHS);
12163 propagateIRFlags(V, E->Scalars, VL0, !MinBWs.contains(E));
12164 if (auto *I = dyn_cast<Instruction>(V))
12165 V = propagateMetadata(I, E->Scalars);
12166
12167 V = FinalShuffle(V, E, VecTy, IsSigned);
12168
12169 E->VectorizedValue = V;
12170 ++NumVectorInstructions;
12171
12172 return V;
12173 }
12174 case Instruction::Load: {
12175 // Loads are inserted at the head of the tree because we don't want to
12176 // sink them all the way down past store instructions.
12177 setInsertPointAfterBundle(E);
12178
12179 LoadInst *LI = cast<LoadInst>(VL0);
12180 Instruction *NewLI;
12181 Value *PO = LI->getPointerOperand();
12182 if (E->State == TreeEntry::Vectorize) {
12183 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
12184 } else if (E->State == TreeEntry::StridedVectorize) {
12185 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12186 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12187 PO = IsReverseOrder ? PtrN : Ptr0;
12188 std::optional<int> Diff = getPointersDiff(
12189 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
12190 Type *StrideTy = DL->getIndexType(PO->getType());
12191 Value *StrideVal;
12192 if (Diff) {
12193 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12194 StrideVal =
12195 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12196 DL->getTypeAllocSize(ScalarTy));
12197 } else {
12198 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
12199 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
12200 return cast<LoadInst>(V)->getPointerOperand();
12201 });
12202 OrdersType Order;
12203 std::optional<Value *> Stride =
12204 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
12205 &*Builder.GetInsertPoint());
12206 Value *NewStride =
12207 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
12208 StrideVal = Builder.CreateMul(
12209 NewStride,
12210 ConstantInt::get(
12211 StrideTy,
12212 (IsReverseOrder ? -1 : 1) *
12213 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
12214 }
12215 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12216 auto *Inst = Builder.CreateIntrinsic(
12217 Intrinsic::experimental_vp_strided_load,
12218 {VecTy, PO->getType(), StrideTy},
12219 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
12220 Builder.getInt32(E->Scalars.size())});
12221 Inst->addParamAttr(
12222 /*ArgNo=*/0,
12223 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
12224 NewLI = Inst;
12225 } else {
12226 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
12227 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12228 if (E->VectorizedValue) {
12229 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12230 return E->VectorizedValue;
12231 }
12232 // Use the minimum alignment of the gathered loads.
12233 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12234 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
12235 }
12236 Value *V = propagateMetadata(NewLI, E->Scalars);
12237
12238 V = FinalShuffle(V, E, VecTy, IsSigned);
12239 E->VectorizedValue = V;
12240 ++NumVectorInstructions;
12241 return V;
12242 }
12243 case Instruction::Store: {
12244 auto *SI = cast<StoreInst>(VL0);
12245
12246 setInsertPointAfterBundle(E);
12247
12248 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12249 VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);
12250
12251 Value *Ptr = SI->getPointerOperand();
12252 StoreInst *ST =
12253 Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
12254
12255 Value *V = propagateMetadata(ST, E->Scalars);
12256
12257 E->VectorizedValue = V;
12258 ++NumVectorInstructions;
12259 return V;
12260 }
12261 case Instruction::GetElementPtr: {
12262 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12263 setInsertPointAfterBundle(E);
12264
12265 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12266 if (E->VectorizedValue) {
12267 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12268 return E->VectorizedValue;
12269 }
12270
12271 SmallVector<Value *> OpVecs;
12272 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
12273 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12274 if (E->VectorizedValue) {
12275 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12276 return E->VectorizedValue;
12277 }
12278 OpVecs.push_back(OpVec);
12279 }
12280
12281 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12282 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
12284 for (Value *V : E->Scalars) {
12285 if (isa<GetElementPtrInst>(V))
12286 GEPs.push_back(V);
12287 }
12288 V = propagateMetadata(I, GEPs);
12289 }
12290
12291 V = FinalShuffle(V, E, VecTy, IsSigned);
12292
12293 E->VectorizedValue = V;
12294 ++NumVectorInstructions;
12295
12296 return V;
12297 }
12298 case Instruction::Call: {
12299 CallInst *CI = cast<CallInst>(VL0);
12300 setInsertPointAfterBundle(E);
12301
12303
12304 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
12305 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12306 VecCallCosts.first <= VecCallCosts.second;
12307
12308 Value *ScalarArg = nullptr;
12309 SmallVector<Value *> OpVecs;
12310 SmallVector<Type *, 2> TysForDecl;
12311 // Add return type if intrinsic is overloaded on it.
12312 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
12313 TysForDecl.push_back(
12314 FixedVectorType::get(CI->getType(), E->Scalars.size()));
12315 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
12316 ValueList OpVL;
12317 // Some intrinsics have scalar arguments. This argument should not be
12318 // vectorized.
12319 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
12320 CallInst *CEI = cast<CallInst>(VL0);
12321 ScalarArg = CEI->getArgOperand(I);
12322 OpVecs.push_back(CEI->getArgOperand(I));
12324 TysForDecl.push_back(ScalarArg->getType());
12325 continue;
12326 }
12327
12328 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
12329 if (E->VectorizedValue) {
12330 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12331 return E->VectorizedValue;
12332 }
12333 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
12334 OpVecs.push_back(OpVec);
12335 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
12336 TysForDecl.push_back(OpVec->getType());
12337 }
12338
12339 Function *CF;
12340 if (!UseIntrinsic) {
12341 VFShape Shape =
12344 static_cast<unsigned>(VecTy->getNumElements())),
12345 false /*HasGlobalPred*/);
12346 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
12347 } else {
12348 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
12349 }
12350
12352 CI->getOperandBundlesAsDefs(OpBundles);
12353 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
12354
12355 propagateIRFlags(V, E->Scalars, VL0);
12356 V = FinalShuffle(V, E, VecTy, IsSigned);
12357
12358 E->VectorizedValue = V;
12359 ++NumVectorInstructions;
12360 return V;
12361 }
12362 case Instruction::ShuffleVector: {
12363 assert(E->isAltShuffle() &&
12364 ((Instruction::isBinaryOp(E->getOpcode()) &&
12365 Instruction::isBinaryOp(E->getAltOpcode())) ||
12366 (Instruction::isCast(E->getOpcode()) &&
12367 Instruction::isCast(E->getAltOpcode())) ||
12368 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12369 "Invalid Shuffle Vector Operand");
12370
12371 Value *LHS = nullptr, *RHS = nullptr;
12372 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
12373 setInsertPointAfterBundle(E);
12374 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12375 if (E->VectorizedValue) {
12376 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12377 return E->VectorizedValue;
12378 }
12379 RHS = vectorizeOperand(E, 1, PostponedPHIs);
12380 } else {
12381 setInsertPointAfterBundle(E);
12382 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12383 }
12384 if (E->VectorizedValue) {
12385 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12386 return E->VectorizedValue;
12387 }
12388 if (LHS && RHS && LHS->getType() != RHS->getType()) {
12389 assert((MinBWs.contains(getOperandEntry(E, 0)) ||
12390 MinBWs.contains(getOperandEntry(E, 1))) &&
12391 "Expected item in MinBWs.");
12392 LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
12393 RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
12394 }
12395
12396 Value *V0, *V1;
12397 if (Instruction::isBinaryOp(E->getOpcode())) {
12398 V0 = Builder.CreateBinOp(
12399 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
12400 V1 = Builder.CreateBinOp(
12401 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
12402 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
12403 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
12404 auto *AltCI = cast<CmpInst>(E->getAltOp());
12405 CmpInst::Predicate AltPred = AltCI->getPredicate();
12406 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
12407 } else {
12408 V0 = Builder.CreateCast(
12409 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
12410 V1 = Builder.CreateCast(
12411 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
12412 }
12413 // Add V0 and V1 to later analysis to try to find and remove matching
12414 // instruction, if any.
12415 for (Value *V : {V0, V1}) {
12416 if (auto *I = dyn_cast<Instruction>(V)) {
12417 GatherShuffleExtractSeq.insert(I);
12418 CSEBlocks.insert(I->getParent());
12419 }
12420 }
12421
12422 // Create shuffle to take alternate operations from the vector.
12423 // Also, gather up main and alt scalar ops to propagate IR flags to
12424 // each vector operation.
12425 ValueList OpScalars, AltScalars;
12427 E->buildAltOpShuffleMask(
12428 [E, this](Instruction *I) {
12429 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
12430 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
12431 *TLI);
12432 },
12433 Mask, &OpScalars, &AltScalars);
12434
12435 propagateIRFlags(V0, OpScalars);
12436 propagateIRFlags(V1, AltScalars);
12437
12438 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
12439 if (auto *I = dyn_cast<Instruction>(V)) {
12440 V = propagateMetadata(I, E->Scalars);
12441 GatherShuffleExtractSeq.insert(I);
12442 CSEBlocks.insert(I->getParent());
12443 }
12444
12445 if (V->getType() != VecTy && !isa<CmpInst>(VL0))
12446 V = Builder.CreateIntCast(
12447 V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);
12448 E->VectorizedValue = V;
12449 ++NumVectorInstructions;
12450
12451 return V;
12452 }
12453 default:
12454 llvm_unreachable("unknown inst");
12455 }
12456 return nullptr;
12457}
12458
12460 ExtraValueToDebugLocsMap ExternallyUsedValues;
12461 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
12462 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
12463}
12464
12465namespace {
12466/// Data type for handling buildvector sequences with the reused scalars from
12467/// other tree entries.
12468struct ShuffledInsertData {
12469 /// List of insertelements to be replaced by shuffles.
12470 SmallVector<InsertElementInst *> InsertElements;
12471 /// The parent vectors and shuffle mask for the given list of inserts.
12473};
12474} // namespace
12475
12477 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
12478 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
12479 Instruction *ReductionRoot) {
12480 // All blocks must be scheduled before any instructions are inserted.
12481 for (auto &BSIter : BlocksSchedules) {
12482 scheduleBlock(BSIter.second.get());
12483 }
12484 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
12485 // need to rebuild it.
12486 EntryToLastInstruction.clear();
12487
12488 if (ReductionRoot)
12489 Builder.SetInsertPoint(ReductionRoot->getParent(),
12490 ReductionRoot->getIterator());
12491 else
12492 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
12493
12494 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
12495 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
12496 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
12497 if (TE->State == TreeEntry::Vectorize &&
12498 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
12499 TE->VectorizedValue)
12500 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
12501 // Run through the list of postponed gathers and emit them, replacing the temp
12502 // emitted allocas with actual vector instructions.
12503 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
12505 for (const TreeEntry *E : PostponedNodes) {
12506 auto *TE = const_cast<TreeEntry *>(E);
12507 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
12508 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
12509 TE->UserTreeIndices.front().EdgeIdx)))
12510 // Found gather node which is absolutely the same as one of the
12511 // vectorized nodes. It may happen after reordering.
12512 continue;
12513 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
12514 TE->VectorizedValue = nullptr;
12515 auto *UserI =
12516 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
12517 // If user is a PHI node, its vector code have to be inserted right before
12518 // block terminator. Since the node was delayed, there were some unresolved
12519 // dependencies at the moment when stab instruction was emitted. In a case
12520 // when any of these dependencies turn out an operand of another PHI, coming
12521 // from this same block, position of a stab instruction will become invalid.
12522 // The is because source vector that supposed to feed this gather node was
12523 // inserted at the end of the block [after stab instruction]. So we need
12524 // to adjust insertion point again to the end of block.
12525 if (isa<PHINode>(UserI)) {
12526 // Insert before all users.
12527 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
12528 for (User *U : PrevVec->users()) {
12529 if (U == UserI)
12530 continue;
12531 auto *UI = dyn_cast<Instruction>(U);
12532 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
12533 continue;
12534 if (UI->comesBefore(InsertPt))
12535 InsertPt = UI;
12536 }
12537 Builder.SetInsertPoint(InsertPt);
12538 } else {
12539 Builder.SetInsertPoint(PrevVec);
12540 }
12541 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
12542 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
12543 PrevVec->replaceAllUsesWith(Vec);
12544 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
12545 // Replace the stub vector node, if it was used before for one of the
12546 // buildvector nodes already.
12547 auto It = PostponedValues.find(PrevVec);
12548 if (It != PostponedValues.end()) {
12549 for (TreeEntry *VTE : It->getSecond())
12550 VTE->VectorizedValue = Vec;
12551 }
12552 eraseInstruction(PrevVec);
12553 }
12554
12555 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
12556 << " values .\n");
12557
12558 SmallVector<ShuffledInsertData> ShuffledInserts;
12559 // Maps vector instruction to original insertelement instruction
12560 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
12561 // Maps extract Scalar to the corresponding extractelement instruction in the
12562 // basic block. Only one extractelement per block should be emitted.
12563 DenseMap<Value *,
12565 ScalarToEEs;
12566 SmallDenseSet<Value *, 4> UsedInserts;
12568 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
12569 // Extract all of the elements with the external uses.
12570 for (const auto &ExternalUse : ExternalUses) {
12571 Value *Scalar = ExternalUse.Scalar;
12572 llvm::User *User = ExternalUse.User;
12573
12574 // Skip users that we already RAUW. This happens when one instruction
12575 // has multiple uses of the same value.
12576 if (User && !is_contained(Scalar->users(), User))
12577 continue;
12578 TreeEntry *E = getTreeEntry(Scalar);
12579 assert(E && "Invalid scalar");
12580 assert(E->State != TreeEntry::NeedToGather &&
12581 "Extracting from a gather list");
12582 // Non-instruction pointers are not deleted, just skip them.
12583 if (E->getOpcode() == Instruction::GetElementPtr &&
12584 !isa<GetElementPtrInst>(Scalar))
12585 continue;
12586
12587 Value *Vec = E->VectorizedValue;
12588 assert(Vec && "Can't find vectorizable value");
12589
12590 Value *Lane = Builder.getInt32(ExternalUse.Lane);
12591 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
12592 if (Scalar->getType() != Vec->getType()) {
12593 Value *Ex = nullptr;
12594 Value *ExV = nullptr;
12595 auto It = ScalarToEEs.find(Scalar);
12596 if (It != ScalarToEEs.end()) {
12597 // No need to emit many extracts, just move the only one in the
12598 // current block.
12599 auto EEIt = It->second.find(Builder.GetInsertBlock());
12600 if (EEIt != It->second.end()) {
12601 Instruction *I = EEIt->second.first;
12602 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
12603 Builder.GetInsertPoint()->comesBefore(I)) {
12604 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
12605 Builder.GetInsertPoint());
12606 if (auto *CI = EEIt->second.second)
12607 CI->moveAfter(I);
12608 }
12609 Ex = I;
12610 ExV = EEIt->second.second ? EEIt->second.second : Ex;
12611 }
12612 }
12613 if (!Ex) {
12614 // "Reuse" the existing extract to improve final codegen.
12615 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
12616 Value *V = ES->getVectorOperand();
12617 if (const TreeEntry *ETE = getTreeEntry(V))
12618 V = ETE->VectorizedValue;
12619 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
12620 } else {
12621 Ex = Builder.CreateExtractElement(Vec, Lane);
12622 }
12623 // If necessary, sign-extend or zero-extend ScalarRoot
12624 // to the larger type.
12625 ExV = Ex;
12626 if (Scalar->getType() != Ex->getType())
12627 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
12628 MinBWs.find(E)->second.second);
12629 if (auto *I = dyn_cast<Instruction>(Ex))
12630 ScalarToEEs[Scalar].try_emplace(
12631 Builder.GetInsertBlock(),
12632 std::make_pair(I, cast<Instruction>(ExV)));
12633 }
12634 // The then branch of the previous if may produce constants, since 0
12635 // operand might be a constant.
12636 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
12637 GatherShuffleExtractSeq.insert(ExI);
12638 CSEBlocks.insert(ExI->getParent());
12639 }
12640 return ExV;
12641 }
12642 assert(isa<FixedVectorType>(Scalar->getType()) &&
12643 isa<InsertElementInst>(Scalar) &&
12644 "In-tree scalar of vector type is not insertelement?");
12645 auto *IE = cast<InsertElementInst>(Scalar);
12646 VectorToInsertElement.try_emplace(Vec, IE);
12647 return Vec;
12648 };
12649 // If User == nullptr, the Scalar remains as scalar in vectorized
12650 // instructions or is used as extra arg. Generate ExtractElement instruction
12651 // and update the record for this scalar in ExternallyUsedValues.
12652 if (!User) {
12653 if (!ScalarsWithNullptrUser.insert(Scalar).second)
12654 continue;
12655 assert((ExternallyUsedValues.count(Scalar) ||
12656 any_of(Scalar->users(),
12657 [&](llvm::User *U) {
12658 TreeEntry *UseEntry = getTreeEntry(U);
12659 return UseEntry &&
12660 (UseEntry->State == TreeEntry::Vectorize ||
12661 UseEntry->State ==
12662 TreeEntry::StridedVectorize) &&
12663 (E->State == TreeEntry::Vectorize ||
12664 E->State == TreeEntry::StridedVectorize) &&
12665 doesInTreeUserNeedToExtract(
12666 Scalar,
12667 cast<Instruction>(UseEntry->Scalars.front()),
12668 TLI);
12669 })) &&
12670 "Scalar with nullptr User must be registered in "
12671 "ExternallyUsedValues map or remain as scalar in vectorized "
12672 "instructions");
12673 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
12674 if (auto *PHI = dyn_cast<PHINode>(VecI))
12675 Builder.SetInsertPoint(PHI->getParent(),
12676 PHI->getParent()->getFirstNonPHIIt());
12677 else
12678 Builder.SetInsertPoint(VecI->getParent(),
12679 std::next(VecI->getIterator()));
12680 } else {
12681 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
12682 }
12683 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12684 // Required to update internally referenced instructions.
12685 Scalar->replaceAllUsesWith(NewInst);
12686 ReplacedExternals.emplace_back(Scalar, NewInst);
12687 continue;
12688 }
12689
12690 if (auto *VU = dyn_cast<InsertElementInst>(User)) {
12691 // Skip if the scalar is another vector op or Vec is not an instruction.
12692 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
12693 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
12694 if (!UsedInserts.insert(VU).second)
12695 continue;
12696 // Need to use original vector, if the root is truncated.
12697 auto BWIt = MinBWs.find(E);
12698 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
12699 auto *ScalarTy = FTy->getElementType();
12700 auto Key = std::make_pair(Vec, ScalarTy);
12701 auto VecIt = VectorCasts.find(Key);
12702 if (VecIt == VectorCasts.end()) {
12703 IRBuilder<>::InsertPointGuard Guard(Builder);
12704 if (auto *IVec = dyn_cast<Instruction>(Vec))
12705 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
12706 Vec = Builder.CreateIntCast(
12707 Vec,
12709 ScalarTy,
12710 cast<FixedVectorType>(Vec->getType())->getNumElements()),
12711 BWIt->second.second);
12712 VectorCasts.try_emplace(Key, Vec);
12713 } else {
12714 Vec = VecIt->second;
12715 }
12716 }
12717
12718 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
12719 if (InsertIdx) {
12720 auto *It =
12721 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
12722 // Checks if 2 insertelements are from the same buildvector.
12723 InsertElementInst *VecInsert = Data.InsertElements.front();
12725 VU, VecInsert,
12726 [](InsertElementInst *II) { return II->getOperand(0); });
12727 });
12728 unsigned Idx = *InsertIdx;
12729 if (It == ShuffledInserts.end()) {
12730 (void)ShuffledInserts.emplace_back();
12731 It = std::next(ShuffledInserts.begin(),
12732 ShuffledInserts.size() - 1);
12733 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
12734 if (Mask.empty())
12735 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12736 // Find the insertvector, vectorized in tree, if any.
12737 Value *Base = VU;
12738 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
12739 if (IEBase != User &&
12740 (!IEBase->hasOneUse() ||
12741 getInsertIndex(IEBase).value_or(Idx) == Idx))
12742 break;
12743 // Build the mask for the vectorized insertelement instructions.
12744 if (const TreeEntry *E = getTreeEntry(IEBase)) {
12745 do {
12746 IEBase = cast<InsertElementInst>(Base);
12747 int IEIdx = *getInsertIndex(IEBase);
12748 assert(Mask[Idx] == PoisonMaskElem &&
12749 "InsertElementInstruction used already.");
12750 Mask[IEIdx] = IEIdx;
12751 Base = IEBase->getOperand(0);
12752 } while (E == getTreeEntry(Base));
12753 break;
12754 }
12755 Base = cast<InsertElementInst>(Base)->getOperand(0);
12756 // After the vectorization the def-use chain has changed, need
12757 // to look through original insertelement instructions, if they
12758 // get replaced by vector instructions.
12759 auto It = VectorToInsertElement.find(Base);
12760 if (It != VectorToInsertElement.end())
12761 Base = It->second;
12762 }
12763 }
12764 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
12765 if (Mask.empty())
12766 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12767 Mask[Idx] = ExternalUse.Lane;
12768 It->InsertElements.push_back(cast<InsertElementInst>(User));
12769 continue;
12770 }
12771 }
12772 }
12773 }
12774
12775 // Generate extracts for out-of-tree users.
12776 // Find the insertion point for the extractelement lane.
12777 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
12778 if (PHINode *PH = dyn_cast<PHINode>(User)) {
12779 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12780 if (PH->getIncomingValue(I) == Scalar) {
12781 Instruction *IncomingTerminator =
12782 PH->getIncomingBlock(I)->getTerminator();
12783 if (isa<CatchSwitchInst>(IncomingTerminator)) {
12784 Builder.SetInsertPoint(VecI->getParent(),
12785 std::next(VecI->getIterator()));
12786 } else {
12787 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
12788 }
12789 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12790 PH->setOperand(I, NewInst);
12791 }
12792 }
12793 } else {
12794 Builder.SetInsertPoint(cast<Instruction>(User));
12795 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12796 User->replaceUsesOfWith(Scalar, NewInst);
12797 }
12798 } else {
12799 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
12800 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12801 User->replaceUsesOfWith(Scalar, NewInst);
12802 }
12803
12804 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
12805 }
12806
12807 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
12808 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12809 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12810 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
12811 for (int I = 0, E = Mask.size(); I < E; ++I) {
12812 if (Mask[I] < VF)
12813 CombinedMask1[I] = Mask[I];
12814 else
12815 CombinedMask2[I] = Mask[I] - VF;
12816 }
12817 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
12818 ShuffleBuilder.add(V1, CombinedMask1);
12819 if (V2)
12820 ShuffleBuilder.add(V2, CombinedMask2);
12821 return ShuffleBuilder.finalize(std::nullopt);
12822 };
12823
12824 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
12825 bool ForSingleMask) {
12826 unsigned VF = Mask.size();
12827 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12828 if (VF != VecVF) {
12829 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
12830 Vec = CreateShuffle(Vec, nullptr, Mask);
12831 return std::make_pair(Vec, true);
12832 }
12833 if (!ForSingleMask) {
12834 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12835 for (unsigned I = 0; I < VF; ++I) {
12836 if (Mask[I] != PoisonMaskElem)
12837 ResizeMask[Mask[I]] = Mask[I];
12838 }
12839 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
12840 }
12841 }
12842
12843 return std::make_pair(Vec, false);
12844 };
12845 // Perform shuffling of the vectorize tree entries for better handling of
12846 // external extracts.
12847 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12848 // Find the first and the last instruction in the list of insertelements.
12849 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
12850 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
12851 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
12852 Builder.SetInsertPoint(LastInsert);
12853 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12854 Value *NewInst = performExtractsShuffleAction<Value>(
12855 MutableArrayRef(Vector.data(), Vector.size()),
12856 FirstInsert->getOperand(0),
12857 [](Value *Vec) {
12858 return cast<VectorType>(Vec->getType())
12859 ->getElementCount()
12860 .getKnownMinValue();
12861 },
12862 ResizeToVF,
12863 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
12864 ArrayRef<Value *> Vals) {
12865 assert((Vals.size() == 1 || Vals.size() == 2) &&
12866 "Expected exactly 1 or 2 input values.");
12867 if (Vals.size() == 1) {
12868 // Do not create shuffle if the mask is a simple identity
12869 // non-resizing mask.
12870 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
12871 ->getNumElements() ||
12872 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
12873 return CreateShuffle(Vals.front(), nullptr, Mask);
12874 return Vals.front();
12875 }
12876 return CreateShuffle(Vals.front() ? Vals.front()
12877 : FirstInsert->getOperand(0),
12878 Vals.back(), Mask);
12879 });
12880 auto It = ShuffledInserts[I].InsertElements.rbegin();
12881 // Rebuild buildvector chain.
12882 InsertElementInst *II = nullptr;
12883 if (It != ShuffledInserts[I].InsertElements.rend())
12884 II = *It;
12886 while (It != ShuffledInserts[I].InsertElements.rend()) {
12887 assert(II && "Must be an insertelement instruction.");
12888 if (*It == II)
12889 ++It;
12890 else
12891 Inserts.push_back(cast<Instruction>(II));
12892 II = dyn_cast<InsertElementInst>(II->getOperand(0));
12893 }
12894 for (Instruction *II : reverse(Inserts)) {
12895 II->replaceUsesOfWith(II->getOperand(0), NewInst);
12896 if (auto *NewI = dyn_cast<Instruction>(NewInst))
12897 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
12898 II->moveAfter(NewI);
12899 NewInst = II;
12900 }
12901 LastInsert->replaceAllUsesWith(NewInst);
12902 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
12903 IE->replaceUsesOfWith(IE->getOperand(0),
12904 PoisonValue::get(IE->getOperand(0)->getType()));
12905 IE->replaceUsesOfWith(IE->getOperand(1),
12906 PoisonValue::get(IE->getOperand(1)->getType()));
12907 eraseInstruction(IE);
12908 }
12909 CSEBlocks.insert(LastInsert->getParent());
12910 }
12911
12912 SmallVector<Instruction *> RemovedInsts;
12913 // For each vectorized value:
12914 for (auto &TEPtr : VectorizableTree) {
12915 TreeEntry *Entry = TEPtr.get();
12916
12917 // No need to handle users of gathered values.
12918 if (Entry->State == TreeEntry::NeedToGather)
12919 continue;
12920
12921 assert(Entry->VectorizedValue && "Can't find vectorizable value");
12922
12923 // For each lane:
12924 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
12925 Value *Scalar = Entry->Scalars[Lane];
12926
12927 if (Entry->getOpcode() == Instruction::GetElementPtr &&
12928 !isa<GetElementPtrInst>(Scalar))
12929 continue;
12930#ifndef NDEBUG
12931 Type *Ty = Scalar->getType();
12932 if (!Ty->isVoidTy()) {
12933 for (User *U : Scalar->users()) {
12934 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
12935
12936 // It is legal to delete users in the ignorelist.
12937 assert((getTreeEntry(U) ||
12938 (UserIgnoreList && UserIgnoreList->contains(U)) ||
12939 (isa_and_nonnull<Instruction>(U) &&
12940 isDeleted(cast<Instruction>(U)))) &&
12941 "Deleting out-of-tree value");
12942 }
12943 }
12944#endif
12945 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
12946 eraseInstruction(cast<Instruction>(Scalar));
12947 // Retain to-be-deleted instructions for some debug-info
12948 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
12949 // deletion - instructions are not deleted until later.
12950 RemovedInsts.push_back(cast<Instruction>(Scalar));
12951 }
12952 }
12953
12954 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
12955 // new vector instruction.
12956 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
12957 V->mergeDIAssignID(RemovedInsts);
12958
12959 Builder.ClearInsertionPoint();
12960 InstrElementSize.clear();
12961
12962 return VectorizableTree[0]->VectorizedValue;
12963}
12964
12966 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
12967 << " gather sequences instructions.\n");
12968 // LICM InsertElementInst sequences.
12969 for (Instruction *I : GatherShuffleExtractSeq) {
12970 if (isDeleted(I))
12971 continue;
12972
12973 // Check if this block is inside a loop.
12974 Loop *L = LI->getLoopFor(I->getParent());
12975 if (!L)
12976 continue;
12977
12978 // Check if it has a preheader.
12979 BasicBlock *PreHeader = L->getLoopPreheader();
12980 if (!PreHeader)
12981 continue;
12982
12983 // If the vector or the element that we insert into it are
12984 // instructions that are defined in this basic block then we can't
12985 // hoist this instruction.
12986 if (any_of(I->operands(), [L](Value *V) {
12987 auto *OpI = dyn_cast<Instruction>(V);
12988 return OpI && L->contains(OpI);
12989 }))
12990 continue;
12991
12992 // We can hoist this instruction. Move it to the pre-header.
12993 I->moveBefore(PreHeader->getTerminator());
12994 CSEBlocks.insert(PreHeader);
12995 }
12996
12997 // Make a list of all reachable blocks in our CSE queue.
12999 CSEWorkList.reserve(CSEBlocks.size());
13000 for (BasicBlock *BB : CSEBlocks)
13001 if (DomTreeNode *N = DT->getNode(BB)) {
13003 CSEWorkList.push_back(N);
13004 }
13005
13006 // Sort blocks by domination. This ensures we visit a block after all blocks
13007 // dominating it are visited.
13008 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
13009 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
13010 "Different nodes should have different DFS numbers");
13011 return A->getDFSNumIn() < B->getDFSNumIn();
13012 });
13013
13014 // Less defined shuffles can be replaced by the more defined copies.
13015 // Between two shuffles one is less defined if it has the same vector operands
13016 // and its mask indeces are the same as in the first one or undefs. E.g.
13017 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
13018 // poison, <0, 0, 0, 0>.
13019 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
13020 SmallVectorImpl<int> &NewMask) {
13021 if (I1->getType() != I2->getType())
13022 return false;
13023 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13024 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13025 if (!SI1 || !SI2)
13026 return I1->isIdenticalTo(I2);
13027 if (SI1->isIdenticalTo(SI2))
13028 return true;
13029 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
13030 if (SI1->getOperand(I) != SI2->getOperand(I))
13031 return false;
13032 // Check if the second instruction is more defined than the first one.
13033 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13034 ArrayRef<int> SM1 = SI1->getShuffleMask();
13035 // Count trailing undefs in the mask to check the final number of used
13036 // registers.
13037 unsigned LastUndefsCnt = 0;
13038 for (int I = 0, E = NewMask.size(); I < E; ++I) {
13039 if (SM1[I] == PoisonMaskElem)
13040 ++LastUndefsCnt;
13041 else
13042 LastUndefsCnt = 0;
13043 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
13044 NewMask[I] != SM1[I])
13045 return false;
13046 if (NewMask[I] == PoisonMaskElem)
13047 NewMask[I] = SM1[I];
13048 }
13049 // Check if the last undefs actually change the final number of used vector
13050 // registers.
13051 return SM1.size() - LastUndefsCnt > 1 &&
13052 TTI->getNumberOfParts(SI1->getType()) ==
13054 FixedVectorType::get(SI1->getType()->getElementType(),
13055 SM1.size() - LastUndefsCnt));
13056 };
13057 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
13058 // instructions. TODO: We can further optimize this scan if we split the
13059 // instructions into different buckets based on the insert lane.
13061 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
13062 assert(*I &&
13063 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
13064 "Worklist not sorted properly!");
13065 BasicBlock *BB = (*I)->getBlock();
13066 // For all instructions in blocks containing gather sequences:
13067 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
13068 if (isDeleted(&In))
13069 continue;
13070 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13071 !GatherShuffleExtractSeq.contains(&In))
13072 continue;
13073
13074 // Check if we can replace this instruction with any of the
13075 // visited instructions.
13076 bool Replaced = false;
13077 for (Instruction *&V : Visited) {
13078 SmallVector<int> NewMask;
13079 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13080 DT->dominates(V->getParent(), In.getParent())) {
13081 In.replaceAllUsesWith(V);
13082 eraseInstruction(&In);
13083 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
13084 if (!NewMask.empty())
13085 SI->setShuffleMask(NewMask);
13086 Replaced = true;
13087 break;
13088 }
13089 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13090 GatherShuffleExtractSeq.contains(V) &&
13091 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13092 DT->dominates(In.getParent(), V->getParent())) {
13093 In.moveAfter(V);
13094 V->replaceAllUsesWith(&In);
13096 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13097 if (!NewMask.empty())
13098 SI->setShuffleMask(NewMask);
13099 V = &In;
13100 Replaced = true;
13101 break;
13102 }
13103 }
13104 if (!Replaced) {
13105 assert(!is_contained(Visited, &In));
13106 Visited.push_back(&In);
13107 }
13108 }
13109 }
13110 CSEBlocks.clear();
13111 GatherShuffleExtractSeq.clear();
13112}
13113
13114BoUpSLP::ScheduleData *
13115BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
13116 ScheduleData *Bundle = nullptr;
13117 ScheduleData *PrevInBundle = nullptr;
13118 for (Value *V : VL) {
13120 continue;
13121 ScheduleData *BundleMember = getScheduleData(V);
13122 assert(BundleMember &&
13123 "no ScheduleData for bundle member "
13124 "(maybe not in same basic block)");
13125 assert(BundleMember->isSchedulingEntity() &&
13126 "bundle member already part of other bundle");
13127 if (PrevInBundle) {
13128 PrevInBundle->NextInBundle = BundleMember;
13129 } else {
13130 Bundle = BundleMember;
13131 }
13132
13133 // Group the instructions to a bundle.
13134 BundleMember->FirstInBundle = Bundle;
13135 PrevInBundle = BundleMember;
13136 }
13137 assert(Bundle && "Failed to find schedule bundle");
13138 return Bundle;
13139}
13140
13141// Groups the instructions to a bundle (which is then a single scheduling entity)
13142// and schedules instructions until the bundle gets ready.
13143std::optional<BoUpSLP::ScheduleData *>
13144BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
13145 const InstructionsState &S) {
13146 // No need to schedule PHIs, insertelement, extractelement and extractvalue
13147 // instructions.
13148 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
13150 return nullptr;
13151
13152 // Initialize the instruction bundle.
13153 Instruction *OldScheduleEnd = ScheduleEnd;
13154 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
13155
13156 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
13157 ScheduleData *Bundle) {
13158 // The scheduling region got new instructions at the lower end (or it is a
13159 // new region for the first bundle). This makes it necessary to
13160 // recalculate all dependencies.
13161 // It is seldom that this needs to be done a second time after adding the
13162 // initial bundle to the region.
13163 if (ScheduleEnd != OldScheduleEnd) {
13164 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
13165 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
13166 ReSchedule = true;
13167 }
13168 if (Bundle) {
13169 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
13170 << " in block " << BB->getName() << "\n");
13171 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
13172 }
13173
13174 if (ReSchedule) {
13175 resetSchedule();
13176 initialFillReadyList(ReadyInsts);
13177 }
13178
13179 // Now try to schedule the new bundle or (if no bundle) just calculate
13180 // dependencies. As soon as the bundle is "ready" it means that there are no
13181 // cyclic dependencies and we can schedule it. Note that's important that we
13182 // don't "schedule" the bundle yet (see cancelScheduling).
13183 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13184 !ReadyInsts.empty()) {
13185 ScheduleData *Picked = ReadyInsts.pop_back_val();
13186 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13187 "must be ready to schedule");
13188 schedule(Picked, ReadyInsts);
13189 }
13190 };
13191
13192 // Make sure that the scheduling region contains all
13193 // instructions of the bundle.
13194 for (Value *V : VL) {
13196 continue;
13197 if (!extendSchedulingRegion(V, S)) {
13198 // If the scheduling region got new instructions at the lower end (or it
13199 // is a new region for the first bundle). This makes it necessary to
13200 // recalculate all dependencies.
13201 // Otherwise the compiler may crash trying to incorrectly calculate
13202 // dependencies and emit instruction in the wrong order at the actual
13203 // scheduling.
13204 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
13205 return std::nullopt;
13206 }
13207 }
13208
13209 bool ReSchedule = false;
13210 for (Value *V : VL) {
13212 continue;
13213 ScheduleData *BundleMember = getScheduleData(V);
13214 assert(BundleMember &&
13215 "no ScheduleData for bundle member (maybe not in same basic block)");
13216
13217 // Make sure we don't leave the pieces of the bundle in the ready list when
13218 // whole bundle might not be ready.
13219 ReadyInsts.remove(BundleMember);
13220
13221 if (!BundleMember->IsScheduled)
13222 continue;
13223 // A bundle member was scheduled as single instruction before and now
13224 // needs to be scheduled as part of the bundle. We just get rid of the
13225 // existing schedule.
13226 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
13227 << " was already scheduled\n");
13228 ReSchedule = true;
13229 }
13230
13231 auto *Bundle = buildBundle(VL);
13232 TryScheduleBundleImpl(ReSchedule, Bundle);
13233 if (!Bundle->isReady()) {
13234 cancelScheduling(VL, S.OpValue);
13235 return std::nullopt;
13236 }
13237 return Bundle;
13238}
13239
13240void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
13241 Value *OpValue) {
13242 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
13244 return;
13245
13246 if (doesNotNeedToBeScheduled(OpValue))
13247 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
13248 ScheduleData *Bundle = getScheduleData(OpValue);
13249 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
13250 assert(!Bundle->IsScheduled &&
13251 "Can't cancel bundle which is already scheduled");
13252 assert(Bundle->isSchedulingEntity() &&
13253 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
13254 "tried to unbundle something which is not a bundle");
13255
13256 // Remove the bundle from the ready list.
13257 if (Bundle->isReady())
13258 ReadyInsts.remove(Bundle);
13259
13260 // Un-bundle: make single instructions out of the bundle.
13261 ScheduleData *BundleMember = Bundle;
13262 while (BundleMember) {
13263 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
13264 BundleMember->FirstInBundle = BundleMember;
13265 ScheduleData *Next = BundleMember->NextInBundle;
13266 BundleMember->NextInBundle = nullptr;
13267 BundleMember->TE = nullptr;
13268 if (BundleMember->unscheduledDepsInBundle() == 0) {
13269 ReadyInsts.insert(BundleMember);
13270 }
13271 BundleMember = Next;
13272 }
13273}
13274
13275BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13276 // Allocate a new ScheduleData for the instruction.
13277 if (ChunkPos >= ChunkSize) {
13278 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
13279 ChunkPos = 0;
13280 }
13281 return &(ScheduleDataChunks.back()[ChunkPos++]);
13282}
13283
13284bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
13285 const InstructionsState &S) {
13286 if (getScheduleData(V, isOneOf(S, V)))
13287 return true;
13288 Instruction *I = dyn_cast<Instruction>(V);
13289 assert(I && "bundle member must be an instruction");
13290 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
13292 "phi nodes/insertelements/extractelements/extractvalues don't need to "
13293 "be scheduled");
13294 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
13295 ScheduleData *ISD = getScheduleData(I);
13296 if (!ISD)
13297 return false;
13298 assert(isInSchedulingRegion(ISD) &&
13299 "ScheduleData not in scheduling region");
13300 ScheduleData *SD = allocateScheduleDataChunks();
13301 SD->Inst = I;
13302 SD->init(SchedulingRegionID, S.OpValue);
13303 ExtraScheduleDataMap[I][S.OpValue] = SD;
13304 return true;
13305 };
13306 if (CheckScheduleForI(I))
13307 return true;
13308 if (!ScheduleStart) {
13309 // It's the first instruction in the new region.
13310 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
13311 ScheduleStart = I;
13312 ScheduleEnd = I->getNextNode();
13313 if (isOneOf(S, I) != I)
13314 CheckScheduleForI(I);
13315 assert(ScheduleEnd && "tried to vectorize a terminator?");
13316 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
13317 return true;
13318 }
13319 // Search up and down at the same time, because we don't know if the new
13320 // instruction is above or below the existing scheduling region.
13321 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
13322 // against the budget. Otherwise debug info could affect codegen.
13324 ++ScheduleStart->getIterator().getReverse();
13325 BasicBlock::reverse_iterator UpperEnd = BB->rend();
13326 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
13327 BasicBlock::iterator LowerEnd = BB->end();
13328 auto IsAssumeLikeIntr = [](const Instruction &I) {
13329 if (auto *II = dyn_cast<IntrinsicInst>(&I))
13330 return II->isAssumeLikeIntrinsic();
13331 return false;
13332 };
13333 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13334 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13335 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
13336 &*DownIter != I) {
13337 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
13338 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
13339 return false;
13340 }
13341
13342 ++UpIter;
13343 ++DownIter;
13344
13345 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
13346 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
13347 }
13348 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
13349 assert(I->getParent() == ScheduleStart->getParent() &&
13350 "Instruction is in wrong basic block.");
13351 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
13352 ScheduleStart = I;
13353 if (isOneOf(S, I) != I)
13354 CheckScheduleForI(I);
13355 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
13356 << "\n");
13357 return true;
13358 }
13359 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
13360 "Expected to reach top of the basic block or instruction down the "
13361 "lower end.");
13362 assert(I->getParent() == ScheduleEnd->getParent() &&
13363 "Instruction is in wrong basic block.");
13364 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
13365 nullptr);
13366 ScheduleEnd = I->getNextNode();
13367 if (isOneOf(S, I) != I)
13368 CheckScheduleForI(I);
13369 assert(ScheduleEnd && "tried to vectorize a terminator?");
13370 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
13371 return true;
13372}
13373
13374void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
13375 Instruction *ToI,
13376 ScheduleData *PrevLoadStore,
13377 ScheduleData *NextLoadStore) {
13378 ScheduleData *CurrentLoadStore = PrevLoadStore;
13379 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
13380 // No need to allocate data for non-schedulable instructions.
13382 continue;
13383 ScheduleData *SD = ScheduleDataMap.lookup(I);
13384 if (!SD) {
13385 SD = allocateScheduleDataChunks();
13386 ScheduleDataMap[I] = SD;
13387 SD->Inst = I;
13388 }
13389 assert(!isInSchedulingRegion(SD) &&
13390 "new ScheduleData already in scheduling region");
13391 SD->init(SchedulingRegionID, I);
13392
13393 if (I->mayReadOrWriteMemory() &&
13394 (!isa<IntrinsicInst>(I) ||
13395 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
13396 cast<IntrinsicInst>(I)->getIntrinsicID() !=
13397 Intrinsic::pseudoprobe))) {
13398 // Update the linked list of memory accessing instructions.
13399 if (CurrentLoadStore) {
13400 CurrentLoadStore->NextLoadStore = SD;
13401 } else {
13402 FirstLoadStoreInRegion = SD;
13403 }
13404 CurrentLoadStore = SD;
13405 }
13406
13407 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
13408 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
13409 RegionHasStackSave = true;
13410 }
13411 if (NextLoadStore) {
13412 if (CurrentLoadStore)
13413 CurrentLoadStore->NextLoadStore = NextLoadStore;
13414 } else {
13415 LastLoadStoreInRegion = CurrentLoadStore;
13416 }
13417}
13418
13419void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
13420 bool InsertInReadyList,
13421 BoUpSLP *SLP) {
13422 assert(SD->isSchedulingEntity());
13423
13425 WorkList.push_back(SD);
13426
13427 while (!WorkList.empty()) {
13428 ScheduleData *SD = WorkList.pop_back_val();
13429 for (ScheduleData *BundleMember = SD; BundleMember;
13430 BundleMember = BundleMember->NextInBundle) {
13431 assert(isInSchedulingRegion(BundleMember));
13432 if (BundleMember->hasValidDependencies())
13433 continue;
13434
13435 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
13436 << "\n");
13437 BundleMember->Dependencies = 0;
13438 BundleMember->resetUnscheduledDeps();
13439
13440 // Handle def-use chain dependencies.
13441 if (BundleMember->OpValue != BundleMember->Inst) {
13442 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
13443 BundleMember->Dependencies++;
13444 ScheduleData *DestBundle = UseSD->FirstInBundle;
13445 if (!DestBundle->IsScheduled)
13446 BundleMember->incrementUnscheduledDeps(1);
13447 if (!DestBundle->hasValidDependencies())
13448 WorkList.push_back(DestBundle);
13449 }
13450 } else {
13451 for (User *U : BundleMember->Inst->users()) {
13452 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
13453 BundleMember->Dependencies++;
13454 ScheduleData *DestBundle = UseSD->FirstInBundle;
13455 if (!DestBundle->IsScheduled)
13456 BundleMember->incrementUnscheduledDeps(1);
13457 if (!DestBundle->hasValidDependencies())
13458 WorkList.push_back(DestBundle);
13459 }
13460 }
13461 }
13462
13463 auto MakeControlDependent = [&](Instruction *I) {
13464 auto *DepDest = getScheduleData(I);
13465 assert(DepDest && "must be in schedule window");
13466 DepDest->ControlDependencies.push_back(BundleMember);
13467 BundleMember->Dependencies++;
13468 ScheduleData *DestBundle = DepDest->FirstInBundle;
13469 if (!DestBundle->IsScheduled)
13470 BundleMember->incrementUnscheduledDeps(1);
13471 if (!DestBundle->hasValidDependencies())
13472 WorkList.push_back(DestBundle);
13473 };
13474
13475 // Any instruction which isn't safe to speculate at the beginning of the
13476 // block is control dependend on any early exit or non-willreturn call
13477 // which proceeds it.
13478 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
13479 for (Instruction *I = BundleMember->Inst->getNextNode();
13480 I != ScheduleEnd; I = I->getNextNode()) {
13481 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
13482 continue;
13483
13484 // Add the dependency
13485 MakeControlDependent(I);
13486
13488 // Everything past here must be control dependent on I.
13489 break;
13490 }
13491 }
13492
13493 if (RegionHasStackSave) {
13494 // If we have an inalloc alloca instruction, it needs to be scheduled
13495 // after any preceeding stacksave. We also need to prevent any alloca
13496 // from reordering above a preceeding stackrestore.
13497 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
13498 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
13499 for (Instruction *I = BundleMember->Inst->getNextNode();
13500 I != ScheduleEnd; I = I->getNextNode()) {
13501 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
13502 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
13503 // Any allocas past here must be control dependent on I, and I
13504 // must be memory dependend on BundleMember->Inst.
13505 break;
13506
13507 if (!isa<AllocaInst>(I))
13508 continue;
13509
13510 // Add the dependency
13511 MakeControlDependent(I);
13512 }
13513 }
13514
13515 // In addition to the cases handle just above, we need to prevent
13516 // allocas and loads/stores from moving below a stacksave or a
13517 // stackrestore. Avoiding moving allocas below stackrestore is currently
13518 // thought to be conservatism. Moving loads/stores below a stackrestore
13519 // can lead to incorrect code.
13520 if (isa<AllocaInst>(BundleMember->Inst) ||
13521 BundleMember->Inst->mayReadOrWriteMemory()) {
13522 for (Instruction *I = BundleMember->Inst->getNextNode();
13523 I != ScheduleEnd; I = I->getNextNode()) {
13524 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
13525 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
13526 continue;
13527
13528 // Add the dependency
13529 MakeControlDependent(I);
13530 break;
13531 }
13532 }
13533 }
13534
13535 // Handle the memory dependencies (if any).
13536 ScheduleData *DepDest = BundleMember->NextLoadStore;
13537 if (!DepDest)
13538 continue;
13539 Instruction *SrcInst = BundleMember->Inst;
13540 assert(SrcInst->mayReadOrWriteMemory() &&
13541 "NextLoadStore list for non memory effecting bundle?");
13542 MemoryLocation SrcLoc = getLocation(SrcInst);
13543 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
13544 unsigned NumAliased = 0;
13545 unsigned DistToSrc = 1;
13546
13547 for (; DepDest; DepDest = DepDest->NextLoadStore) {
13548 assert(isInSchedulingRegion(DepDest));
13549
13550 // We have two limits to reduce the complexity:
13551 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
13552 // SLP->isAliased (which is the expensive part in this loop).
13553 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
13554 // the whole loop (even if the loop is fast, it's quadratic).
13555 // It's important for the loop break condition (see below) to
13556 // check this limit even between two read-only instructions.
13557 if (DistToSrc >= MaxMemDepDistance ||
13558 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
13559 (NumAliased >= AliasedCheckLimit ||
13560 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
13561
13562 // We increment the counter only if the locations are aliased
13563 // (instead of counting all alias checks). This gives a better
13564 // balance between reduced runtime and accurate dependencies.
13565 NumAliased++;
13566
13567 DepDest->MemoryDependencies.push_back(BundleMember);
13568 BundleMember->Dependencies++;
13569 ScheduleData *DestBundle = DepDest->FirstInBundle;
13570 if (!DestBundle->IsScheduled) {
13571 BundleMember->incrementUnscheduledDeps(1);
13572 }
13573 if (!DestBundle->hasValidDependencies()) {
13574 WorkList.push_back(DestBundle);
13575 }
13576 }
13577
13578 // Example, explaining the loop break condition: Let's assume our
13579 // starting instruction is i0 and MaxMemDepDistance = 3.
13580 //
13581 // +--------v--v--v
13582 // i0,i1,i2,i3,i4,i5,i6,i7,i8
13583 // +--------^--^--^
13584 //
13585 // MaxMemDepDistance let us stop alias-checking at i3 and we add
13586 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
13587 // Previously we already added dependencies from i3 to i6,i7,i8
13588 // (because of MaxMemDepDistance). As we added a dependency from
13589 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
13590 // and we can abort this loop at i6.
13591 if (DistToSrc >= 2 * MaxMemDepDistance)
13592 break;
13593 DistToSrc++;
13594 }
13595 }
13596 if (InsertInReadyList && SD->isReady()) {
13597 ReadyInsts.insert(SD);
13598 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
13599 << "\n");
13600 }
13601 }
13602}
13603
13604void BoUpSLP::BlockScheduling::resetSchedule() {
13605 assert(ScheduleStart &&
13606 "tried to reset schedule on block which has not been scheduled");
13607 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
13608 doForAllOpcodes(I, [&](ScheduleData *SD) {
13609 assert(isInSchedulingRegion(SD) &&
13610 "ScheduleData not in scheduling region");
13611 SD->IsScheduled = false;
13612 SD->resetUnscheduledDeps();
13613 });
13614 }
13615 ReadyInsts.clear();
13616}
13617
13618void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
13619 if (!BS->ScheduleStart)
13620 return;
13621
13622 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
13623
13624 // A key point - if we got here, pre-scheduling was able to find a valid
13625 // scheduling of the sub-graph of the scheduling window which consists
13626 // of all vector bundles and their transitive users. As such, we do not
13627 // need to reschedule anything *outside of* that subgraph.
13628
13629 BS->resetSchedule();
13630
13631 // For the real scheduling we use a more sophisticated ready-list: it is
13632 // sorted by the original instruction location. This lets the final schedule
13633 // be as close as possible to the original instruction order.
13634 // WARNING: If changing this order causes a correctness issue, that means
13635 // there is some missing dependence edge in the schedule data graph.
13636 struct ScheduleDataCompare {
13637 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
13638 return SD2->SchedulingPriority < SD1->SchedulingPriority;
13639 }
13640 };
13641 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
13642
13643 // Ensure that all dependency data is updated (for nodes in the sub-graph)
13644 // and fill the ready-list with initial instructions.
13645 int Idx = 0;
13646 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
13647 I = I->getNextNode()) {
13648 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
13649 TreeEntry *SDTE = getTreeEntry(SD->Inst);
13650 (void)SDTE;
13652 SD->isPartOfBundle() ==
13653 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
13654 "scheduler and vectorizer bundle mismatch");
13655 SD->FirstInBundle->SchedulingPriority = Idx++;
13656
13657 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
13658 BS->calculateDependencies(SD, false, this);
13659 });
13660 }
13661 BS->initialFillReadyList(ReadyInsts);
13662
13663 Instruction *LastScheduledInst = BS->ScheduleEnd;
13664
13665 // Do the "real" scheduling.
13666 while (!ReadyInsts.empty()) {
13667 ScheduleData *Picked = *ReadyInsts.begin();
13668 ReadyInsts.erase(ReadyInsts.begin());
13669
13670 // Move the scheduled instruction(s) to their dedicated places, if not
13671 // there yet.
13672 for (ScheduleData *BundleMember = Picked; BundleMember;
13673 BundleMember = BundleMember->NextInBundle) {
13674 Instruction *PickedInst = BundleMember->Inst;
13675 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
13676 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
13677 LastScheduledInst = PickedInst;
13678 }
13679
13680 BS->schedule(Picked, ReadyInsts);
13681 }
13682
13683 // Check that we didn't break any of our invariants.
13684#ifdef EXPENSIVE_CHECKS
13685 BS->verify();
13686#endif
13687
13688#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
13689 // Check that all schedulable entities got scheduled
13690 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
13691 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
13692 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
13693 assert(SD->IsScheduled && "must be scheduled at this point");
13694 }
13695 });
13696 }
13697#endif
13698
13699 // Avoid duplicate scheduling of the block.
13700 BS->ScheduleStart = nullptr;
13701}
13702
13704 // If V is a store, just return the width of the stored value (or value
13705 // truncated just before storing) without traversing the expression tree.
13706 // This is the common case.
13707 if (auto *Store = dyn_cast<StoreInst>(V))
13708 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
13709
13710 if (auto *IEI = dyn_cast<InsertElementInst>(V))
13711 return getVectorElementSize(IEI->getOperand(1));
13712
13713 auto E = InstrElementSize.find(V);
13714 if (E != InstrElementSize.end())
13715 return E->second;
13716
13717 // If V is not a store, we can traverse the expression tree to find loads
13718 // that feed it. The type of the loaded value may indicate a more suitable
13719 // width than V's type. We want to base the vector element size on the width
13720 // of memory operations where possible.
13723 if (auto *I = dyn_cast<Instruction>(V)) {
13724 Worklist.emplace_back(I, I->getParent());
13725 Visited.insert(I);
13726 }
13727
13728 // Traverse the expression tree in bottom-up order looking for loads. If we
13729 // encounter an instruction we don't yet handle, we give up.
13730 auto Width = 0u;
13731 while (!Worklist.empty()) {
13732 Instruction *I;
13733 BasicBlock *Parent;
13734 std::tie(I, Parent) = Worklist.pop_back_val();
13735
13736 // We should only be looking at scalar instructions here. If the current
13737 // instruction has a vector type, skip.
13738 auto *Ty = I->getType();
13739 if (isa<VectorType>(Ty))
13740 continue;
13741
13742 // If the current instruction is a load, update MaxWidth to reflect the
13743 // width of the loaded value.
13744 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
13745 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
13746
13747 // Otherwise, we need to visit the operands of the instruction. We only
13748 // handle the interesting cases from buildTree here. If an operand is an
13749 // instruction we haven't yet visited and from the same basic block as the
13750 // user or the use is a PHI node, we add it to the worklist.
13753 for (Use &U : I->operands())
13754 if (auto *J = dyn_cast<Instruction>(U.get()))
13755 if (Visited.insert(J).second &&
13756 (isa<PHINode>(I) || J->getParent() == Parent))
13757 Worklist.emplace_back(J, J->getParent());
13758 } else {
13759 break;
13760 }
13761 }
13762
13763 // If we didn't encounter a memory access in the expression tree, or if we
13764 // gave up for some reason, just return the width of V. Otherwise, return the
13765 // maximum width we found.
13766 if (!Width) {
13767 if (auto *CI = dyn_cast<CmpInst>(V))
13768 V = CI->getOperand(0);
13769 Width = DL->getTypeSizeInBits(V->getType());
13770 }
13771
13772 for (Instruction *I : Visited)
13773 InstrElementSize[I] = Width;
13774
13775 return Width;
13776}
13777
13778// Determine if a value V in a vectorizable expression Expr can be demoted to a
13779// smaller type with a truncation. We collect the values that will be demoted
13780// in ToDemote and additional roots that require investigating in Roots.
13781bool BoUpSLP::collectValuesToDemote(
13782 Value *V, SmallVectorImpl<Value *> &ToDemote,
13783 DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
13784 SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
13785 // We can always demote constants.
13786 if (isa<Constant>(V))
13787 return true;
13788
13789 // If the value is not a vectorized instruction in the expression and not used
13790 // by the insertelement instruction and not used in multiple vector nodes, it
13791 // cannot be demoted.
13792 auto *I = dyn_cast<Instruction>(V);
13793 if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) ||
13794 !Visited.insert(I).second || all_of(I->users(), [&](User *U) {
13795 return isa<InsertElementInst>(U) && !getTreeEntry(U);
13796 }))
13797 return false;
13798
13799 unsigned Start = 0;
13800 unsigned End = I->getNumOperands();
13801 switch (I->getOpcode()) {
13802
13803 // We can always demote truncations and extensions. Since truncations can
13804 // seed additional demotion, we save the truncated value.
13805 case Instruction::Trunc:
13806 Roots.push_back(I->getOperand(0));
13807 break;
13808 case Instruction::ZExt:
13809 case Instruction::SExt:
13810 if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
13811 return false;
13812 break;
13813
13814 // We can demote certain binary operations if we can demote both of their
13815 // operands.
13816 case Instruction::Add:
13817 case Instruction::Sub:
13818 case Instruction::Mul:
13819 case Instruction::And:
13820 case Instruction::Or:
13821 case Instruction::Xor:
13822 if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
13823 Visited) ||
13824 !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
13825 Visited))
13826 return false;
13827 break;
13828
13829 // We can demote selects if we can demote their true and false values.
13830 case Instruction::Select: {
13831 Start = 1;
13832 SelectInst *SI = cast<SelectInst>(I);
13833 if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
13834 Roots, Visited) ||
13835 !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
13836 Roots, Visited))
13837 return false;
13838 break;
13839 }
13840
13841 // We can demote phis if we can demote all their incoming operands. Note that
13842 // we don't need to worry about cycles since we ensure single use above.
13843 case Instruction::PHI: {
13844 PHINode *PN = cast<PHINode>(I);
13845 for (Value *IncValue : PN->incoming_values())
13846 if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
13847 Visited))
13848 return false;
13849 break;
13850 }
13851
13852 // Otherwise, conservatively give up.
13853 default:
13854 return false;
13855 }
13856
13857 // Gather demoted constant operands.
13858 for (unsigned Idx : seq<unsigned>(Start, End))
13859 if (isa<Constant>(I->getOperand(Idx)))
13860 DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
13861 // Record the value that we can demote.
13862 ToDemote.push_back(V);
13863 return true;
13864}
13865
13867 // We only attempt to truncate integer expressions.
13868 auto &TreeRoot = VectorizableTree[0]->Scalars;
13869 auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
13870 if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather)
13871 return;
13872
13873 // Ensure the roots of the vectorizable tree don't form a cycle.
13874 if (!VectorizableTree.front()->UserTreeIndices.empty())
13875 return;
13876
13877 // Conservatively determine if we can actually truncate the roots of the
13878 // expression. Collect the values that can be demoted in ToDemote and
13879 // additional roots that require investigating in Roots.
13880 SmallVector<Value *, 32> ToDemote;
13883 for (auto *Root : TreeRoot) {
13884 DenseSet<Value *> Visited;
13885 if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
13886 return;
13887 }
13888
13889 // The maximum bit width required to represent all the values that can be
13890 // demoted without loss of precision. It would be safe to truncate the roots
13891 // of the expression to this width.
13892 auto MaxBitWidth = 1u;
13893
13894 // We first check if all the bits of the roots are demanded. If they're not,
13895 // we can truncate the roots to this narrower type.
13896 for (auto *Root : TreeRoot) {
13897 auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
13898 MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
13899 MaxBitWidth);
13900 }
13901
13902 // True if the roots can be zero-extended back to their original type, rather
13903 // than sign-extended. We know that if the leading bits are not demanded, we
13904 // can safely zero-extend. So we initialize IsKnownPositive to True.
13905 bool IsKnownPositive = true;
13906
13907 // If all the bits of the roots are demanded, we can try a little harder to
13908 // compute a narrower type. This can happen, for example, if the roots are
13909 // getelementptr indices. InstCombine promotes these indices to the pointer
13910 // width. Thus, all their bits are technically demanded even though the
13911 // address computation might be vectorized in a smaller type.
13912 //
13913 // We start by looking at each entry that can be demoted. We compute the
13914 // maximum bit width required to store the scalar by using ValueTracking to
13915 // compute the number of high-order bits we can truncate.
13916 if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
13917 all_of(TreeRoot, [](Value *V) {
13918 return all_of(V->users(),
13919 [](User *U) { return isa<GetElementPtrInst>(U); });
13920 })) {
13921 MaxBitWidth = 8u;
13922
13923 // Determine if the sign bit of all the roots is known to be zero. If not,
13924 // IsKnownPositive is set to False.
13925 IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
13926 KnownBits Known = computeKnownBits(R, *DL);
13927 return Known.isNonNegative();
13928 });
13929
13930 // Determine the maximum number of bits required to store the scalar
13931 // values.
13932 for (auto *Scalar : ToDemote) {
13933 auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
13934 auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
13935 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
13936 }
13937
13938 // If we can't prove that the sign bit is zero, we must add one to the
13939 // maximum bit width to account for the unknown sign bit. This preserves
13940 // the existing sign bit so we can safely sign-extend the root back to the
13941 // original type. Otherwise, if we know the sign bit is zero, we will
13942 // zero-extend the root instead.
13943 //
13944 // FIXME: This is somewhat suboptimal, as there will be cases where adding
13945 // one to the maximum bit width will yield a larger-than-necessary
13946 // type. In general, we need to add an extra bit only if we can't
13947 // prove that the upper bit of the original type is equal to the
13948 // upper bit of the proposed smaller type. If these two bits are the
13949 // same (either zero or one) we know that sign-extending from the
13950 // smaller type will result in the same value. Here, since we can't
13951 // yet prove this, we are just making the proposed smaller type
13952 // larger to ensure correctness.
13953 if (!IsKnownPositive)
13954 ++MaxBitWidth;
13955 }
13956
13957 // Round MaxBitWidth up to the next power-of-two.
13958 MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
13959
13960 // If the maximum bit width we compute is less than the with of the roots'
13961 // type, we can proceed with the narrowing. Otherwise, do nothing.
13962 if (MaxBitWidth >= TreeRootIT->getBitWidth())
13963 return;
13964
13965 // If we can truncate the root, we must collect additional values that might
13966 // be demoted as a result. That is, those seeded by truncations we will
13967 // modify.
13968 while (!Roots.empty()) {
13969 DenseSet<Value *> Visited;
13970 collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,
13971 Visited);
13972 }
13973
13974 // Check that all users are marked for demotion.
13975 DenseSet<Value *> Demoted(ToDemote.begin(), ToDemote.end());
13977 for (Value *V: ToDemote) {
13978 const TreeEntry *TE = getTreeEntry(V);
13979 assert(TE && "Expected vectorized scalar.");
13980 if (!Visited.insert(TE).second)
13981 continue;
13982 if (!all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
13983 return all_of(EI.UserTE->Scalars,
13984 [&](Value *V) { return Demoted.contains(V); });
13985 }))
13986 return;
13987 }
13988 // Finally, map the values we can demote to the maximum bit with we computed.
13989 for (auto *Scalar : ToDemote) {
13990 auto *TE = getTreeEntry(Scalar);
13991 assert(TE && "Expected vectorized scalar.");
13992 if (MinBWs.contains(TE))
13993 continue;
13994 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
13995 KnownBits Known = computeKnownBits(R, *DL);
13996 return !Known.isNonNegative();
13997 });
13998 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
13999 const auto *I = cast<Instruction>(Scalar);
14000 auto DCIt = DemotedConsts.find(I);
14001 if (DCIt != DemotedConsts.end()) {
14002 for (unsigned Idx : DCIt->getSecond()) {
14003 // Check that all instructions operands are demoted.
14004 if (all_of(TE->Scalars, [&](Value *V) {
14005 auto SIt = DemotedConsts.find(cast<Instruction>(V));
14006 return SIt != DemotedConsts.end() &&
14007 is_contained(SIt->getSecond(), Idx);
14008 })) {
14009 const TreeEntry *CTE = getOperandEntry(TE, Idx);
14010 MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
14011 }
14012 }
14013 }
14014 }
14015}
14016
14018 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
14019 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
14020 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
14021 auto *AA = &AM.getResult<AAManager>(F);
14022 auto *LI = &AM.getResult<LoopAnalysis>(F);
14023 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
14024 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
14025 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
14027
14028 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
14029 if (!Changed)
14030 return PreservedAnalyses::all();
14031
14034 return PA;
14035}
14036
14038 TargetTransformInfo *TTI_,
14039 TargetLibraryInfo *TLI_, AAResults *AA_,
14040 LoopInfo *LI_, DominatorTree *DT_,
14041 AssumptionCache *AC_, DemandedBits *DB_,
14044 return false;
14045 SE = SE_;
14046 TTI = TTI_;
14047 TLI = TLI_;
14048 AA = AA_;
14049 LI = LI_;
14050 DT = DT_;
14051 AC = AC_;
14052 DB = DB_;
14053 DL = &F.getParent()->getDataLayout();
14054
14055 Stores.clear();
14056 GEPs.clear();
14057 bool Changed = false;
14058
14059 // If the target claims to have no vector registers don't attempt
14060 // vectorization.
14062 LLVM_DEBUG(
14063 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
14064 return false;
14065 }
14066
14067 // Don't vectorize when the attribute NoImplicitFloat is used.
14068 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
14069 return false;
14070
14071 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
14072
14073 // Use the bottom up slp vectorizer to construct chains that start with
14074 // store instructions.
14075 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
14076
14077 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
14078 // delete instructions.
14079
14080 // Update DFS numbers now so that we can use them for ordering.
14081 DT->updateDFSNumbers();
14082
14083 // Scan the blocks in the function in post order.
14084 for (auto *BB : post_order(&F.getEntryBlock())) {
14085 // Start new block - clear the list of reduction roots.
14086 R.clearReductionData();
14087 collectSeedInstructions(BB);
14088
14089 // Vectorize trees that end at stores.
14090 if (!Stores.empty()) {
14091 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
14092 << " underlying objects.\n");
14093 Changed |= vectorizeStoreChains(R);
14094 }
14095
14096 // Vectorize trees that end at reductions.
14097 Changed |= vectorizeChainsInBlock(BB, R);
14098
14099 // Vectorize the index computations of getelementptr instructions. This
14100 // is primarily intended to catch gather-like idioms ending at
14101 // non-consecutive loads.
14102 if (!GEPs.empty()) {
14103 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
14104 << " underlying objects.\n");
14105 Changed |= vectorizeGEPIndices(BB, R);
14106 }
14107 }
14108
14109 if (Changed) {
14110 R.optimizeGatherSequence();
14111 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
14112 }
14113 return Changed;
14114}
14115
14116bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
14117 unsigned Idx, unsigned MinVF) {
14118 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
14119 << "\n");
14120 const unsigned Sz = R.getVectorElementSize(Chain[0]);
14121 unsigned VF = Chain.size();
14122
14123 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
14124 return false;
14125
14126 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
14127 << "\n");
14128
14129 R.buildTree(Chain);
14130 if (R.isTreeTinyAndNotFullyVectorizable())
14131 return false;
14132 if (R.isLoadCombineCandidate())
14133 return false;
14134 R.reorderTopToBottom();
14135 R.reorderBottomToTop();
14136 R.buildExternalUses();
14137
14138 R.computeMinimumValueSizes();
14139
14140 InstructionCost Cost = R.getTreeCost();
14141
14142 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
14143 if (Cost < -SLPCostThreshold) {
14144 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
14145
14146 using namespace ore;
14147
14148 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
14149 cast<StoreInst>(Chain[0]))
14150 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
14151 << " and with tree size "
14152 << NV("TreeSize", R.getTreeSize()));
14153
14154 R.vectorizeTree();
14155 return true;
14156 }
14157
14158 return false;
14159}
14160
14161bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
14162 BoUpSLP &R) {
14163 // We may run into multiple chains that merge into a single chain. We mark the
14164 // stores that we vectorized so that we don't visit the same store twice.
14165 BoUpSLP::ValueSet VectorizedStores;
14166 bool Changed = false;
14167
14168 // Stores the pair of stores (first_store, last_store) in a range, that were
14169 // already tried to be vectorized. Allows to skip the store ranges that were
14170 // already tried to be vectorized but the attempts were unsuccessful.
14172 struct StoreDistCompare {
14173 bool operator()(const std::pair<unsigned, int> &Op1,
14174 const std::pair<unsigned, int> &Op2) const {
14175 return Op1.second < Op2.second;
14176 }
14177 };
14178 // A set of pairs (index of store in Stores array ref, Distance of the store
14179 // address relative to base store address in units).
14180 using StoreIndexToDistSet =
14181 std::set<std::pair<unsigned, int>, StoreDistCompare>;
14182 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
14183 int PrevDist = -1;
14185 // Collect the chain into a list.
14186 for (auto [Idx, Data] : enumerate(Set)) {
14187 if (Operands.empty() || Data.second - PrevDist == 1) {
14188 Operands.push_back(Stores[Data.first]);
14189 PrevDist = Data.second;
14190 if (Idx != Set.size() - 1)
14191 continue;
14192 }
14193 auto E = make_scope_exit([&, &DataVar = Data]() {
14194 Operands.clear();
14195 Operands.push_back(Stores[DataVar.first]);
14196 PrevDist = DataVar.second;
14197 });
14198
14199 if (Operands.size() <= 1)
14200 continue;
14201
14202 unsigned MaxVecRegSize = R.getMaxVecRegSize();
14203 unsigned EltSize = R.getVectorElementSize(Operands[0]);
14204 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
14205
14206 unsigned MaxVF =
14207 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
14208 auto *Store = cast<StoreInst>(Operands[0]);
14209 Type *StoreTy = Store->getValueOperand()->getType();
14210 Type *ValueTy = StoreTy;
14211 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
14212 ValueTy = Trunc->getSrcTy();
14213 unsigned MinVF = TTI->getStoreMinimumVF(
14214 R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
14215
14216 if (MaxVF < MinVF) {
14217 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
14218 << ") < "
14219 << "MinVF (" << MinVF << ")\n");
14220 continue;
14221 }
14222
14223 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
14224 SmallVector<unsigned> CandidateVFs(Sz);
14225 // FIXME: Is division-by-2 the correct step? Should we assert that the
14226 // register size is a power-of-2?
14227 unsigned Size = MaxVF;
14228 for_each(CandidateVFs, [&](unsigned &VF) {
14229 VF = Size;
14230 Size /= 2;
14231 });
14232 unsigned StartIdx = 0;
14233 for (unsigned Size : CandidateVFs) {
14234 for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
14236 assert(
14237 all_of(
14238 Slice,
14239 [&](Value *V) {
14240 return cast<StoreInst>(V)->getValueOperand()->getType() ==
14241 cast<StoreInst>(Slice.front())
14242 ->getValueOperand()
14243 ->getType();
14244 }) &&
14245 "Expected all operands of same type.");
14246 if (!VectorizedStores.count(Slice.front()) &&
14247 !VectorizedStores.count(Slice.back()) &&
14248 TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
14249 .second &&
14250 vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
14251 // Mark the vectorized stores so that we don't vectorize them again.
14252 VectorizedStores.insert(Slice.begin(), Slice.end());
14253 Changed = true;
14254 // If we vectorized initial block, no need to try to vectorize it
14255 // again.
14256 if (Cnt == StartIdx)
14257 StartIdx += Size;
14258 Cnt += Size;
14259 continue;
14260 }
14261 ++Cnt;
14262 }
14263 // Check if the whole array was vectorized already - exit.
14264 if (StartIdx >= Operands.size())
14265 break;
14266 }
14267 }
14268 };
14269
14270 // Stores pair (first: index of the store into Stores array ref, address of
14271 // which taken as base, second: sorted set of pairs {index, dist}, which are
14272 // indices of stores in the set and their store location distances relative to
14273 // the base address).
14274
14275 // Need to store the index of the very first store separately, since the set
14276 // may be reordered after the insertion and the first store may be moved. This
14277 // container allows to reduce number of calls of getPointersDiff() function.
14279 // Inserts the specified store SI with the given index Idx to the set of the
14280 // stores. If the store with the same distance is found already - stop
14281 // insertion, try to vectorize already found stores. If some stores from this
14282 // sequence were not vectorized - try to vectorize them with the new store
14283 // later. But this logic is applied only to the stores, that come before the
14284 // previous store with the same distance.
14285 // Example:
14286 // 1. store x, %p
14287 // 2. store y, %p+1
14288 // 3. store z, %p+2
14289 // 4. store a, %p
14290 // 5. store b, %p+3
14291 // - Scan this from the last to first store. The very first bunch of stores is
14292 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
14293 // vector).
14294 // - The next store in the list - #1 - has the same distance from store #5 as
14295 // the store #4.
14296 // - Try to vectorize sequence of stores 4,2,3,5.
14297 // - If all these stores are vectorized - just drop them.
14298 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
14299 // - Start new stores sequence.
14300 // The new bunch of stores is {1, {1, 0}}.
14301 // - Add the stores from previous sequence, that were not vectorized.
14302 // Here we consider the stores in the reversed order, rather they are used in
14303 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
14304 // Store #3 can be added -> comes after store #4 with the same distance as
14305 // store #1.
14306 // Store #5 cannot be added - comes before store #4.
14307 // This logic allows to improve the compile time, we assume that the stores
14308 // after previous store with the same distance most likely have memory
14309 // dependencies and no need to waste compile time to try to vectorize them.
14310 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
14311 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
14312 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
14313 std::optional<int> Diff = getPointersDiff(
14314 Stores[Set.first]->getValueOperand()->getType(),
14315 Stores[Set.first]->getPointerOperand(),
14316 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
14317 /*StrictCheck=*/true);
14318 if (!Diff)
14319 continue;
14320 auto It = Set.second.find(std::make_pair(Idx, *Diff));
14321 if (It == Set.second.end()) {
14322 Set.second.emplace(Idx, *Diff);
14323 return;
14324 }
14325 // Try to vectorize the first found set to avoid duplicate analysis.
14326 TryToVectorize(Set.second);
14327 StoreIndexToDistSet PrevSet;
14328 PrevSet.swap(Set.second);
14329 Set.first = Idx;
14330 Set.second.emplace(Idx, 0);
14331 // Insert stores that followed previous match to try to vectorize them
14332 // with this store.
14333 unsigned StartIdx = It->first + 1;
14334 SmallBitVector UsedStores(Idx - StartIdx);
14335 // Distances to previously found dup store (or this store, since they
14336 // store to the same addresses).
14337 SmallVector<int> Dists(Idx - StartIdx, 0);
14338 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
14339 // Do not try to vectorize sequences, we already tried.
14340 if (Pair.first <= It->first ||
14341 VectorizedStores.contains(Stores[Pair.first]))
14342 break;
14343 unsigned BI = Pair.first - StartIdx;
14344 UsedStores.set(BI);
14345 Dists[BI] = Pair.second - It->second;
14346 }
14347 for (unsigned I = StartIdx; I < Idx; ++I) {
14348 unsigned BI = I - StartIdx;
14349 if (UsedStores.test(BI))
14350 Set.second.emplace(I, Dists[BI]);
14351 }
14352 return;
14353 }
14354 auto &Res = SortedStores.emplace_back();
14355 Res.first = Idx;
14356 Res.second.emplace(Idx, 0);
14357 };
14358 StoreInst *PrevStore = Stores.front();
14359 for (auto [I, SI] : enumerate(Stores)) {
14360 // Check that we do not try to vectorize stores of different types.
14361 if (PrevStore->getValueOperand()->getType() !=
14362 SI->getValueOperand()->getType()) {
14363 for (auto &Set : SortedStores)
14364 TryToVectorize(Set.second);
14365 SortedStores.clear();
14366 PrevStore = SI;
14367 }
14368 FillStoresSet(I, SI);
14369 }
14370
14371 // Final vectorization attempt.
14372 for (auto &Set : SortedStores)
14373 TryToVectorize(Set.second);
14374
14375 return Changed;
14376}
14377
14378void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
14379 // Initialize the collections. We will make a single pass over the block.
14380 Stores.clear();
14381 GEPs.clear();
14382
14383 // Visit the store and getelementptr instructions in BB and organize them in
14384 // Stores and GEPs according to the underlying objects of their pointer
14385 // operands.
14386 for (Instruction &I : *BB) {
14387 // Ignore store instructions that are volatile or have a pointer operand
14388 // that doesn't point to a scalar type.
14389 if (auto *SI = dyn_cast<StoreInst>(&I)) {
14390 if (!SI->isSimple())
14391 continue;
14392 if (!isValidElementType(SI->getValueOperand()->getType()))
14393 continue;
14394 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
14395 }
14396
14397 // Ignore getelementptr instructions that have more than one index, a
14398 // constant index, or a pointer operand that doesn't point to a scalar
14399 // type.
14400 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
14401 if (GEP->getNumIndices() != 1)
14402 continue;
14403 Value *Idx = GEP->idx_begin()->get();
14404 if (isa<Constant>(Idx))
14405 continue;
14406 if (!isValidElementType(Idx->getType()))
14407 continue;
14408 if (GEP->getType()->isVectorTy())
14409 continue;
14410 GEPs[GEP->getPointerOperand()].push_back(GEP);
14411 }
14412 }
14413}
14414
14415bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
14416 bool MaxVFOnly) {
14417 if (VL.size() < 2)
14418 return false;
14419
14420 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
14421 << VL.size() << ".\n");
14422
14423 // Check that all of the parts are instructions of the same type,
14424 // we permit an alternate opcode via InstructionsState.
14425 InstructionsState S = getSameOpcode(VL, *TLI);
14426 if (!S.getOpcode())
14427 return false;
14428
14429 Instruction *I0 = cast<Instruction>(S.OpValue);
14430 // Make sure invalid types (including vector type) are rejected before
14431 // determining vectorization factor for scalar instructions.
14432 for (Value *V : VL) {
14433 Type *Ty = V->getType();
14434 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
14435 // NOTE: the following will give user internal llvm type name, which may
14436 // not be useful.
14437 R.getORE()->emit([&]() {
14438 std::string TypeStr;
14439 llvm::raw_string_ostream rso(TypeStr);
14440 Ty->print(rso);
14441 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
14442 << "Cannot SLP vectorize list: type "
14443 << rso.str() + " is unsupported by vectorizer";
14444 });
14445 return false;
14446 }
14447 }
14448
14449 unsigned Sz = R.getVectorElementSize(I0);
14450 unsigned MinVF = R.getMinVF(Sz);
14451 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
14452 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
14453 if (MaxVF < 2) {
14454 R.getORE()->emit([&]() {
14455 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
14456 << "Cannot SLP vectorize list: vectorization factor "
14457 << "less than 2 is not supported";
14458 });
14459 return false;
14460 }
14461
14462 bool Changed = false;
14463 bool CandidateFound = false;
14464 InstructionCost MinCost = SLPCostThreshold.getValue();
14465 Type *ScalarTy = VL[0]->getType();
14466 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
14467 ScalarTy = IE->getOperand(1)->getType();
14468
14469 unsigned NextInst = 0, MaxInst = VL.size();
14470 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
14471 // No actual vectorization should happen, if number of parts is the same as
14472 // provided vectorization factor (i.e. the scalar type is used for vector
14473 // code during codegen).
14474 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
14475 if (TTI->getNumberOfParts(VecTy) == VF)
14476 continue;
14477 for (unsigned I = NextInst; I < MaxInst; ++I) {
14478 unsigned ActualVF = std::min(MaxInst - I, VF);
14479
14480 if (!isPowerOf2_32(ActualVF))
14481 continue;
14482
14483 if (MaxVFOnly && ActualVF < MaxVF)
14484 break;
14485 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
14486 break;
14487
14488 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
14489 // Check that a previous iteration of this loop did not delete the Value.
14490 if (llvm::any_of(Ops, [&R](Value *V) {
14491 auto *I = dyn_cast<Instruction>(V);
14492 return I && R.isDeleted(I);
14493 }))
14494 continue;
14495
14496 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
14497 << "\n");
14498
14499 R.buildTree(Ops);
14500 if (R.isTreeTinyAndNotFullyVectorizable())
14501 continue;
14502 R.reorderTopToBottom();
14503 R.reorderBottomToTop(
14504 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
14505 !R.doesRootHaveInTreeUses());
14506 R.buildExternalUses();
14507
14508 R.computeMinimumValueSizes();
14509 InstructionCost Cost = R.getTreeCost();
14510 CandidateFound = true;
14511 MinCost = std::min(MinCost, Cost);
14512
14513 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
14514 << " for VF=" << ActualVF << "\n");
14515 if (Cost < -SLPCostThreshold) {
14516 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
14517 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
14518 cast<Instruction>(Ops[0]))
14519 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
14520 << " and with tree size "
14521 << ore::NV("TreeSize", R.getTreeSize()));
14522
14523 R.vectorizeTree();
14524 // Move to the next bundle.
14525 I += VF - 1;
14526 NextInst = I + 1;
14527 Changed = true;
14528 }
14529 }
14530 }
14531
14532 if (!Changed && CandidateFound) {
14533 R.getORE()->emit([&]() {
14534 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
14535 << "List vectorization was possible but not beneficial with cost "
14536 << ore::NV("Cost", MinCost) << " >= "
14537 << ore::NV("Treshold", -SLPCostThreshold);
14538 });
14539 } else if (!Changed) {
14540 R.getORE()->emit([&]() {
14541 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
14542 << "Cannot SLP vectorize list: vectorization was impossible"
14543 << " with available vectorization factors";
14544 });
14545 }
14546 return Changed;
14547}
14548
14549bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
14550 if (!I)
14551 return false;
14552
14553 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
14554 return false;
14555
14556 Value *P = I->getParent();
14557
14558 // Vectorize in current basic block only.
14559 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
14560 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
14561 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
14562 return false;
14563
14564 // First collect all possible candidates
14566 Candidates.emplace_back(Op0, Op1);
14567
14568 auto *A = dyn_cast<BinaryOperator>(Op0);
14569 auto *B = dyn_cast<BinaryOperator>(Op1);
14570 // Try to skip B.
14571 if (A && B && B->hasOneUse()) {
14572 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
14573 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
14574 if (B0 && B0->getParent() == P)
14575 Candidates.emplace_back(A, B0);
14576 if (B1 && B1->getParent() == P)
14577 Candidates.emplace_back(A, B1);
14578 }
14579 // Try to skip A.
14580 if (B && A && A->hasOneUse()) {
14581 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
14582 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
14583 if (A0 && A0->getParent() == P)
14584 Candidates.emplace_back(A0, B);
14585 if (A1 && A1->getParent() == P)
14586 Candidates.emplace_back(A1, B);
14587 }
14588
14589 if (Candidates.size() == 1)
14590 return tryToVectorizeList({Op0, Op1}, R);
14591
14592 // We have multiple options. Try to pick the single best.
14593 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
14594 if (!BestCandidate)
14595 return false;
14596 return tryToVectorizeList(
14597 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
14598}
14599
14600namespace {
14601
14602/// Model horizontal reductions.
14603///
14604/// A horizontal reduction is a tree of reduction instructions that has values
14605/// that can be put into a vector as its leaves. For example:
14606///
14607/// mul mul mul mul
14608/// \ / \ /
14609/// + +
14610/// \ /
14611/// +
14612/// This tree has "mul" as its leaf values and "+" as its reduction
14613/// instructions. A reduction can feed into a store or a binary operation
14614/// feeding a phi.
14615/// ...
14616/// \ /
14617/// +
14618/// |
14619/// phi +=
14620///
14621/// Or:
14622/// ...
14623/// \ /
14624/// +
14625/// |
14626/// *p =
14627///
14628class HorizontalReduction {
14629 using ReductionOpsType = SmallVector<Value *, 16>;
14630 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
14631 ReductionOpsListType ReductionOps;
14632 /// List of possibly reduced values.
14634 /// Maps reduced value to the corresponding reduction operation.
14636 // Use map vector to make stable output.
14638 WeakTrackingVH ReductionRoot;
14639 /// The type of reduction operation.
14640 RecurKind RdxKind;
14641 /// Checks if the optimization of original scalar identity operations on
14642 /// matched horizontal reductions is enabled and allowed.
14643 bool IsSupportedHorRdxIdentityOp = false;
14644
14645 static bool isCmpSelMinMax(Instruction *I) {
14646 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
14648 }
14649
14650 // And/or are potentially poison-safe logical patterns like:
14651 // select x, y, false
14652 // select x, true, y
14653 static bool isBoolLogicOp(Instruction *I) {
14654 return isa<SelectInst>(I) &&
14655 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
14656 }
14657
14658 /// Checks if instruction is associative and can be vectorized.
14659 static bool isVectorizable(RecurKind Kind, Instruction *I) {
14660 if (Kind == RecurKind::None)
14661 return false;
14662
14663 // Integer ops that map to select instructions or intrinsics are fine.
14665 isBoolLogicOp(I))
14666 return true;
14667
14668 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
14669 // FP min/max are associative except for NaN and -0.0. We do not
14670 // have to rule out -0.0 here because the intrinsic semantics do not
14671 // specify a fixed result for it.
14672 return I->getFastMathFlags().noNaNs();
14673 }
14674
14675 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
14676 return true;
14677
14678 return I->isAssociative();
14679 }
14680
14681 static Value *getRdxOperand(Instruction *I, unsigned Index) {
14682 // Poison-safe 'or' takes the form: select X, true, Y
14683 // To make that work with the normal operand processing, we skip the
14684 // true value operand.
14685 // TODO: Change the code and data structures to handle this without a hack.
14686 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
14687 return I->getOperand(2);
14688 return I->getOperand(Index);
14689 }
14690
14691 /// Creates reduction operation with the current opcode.
14692 static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
14693 Value *RHS, const Twine &Name, bool UseSelect) {
14694 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
14696 switch (Kind) {
14697 case RecurKind::Or:
14698 if (UseSelect &&
14700 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
14701 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
14702 Name);
14703 case RecurKind::And:
14704 if (UseSelect &&
14706 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
14707 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
14708 Name);
14709 case RecurKind::Add:
14710 case RecurKind::Mul:
14711 case RecurKind::Xor:
14712 case RecurKind::FAdd:
14713 case RecurKind::FMul:
14714 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
14715 Name);
14716 case RecurKind::FMax:
14717 if (IsConstant)
14718 return ConstantFP::get(LHS->getType(),
14719 maxnum(cast<ConstantFP>(LHS)->getValueAPF(),
14720 cast<ConstantFP>(RHS)->getValueAPF()));
14721 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
14722 case RecurKind::FMin:
14723 if (IsConstant)
14724 return ConstantFP::get(LHS->getType(),
14725 minnum(cast<ConstantFP>(LHS)->getValueAPF(),
14726 cast<ConstantFP>(RHS)->getValueAPF()));
14727 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
14728 case RecurKind::FMaximum:
14729 if (IsConstant)
14730 return ConstantFP::get(LHS->getType(),
14731 maximum(cast<ConstantFP>(LHS)->getValueAPF(),
14732 cast<ConstantFP>(RHS)->getValueAPF()));
14733 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
14734 case RecurKind::FMinimum:
14735 if (IsConstant)
14736 return ConstantFP::get(LHS->getType(),
14737 minimum(cast<ConstantFP>(LHS)->getValueAPF(),
14738 cast<ConstantFP>(RHS)->getValueAPF()));
14739 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
14740 case RecurKind::SMax:
14741 if (IsConstant || UseSelect) {
14742 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
14743 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14744 }
14745 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
14746 case RecurKind::SMin:
14747 if (IsConstant || UseSelect) {
14748 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
14749 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14750 }
14751 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
14752 case RecurKind::UMax:
14753 if (IsConstant || UseSelect) {
14754 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
14755 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14756 }
14757 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
14758 case RecurKind::UMin:
14759 if (IsConstant || UseSelect) {
14760 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
14761 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14762 }
14763 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
14764 default:
14765 llvm_unreachable("Unknown reduction operation.");
14766 }
14767 }
14768
14769 /// Creates reduction operation with the current opcode with the IR flags
14770 /// from \p ReductionOps, dropping nuw/nsw flags.
14771 static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
14772 Value *RHS, const Twine &Name,
14773 const ReductionOpsListType &ReductionOps) {
14774 bool UseSelect =
14775 ReductionOps.size() == 2 ||
14776 // Logical or/and.
14777 (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) {
14778 return isa<SelectInst>(V);
14779 }));
14780 assert((!UseSelect || ReductionOps.size() != 2 ||
14781 isa<SelectInst>(ReductionOps[1][0])) &&
14782 "Expected cmp + select pairs for reduction");
14783 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
14785 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
14786 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
14787 /*IncludeWrapFlags=*/false);
14788 propagateIRFlags(Op, ReductionOps[1], nullptr,
14789 /*IncludeWrapFlags=*/false);
14790 return Op;
14791 }
14792 }
14793 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
14794 return Op;
14795 }
14796
14797public:
14798 static RecurKind getRdxKind(Value *V) {
14799 auto *I = dyn_cast<Instruction>(V);
14800 if (!I)
14801 return RecurKind::None;
14802 if (match(I, m_Add(m_Value(), m_Value())))
14803 return RecurKind::Add;
14804 if (match(I, m_Mul(m_Value(), m_Value())))
14805 return RecurKind::Mul;
14806 if (match(I, m_And(m_Value(), m_Value())) ||
14808 return RecurKind::And;
14809 if (match(I, m_Or(m_Value(), m_Value())) ||
14811 return RecurKind::Or;
14812 if (match(I, m_Xor(m_Value(), m_Value())))
14813 return RecurKind::Xor;
14814 if (match(I, m_FAdd(m_Value(), m_Value())))
14815 return RecurKind::FAdd;
14816 if (match(I, m_FMul(m_Value(), m_Value())))
14817 return RecurKind::FMul;
14818
14819 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
14820 return RecurKind::FMax;
14821 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
14822 return RecurKind::FMin;
14823
14824 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
14825 return RecurKind::FMaximum;
14826 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
14827 return RecurKind::FMinimum;
14828 // This matches either cmp+select or intrinsics. SLP is expected to handle
14829 // either form.
14830 // TODO: If we are canonicalizing to intrinsics, we can remove several
14831 // special-case paths that deal with selects.
14832 if (match(I, m_SMax(m_Value(), m_Value())))
14833 return RecurKind::SMax;
14834 if (match(I, m_SMin(m_Value(), m_Value())))
14835 return RecurKind::SMin;
14836 if (match(I, m_UMax(m_Value(), m_Value())))
14837 return RecurKind::UMax;
14838 if (match(I, m_UMin(m_Value(), m_Value())))
14839 return RecurKind::UMin;
14840
14841 if (auto *Select = dyn_cast<SelectInst>(I)) {
14842 // Try harder: look for min/max pattern based on instructions producing
14843 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
14844 // During the intermediate stages of SLP, it's very common to have
14845 // pattern like this (since optimizeGatherSequence is run only once
14846 // at the end):
14847 // %1 = extractelement <2 x i32> %a, i32 0
14848 // %2 = extractelement <2 x i32> %a, i32 1
14849 // %cond = icmp sgt i32 %1, %2
14850 // %3 = extractelement <2 x i32> %a, i32 0
14851 // %4 = extractelement <2 x i32> %a, i32 1
14852 // %select = select i1 %cond, i32 %3, i32 %4
14853 CmpInst::Predicate Pred;
14854 Instruction *L1;
14855 Instruction *L2;
14856
14857 Value *LHS = Select->getTrueValue();
14858 Value *RHS = Select->getFalseValue();
14859 Value *Cond = Select->getCondition();
14860
14861 // TODO: Support inverse predicates.
14862 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
14863 if (!isa<ExtractElementInst>(RHS) ||
14864 !L2->isIdenticalTo(cast<Instruction>(RHS)))
14865 return RecurKind::None;
14866 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
14867 if (!isa<ExtractElementInst>(LHS) ||
14868 !L1->isIdenticalTo(cast<Instruction>(LHS)))
14869 return RecurKind::None;
14870 } else {
14871 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
14872 return RecurKind::None;
14873 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
14874 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
14875 !L2->isIdenticalTo(cast<Instruction>(RHS)))
14876 return RecurKind::None;
14877 }
14878
14879 switch (Pred) {
14880 default:
14881 return RecurKind::None;
14882 case CmpInst::ICMP_SGT:
14883 case CmpInst::ICMP_SGE:
14884 return RecurKind::SMax;
14885 case CmpInst::ICMP_SLT:
14886 case CmpInst::ICMP_SLE:
14887 return RecurKind::SMin;
14888 case CmpInst::ICMP_UGT:
14889 case CmpInst::ICMP_UGE:
14890 return RecurKind::UMax;
14891 case CmpInst::ICMP_ULT:
14892 case CmpInst::ICMP_ULE:
14893 return RecurKind::UMin;
14894 }
14895 }
14896 return RecurKind::None;
14897 }
14898
14899 /// Get the index of the first operand.
14900 static unsigned getFirstOperandIndex(Instruction *I) {
14901 return isCmpSelMinMax(I) ? 1 : 0;
14902 }
14903
14904private:
14905 /// Total number of operands in the reduction operation.
14906 static unsigned getNumberOfOperands(Instruction *I) {
14907 return isCmpSelMinMax(I) ? 3 : 2;
14908 }
14909
14910 /// Checks if the instruction is in basic block \p BB.
14911 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
14912 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
14913 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
14914 auto *Sel = cast<SelectInst>(I);
14915 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
14916 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
14917 }
14918 return I->getParent() == BB;
14919 }
14920
14921 /// Expected number of uses for reduction operations/reduced values.
14922 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
14923 if (IsCmpSelMinMax) {
14924 // SelectInst must be used twice while the condition op must have single
14925 // use only.
14926 if (auto *Sel = dyn_cast<SelectInst>(I))
14927 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
14928 return I->hasNUses(2);
14929 }
14930
14931 // Arithmetic reduction operation must be used once only.
14932 return I->hasOneUse();
14933 }
14934
14935 /// Initializes the list of reduction operations.
14936 void initReductionOps(Instruction *I) {
14937 if (isCmpSelMinMax(I))
14938 ReductionOps.assign(2, ReductionOpsType());
14939 else
14940 ReductionOps.assign(1, ReductionOpsType());
14941 }
14942
14943 /// Add all reduction operations for the reduction instruction \p I.
14944 void addReductionOps(Instruction *I) {
14945 if (isCmpSelMinMax(I)) {
14946 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
14947 ReductionOps[1].emplace_back(I);
14948 } else {
14949 ReductionOps[0].emplace_back(I);
14950 }
14951 }
14952
14953 static bool isGoodForReduction(ArrayRef<Value *> Data) {
14954 int Sz = Data.size();
14955 auto *I = dyn_cast<Instruction>(Data.front());
14956 return Sz > 1 || isConstant(Data.front()) ||
14957 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
14958 }
14959
14960public:
14961 HorizontalReduction() = default;
14962
14963 /// Try to find a reduction tree.
14964 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
14965 ScalarEvolution &SE, const DataLayout &DL,
14966 const TargetLibraryInfo &TLI) {
14967 RdxKind = HorizontalReduction::getRdxKind(Root);
14968 if (!isVectorizable(RdxKind, Root))
14969 return false;
14970
14971 // Analyze "regular" integer/FP types for reductions - no target-specific
14972 // types or pointers.
14973 Type *Ty = Root->getType();
14974 if (!isValidElementType(Ty) || Ty->isPointerTy())
14975 return false;
14976
14977 // Though the ultimate reduction may have multiple uses, its condition must
14978 // have only single use.
14979 if (auto *Sel = dyn_cast<SelectInst>(Root))
14980 if (!Sel->getCondition()->hasOneUse())
14981 return false;
14982
14983 ReductionRoot = Root;
14984
14985 // Iterate through all the operands of the possible reduction tree and
14986 // gather all the reduced values, sorting them by their value id.
14987 BasicBlock *BB = Root->getParent();
14988 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
14989 SmallVector<Instruction *> Worklist(1, Root);
14990 // Checks if the operands of the \p TreeN instruction are also reduction
14991 // operations or should be treated as reduced values or an extra argument,
14992 // which is not part of the reduction.
14993 auto CheckOperands = [&](Instruction *TreeN,
14994 SmallVectorImpl<Value *> &ExtraArgs,
14995 SmallVectorImpl<Value *> &PossibleReducedVals,
14996 SmallVectorImpl<Instruction *> &ReductionOps) {
14997 for (int I = getFirstOperandIndex(TreeN),
14998 End = getNumberOfOperands(TreeN);
14999 I < End; ++I) {
15000 Value *EdgeVal = getRdxOperand(TreeN, I);
15001 ReducedValsToOps[EdgeVal].push_back(TreeN);
15002 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
15003 // Edge has wrong parent - mark as an extra argument.
15004 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
15005 !hasSameParent(EdgeInst, BB)) {
15006 ExtraArgs.push_back(EdgeVal);
15007 continue;
15008 }
15009 // If the edge is not an instruction, or it is different from the main
15010 // reduction opcode or has too many uses - possible reduced value.
15011 // Also, do not try to reduce const values, if the operation is not
15012 // foldable.
15013 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
15014 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
15015 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
15016 !isVectorizable(RdxKind, EdgeInst) ||
15017 (R.isAnalyzedReductionRoot(EdgeInst) &&
15018 all_of(EdgeInst->operands(), Constant::classof))) {
15019 PossibleReducedVals.push_back(EdgeVal);
15020 continue;
15021 }
15022 ReductionOps.push_back(EdgeInst);
15023 }
15024 };
15025 // Try to regroup reduced values so that it gets more profitable to try to
15026 // reduce them. Values are grouped by their value ids, instructions - by
15027 // instruction op id and/or alternate op id, plus do extra analysis for
15028 // loads (grouping them by the distabce between pointers) and cmp
15029 // instructions (grouping them by the predicate).
15031 PossibleReducedVals;
15032 initReductionOps(Root);
15034 SmallSet<size_t, 2> LoadKeyUsed;
15035 SmallPtrSet<Value *, 4> DoNotReverseVals;
15036
15037 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
15039 if (LoadKeyUsed.contains(Key)) {
15040 auto LIt = LoadsMap.find(Ptr);
15041 if (LIt != LoadsMap.end()) {
15042 for (LoadInst *RLI : LIt->second) {
15043 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
15044 LI->getType(), LI->getPointerOperand(), DL, SE,
15045 /*StrictCheck=*/true))
15046 return hash_value(RLI->getPointerOperand());
15047 }
15048 for (LoadInst *RLI : LIt->second) {
15050 LI->getPointerOperand(), TLI)) {
15051 hash_code SubKey = hash_value(RLI->getPointerOperand());
15052 DoNotReverseVals.insert(RLI);
15053 return SubKey;
15054 }
15055 }
15056 if (LIt->second.size() > 2) {
15057 hash_code SubKey =
15058 hash_value(LIt->second.back()->getPointerOperand());
15059 DoNotReverseVals.insert(LIt->second.back());
15060 return SubKey;
15061 }
15062 }
15063 }
15064 LoadKeyUsed.insert(Key);
15065 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
15066 return hash_value(LI->getPointerOperand());
15067 };
15068
15069 while (!Worklist.empty()) {
15070 Instruction *TreeN = Worklist.pop_back_val();
15072 SmallVector<Value *> PossibleRedVals;
15073 SmallVector<Instruction *> PossibleReductionOps;
15074 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
15075 // If too many extra args - mark the instruction itself as a reduction
15076 // value, not a reduction operation.
15077 if (Args.size() < 2) {
15078 addReductionOps(TreeN);
15079 // Add extra args.
15080 if (!Args.empty()) {
15081 assert(Args.size() == 1 && "Expected only single argument.");
15082 ExtraArgs[TreeN] = Args.front();
15083 }
15084 // Add reduction values. The values are sorted for better vectorization
15085 // results.
15086 for (Value *V : PossibleRedVals) {
15087 size_t Key, Idx;
15088 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
15089 /*AllowAlternate=*/false);
15090 ++PossibleReducedVals[Key][Idx]
15091 .insert(std::make_pair(V, 0))
15092 .first->second;
15093 }
15094 Worklist.append(PossibleReductionOps.rbegin(),
15095 PossibleReductionOps.rend());
15096 } else {
15097 size_t Key, Idx;
15098 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
15099 /*AllowAlternate=*/false);
15100 ++PossibleReducedVals[Key][Idx]
15101 .insert(std::make_pair(TreeN, 0))
15102 .first->second;
15103 }
15104 }
15105 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
15106 // Sort values by the total number of values kinds to start the reduction
15107 // from the longest possible reduced values sequences.
15108 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
15109 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
15110 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
15111 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
15112 It != E; ++It) {
15113 PossibleRedValsVect.emplace_back();
15114 auto RedValsVect = It->second.takeVector();
15115 stable_sort(RedValsVect, llvm::less_second());
15116 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
15117 PossibleRedValsVect.back().append(Data.second, Data.first);
15118 }
15119 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
15120 return P1.size() > P2.size();
15121 });
15122 int NewIdx = -1;
15123 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
15124 if (isGoodForReduction(Data) ||
15125 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
15126 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
15128 cast<LoadInst>(Data.front())->getPointerOperand()) ==
15129 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
15130 ->getPointerOperand()))) {
15131 if (NewIdx < 0) {
15132 NewIdx = ReducedVals.size();
15133 ReducedVals.emplace_back();
15134 }
15135 if (DoNotReverseVals.contains(Data.front()))
15136 ReducedVals[NewIdx].append(Data.begin(), Data.end());
15137 else
15138 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
15139 } else {
15140 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
15141 }
15142 }
15143 }
15144 // Sort the reduced values by number of same/alternate opcode and/or pointer
15145 // operand.
15146 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
15147 return P1.size() > P2.size();
15148 });
15149 return true;
15150 }
15151
15152 /// Attempt to vectorize the tree found by matchAssociativeReduction.
15153 Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI,
15154 const TargetLibraryInfo &TLI) {
15155 constexpr int ReductionLimit = 4;
15156 constexpr unsigned RegMaxNumber = 4;
15157 constexpr unsigned RedValsMaxNumber = 128;
15158 // If there are a sufficient number of reduction values, reduce
15159 // to a nearby power-of-2. We can safely generate oversized
15160 // vectors and rely on the backend to split them to legal sizes.
15161 unsigned NumReducedVals =
15162 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
15163 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
15164 if (!isGoodForReduction(Vals))
15165 return Num;
15166 return Num + Vals.size();
15167 });
15168 if (NumReducedVals < ReductionLimit &&
15170 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
15171 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
15172 }))) {
15173 for (ReductionOpsType &RdxOps : ReductionOps)
15174 for (Value *RdxOp : RdxOps)
15175 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
15176 return nullptr;
15177 }
15178
15179 IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
15180
15181 // Track the reduced values in case if they are replaced by extractelement
15182 // because of the vectorization.
15184 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
15185 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
15186 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
15187 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
15188 // The same extra argument may be used several times, so log each attempt
15189 // to use it.
15190 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
15191 assert(Pair.first && "DebugLoc must be set.");
15192 ExternallyUsedValues[Pair.second].push_back(Pair.first);
15193 TrackedVals.try_emplace(Pair.second, Pair.second);
15194 }
15195
15196 // The compare instruction of a min/max is the insertion point for new
15197 // instructions and may be replaced with a new compare instruction.
15198 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
15199 assert(isa<SelectInst>(RdxRootInst) &&
15200 "Expected min/max reduction to have select root instruction");
15201 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
15202 assert(isa<Instruction>(ScalarCond) &&
15203 "Expected min/max reduction to have compare condition");
15204 return cast<Instruction>(ScalarCond);
15205 };
15206
15207 // Return new VectorizedTree, based on previous value.
15208 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
15209 if (VectorizedTree) {
15210 // Update the final value in the reduction.
15212 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
15213 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
15215 !isGuaranteedNotToBePoison(VectorizedTree))) {
15216 auto It = ReducedValsToOps.find(Res);
15217 if (It != ReducedValsToOps.end() &&
15218 any_of(It->getSecond(),
15219 [](Instruction *I) { return isBoolLogicOp(I); }))
15220 std::swap(VectorizedTree, Res);
15221 }
15222
15223 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
15224 ReductionOps);
15225 }
15226 // Initialize the final value in the reduction.
15227 return Res;
15228 };
15229 bool AnyBoolLogicOp =
15230 any_of(ReductionOps.back(), [](Value *V) {
15231 return isBoolLogicOp(cast<Instruction>(V));
15232 });
15233 // The reduction root is used as the insertion point for new instructions,
15234 // so set it as externally used to prevent it from being deleted.
15235 ExternallyUsedValues[ReductionRoot];
15236 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
15237 ReductionOps.front().size());
15238 for (ReductionOpsType &RdxOps : ReductionOps)
15239 for (Value *RdxOp : RdxOps) {
15240 if (!RdxOp)
15241 continue;
15242 IgnoreList.insert(RdxOp);
15243 }
15244 // Intersect the fast-math-flags from all reduction operations.
15245 FastMathFlags RdxFMF;
15246 RdxFMF.set();
15247 for (Value *U : IgnoreList)
15248 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
15249 RdxFMF &= FPMO->getFastMathFlags();
15250 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
15251
15252 // Need to track reduced vals, they may be changed during vectorization of
15253 // subvectors.
15254 for (ArrayRef<Value *> Candidates : ReducedVals)
15255 for (Value *V : Candidates)
15256 TrackedVals.try_emplace(V, V);
15257
15258 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
15259 // List of the values that were reduced in other trees as part of gather
15260 // nodes and thus requiring extract if fully vectorized in other trees.
15261 SmallPtrSet<Value *, 4> RequiredExtract;
15262 Value *VectorizedTree = nullptr;
15263 bool CheckForReusedReductionOps = false;
15264 // Try to vectorize elements based on their type.
15265 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
15266 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
15267 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
15268 SmallVector<Value *> Candidates;
15269 Candidates.reserve(2 * OrigReducedVals.size());
15270 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
15271 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
15272 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
15273 // Check if the reduction value was not overriden by the extractelement
15274 // instruction because of the vectorization and exclude it, if it is not
15275 // compatible with other values.
15276 // Also check if the instruction was folded to constant/other value.
15277 auto *Inst = dyn_cast<Instruction>(RdxVal);
15278 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
15279 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
15280 (S.getOpcode() && !Inst))
15281 continue;
15282 Candidates.push_back(RdxVal);
15283 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
15284 }
15285 bool ShuffledExtracts = false;
15286 // Try to handle shuffled extractelements.
15287 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
15288 I + 1 < E) {
15289 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
15290 if (NextS.getOpcode() == Instruction::ExtractElement &&
15291 !NextS.isAltShuffle()) {
15292 SmallVector<Value *> CommonCandidates(Candidates);
15293 for (Value *RV : ReducedVals[I + 1]) {
15294 Value *RdxVal = TrackedVals.find(RV)->second;
15295 // Check if the reduction value was not overriden by the
15296 // extractelement instruction because of the vectorization and
15297 // exclude it, if it is not compatible with other values.
15298 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
15299 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
15300 continue;
15301 CommonCandidates.push_back(RdxVal);
15302 TrackedToOrig.try_emplace(RdxVal, RV);
15303 }
15305 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
15306 ++I;
15307 Candidates.swap(CommonCandidates);
15308 ShuffledExtracts = true;
15309 }
15310 }
15311 }
15312
15313 // Emit code for constant values.
15314 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
15315 allConstant(Candidates)) {
15316 Value *Res = Candidates.front();
15317 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
15318 for (Value *VC : ArrayRef(Candidates).drop_front()) {
15319 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
15320 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
15321 if (auto *ResI = dyn_cast<Instruction>(Res))
15322 V.analyzedReductionRoot(ResI);
15323 }
15324 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
15325 continue;
15326 }
15327
15328 unsigned NumReducedVals = Candidates.size();
15329 if (NumReducedVals < ReductionLimit &&
15330 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
15331 !isSplat(Candidates)))
15332 continue;
15333
15334 // Check if we support repeated scalar values processing (optimization of
15335 // original scalar identity operations on matched horizontal reductions).
15336 IsSupportedHorRdxIdentityOp =
15337 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
15338 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
15339 // Gather same values.
15340 MapVector<Value *, unsigned> SameValuesCounter;
15341 if (IsSupportedHorRdxIdentityOp)
15342 for (Value *V : Candidates)
15343 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
15344 // Used to check if the reduced values used same number of times. In this
15345 // case the compiler may produce better code. E.g. if reduced values are
15346 // aabbccdd (8 x values), then the first node of the tree will have a node
15347 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
15348 // Plus, the final reduction will be performed on <8 x aabbccdd>.
15349 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
15350 // x abcd) * 2.
15351 // Currently it only handles add/fadd/xor. and/or/min/max do not require
15352 // this analysis, other operations may require an extra estimation of
15353 // the profitability.
15354 bool SameScaleFactor = false;
15355 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
15356 SameValuesCounter.size() != Candidates.size();
15357 if (OptReusedScalars) {
15358 SameScaleFactor =
15359 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
15360 RdxKind == RecurKind::Xor) &&
15361 all_of(drop_begin(SameValuesCounter),
15362 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
15363 return P.second == SameValuesCounter.front().second;
15364 });
15365 Candidates.resize(SameValuesCounter.size());
15366 transform(SameValuesCounter, Candidates.begin(),
15367 [](const auto &P) { return P.first; });
15368 NumReducedVals = Candidates.size();
15369 // Have a reduction of the same element.
15370 if (NumReducedVals == 1) {
15371 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
15372 unsigned Cnt = SameValuesCounter.lookup(OrigV);
15373 Value *RedVal =
15374 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
15375 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
15376 VectorizedVals.try_emplace(OrigV, Cnt);
15377 continue;
15378 }
15379 }
15380
15381 unsigned MaxVecRegSize = V.getMaxVecRegSize();
15382 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
15383 unsigned MaxElts =
15384 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
15385
15386 unsigned ReduxWidth = std::min<unsigned>(
15387 llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
15388 unsigned Start = 0;
15389 unsigned Pos = Start;
15390 // Restarts vectorization attempt with lower vector factor.
15391 unsigned PrevReduxWidth = ReduxWidth;
15392 bool CheckForReusedReductionOpsLocal = false;
15393 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
15394 &CheckForReusedReductionOpsLocal,
15395 &PrevReduxWidth, &V,
15396 &IgnoreList](bool IgnoreVL = false) {
15397 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
15398 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
15399 // Check if any of the reduction ops are gathered. If so, worth
15400 // trying again with less number of reduction ops.
15401 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
15402 }
15403 ++Pos;
15404 if (Pos < NumReducedVals - ReduxWidth + 1)
15405 return IsAnyRedOpGathered;
15406 Pos = Start;
15407 ReduxWidth /= 2;
15408 return IsAnyRedOpGathered;
15409 };
15410 bool AnyVectorized = false;
15411 while (Pos < NumReducedVals - ReduxWidth + 1 &&
15412 ReduxWidth >= ReductionLimit) {
15413 // Dependency in tree of the reduction ops - drop this attempt, try
15414 // later.
15415 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
15416 Start == 0) {
15417 CheckForReusedReductionOps = true;
15418 break;
15419 }
15420 PrevReduxWidth = ReduxWidth;
15421 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
15422 // Beeing analyzed already - skip.
15423 if (V.areAnalyzedReductionVals(VL)) {
15424 (void)AdjustReducedVals(/*IgnoreVL=*/true);
15425 continue;
15426 }
15427 // Early exit if any of the reduction values were deleted during
15428 // previous vectorization attempts.
15429 if (any_of(VL, [&V](Value *RedVal) {
15430 auto *RedValI = dyn_cast<Instruction>(RedVal);
15431 if (!RedValI)
15432 return false;
15433 return V.isDeleted(RedValI);
15434 }))
15435 break;
15436 V.buildTree(VL, IgnoreList);
15437 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
15438 if (!AdjustReducedVals())
15439 V.analyzedReductionVals(VL);
15440 continue;
15441 }
15442 if (V.isLoadCombineReductionCandidate(RdxKind)) {
15443 if (!AdjustReducedVals())
15444 V.analyzedReductionVals(VL);
15445 continue;
15446 }
15447 V.reorderTopToBottom();
15448 // No need to reorder the root node at all.
15449 V.reorderBottomToTop(/*IgnoreReorder=*/true);
15450 // Keep extracted other reduction values, if they are used in the
15451 // vectorization trees.
15452 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
15453 ExternallyUsedValues);
15454 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
15455 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
15456 continue;
15457 for (Value *V : ReducedVals[Cnt])
15458 if (isa<Instruction>(V))
15459 LocalExternallyUsedValues[TrackedVals[V]];
15460 }
15461 if (!IsSupportedHorRdxIdentityOp) {
15462 // Number of uses of the candidates in the vector of values.
15463 assert(SameValuesCounter.empty() &&
15464 "Reused values counter map is not empty");
15465 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
15466 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
15467 continue;
15468 Value *V = Candidates[Cnt];
15469 Value *OrigV = TrackedToOrig.find(V)->second;
15470 ++SameValuesCounter[OrigV];
15471 }
15472 }
15473 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
15474 // Gather externally used values.
15476 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
15477 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
15478 continue;
15479 Value *RdxVal = Candidates[Cnt];
15480 if (!Visited.insert(RdxVal).second)
15481 continue;
15482 // Check if the scalar was vectorized as part of the vectorization
15483 // tree but not the top node.
15484 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
15485 LocalExternallyUsedValues[RdxVal];
15486 continue;
15487 }
15488 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
15489 unsigned NumOps =
15490 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
15491 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
15492 LocalExternallyUsedValues[RdxVal];
15493 }
15494 // Do not need the list of reused scalars in regular mode anymore.
15495 if (!IsSupportedHorRdxIdentityOp)
15496 SameValuesCounter.clear();
15497 for (Value *RdxVal : VL)
15498 if (RequiredExtract.contains(RdxVal))
15499 LocalExternallyUsedValues[RdxVal];
15500 // Update LocalExternallyUsedValues for the scalar, replaced by
15501 // extractelement instructions.
15502 DenseMap<Value *, Value *> ReplacementToExternal;
15503 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
15504 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
15505 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
15506 Value *Ext = Pair.first;
15507 auto RIt = ReplacementToExternal.find(Ext);
15508 while (RIt != ReplacementToExternal.end()) {
15509 Ext = RIt->second;
15510 RIt = ReplacementToExternal.find(Ext);
15511 }
15512 auto *It = ExternallyUsedValues.find(Ext);
15513 if (It == ExternallyUsedValues.end())
15514 continue;
15515 LocalExternallyUsedValues[Pair.second].append(It->second);
15516 }
15517 V.buildExternalUses(LocalExternallyUsedValues);
15518
15519 V.computeMinimumValueSizes();
15520
15521 // Estimate cost.
15522 InstructionCost TreeCost = V.getTreeCost(VL);
15523 InstructionCost ReductionCost =
15524 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
15525 InstructionCost Cost = TreeCost + ReductionCost;
15526 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
15527 << " for reduction\n");
15528 if (!Cost.isValid())
15529 return nullptr;
15530 if (Cost >= -SLPCostThreshold) {
15531 V.getORE()->emit([&]() {
15533 SV_NAME, "HorSLPNotBeneficial",
15534 ReducedValsToOps.find(VL[0])->second.front())
15535 << "Vectorizing horizontal reduction is possible "
15536 << "but not beneficial with cost " << ore::NV("Cost", Cost)
15537 << " and threshold "
15538 << ore::NV("Threshold", -SLPCostThreshold);
15539 });
15540 if (!AdjustReducedVals())
15541 V.analyzedReductionVals(VL);
15542 continue;
15543 }
15544
15545 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
15546 << Cost << ". (HorRdx)\n");
15547 V.getORE()->emit([&]() {
15548 return OptimizationRemark(
15549 SV_NAME, "VectorizedHorizontalReduction",
15550 ReducedValsToOps.find(VL[0])->second.front())
15551 << "Vectorized horizontal reduction with cost "
15552 << ore::NV("Cost", Cost) << " and with tree size "
15553 << ore::NV("TreeSize", V.getTreeSize());
15554 });
15555
15556 Builder.setFastMathFlags(RdxFMF);
15557
15558 // Emit a reduction. If the root is a select (min/max idiom), the insert
15559 // point is the compare condition of that select.
15560 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
15561 Instruction *InsertPt = RdxRootInst;
15562 if (IsCmpSelMinMax)
15563 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
15564
15565 // Vectorize a tree.
15566 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
15567 ReplacedExternals, InsertPt);
15568
15569 Builder.SetInsertPoint(InsertPt);
15570
15571 // To prevent poison from leaking across what used to be sequential,
15572 // safe, scalar boolean logic operations, the reduction operand must be
15573 // frozen.
15574 if ((isBoolLogicOp(RdxRootInst) ||
15575 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
15576 !isGuaranteedNotToBePoison(VectorizedRoot))
15577 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
15578
15579 // Emit code to correctly handle reused reduced values, if required.
15580 if (OptReusedScalars && !SameScaleFactor) {
15581 VectorizedRoot =
15582 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
15583 SameValuesCounter, TrackedToOrig);
15584 }
15585
15586 Value *ReducedSubTree =
15587 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
15588 if (ReducedSubTree->getType() != VL.front()->getType()) {
15589 ReducedSubTree = Builder.CreateIntCast(
15590 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
15592 R, cast<Instruction>(ReductionOps.front().front())
15593 ->getModule()
15594 ->getDataLayout());
15595 return !Known.isNonNegative();
15596 }));
15597 }
15598
15599 // Improved analysis for add/fadd/xor reductions with same scale factor
15600 // for all operands of reductions. We can emit scalar ops for them
15601 // instead.
15602 if (OptReusedScalars && SameScaleFactor)
15603 ReducedSubTree = emitScaleForReusedOps(
15604 ReducedSubTree, Builder, SameValuesCounter.front().second);
15605
15606 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
15607 // Count vectorized reduced values to exclude them from final reduction.
15608 for (Value *RdxVal : VL) {
15609 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
15610 if (IsSupportedHorRdxIdentityOp) {
15611 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
15612 continue;
15613 }
15614 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
15615 if (!V.isVectorized(RdxVal))
15616 RequiredExtract.insert(RdxVal);
15617 }
15618 Pos += ReduxWidth;
15619 Start = Pos;
15620 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
15621 AnyVectorized = true;
15622 }
15623 if (OptReusedScalars && !AnyVectorized) {
15624 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
15625 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
15626 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
15627 Value *OrigV = TrackedToOrig.find(P.first)->second;
15628 VectorizedVals.try_emplace(OrigV, P.second);
15629 }
15630 continue;
15631 }
15632 }
15633 if (VectorizedTree) {
15634 // Reorder operands of bool logical op in the natural order to avoid
15635 // possible problem with poison propagation. If not possible to reorder
15636 // (both operands are originally RHS), emit an extra freeze instruction
15637 // for the LHS operand.
15638 // I.e., if we have original code like this:
15639 // RedOp1 = select i1 ?, i1 LHS, i1 false
15640 // RedOp2 = select i1 RHS, i1 ?, i1 false
15641
15642 // Then, we swap LHS/RHS to create a new op that matches the poison
15643 // semantics of the original code.
15644
15645 // If we have original code like this and both values could be poison:
15646 // RedOp1 = select i1 ?, i1 LHS, i1 false
15647 // RedOp2 = select i1 ?, i1 RHS, i1 false
15648
15649 // Then, we must freeze LHS in the new op.
15650 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
15651 Instruction *RedOp1,
15652 Instruction *RedOp2,
15653 bool InitStep) {
15654 if (!AnyBoolLogicOp)
15655 return;
15656 if (isBoolLogicOp(RedOp1) &&
15657 ((!InitStep && LHS == VectorizedTree) ||
15658 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
15659 return;
15660 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
15661 getRdxOperand(RedOp2, 0) == RHS ||
15663 std::swap(LHS, RHS);
15664 return;
15665 }
15666 if (LHS != VectorizedTree)
15667 LHS = Builder.CreateFreeze(LHS);
15668 };
15669 // Finish the reduction.
15670 // Need to add extra arguments and not vectorized possible reduction
15671 // values.
15672 // Try to avoid dependencies between the scalar remainders after
15673 // reductions.
15674 auto FinalGen =
15676 bool InitStep) {
15677 unsigned Sz = InstVals.size();
15679 Sz % 2);
15680 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
15681 Instruction *RedOp = InstVals[I + 1].first;
15682 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
15683 Value *RdxVal1 = InstVals[I].second;
15684 Value *StableRdxVal1 = RdxVal1;
15685 auto It1 = TrackedVals.find(RdxVal1);
15686 if (It1 != TrackedVals.end())
15687 StableRdxVal1 = It1->second;
15688 Value *RdxVal2 = InstVals[I + 1].second;
15689 Value *StableRdxVal2 = RdxVal2;
15690 auto It2 = TrackedVals.find(RdxVal2);
15691 if (It2 != TrackedVals.end())
15692 StableRdxVal2 = It2->second;
15693 // To prevent poison from leaking across what used to be
15694 // sequential, safe, scalar boolean logic operations, the
15695 // reduction operand must be frozen.
15696 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
15697 RedOp, InitStep);
15698 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
15699 StableRdxVal2, "op.rdx", ReductionOps);
15700 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
15701 }
15702 if (Sz % 2 == 1)
15703 ExtraReds[Sz / 2] = InstVals.back();
15704 return ExtraReds;
15705 };
15707 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
15708 VectorizedTree);
15710 for (ArrayRef<Value *> Candidates : ReducedVals) {
15711 for (Value *RdxVal : Candidates) {
15712 if (!Visited.insert(RdxVal).second)
15713 continue;
15714 unsigned NumOps = VectorizedVals.lookup(RdxVal);
15715 for (Instruction *RedOp :
15716 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
15717 .drop_back(NumOps))
15718 ExtraReductions.emplace_back(RedOp, RdxVal);
15719 }
15720 }
15721 for (auto &Pair : ExternallyUsedValues) {
15722 // Add each externally used value to the final reduction.
15723 for (auto *I : Pair.second)
15724 ExtraReductions.emplace_back(I, Pair.first);
15725 }
15726 // Iterate through all not-vectorized reduction values/extra arguments.
15727 bool InitStep = true;
15728 while (ExtraReductions.size() > 1) {
15729 VectorizedTree = ExtraReductions.front().second;
15731 FinalGen(ExtraReductions, InitStep);
15732 ExtraReductions.swap(NewReds);
15733 InitStep = false;
15734 }
15735 VectorizedTree = ExtraReductions.front().second;
15736
15737 ReductionRoot->replaceAllUsesWith(VectorizedTree);
15738
15739 // The original scalar reduction is expected to have no remaining
15740 // uses outside the reduction tree itself. Assert that we got this
15741 // correct, replace internal uses with undef, and mark for eventual
15742 // deletion.
15743#ifndef NDEBUG
15744 SmallSet<Value *, 4> IgnoreSet;
15745 for (ArrayRef<Value *> RdxOps : ReductionOps)
15746 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
15747#endif
15748 for (ArrayRef<Value *> RdxOps : ReductionOps) {
15749 for (Value *Ignore : RdxOps) {
15750 if (!Ignore)
15751 continue;
15752#ifndef NDEBUG
15753 for (auto *U : Ignore->users()) {
15754 assert(IgnoreSet.count(U) &&
15755 "All users must be either in the reduction ops list.");
15756 }
15757#endif
15758 if (!Ignore->use_empty()) {
15759 Value *Undef = UndefValue::get(Ignore->getType());
15760 Ignore->replaceAllUsesWith(Undef);
15761 }
15762 V.eraseInstruction(cast<Instruction>(Ignore));
15763 }
15764 }
15765 } else if (!CheckForReusedReductionOps) {
15766 for (ReductionOpsType &RdxOps : ReductionOps)
15767 for (Value *RdxOp : RdxOps)
15768 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
15769 }
15770 return VectorizedTree;
15771 }
15772
15773private:
15774 /// Calculate the cost of a reduction.
15775 InstructionCost getReductionCost(TargetTransformInfo *TTI,
15776 ArrayRef<Value *> ReducedVals,
15777 bool IsCmpSelMinMax, unsigned ReduxWidth,
15778 FastMathFlags FMF) {
15780 Type *ScalarTy = ReducedVals.front()->getType();
15781 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
15782 InstructionCost VectorCost = 0, ScalarCost;
15783 // If all of the reduced values are constant, the vector cost is 0, since
15784 // the reduction value can be calculated at the compile time.
15785 bool AllConsts = allConstant(ReducedVals);
15786 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
15788 // Scalar cost is repeated for N-1 elements.
15789 int Cnt = ReducedVals.size();
15790 for (Value *RdxVal : ReducedVals) {
15791 if (Cnt == 1)
15792 break;
15793 --Cnt;
15794 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
15795 Cost += GenCostFn();
15796 continue;
15797 }
15798 InstructionCost ScalarCost = 0;
15799 for (User *U : RdxVal->users()) {
15800 auto *RdxOp = cast<Instruction>(U);
15801 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
15802 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
15803 continue;
15804 }
15805 ScalarCost = InstructionCost::getInvalid();
15806 break;
15807 }
15808 if (ScalarCost.isValid())
15809 Cost += ScalarCost;
15810 else
15811 Cost += GenCostFn();
15812 }
15813 return Cost;
15814 };
15815 switch (RdxKind) {
15816 case RecurKind::Add:
15817 case RecurKind::Mul:
15818 case RecurKind::Or:
15819 case RecurKind::And:
15820 case RecurKind::Xor:
15821 case RecurKind::FAdd:
15822 case RecurKind::FMul: {
15823 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
15824 if (!AllConsts)
15825 VectorCost =
15826 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
15827 ScalarCost = EvaluateScalarCost([&]() {
15828 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
15829 });
15830 break;
15831 }
15832 case RecurKind::FMax:
15833 case RecurKind::FMin:
15834 case RecurKind::FMaximum:
15835 case RecurKind::FMinimum:
15836 case RecurKind::SMax:
15837 case RecurKind::SMin:
15838 case RecurKind::UMax:
15839 case RecurKind::UMin: {
15841 if (!AllConsts)
15842 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
15843 ScalarCost = EvaluateScalarCost([&]() {
15844 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
15845 return TTI->getIntrinsicInstrCost(ICA, CostKind);
15846 });
15847 break;
15848 }
15849 default:
15850 llvm_unreachable("Expected arithmetic or min/max reduction operation");
15851 }
15852
15853 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
15854 << " for reduction of " << shortBundleName(ReducedVals)
15855 << " (It is a splitting reduction)\n");
15856 return VectorCost - ScalarCost;
15857 }
15858
15859 /// Emit a horizontal reduction of the vectorized value.
15860 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
15861 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
15862 assert(VectorizedValue && "Need to have a vectorized tree node");
15863 assert(isPowerOf2_32(ReduxWidth) &&
15864 "We only handle power-of-two reductions for now");
15865 assert(RdxKind != RecurKind::FMulAdd &&
15866 "A call to the llvm.fmuladd intrinsic is not handled yet");
15867
15868 ++NumVectorInstructions;
15869 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
15870 }
15871
15872 /// Emits optimized code for unique scalar value reused \p Cnt times.
15873 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
15874 unsigned Cnt) {
15875 assert(IsSupportedHorRdxIdentityOp &&
15876 "The optimization of matched scalar identity horizontal reductions "
15877 "must be supported.");
15878 switch (RdxKind) {
15879 case RecurKind::Add: {
15880 // res = mul vv, n
15881 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
15882 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
15883 << VectorizedValue << ". (HorRdx)\n");
15884 return Builder.CreateMul(VectorizedValue, Scale);
15885 }
15886 case RecurKind::Xor: {
15887 // res = n % 2 ? 0 : vv
15888 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
15889 << ". (HorRdx)\n");
15890 if (Cnt % 2 == 0)
15891 return Constant::getNullValue(VectorizedValue->getType());
15892 return VectorizedValue;
15893 }
15894 case RecurKind::FAdd: {
15895 // res = fmul v, n
15896 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
15897 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
15898 << VectorizedValue << ". (HorRdx)\n");
15899 return Builder.CreateFMul(VectorizedValue, Scale);
15900 }
15901 case RecurKind::And:
15902 case RecurKind::Or:
15903 case RecurKind::SMax:
15904 case RecurKind::SMin:
15905 case RecurKind::UMax:
15906 case RecurKind::UMin:
15907 case RecurKind::FMax:
15908 case RecurKind::FMin:
15909 case RecurKind::FMaximum:
15910 case RecurKind::FMinimum:
15911 // res = vv
15912 return VectorizedValue;
15913 case RecurKind::Mul:
15914 case RecurKind::FMul:
15915 case RecurKind::FMulAdd:
15916 case RecurKind::IAnyOf:
15917 case RecurKind::FAnyOf:
15918 case RecurKind::None:
15919 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
15920 }
15921 return nullptr;
15922 }
15923
15924 /// Emits actual operation for the scalar identity values, found during
15925 /// horizontal reduction analysis.
15926 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
15928 const MapVector<Value *, unsigned> &SameValuesCounter,
15929 const DenseMap<Value *, Value *> &TrackedToOrig) {
15930 assert(IsSupportedHorRdxIdentityOp &&
15931 "The optimization of matched scalar identity horizontal reductions "
15932 "must be supported.");
15933 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
15934 if (VTy->getElementType() != VL.front()->getType()) {
15935 VectorizedValue = Builder.CreateIntCast(
15936 VectorizedValue,
15937 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
15938 any_of(VL, [&](Value *R) {
15940 R, cast<Instruction>(ReductionOps.front().front())
15941 ->getModule()
15942 ->getDataLayout());
15943 return !Known.isNonNegative();
15944 }));
15945 }
15946 switch (RdxKind) {
15947 case RecurKind::Add: {
15948 // root = mul prev_root, <1, 1, n, 1>
15950 for (Value *V : VL) {
15951 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
15952 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
15953 }
15954 auto *Scale = ConstantVector::get(Vals);
15955 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
15956 << VectorizedValue << ". (HorRdx)\n");
15957 return Builder.CreateMul(VectorizedValue, Scale);
15958 }
15959 case RecurKind::And:
15960 case RecurKind::Or:
15961 // No need for multiple or/and(s).
15962 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
15963 << ". (HorRdx)\n");
15964 return VectorizedValue;
15965 case RecurKind::SMax:
15966 case RecurKind::SMin:
15967 case RecurKind::UMax:
15968 case RecurKind::UMin:
15969 case RecurKind::FMax:
15970 case RecurKind::FMin:
15971 case RecurKind::FMaximum:
15972 case RecurKind::FMinimum:
15973 // No need for multiple min/max(s) of the same value.
15974 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
15975 << ". (HorRdx)\n");
15976 return VectorizedValue;
15977 case RecurKind::Xor: {
15978 // Replace values with even number of repeats with 0, since
15979 // x xor x = 0.
15980 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
15981 // 7>, if elements 4th and 6th elements have even number of repeats.
15983 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
15985 std::iota(Mask.begin(), Mask.end(), 0);
15986 bool NeedShuffle = false;
15987 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
15988 Value *V = VL[I];
15989 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
15990 if (Cnt % 2 == 0) {
15991 Mask[I] = VF;
15992 NeedShuffle = true;
15993 }
15994 }
15995 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
15996 : Mask) dbgs()
15997 << I << " ";
15998 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
15999 if (NeedShuffle)
16000 VectorizedValue = Builder.CreateShuffleVector(
16001 VectorizedValue,
16002 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
16003 return VectorizedValue;
16004 }
16005 case RecurKind::FAdd: {
16006 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
16008 for (Value *V : VL) {
16009 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
16010 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
16011 }
16012 auto *Scale = ConstantVector::get(Vals);
16013 return Builder.CreateFMul(VectorizedValue, Scale);
16014 }
16015 case RecurKind::Mul:
16016 case RecurKind::FMul:
16017 case RecurKind::FMulAdd:
16018 case RecurKind::IAnyOf:
16019 case RecurKind::FAnyOf:
16020 case RecurKind::None:
16021 llvm_unreachable("Unexpected reduction kind for reused scalars.");
16022 }
16023 return nullptr;
16024 }
16025};
16026} // end anonymous namespace
16027
16028static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
16029 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
16030 return cast<FixedVectorType>(IE->getType())->getNumElements();
16031
16032 unsigned AggregateSize = 1;
16033 auto *IV = cast<InsertValueInst>(InsertInst);
16034 Type *CurrentType = IV->getType();
16035 do {
16036 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
16037 for (auto *Elt : ST->elements())
16038 if (Elt != ST->getElementType(0)) // check homogeneity
16039 return std::nullopt;
16040 AggregateSize *= ST->getNumElements();
16041 CurrentType = ST->getElementType(0);
16042 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
16043 AggregateSize *= AT->getNumElements();
16044 CurrentType = AT->getElementType();
16045 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
16046 AggregateSize *= VT->getNumElements();
16047 return AggregateSize;
16048 } else if (CurrentType->isSingleValueType()) {
16049 return AggregateSize;
16050 } else {
16051 return std::nullopt;
16052 }
16053 } while (true);
16054}
16055
16056static void findBuildAggregate_rec(Instruction *LastInsertInst,
16058 SmallVectorImpl<Value *> &BuildVectorOpds,
16059 SmallVectorImpl<Value *> &InsertElts,
16060 unsigned OperandOffset) {
16061 do {
16062 Value *InsertedOperand = LastInsertInst->getOperand(1);
16063 std::optional<unsigned> OperandIndex =
16064 getInsertIndex(LastInsertInst, OperandOffset);
16065 if (!OperandIndex)
16066 return;
16067 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
16068 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
16069 BuildVectorOpds, InsertElts, *OperandIndex);
16070
16071 } else {
16072 BuildVectorOpds[*OperandIndex] = InsertedOperand;
16073 InsertElts[*OperandIndex] = LastInsertInst;
16074 }
16075 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
16076 } while (LastInsertInst != nullptr &&
16077 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
16078 LastInsertInst->hasOneUse());
16079}
16080
16081/// Recognize construction of vectors like
16082/// %ra = insertelement <4 x float> poison, float %s0, i32 0
16083/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
16084/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
16085/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
16086/// starting from the last insertelement or insertvalue instruction.
16087///
16088/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
16089/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
16090/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
16091///
16092/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
16093///
16094/// \return true if it matches.
16095static bool findBuildAggregate(Instruction *LastInsertInst,
16097 SmallVectorImpl<Value *> &BuildVectorOpds,
16098 SmallVectorImpl<Value *> &InsertElts) {
16099
16100 assert((isa<InsertElementInst>(LastInsertInst) ||
16101 isa<InsertValueInst>(LastInsertInst)) &&
16102 "Expected insertelement or insertvalue instruction!");
16103
16104 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
16105 "Expected empty result vectors!");
16106
16107 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
16108 if (!AggregateSize)
16109 return false;
16110 BuildVectorOpds.resize(*AggregateSize);
16111 InsertElts.resize(*AggregateSize);
16112
16113 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
16114 llvm::erase(BuildVectorOpds, nullptr);
16115 llvm::erase(InsertElts, nullptr);
16116 if (BuildVectorOpds.size() >= 2)
16117 return true;
16118
16119 return false;
16120}
16121
16122/// Try and get a reduction instruction from a phi node.
16123///
16124/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
16125/// if they come from either \p ParentBB or a containing loop latch.
16126///
16127/// \returns A candidate reduction value if possible, or \code nullptr \endcode
16128/// if not possible.
16130 BasicBlock *ParentBB, LoopInfo *LI) {
16131 // There are situations where the reduction value is not dominated by the
16132 // reduction phi. Vectorizing such cases has been reported to cause
16133 // miscompiles. See PR25787.
16134 auto DominatedReduxValue = [&](Value *R) {
16135 return isa<Instruction>(R) &&
16136 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
16137 };
16138
16139 Instruction *Rdx = nullptr;
16140
16141 // Return the incoming value if it comes from the same BB as the phi node.
16142 if (P->getIncomingBlock(0) == ParentBB) {
16143 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
16144 } else if (P->getIncomingBlock(1) == ParentBB) {
16145 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
16146 }
16147
16148 if (Rdx && DominatedReduxValue(Rdx))
16149 return Rdx;
16150
16151 // Otherwise, check whether we have a loop latch to look at.
16152 Loop *BBL = LI->getLoopFor(ParentBB);
16153 if (!BBL)
16154 return nullptr;
16155 BasicBlock *BBLatch = BBL->getLoopLatch();
16156 if (!BBLatch)
16157 return nullptr;
16158
16159 // There is a loop latch, return the incoming value if it comes from
16160 // that. This reduction pattern occasionally turns up.
16161 if (P->getIncomingBlock(0) == BBLatch) {
16162 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
16163 } else if (P->getIncomingBlock(1) == BBLatch) {
16164 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
16165 }
16166
16167 if (Rdx && DominatedReduxValue(Rdx))
16168 return Rdx;
16169
16170 return nullptr;
16171}
16172
16173static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
16174 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
16175 return true;
16176 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
16177 return true;
16178 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
16179 return true;
16180 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
16181 return true;
16182 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
16183 return true;
16184 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
16185 return true;
16186 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
16187 return true;
16188 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
16189 return true;
16190 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
16191 return true;
16192 return false;
16193}
16194
16195/// We could have an initial reduction that is not an add.
16196/// r *= v1 + v2 + v3 + v4
16197/// In such a case start looking for a tree rooted in the first '+'.
16198/// \Returns the new root if found, which may be nullptr if not an instruction.
16200 Instruction *Root) {
16201 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
16202 isa<IntrinsicInst>(Root)) &&
16203 "Expected binop, select, or intrinsic for reduction matching");
16204 Value *LHS =
16205 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
16206 Value *RHS =
16207 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
16208 if (LHS == Phi)
16209 return dyn_cast<Instruction>(RHS);
16210 if (RHS == Phi)
16211 return dyn_cast<Instruction>(LHS);
16212 return nullptr;
16213}
16214
16215/// \p Returns the first operand of \p I that does not match \p Phi. If
16216/// operand is not an instruction it returns nullptr.
16218 Value *Op0 = nullptr;
16219 Value *Op1 = nullptr;
16220 if (!matchRdxBop(I, Op0, Op1))
16221 return nullptr;
16222 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
16223}
16224
16225/// \Returns true if \p I is a candidate instruction for reduction vectorization.
16227 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
16228 Value *B0 = nullptr, *B1 = nullptr;
16229 bool IsBinop = matchRdxBop(I, B0, B1);
16230 return IsBinop || IsSelect;
16231}
16232
16233bool SLPVectorizerPass::vectorizeHorReduction(
16235 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
16236 if (!ShouldVectorizeHor)
16237 return false;
16238 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
16239
16240 if (Root->getParent() != BB || isa<PHINode>(Root))
16241 return false;
16242
16243 // If we can find a secondary reduction root, use that instead.
16244 auto SelectRoot = [&]() {
16245 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
16246 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
16247 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
16248 return NewRoot;
16249 return Root;
16250 };
16251
16252 // Start analysis starting from Root instruction. If horizontal reduction is
16253 // found, try to vectorize it. If it is not a horizontal reduction or
16254 // vectorization is not possible or not effective, and currently analyzed
16255 // instruction is a binary operation, try to vectorize the operands, using
16256 // pre-order DFS traversal order. If the operands were not vectorized, repeat
16257 // the same procedure considering each operand as a possible root of the
16258 // horizontal reduction.
16259 // Interrupt the process if the Root instruction itself was vectorized or all
16260 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
16261 // If a horizintal reduction was not matched or vectorized we collect
16262 // instructions for possible later attempts for vectorization.
16263 std::queue<std::pair<Instruction *, unsigned>> Stack;
16264 Stack.emplace(SelectRoot(), 0);
16265 SmallPtrSet<Value *, 8> VisitedInstrs;
16266 bool Res = false;
16267 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
16268 if (R.isAnalyzedReductionRoot(Inst))
16269 return nullptr;
16270 if (!isReductionCandidate(Inst))
16271 return nullptr;
16272 HorizontalReduction HorRdx;
16273 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
16274 return nullptr;
16275 return HorRdx.tryToReduce(R, TTI, *TLI);
16276 };
16277 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
16278 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
16279 FutureSeed = getNonPhiOperand(Root, P);
16280 if (!FutureSeed)
16281 return false;
16282 }
16283 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
16284 // analysis is done separately.
16285 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
16286 PostponedInsts.push_back(FutureSeed);
16287 return true;
16288 };
16289
16290 while (!Stack.empty()) {
16291 Instruction *Inst;
16292 unsigned Level;
16293 std::tie(Inst, Level) = Stack.front();
16294 Stack.pop();
16295 // Do not try to analyze instruction that has already been vectorized.
16296 // This may happen when we vectorize instruction operands on a previous
16297 // iteration while stack was populated before that happened.
16298 if (R.isDeleted(Inst))
16299 continue;
16300 if (Value *VectorizedV = TryToReduce(Inst)) {
16301 Res = true;
16302 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
16303 // Try to find another reduction.
16304 Stack.emplace(I, Level);
16305 continue;
16306 }
16307 } else {
16308 // We could not vectorize `Inst` so try to use it as a future seed.
16309 if (!TryAppendToPostponedInsts(Inst)) {
16310 assert(Stack.empty() && "Expected empty stack");
16311 break;
16312 }
16313 }
16314
16315 // Try to vectorize operands.
16316 // Continue analysis for the instruction from the same basic block only to
16317 // save compile time.
16318 if (++Level < RecursionMaxDepth)
16319 for (auto *Op : Inst->operand_values())
16320 if (VisitedInstrs.insert(Op).second)
16321 if (auto *I = dyn_cast<Instruction>(Op))
16322 // Do not try to vectorize CmpInst operands, this is done
16323 // separately.
16324 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
16325 !R.isDeleted(I) && I->getParent() == BB)
16326 Stack.emplace(I, Level);
16327 }
16328 return Res;
16329}
16330
16331bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
16332 BasicBlock *BB, BoUpSLP &R,
16334 SmallVector<WeakTrackingVH> PostponedInsts;
16335 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
16336 Res |= tryToVectorize(PostponedInsts, R);
16337 return Res;
16338}
16339
16340bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
16341 BoUpSLP &R) {
16342 bool Res = false;
16343 for (Value *V : Insts)
16344 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
16345 Res |= tryToVectorize(Inst, R);
16346 return Res;
16347}
16348
16349bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
16350 BasicBlock *BB, BoUpSLP &R) {
16351 if (!R.canMapToVector(IVI->getType()))
16352 return false;
16353
16354 SmallVector<Value *, 16> BuildVectorOpds;
16355 SmallVector<Value *, 16> BuildVectorInsts;
16356 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
16357 return false;
16358
16359 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
16360 // Aggregate value is unlikely to be processed in vector register.
16361 return tryToVectorizeList(BuildVectorOpds, R);
16362}
16363
16364bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
16365 BasicBlock *BB, BoUpSLP &R) {
16366 SmallVector<Value *, 16> BuildVectorInsts;
16367 SmallVector<Value *, 16> BuildVectorOpds;
16369 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
16370 (llvm::all_of(
16371 BuildVectorOpds,
16372 [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&
16373 isFixedVectorShuffle(BuildVectorOpds, Mask)))
16374 return false;
16375
16376 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
16377 return tryToVectorizeList(BuildVectorInsts, R);
16378}
16379
16380template <typename T>
16382 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
16383 function_ref<bool(T *, T *)> AreCompatible,
16384 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
16385 bool MaxVFOnly, BoUpSLP &R) {
16386 bool Changed = false;
16387 // Sort by type, parent, operands.
16388 stable_sort(Incoming, Comparator);
16389
16390 // Try to vectorize elements base on their type.
16391 SmallVector<T *> Candidates;
16392 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
16393 // Look for the next elements with the same type, parent and operand
16394 // kinds.
16395 auto *SameTypeIt = IncIt;
16396 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
16397 ++SameTypeIt;
16398
16399 // Try to vectorize them.
16400 unsigned NumElts = (SameTypeIt - IncIt);
16401 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
16402 << NumElts << ")\n");
16403 // The vectorization is a 3-state attempt:
16404 // 1. Try to vectorize instructions with the same/alternate opcodes with the
16405 // size of maximal register at first.
16406 // 2. Try to vectorize remaining instructions with the same type, if
16407 // possible. This may result in the better vectorization results rather than
16408 // if we try just to vectorize instructions with the same/alternate opcodes.
16409 // 3. Final attempt to try to vectorize all instructions with the
16410 // same/alternate ops only, this may result in some extra final
16411 // vectorization.
16412 if (NumElts > 1 &&
16413 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
16414 // Success start over because instructions might have been changed.
16415 Changed = true;
16416 } else {
16417 /// \Returns the minimum number of elements that we will attempt to
16418 /// vectorize.
16419 auto GetMinNumElements = [&R](Value *V) {
16420 unsigned EltSize = R.getVectorElementSize(V);
16421 return std::max(2U, R.getMaxVecRegSize() / EltSize);
16422 };
16423 if (NumElts < GetMinNumElements(*IncIt) &&
16424 (Candidates.empty() ||
16425 Candidates.front()->getType() == (*IncIt)->getType())) {
16426 Candidates.append(IncIt, std::next(IncIt, NumElts));
16427 }
16428 }
16429 // Final attempt to vectorize instructions with the same types.
16430 if (Candidates.size() > 1 &&
16431 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
16432 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
16433 // Success start over because instructions might have been changed.
16434 Changed = true;
16435 } else if (MaxVFOnly) {
16436 // Try to vectorize using small vectors.
16437 for (auto *It = Candidates.begin(), *End = Candidates.end();
16438 It != End;) {
16439 auto *SameTypeIt = It;
16440 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
16441 ++SameTypeIt;
16442 unsigned NumElts = (SameTypeIt - It);
16443 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
16444 /*MaxVFOnly=*/false))
16445 Changed = true;
16446 It = SameTypeIt;
16447 }
16448 }
16449 Candidates.clear();
16450 }
16451
16452 // Start over at the next instruction of a different type (or the end).
16453 IncIt = SameTypeIt;
16454 }
16455 return Changed;
16456}
16457
16458/// Compare two cmp instructions. If IsCompatibility is true, function returns
16459/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
16460/// operands. If IsCompatibility is false, function implements strict weak
16461/// ordering relation between two cmp instructions, returning true if the first
16462/// instruction is "less" than the second, i.e. its predicate is less than the
16463/// predicate of the second or the operands IDs are less than the operands IDs
16464/// of the second cmp instruction.
16465template <bool IsCompatibility>
16466static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
16467 const DominatorTree &DT) {
16468 assert(isValidElementType(V->getType()) &&
16469 isValidElementType(V2->getType()) &&
16470 "Expected valid element types only.");
16471 if (V == V2)
16472 return IsCompatibility;
16473 auto *CI1 = cast<CmpInst>(V);
16474 auto *CI2 = cast<CmpInst>(V2);
16475 if (CI1->getOperand(0)->getType()->getTypeID() <
16476 CI2->getOperand(0)->getType()->getTypeID())
16477 return !IsCompatibility;
16478 if (CI1->getOperand(0)->getType()->getTypeID() >
16479 CI2->getOperand(0)->getType()->getTypeID())
16480 return false;
16481 CmpInst::Predicate Pred1 = CI1->getPredicate();
16482 CmpInst::Predicate Pred2 = CI2->getPredicate();
16485 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
16486 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
16487 if (BasePred1 < BasePred2)
16488 return !IsCompatibility;
16489 if (BasePred1 > BasePred2)
16490 return false;
16491 // Compare operands.
16492 bool CI1Preds = Pred1 == BasePred1;
16493 bool CI2Preds = Pred2 == BasePred1;
16494 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
16495 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
16496 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
16497 if (Op1 == Op2)
16498 continue;
16499 if (Op1->getValueID() < Op2->getValueID())
16500 return !IsCompatibility;
16501 if (Op1->getValueID() > Op2->getValueID())
16502 return false;
16503 if (auto *I1 = dyn_cast<Instruction>(Op1))
16504 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
16505 if (IsCompatibility) {
16506 if (I1->getParent() != I2->getParent())
16507 return false;
16508 } else {
16509 // Try to compare nodes with same parent.
16510 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
16511 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
16512 if (!NodeI1)
16513 return NodeI2 != nullptr;
16514 if (!NodeI2)
16515 return false;
16516 assert((NodeI1 == NodeI2) ==
16517 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
16518 "Different nodes should have different DFS numbers");
16519 if (NodeI1 != NodeI2)
16520 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
16521 }
16522 InstructionsState S = getSameOpcode({I1, I2}, TLI);
16523 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
16524 continue;
16525 if (IsCompatibility)
16526 return false;
16527 if (I1->getOpcode() != I2->getOpcode())
16528 return I1->getOpcode() < I2->getOpcode();
16529 }
16530 }
16531 return IsCompatibility;
16532}
16533
16534template <typename ItT>
16535bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
16536 BasicBlock *BB, BoUpSLP &R) {
16537 bool Changed = false;
16538 // Try to find reductions first.
16539 for (CmpInst *I : CmpInsts) {
16540 if (R.isDeleted(I))
16541 continue;
16542 for (Value *Op : I->operands())
16543 if (auto *RootOp = dyn_cast<Instruction>(Op))
16544 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
16545 }
16546 // Try to vectorize operands as vector bundles.
16547 for (CmpInst *I : CmpInsts) {
16548 if (R.isDeleted(I))
16549 continue;
16550 Changed |= tryToVectorize(I, R);
16551 }
16552 // Try to vectorize list of compares.
16553 // Sort by type, compare predicate, etc.
16554 auto CompareSorter = [&](Value *V, Value *V2) {
16555 if (V == V2)
16556 return false;
16557 return compareCmp<false>(V, V2, *TLI, *DT);
16558 };
16559
16560 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
16561 if (V1 == V2)
16562 return true;
16563 return compareCmp<true>(V1, V2, *TLI, *DT);
16564 };
16565
16567 for (Instruction *V : CmpInsts)
16568 if (!R.isDeleted(V) && isValidElementType(V->getType()))
16569 Vals.push_back(V);
16570 if (Vals.size() <= 1)
16571 return Changed;
16572 Changed |= tryToVectorizeSequence<Value>(
16573 Vals, CompareSorter, AreCompatibleCompares,
16574 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
16575 // Exclude possible reductions from other blocks.
16576 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
16577 return any_of(V->users(), [V](User *U) {
16578 auto *Select = dyn_cast<SelectInst>(U);
16579 return Select &&
16580 Select->getParent() != cast<Instruction>(V)->getParent();
16581 });
16582 });
16583 if (ArePossiblyReducedInOtherBlock)
16584 return false;
16585 return tryToVectorizeList(Candidates, R, MaxVFOnly);
16586 },
16587 /*MaxVFOnly=*/true, R);
16588 return Changed;
16589}
16590
16591bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
16592 BasicBlock *BB, BoUpSLP &R) {
16593 assert(all_of(Instructions,
16594 [](auto *I) {
16595 return isa<InsertElementInst, InsertValueInst>(I);
16596 }) &&
16597 "This function only accepts Insert instructions");
16598 bool OpsChanged = false;
16599 SmallVector<WeakTrackingVH> PostponedInsts;
16600 // pass1 - try to vectorize reductions only
16601 for (auto *I : reverse(Instructions)) {
16602 if (R.isDeleted(I))
16603 continue;
16604 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
16605 }
16606 // pass2 - try to match and vectorize a buildvector sequence.
16607 for (auto *I : reverse(Instructions)) {
16608 if (R.isDeleted(I) || isa<CmpInst>(I))
16609 continue;
16610 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
16611 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
16612 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
16613 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
16614 }
16615 }
16616 // Now try to vectorize postponed instructions.
16617 OpsChanged |= tryToVectorize(PostponedInsts, R);
16618
16619 Instructions.clear();
16620 return OpsChanged;
16621}
16622
16623bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
16624 bool Changed = false;
16626 SmallPtrSet<Value *, 16> VisitedInstrs;
16627 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
16628 // node. Allows better to identify the chains that can be vectorized in the
16629 // better way.
16631 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
16633 isValidElementType(V2->getType()) &&
16634 "Expected vectorizable types only.");
16635 // It is fine to compare type IDs here, since we expect only vectorizable
16636 // types, like ints, floats and pointers, we don't care about other type.
16637 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
16638 return true;
16639 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
16640 return false;
16641 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
16642 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
16643 if (Opcodes1.size() < Opcodes2.size())
16644 return true;
16645 if (Opcodes1.size() > Opcodes2.size())
16646 return false;
16647 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
16648 {
16649 // Instructions come first.
16650 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
16651 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
16652 if (I1 && I2) {
16653 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
16654 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
16655 if (!NodeI1)
16656 return NodeI2 != nullptr;
16657 if (!NodeI2)
16658 return false;
16659 assert((NodeI1 == NodeI2) ==
16660 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
16661 "Different nodes should have different DFS numbers");
16662 if (NodeI1 != NodeI2)
16663 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
16664 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
16665 if (S.getOpcode() && !S.isAltShuffle())
16666 continue;
16667 return I1->getOpcode() < I2->getOpcode();
16668 }
16669 if (I1)
16670 return true;
16671 if (I2)
16672 return false;
16673 }
16674 {
16675 // Non-undef constants come next.
16676 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
16677 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
16678 if (C1 && C2)
16679 continue;
16680 if (C1)
16681 return true;
16682 if (C2)
16683 return false;
16684 }
16685 bool U1 = isa<UndefValue>(Opcodes1[I]);
16686 bool U2 = isa<UndefValue>(Opcodes2[I]);
16687 {
16688 // Non-constant non-instructions come next.
16689 if (!U1 && !U2) {
16690 auto ValID1 = Opcodes1[I]->getValueID();
16691 auto ValID2 = Opcodes2[I]->getValueID();
16692 if (ValID1 == ValID2)
16693 continue;
16694 if (ValID1 < ValID2)
16695 return true;
16696 if (ValID1 > ValID2)
16697 return false;
16698 }
16699 if (!U1)
16700 return true;
16701 if (!U2)
16702 return false;
16703 }
16704 // Undefs come last.
16705 assert(U1 && U2 && "The only thing left should be undef & undef.");
16706 continue;
16707 }
16708 return false;
16709 };
16710 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
16711 if (V1 == V2)
16712 return true;
16713 if (V1->getType() != V2->getType())
16714 return false;
16715 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
16716 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
16717 if (Opcodes1.size() != Opcodes2.size())
16718 return false;
16719 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
16720 // Undefs are compatible with any other value.
16721 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
16722 continue;
16723 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
16724 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
16725 if (I1->getParent() != I2->getParent())
16726 return false;
16727 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
16728 if (S.getOpcode())
16729 continue;
16730 return false;
16731 }
16732 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
16733 continue;
16734 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
16735 return false;
16736 }
16737 return true;
16738 };
16739
16740 bool HaveVectorizedPhiNodes = false;
16741 do {
16742 // Collect the incoming values from the PHIs.
16743 Incoming.clear();
16744 for (Instruction &I : *BB) {
16745 PHINode *P = dyn_cast<PHINode>(&I);
16746 if (!P)
16747 break;
16748
16749 // No need to analyze deleted, vectorized and non-vectorizable
16750 // instructions.
16751 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
16752 isValidElementType(P->getType()))
16753 Incoming.push_back(P);
16754 }
16755
16756 if (Incoming.size() <= 1)
16757 break;
16758
16759 // Find the corresponding non-phi nodes for better matching when trying to
16760 // build the tree.
16761 for (Value *V : Incoming) {
16762 SmallVectorImpl<Value *> &Opcodes =
16763 PHIToOpcodes.try_emplace(V).first->getSecond();
16764 if (!Opcodes.empty())
16765 continue;
16766 SmallVector<Value *, 4> Nodes(1, V);
16768 while (!Nodes.empty()) {
16769 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
16770 if (!Visited.insert(PHI).second)
16771 continue;
16772 for (Value *V : PHI->incoming_values()) {
16773 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
16774 Nodes.push_back(PHI1);
16775 continue;
16776 }
16777 Opcodes.emplace_back(V);
16778 }
16779 }
16780 }
16781
16782 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
16783 Incoming, PHICompare, AreCompatiblePHIs,
16784 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
16785 return tryToVectorizeList(Candidates, R, MaxVFOnly);
16786 },
16787 /*MaxVFOnly=*/true, R);
16788 Changed |= HaveVectorizedPhiNodes;
16789 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
16790 } while (HaveVectorizedPhiNodes);
16791
16792 VisitedInstrs.clear();
16793
16794 InstSetVector PostProcessInserts;
16795 SmallSetVector<CmpInst *, 8> PostProcessCmps;
16796 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
16797 // also vectorizes `PostProcessCmps`.
16798 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
16799 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
16800 if (VectorizeCmps) {
16801 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
16802 PostProcessCmps.clear();
16803 }
16804 PostProcessInserts.clear();
16805 return Changed;
16806 };
16807 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
16808 auto IsInPostProcessInstrs = [&](Instruction *I) {
16809 if (auto *Cmp = dyn_cast<CmpInst>(I))
16810 return PostProcessCmps.contains(Cmp);
16811 return isa<InsertElementInst, InsertValueInst>(I) &&
16812 PostProcessInserts.contains(I);
16813 };
16814 // Returns true if `I` is an instruction without users, like terminator, or
16815 // function call with ignored return value, store. Ignore unused instructions
16816 // (basing on instruction type, except for CallInst and InvokeInst).
16817 auto HasNoUsers = [](Instruction *I) {
16818 return I->use_empty() &&
16819 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
16820 };
16821 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
16822 // Skip instructions with scalable type. The num of elements is unknown at
16823 // compile-time for scalable type.
16824 if (isa<ScalableVectorType>(It->getType()))
16825 continue;
16826
16827 // Skip instructions marked for the deletion.
16828 if (R.isDeleted(&*It))
16829 continue;
16830 // We may go through BB multiple times so skip the one we have checked.
16831 if (!VisitedInstrs.insert(&*It).second) {
16832 if (HasNoUsers(&*It) &&
16833 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
16834 // We would like to start over since some instructions are deleted
16835 // and the iterator may become invalid value.
16836 Changed = true;
16837 It = BB->begin();
16838 E = BB->end();
16839 }
16840 continue;
16841 }
16842
16843 if (isa<DbgInfoIntrinsic>(It))
16844 continue;
16845
16846 // Try to vectorize reductions that use PHINodes.
16847 if (PHINode *P = dyn_cast<PHINode>(It)) {
16848 // Check that the PHI is a reduction PHI.
16849 if (P->getNumIncomingValues() == 2) {
16850 // Try to match and vectorize a horizontal reduction.
16851 Instruction *Root = getReductionInstr(DT, P, BB, LI);
16852 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
16853 Changed = true;
16854 It = BB->begin();
16855 E = BB->end();
16856 continue;
16857 }
16858 }
16859 // Try to vectorize the incoming values of the PHI, to catch reductions
16860 // that feed into PHIs.
16861 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
16862 // Skip if the incoming block is the current BB for now. Also, bypass
16863 // unreachable IR for efficiency and to avoid crashing.
16864 // TODO: Collect the skipped incoming values and try to vectorize them
16865 // after processing BB.
16866 if (BB == P->getIncomingBlock(I) ||
16867 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
16868 continue;
16869
16870 // Postponed instructions should not be vectorized here, delay their
16871 // vectorization.
16872 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
16873 PI && !IsInPostProcessInstrs(PI))
16874 Changed |= vectorizeRootInstruction(nullptr, PI,
16875 P->getIncomingBlock(I), R, TTI);
16876 }
16877 continue;
16878 }
16879
16880 if (HasNoUsers(&*It)) {
16881 bool OpsChanged = false;
16882 auto *SI = dyn_cast<StoreInst>(It);
16883 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
16884 if (SI) {
16885 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
16886 // Try to vectorize chain in store, if this is the only store to the
16887 // address in the block.
16888 // TODO: This is just a temporarily solution to save compile time. Need
16889 // to investigate if we can safely turn on slp-vectorize-hor-store
16890 // instead to allow lookup for reduction chains in all non-vectorized
16891 // stores (need to check side effects and compile time).
16892 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
16893 SI->getValueOperand()->hasOneUse();
16894 }
16895 if (TryToVectorizeRoot) {
16896 for (auto *V : It->operand_values()) {
16897 // Postponed instructions should not be vectorized here, delay their
16898 // vectorization.
16899 if (auto *VI = dyn_cast<Instruction>(V);
16900 VI && !IsInPostProcessInstrs(VI))
16901 // Try to match and vectorize a horizontal reduction.
16902 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
16903 }
16904 }
16905 // Start vectorization of post-process list of instructions from the
16906 // top-tree instructions to try to vectorize as many instructions as
16907 // possible.
16908 OpsChanged |=
16909 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
16910 if (OpsChanged) {
16911 // We would like to start over since some instructions are deleted
16912 // and the iterator may become invalid value.
16913 Changed = true;
16914 It = BB->begin();
16915 E = BB->end();
16916 continue;
16917 }
16918 }
16919
16920 if (isa<InsertElementInst, InsertValueInst>(It))
16921 PostProcessInserts.insert(&*It);
16922 else if (isa<CmpInst>(It))
16923 PostProcessCmps.insert(cast<CmpInst>(&*It));
16924 }
16925
16926 return Changed;
16927}
16928
16929bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
16930 auto Changed = false;
16931 for (auto &Entry : GEPs) {
16932 // If the getelementptr list has fewer than two elements, there's nothing
16933 // to do.
16934 if (Entry.second.size() < 2)
16935 continue;
16936
16937 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
16938 << Entry.second.size() << ".\n");
16939
16940 // Process the GEP list in chunks suitable for the target's supported
16941 // vector size. If a vector register can't hold 1 element, we are done. We
16942 // are trying to vectorize the index computations, so the maximum number of
16943 // elements is based on the size of the index expression, rather than the
16944 // size of the GEP itself (the target's pointer size).
16945 unsigned MaxVecRegSize = R.getMaxVecRegSize();
16946 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
16947 if (MaxVecRegSize < EltSize)
16948 continue;
16949
16950 unsigned MaxElts = MaxVecRegSize / EltSize;
16951 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
16952 auto Len = std::min<unsigned>(BE - BI, MaxElts);
16953 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
16954
16955 // Initialize a set a candidate getelementptrs. Note that we use a
16956 // SetVector here to preserve program order. If the index computations
16957 // are vectorizable and begin with loads, we want to minimize the chance
16958 // of having to reorder them later.
16959 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
16960
16961 // Some of the candidates may have already been vectorized after we
16962 // initially collected them or their index is optimized to constant value.
16963 // If so, they are marked as deleted, so remove them from the set of
16964 // candidates.
16965 Candidates.remove_if([&R](Value *I) {
16966 return R.isDeleted(cast<Instruction>(I)) ||
16967 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
16968 });
16969
16970 // Remove from the set of candidates all pairs of getelementptrs with
16971 // constant differences. Such getelementptrs are likely not good
16972 // candidates for vectorization in a bottom-up phase since one can be
16973 // computed from the other. We also ensure all candidate getelementptr
16974 // indices are unique.
16975 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
16976 auto *GEPI = GEPList[I];
16977 if (!Candidates.count(GEPI))
16978 continue;
16979 auto *SCEVI = SE->getSCEV(GEPList[I]);
16980 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
16981 auto *GEPJ = GEPList[J];
16982 auto *SCEVJ = SE->getSCEV(GEPList[J]);
16983 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
16984 Candidates.remove(GEPI);
16985 Candidates.remove(GEPJ);
16986 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
16987 Candidates.remove(GEPJ);
16988 }
16989 }
16990 }
16991
16992 // We break out of the above computation as soon as we know there are
16993 // fewer than two candidates remaining.
16994 if (Candidates.size() < 2)
16995 continue;
16996
16997 // Add the single, non-constant index of each candidate to the bundle. We
16998 // ensured the indices met these constraints when we originally collected
16999 // the getelementptrs.
17000 SmallVector<Value *, 16> Bundle(Candidates.size());
17001 auto BundleIndex = 0u;
17002 for (auto *V : Candidates) {
17003 auto *GEP = cast<GetElementPtrInst>(V);
17004 auto *GEPIdx = GEP->idx_begin()->get();
17005 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
17006 Bundle[BundleIndex++] = GEPIdx;
17007 }
17008
17009 // Try and vectorize the indices. We are currently only interested in
17010 // gather-like cases of the form:
17011 //
17012 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
17013 //
17014 // where the loads of "a", the loads of "b", and the subtractions can be
17015 // performed in parallel. It's likely that detecting this pattern in a
17016 // bottom-up phase will be simpler and less costly than building a
17017 // full-blown top-down phase beginning at the consecutive loads.
17018 Changed |= tryToVectorizeList(Bundle, R);
17019 }
17020 }
17021 return Changed;
17022}
17023
17024bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
17025 bool Changed = false;
17026 // Sort by type, base pointers and values operand. Value operands must be
17027 // compatible (have the same opcode, same parent), otherwise it is
17028 // definitely not profitable to try to vectorize them.
17029 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
17030 if (V->getValueOperand()->getType()->getTypeID() <
17031 V2->getValueOperand()->getType()->getTypeID())
17032 return true;
17033 if (V->getValueOperand()->getType()->getTypeID() >
17034 V2->getValueOperand()->getType()->getTypeID())
17035 return false;
17036 if (V->getPointerOperandType()->getTypeID() <
17037 V2->getPointerOperandType()->getTypeID())
17038 return true;
17039 if (V->getPointerOperandType()->getTypeID() >
17040 V2->getPointerOperandType()->getTypeID())
17041 return false;
17042 // UndefValues are compatible with all other values.
17043 if (isa<UndefValue>(V->getValueOperand()) ||
17044 isa<UndefValue>(V2->getValueOperand()))
17045 return false;
17046 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
17047 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
17049 DT->getNode(I1->getParent());
17051 DT->getNode(I2->getParent());
17052 assert(NodeI1 && "Should only process reachable instructions");
17053 assert(NodeI2 && "Should only process reachable instructions");
17054 assert((NodeI1 == NodeI2) ==
17055 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17056 "Different nodes should have different DFS numbers");
17057 if (NodeI1 != NodeI2)
17058 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17059 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
17060 if (S.getOpcode())
17061 return false;
17062 return I1->getOpcode() < I2->getOpcode();
17063 }
17064 if (isa<Constant>(V->getValueOperand()) &&
17065 isa<Constant>(V2->getValueOperand()))
17066 return false;
17067 return V->getValueOperand()->getValueID() <
17068 V2->getValueOperand()->getValueID();
17069 };
17070
17071 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
17072 if (V1 == V2)
17073 return true;
17074 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
17075 return false;
17076 if (V1->getPointerOperandType() != V2->getPointerOperandType())
17077 return false;
17078 // Undefs are compatible with any other value.
17079 if (isa<UndefValue>(V1->getValueOperand()) ||
17080 isa<UndefValue>(V2->getValueOperand()))
17081 return true;
17082 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
17083 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
17084 if (I1->getParent() != I2->getParent())
17085 return false;
17086 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
17087 return S.getOpcode() > 0;
17088 }
17089 if (isa<Constant>(V1->getValueOperand()) &&
17090 isa<Constant>(V2->getValueOperand()))
17091 return true;
17092 return V1->getValueOperand()->getValueID() ==
17093 V2->getValueOperand()->getValueID();
17094 };
17095
17096 // Attempt to sort and vectorize each of the store-groups.
17097 for (auto &Pair : Stores) {
17098 if (Pair.second.size() < 2)
17099 continue;
17100
17101 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
17102 << Pair.second.size() << ".\n");
17103
17104 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
17105 continue;
17106
17107 // Reverse stores to do bottom-to-top analysis. This is important if the
17108 // values are stores to the same addresses several times, in this case need
17109 // to follow the stores order (reversed to meet the memory dependecies).
17110 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
17111 Pair.second.rend());
17112 Changed |= tryToVectorizeSequence<StoreInst>(
17113 ReversedStores, StoreSorter, AreCompatibleStores,
17114 [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
17115 return vectorizeStores(Candidates, R);
17116 },
17117 /*MaxVFOnly=*/false, R);
17118 }
17119 return Changed;
17120}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1497
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI)
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1672
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:519
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:442
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:429
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:354
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:166
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:205
reverse_iterator rend()
Definition: BasicBlock.h:447
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:164
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:220
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1455
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2284
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2179
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1703
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2421
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2278
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1648
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1561
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1639
unsigned arg_size() const
Definition: InstrTypes.h:1646
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1832
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2275
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:579
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:955
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1323
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:965
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:994
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:995
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:989
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:988
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:992
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:990
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:993
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:991
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:1128
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1090
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1066
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2126
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:153
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1449
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
static bool classof(const Value *V)
Definition: Constant.h:167
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:348
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2240
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2443
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2248
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1806
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:460
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2518
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:305
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:842
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1752
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:480
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2349
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2232
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2477
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:465
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1660
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2256
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2144
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2179
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1825
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2395
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1865
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1581
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1355
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:259
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:731
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:453
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:256
const BasicBlock * getParent() const
Definition: Instruction.h:151
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:251
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:257
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Constants.h:1427
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
Type * getPointerOperandType() const
Definition: Instructions.h:420
Value * getValueOperand()
Definition: Instructions.h:414
Value * getPointerOperand()
Definition: Instructions.h:417
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Returns the cost estimation for alternating opcode pattern that can be lowered to a single instructio...
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Constants.h:1385
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:29
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isLoadCombineCandidate() const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail)
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:103
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1451
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:765
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:821
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:163
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:294
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:234
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:2004
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1724
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1689
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1166
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2415
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:6966
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
LLVM_READONLY APFloat maximum(const APFloat &A, const APFloat &B)
Implements IEEE 754-2019 maximum semantics.
Definition: APFloat.h:1436
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2068
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1937
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2019 maximumNumber semantics.
Definition: APFloat.h:1410
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1656
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1763
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1745
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:418
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1235
constexpr int PoisonMaskElem
@ Other
Any other memory.
TargetTransformInfo TTI
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2019 minimumNumber semantics.
Definition: APFloat.h:1396
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1923
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1995
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1833
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1930
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
LLVM_READONLY APFloat minimum(const APFloat &A, const APFloat &B)
Implements IEEE 754-2019 minimum semantics.
Definition: APFloat.h:1423
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:45
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2442
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1459
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1468
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const