LLVM 19.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160};
161using Chain = SmallVector<ChainElem, 1>;
162
163void sortChainInBBOrder(Chain &C) {
164 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
165}
166
167void sortChainInOffsetOrder(Chain &C) {
168 sort(C, [](const auto &A, const auto &B) {
169 if (A.OffsetFromLeader != B.OffsetFromLeader)
170 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
171 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
172 });
173}
174
175[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
176 for (const auto &E : C) {
177 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
178 }
179}
180
181using EquivalenceClassMap =
183
184// FIXME: Assuming stack alignment of 4 is always good enough
185constexpr unsigned StackAdjustedAlignment = 4;
186
189 for (const ChainElem &E : C)
190 Values.push_back(E.Inst);
191 return propagateMetadata(I, Values);
192}
193
194bool isInvariantLoad(const Instruction *I) {
195 const LoadInst *LI = dyn_cast<LoadInst>(I);
196 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
197}
198
199/// Reorders the instructions that I depends on (the instructions defining its
200/// operands), to ensure they dominate I.
201void reorder(Instruction *I) {
202 SmallPtrSet<Instruction *, 16> InstructionsToMove;
204
205 Worklist.push_back(I);
206 while (!Worklist.empty()) {
207 Instruction *IW = Worklist.pop_back_val();
208 int NumOperands = IW->getNumOperands();
209 for (int i = 0; i < NumOperands; i++) {
210 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
211 if (!IM || IM->getOpcode() == Instruction::PHI)
212 continue;
213
214 // If IM is in another BB, no need to move it, because this pass only
215 // vectorizes instructions within one BB.
216 if (IM->getParent() != I->getParent())
217 continue;
218
219 if (!IM->comesBefore(I)) {
220 InstructionsToMove.insert(IM);
221 Worklist.push_back(IM);
222 }
223 }
224 }
225
226 // All instructions to move should follow I. Start from I, not from begin().
227 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
228 Instruction *IM = &*(BBI++);
229 if (!InstructionsToMove.count(IM))
230 continue;
231 IM->moveBefore(I);
232 }
233}
234
235class Vectorizer {
236 Function &F;
237 AliasAnalysis &AA;
238 AssumptionCache &AC;
239 DominatorTree &DT;
240 ScalarEvolution &SE;
242 const DataLayout &DL;
243 IRBuilder<> Builder;
244
245 // We could erase instrs right after vectorizing them, but that can mess up
246 // our BB iterators, and also can make the equivalence class keys point to
247 // freed memory. This is fixable, but it's simpler just to wait until we're
248 // done with the BB and erase all at once.
250
251public:
252 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
254 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
255 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
256
257 bool run();
258
259private:
260 static const unsigned MaxDepth = 3;
261
262 /// Runs the vectorizer on a "pseudo basic block", which is a range of
263 /// instructions [Begin, End) within one BB all of which have
264 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
265 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
266
267 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
268 /// in the same BB with the same value for getUnderlyingObject() etc.
269 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
271
272 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
273 /// where all instructions access a known, constant offset from the first
274 /// instruction.
275 bool runOnChain(Chain &C);
276
277 /// Splits the chain into subchains of instructions which read/write a
278 /// contiguous block of memory. Discards any length-1 subchains (because
279 /// there's nothing to vectorize in there).
280 std::vector<Chain> splitChainByContiguity(Chain &C);
281
282 /// Splits the chain into subchains where it's safe to hoist loads up to the
283 /// beginning of the sub-chain and it's safe to sink loads up to the end of
284 /// the sub-chain. Discards any length-1 subchains.
285 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
286
287 /// Splits the chain into subchains that make legal, aligned accesses.
288 /// Discards any length-1 subchains.
289 std::vector<Chain> splitChainByAlignment(Chain &C);
290
291 /// Converts the instrs in the chain into a single vectorized load or store.
292 /// Adds the old scalar loads/stores to ToErase.
293 bool vectorizeChain(Chain &C);
294
295 /// Tries to compute the offset in bytes PtrB - PtrA.
296 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
297 Instruction *ContextInst,
298 unsigned Depth = 0);
299 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
300 Instruction *ContextInst,
301 unsigned Depth);
302 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
303 Instruction *ContextInst,
304 unsigned Depth);
305
306 /// Gets the element type of the vector that the chain will load or store.
307 /// This is nontrivial because the chain may contain elements of different
308 /// types; e.g. it's legal to have a chain that contains both i32 and float.
309 Type *getChainElemTy(const Chain &C);
310
311 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
312 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
313 /// instructions.
314 ///
315 /// The map ChainElemOffsets must contain all of the elements in
316 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
317 /// address. It's ok if it contains additional entries.
318 template <bool IsLoadChain>
319 bool isSafeToMove(
320 Instruction *ChainElem, Instruction *ChainBegin,
322
323 /// Collects loads and stores grouped by "equivalence class", where:
324 /// - all elements in an eq class are a load or all are a store,
325 /// - they all load/store the same element size (it's OK to have e.g. i8 and
326 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
327 /// - they all have the same value for getUnderlyingObject().
328 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
330
331 /// Partitions Instrs into "chains" where every instruction has a known
332 /// constant offset from the first instr in the chain.
333 ///
334 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
335 /// in the chain is the leader, and an instr touches distance 0 from itself.
336 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
337};
338
339class LoadStoreVectorizerLegacyPass : public FunctionPass {
340public:
341 static char ID;
342
343 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
346 }
347
348 bool runOnFunction(Function &F) override;
349
350 StringRef getPassName() const override {
351 return "GPU Load and Store Vectorizer";
352 }
353
354 void getAnalysisUsage(AnalysisUsage &AU) const override {
360 AU.setPreservesCFG();
361 }
362};
363
364} // end anonymous namespace
365
366char LoadStoreVectorizerLegacyPass::ID = 0;
367
368INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
369 "Vectorize load and Store instructions", false, false)
376INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
377 "Vectorize load and store instructions", false, false)
378
380 return new LoadStoreVectorizerLegacyPass();
381}
382
383bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
384 // Don't vectorize when the attribute NoImplicitFloat is used.
385 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
386 return false;
387
388 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
389 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
390 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
392 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
393
394 AssumptionCache &AC =
395 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
396
397 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
398}
399
402 // Don't vectorize when the attribute NoImplicitFloat is used.
403 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
404 return PreservedAnalyses::all();
405
411
412 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
415 return Changed ? PA : PreservedAnalyses::all();
416}
417
418bool Vectorizer::run() {
419 bool Changed = false;
420 // Break up the BB if there are any instrs which aren't guaranteed to transfer
421 // execution to their successor.
422 //
423 // Consider, for example:
424 //
425 // def assert_arr_len(int n) { if (n < 2) exit(); }
426 //
427 // load arr[0]
428 // call assert_array_len(arr.length)
429 // load arr[1]
430 //
431 // Even though assert_arr_len does not read or write any memory, we can't
432 // speculate the second load before the call. More info at
433 // https://github.com/llvm/llvm-project/issues/52950.
434 for (BasicBlock *BB : post_order(&F)) {
435 // BB must at least have a terminator.
436 assert(!BB->empty());
437
439 Barriers.push_back(BB->begin());
440 for (Instruction &I : *BB)
442 Barriers.push_back(I.getIterator());
443 Barriers.push_back(BB->end());
444
445 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
446 ++It)
447 Changed |= runOnPseudoBB(*It, *std::next(It));
448
449 for (Instruction *I : ToErase) {
450 auto *PtrOperand = getLoadStorePointerOperand(I);
451 if (I->use_empty())
452 I->eraseFromParent();
454 }
455 ToErase.clear();
456 }
457
458 return Changed;
459}
460
461bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
463 LLVM_DEBUG({
464 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
465 if (End != Begin->getParent()->end())
466 dbgs() << *End;
467 else
468 dbgs() << "<BB end>";
469 dbgs() << ")\n";
470 });
471
472 bool Changed = false;
473 for (const auto &[EqClassKey, EqClass] :
474 collectEquivalenceClasses(Begin, End))
475 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
476
477 return Changed;
478}
479
480bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
481 ArrayRef<Instruction *> EqClass) {
482 bool Changed = false;
483
484 LLVM_DEBUG({
485 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
486 << " keyed on " << EqClassKey << ":\n";
487 for (Instruction *I : EqClass)
488 dbgs() << " " << *I << "\n";
489 });
490
491 std::vector<Chain> Chains = gatherChains(EqClass);
492 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
493 << " nontrivial chains.\n";);
494 for (Chain &C : Chains)
495 Changed |= runOnChain(C);
496 return Changed;
497}
498
499bool Vectorizer::runOnChain(Chain &C) {
500 LLVM_DEBUG({
501 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
502 dumpChain(C);
503 });
504
505 // Split up the chain into increasingly smaller chains, until we can finally
506 // vectorize the chains.
507 //
508 // (Don't be scared by the depth of the loop nest here. These operations are
509 // all at worst O(n lg n) in the number of instructions, and splitting chains
510 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
511 bool Changed = false;
512 for (auto &C : splitChainByMayAliasInstrs(C))
513 for (auto &C : splitChainByContiguity(C))
514 for (auto &C : splitChainByAlignment(C))
515 Changed |= vectorizeChain(C);
516 return Changed;
517}
518
519std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
520 if (C.empty())
521 return {};
522
523 sortChainInBBOrder(C);
524
525 LLVM_DEBUG({
526 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
527 dumpChain(C);
528 });
529
530 // We know that elements in the chain with nonverlapping offsets can't
531 // alias, but AA may not be smart enough to figure this out. Use a
532 // hashtable.
533 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
534 for (const auto &E : C)
535 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
536
537 // Loads get hoisted up to the first load in the chain. Stores get sunk
538 // down to the last store in the chain. Our algorithm for loads is:
539 //
540 // - Take the first element of the chain. This is the start of a new chain.
541 // - Take the next element of `Chain` and check for may-alias instructions
542 // up to the start of NewChain. If no may-alias instrs, add it to
543 // NewChain. Otherwise, start a new NewChain.
544 //
545 // For stores it's the same except in the reverse direction.
546 //
547 // We expect IsLoad to be an std::bool_constant.
548 auto Impl = [&](auto IsLoad) {
549 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
550 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
551 if constexpr (IsLoad())
552 return std::make_pair(C.begin(), C.end());
553 else
554 return std::make_pair(C.rbegin(), C.rend());
555 }(IsLoad);
556 assert(ChainBegin != ChainEnd);
557
558 std::vector<Chain> Chains;
560 NewChain.push_back(*ChainBegin);
561 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
562 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
563 ChainOffsets)) {
564 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
565 << *ChainIt->Inst << " into " << *ChainBegin->Inst
566 << "\n");
567 NewChain.push_back(*ChainIt);
568 } else {
570 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
571 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
572 if (NewChain.size() > 1) {
573 LLVM_DEBUG({
574 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
575 dumpChain(NewChain);
576 });
577 Chains.push_back(std::move(NewChain));
578 }
579
580 // Start a new chain.
581 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
582 }
583 }
584 if (NewChain.size() > 1) {
585 LLVM_DEBUG({
586 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
587 dumpChain(NewChain);
588 });
589 Chains.push_back(std::move(NewChain));
590 }
591 return Chains;
592 };
593
594 if (isa<LoadInst>(C[0].Inst))
595 return Impl(/*IsLoad=*/std::bool_constant<true>());
596
597 assert(isa<StoreInst>(C[0].Inst));
598 return Impl(/*IsLoad=*/std::bool_constant<false>());
599}
600
601std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
602 if (C.empty())
603 return {};
604
605 sortChainInOffsetOrder(C);
606
607 LLVM_DEBUG({
608 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
609 dumpChain(C);
610 });
611
612 std::vector<Chain> Ret;
613 Ret.push_back({C.front()});
614
615 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
616 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
617 auto &CurChain = Ret.back();
618 const ChainElem &Prev = CurChain.back();
619 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
620 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
621 "collectEquivalenceClass");
622 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
623
624 // Add this instruction to the end of the current chain, or start a new one.
625 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
626 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
627 << (AreContiguous ? "" : "not ") << "contiguous: "
628 << *Prev.Inst << " (ends at offset " << PrevReadEnd
629 << ") -> " << *It->Inst << " (starts at offset "
630 << It->OffsetFromLeader << ")\n");
631 if (AreContiguous)
632 CurChain.push_back(*It);
633 else
634 Ret.push_back({*It});
635 }
636
637 // Filter out length-1 chains, these are uninteresting.
638 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
639 return Ret;
640}
641
642Type *Vectorizer::getChainElemTy(const Chain &C) {
643 assert(!C.empty());
644 // The rules are:
645 // - If there are any pointer types in the chain, use an integer type.
646 // - Prefer an integer type if it appears in the chain.
647 // - Otherwise, use the first type in the chain.
648 //
649 // The rule about pointer types is a simplification when we merge e.g. a load
650 // of a ptr and a double. There's no direct conversion from a ptr to a
651 // double; it requires a ptrtoint followed by a bitcast.
652 //
653 // It's unclear to me if the other rules have any practical effect, but we do
654 // it to match this pass's previous behavior.
655 if (any_of(C, [](const ChainElem &E) {
656 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
657 })) {
658 return Type::getIntNTy(
659 F.getContext(),
660 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
661 }
662
663 for (const ChainElem &E : C)
664 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
665 return T;
666 return getLoadStoreType(C[0].Inst)->getScalarType();
667}
668
669std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
670 // We use a simple greedy algorithm.
671 // - Given a chain of length N, find all prefixes that
672 // (a) are not longer than the max register length, and
673 // (b) are a power of 2.
674 // - Starting from the longest prefix, try to create a vector of that length.
675 // - If one of them works, great. Repeat the algorithm on any remaining
676 // elements in the chain.
677 // - If none of them work, discard the first element and repeat on a chain
678 // of length N-1.
679 if (C.empty())
680 return {};
681
682 sortChainInOffsetOrder(C);
683
684 LLVM_DEBUG({
685 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
686 dumpChain(C);
687 });
688
689 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
690 auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
691 unsigned ChainSizeBytes, VectorType *VecTy) {
692 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
693 ChainSizeBytes, VecTy)
694 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
695 ChainSizeBytes, VecTy);
696 };
697
698#ifndef NDEBUG
699 for (const auto &E : C) {
700 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
701 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
702 "Should have filtered out non-power-of-two elements in "
703 "collectEquivalenceClasses.");
704 }
705#endif
706
707 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
708 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
709
710 std::vector<Chain> Ret;
711 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
712 // Find candidate chains of size not greater than the largest vector reg.
713 // These chains are over the closed interval [CBegin, CEnd].
714 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
715 CandidateChains;
716 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
717 APInt Sz = C[CEnd].OffsetFromLeader +
718 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
719 C[CBegin].OffsetFromLeader;
720 if (Sz.sgt(VecRegBytes))
721 break;
722 CandidateChains.push_back(
723 {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
724 }
725
726 // Consider the longest chain first.
727 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
728 It != End; ++It) {
729 auto [CEnd, SizeBytes] = *It;
731 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
732 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
733
734 Type *VecElemTy = getChainElemTy(C);
735 // Note, VecElemTy is a power of 2, but might be less than one byte. For
736 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
737 // VecElemTy would be i4.
738 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
739
740 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
741 assert((8 * SizeBytes) % VecElemBits == 0);
742 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
743 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
744 unsigned VF = 8 * VecRegBytes / VecElemBits;
745
746 // Check that TTI is happy with this vectorization factor.
747 unsigned TargetVF = getVectorFactor(VF, VecElemBits,
748 VecElemBits * NumVecElems / 8, VecTy);
749 if (TargetVF != VF && TargetVF < NumVecElems) {
751 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
752 "because TargetVF="
753 << TargetVF << " != VF=" << VF
754 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
755 continue;
756 }
757
758 // Is a load/store with this alignment allowed by TTI and at least as fast
759 // as an unvectorized load/store?
760 //
761 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
762 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
763 &F = F](Align Alignment) {
764 if (Alignment.value() % SizeBytes == 0)
765 return true;
766 unsigned VectorizedSpeed = 0;
767 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
768 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
769 if (!AllowsMisaligned) {
771 << "LSV: Access of " << SizeBytes << "B in addrspace "
772 << AS << " with alignment " << Alignment.value()
773 << " is misaligned, and therefore can't be vectorized.\n");
774 return false;
775 }
776
777 unsigned ElementwiseSpeed = 0;
778 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
779 Alignment, &ElementwiseSpeed);
780 if (VectorizedSpeed < ElementwiseSpeed) {
782 << "LSV: Access of " << SizeBytes << "B in addrspace "
783 << AS << " with alignment " << Alignment.value()
784 << " has relative speed " << VectorizedSpeed
785 << ", which is lower than the elementwise speed of "
786 << ElementwiseSpeed
787 << ". Therefore this access won't be vectorized.\n");
788 return false;
789 }
790 return true;
791 };
792
793 // If we're loading/storing from an alloca, align it if possible.
794 //
795 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
796 // tells us this is beneficial. This feels a bit odd, but it matches
797 // existing tests. This isn't *so* bad, because at most we align to 4
798 // bytes (current value of StackAdjustedAlignment).
799 //
800 // FIXME: We will upgrade the alignment of the alloca even if it turns out
801 // we can't vectorize for some other reason.
802 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
803 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
804 isa<AllocaInst>(PtrOperand->stripPointerCasts());
805 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
806 Align PrefAlign = Align(StackAdjustedAlignment);
807 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
808 IsAllowedAndFast(PrefAlign)) {
810 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
811 if (NewAlign >= Alignment) {
813 << "LSV: splitByChain upgrading alloca alignment from "
814 << Alignment.value() << " to " << NewAlign.value()
815 << "\n");
816 Alignment = NewAlign;
817 }
818 }
819
820 if (!IsAllowedAndFast(Alignment)) {
822 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
823 "because its alignment is not AllowedAndFast: "
824 << Alignment.value() << "\n");
825 continue;
826 }
827
828 if ((IsLoadChain &&
829 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
830 (!IsLoadChain &&
831 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
833 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
834 "because !isLegalToVectorizeLoad/StoreChain.");
835 continue;
836 }
837
838 // Hooray, we can vectorize this chain!
839 Chain &NewChain = Ret.emplace_back();
840 for (unsigned I = CBegin; I <= CEnd; ++I)
841 NewChain.push_back(C[I]);
842 CBegin = CEnd; // Skip over the instructions we've added to the chain.
843 break;
844 }
845 }
846 return Ret;
847}
848
849bool Vectorizer::vectorizeChain(Chain &C) {
850 if (C.size() < 2)
851 return false;
852
853 sortChainInOffsetOrder(C);
854
855 LLVM_DEBUG({
856 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
857 dumpChain(C);
858 });
859
860 Type *VecElemTy = getChainElemTy(C);
861 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
862 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
863 unsigned ChainBytes = std::accumulate(
864 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
865 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
866 });
867 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
868 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
869 // than 1 byte (e.g. VecTy == <32 x i1>).
870 Type *VecTy = FixedVectorType::get(
871 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
872
873 Align Alignment = getLoadStoreAlignment(C[0].Inst);
874 // If this is a load/store of an alloca, we might have upgraded the alloca's
875 // alignment earlier. Get the new alignment.
876 if (AS == DL.getAllocaAddrSpace()) {
877 Alignment = std::max(
878 Alignment,
880 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
881 }
882
883 // All elements of the chain must have the same scalar-type size.
884#ifndef NDEBUG
885 for (const ChainElem &E : C)
886 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
887 DL.getTypeStoreSize(VecElemTy));
888#endif
889
890 Instruction *VecInst;
891 if (IsLoadChain) {
892 // Loads get hoisted to the location of the first load in the chain. We may
893 // also need to hoist the (transitive) operands of the loads.
894 Builder.SetInsertPoint(
895 std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {
896 return A.Inst->comesBefore(B.Inst);
897 })->Inst);
898
899 // Chain is in offset order, so C[0] is the instr with the lowest offset,
900 // i.e. the root of the vector.
901 VecInst = Builder.CreateAlignedLoad(VecTy,
903 Alignment);
904
905 unsigned VecIdx = 0;
906 for (const ChainElem &E : C) {
907 Instruction *I = E.Inst;
908 Value *V;
910 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
911 auto Mask = llvm::to_vector<8>(
912 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
913 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
914 VecIdx += VT->getNumElements();
915 } else {
916 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
917 I->getName());
918 ++VecIdx;
919 }
920 if (V->getType() != I->getType())
921 V = Builder.CreateBitOrPointerCast(V, I->getType());
922 I->replaceAllUsesWith(V);
923 }
924
925 // Finally, we need to reorder the instrs in the BB so that the (transitive)
926 // operands of VecInst appear before it. To see why, suppose we have
927 // vectorized the following code:
928 //
929 // ptr1 = gep a, 1
930 // load1 = load i32 ptr1
931 // ptr0 = gep a, 0
932 // load0 = load i32 ptr0
933 //
934 // We will put the vectorized load at the location of the earliest load in
935 // the BB, i.e. load1. We get:
936 //
937 // ptr1 = gep a, 1
938 // loadv = load <2 x i32> ptr0
939 // load0 = extractelement loadv, 0
940 // load1 = extractelement loadv, 1
941 // ptr0 = gep a, 0
942 //
943 // Notice that loadv uses ptr0, which is defined *after* it!
944 reorder(VecInst);
945 } else {
946 // Stores get sunk to the location of the last store in the chain.
947 Builder.SetInsertPoint(
948 std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {
949 return A.Inst->comesBefore(B.Inst);
950 })->Inst);
951
952 // Build the vector to store.
953 Value *Vec = PoisonValue::get(VecTy);
954 unsigned VecIdx = 0;
955 auto InsertElem = [&](Value *V) {
956 if (V->getType() != VecElemTy)
957 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
958 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
959 };
960 for (const ChainElem &E : C) {
961 auto I = cast<StoreInst>(E.Inst);
962 if (FixedVectorType *VT =
963 dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
964 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
965 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
966 Builder.getInt32(J)));
967 }
968 } else {
969 InsertElem(I->getValueOperand());
970 }
971 }
972
973 // Chain is in offset order, so C[0] is the instr with the lowest offset,
974 // i.e. the root of the vector.
975 VecInst = Builder.CreateAlignedStore(
976 Vec,
978 Alignment);
979 }
980
981 propagateMetadata(VecInst, C);
982
983 for (const ChainElem &E : C)
984 ToErase.push_back(E.Inst);
985
986 ++NumVectorInstructions;
987 NumScalarsVectorized += C.size();
988 return true;
989}
990
991template <bool IsLoadChain>
992bool Vectorizer::isSafeToMove(
993 Instruction *ChainElem, Instruction *ChainBegin,
995 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
996 << *ChainBegin << ")\n");
997
998 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
999 if (ChainElem == ChainBegin)
1000 return true;
1001
1002 // Invariant loads can always be reordered; by definition they are not
1003 // clobbered by stores.
1004 if (isInvariantLoad(ChainElem))
1005 return true;
1006
1007 auto BBIt = std::next([&] {
1008 if constexpr (IsLoadChain)
1009 return BasicBlock::reverse_iterator(ChainElem);
1010 else
1011 return BasicBlock::iterator(ChainElem);
1012 }());
1013 auto BBItEnd = std::next([&] {
1014 if constexpr (IsLoadChain)
1015 return BasicBlock::reverse_iterator(ChainBegin);
1016 else
1017 return BasicBlock::iterator(ChainBegin);
1018 }());
1019
1020 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1021 const unsigned ChainElemSize =
1022 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1023
1024 for (; BBIt != BBItEnd; ++BBIt) {
1025 Instruction *I = &*BBIt;
1026
1027 if (!I->mayReadOrWriteMemory())
1028 continue;
1029
1030 // Loads can be reordered with other loads.
1031 if (IsLoadChain && isa<LoadInst>(I))
1032 continue;
1033
1034 // Stores can be sunk below invariant loads.
1035 if (!IsLoadChain && isInvariantLoad(I))
1036 continue;
1037
1038 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1039 // what offset ChainIt accesses. This may be better than AA is able to do.
1040 //
1041 // We should really only have duplicate offsets for stores (the duplicate
1042 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1043 // split the chain so we don't have to handle this case specially.
1044 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1045 // I and ChainElem overlap if:
1046 // - I and ChainElem have the same offset, OR
1047 // - I's offset is less than ChainElem's, but I touches past the
1048 // beginning of ChainElem, OR
1049 // - ChainElem's offset is less than I's, but ChainElem touches past the
1050 // beginning of I.
1051 const APInt &IOffset = OffsetIt->second;
1052 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1053 if (IOffset == ChainElemOffset ||
1054 (IOffset.sle(ChainElemOffset) &&
1055 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1056 (ChainElemOffset.sle(IOffset) &&
1057 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1058 LLVM_DEBUG({
1059 // Double check that AA also sees this alias. If not, we probably
1060 // have a bug.
1061 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1062 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1063 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1064 });
1065 return false; // We found an aliasing instruction; bail.
1066 }
1067
1068 continue; // We're confident there's no alias.
1069 }
1070
1071 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1072 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1073 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1074 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1075 << " Aliasing instruction:\n"
1076 << " " << *I << '\n'
1077 << " Aliased instruction and pointer:\n"
1078 << " " << *ChainElem << '\n'
1079 << " " << *getLoadStorePointerOperand(ChainElem)
1080 << '\n');
1081
1082 return false;
1083 }
1084 }
1085 return true;
1086}
1087
1089 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1090 return (Signed && BinOpI->hasNoSignedWrap()) ||
1091 (!Signed && BinOpI->hasNoUnsignedWrap());
1092}
1093
1094static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1095 unsigned MatchingOpIdxA, Instruction *AddOpB,
1096 unsigned MatchingOpIdxB, bool Signed) {
1097 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1098 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1099 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1100 << ", MatchingOpIdxB=" << MatchingOpIdxB
1101 << ", Signed=" << Signed << "\n");
1102 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1103 // being the same, we can guarantee that the transformation is safe if we can
1104 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1105 // For example:
1106 // %tmp7 = add nsw i32 %tmp2, %v0
1107 // %tmp8 = sext i32 %tmp7 to i64
1108 // ...
1109 // %tmp11 = add nsw i32 %v0, 1
1110 // %tmp12 = add nsw i32 %tmp2, %tmp11
1111 // %tmp13 = sext i32 %tmp12 to i64
1112 //
1113 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1114 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1115 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1116 assert(AddOpA->getOpcode() == Instruction::Add &&
1117 AddOpB->getOpcode() == Instruction::Add &&
1118 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1119 if (AddOpA->getOperand(MatchingOpIdxA) ==
1120 AddOpB->getOperand(MatchingOpIdxB)) {
1121 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1122 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1123 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1124 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1125 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1126 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1127 checkNoWrapFlags(OtherInstrB, Signed) &&
1128 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1129 int64_t CstVal =
1130 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1131 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1132 IdxDiff.getSExtValue() == CstVal)
1133 return true;
1134 }
1135 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1136 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1137 checkNoWrapFlags(OtherInstrA, Signed) &&
1138 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1139 int64_t CstVal =
1140 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1141 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1142 IdxDiff.getSExtValue() == -CstVal)
1143 return true;
1144 }
1145 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1146 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1147 if (OtherInstrA && OtherInstrB &&
1148 OtherInstrA->getOpcode() == Instruction::Add &&
1149 OtherInstrB->getOpcode() == Instruction::Add &&
1150 checkNoWrapFlags(OtherInstrA, Signed) &&
1151 checkNoWrapFlags(OtherInstrB, Signed) &&
1152 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1153 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1154 int64_t CstValA =
1155 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1156 int64_t CstValB =
1157 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1158 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1159 IdxDiff.getSExtValue() == (CstValB - CstValA))
1160 return true;
1161 }
1162 }
1163 return false;
1164}
1165
1166std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1167 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1168 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1169 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1170 << " Depth=" << Depth << "\n");
1171 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1172 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1173 if (!GEPA || !GEPB)
1174 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1175
1176 // Look through GEPs after checking they're the same except for the last
1177 // index.
1178 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1179 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1180 return std::nullopt;
1181 gep_type_iterator GTIA = gep_type_begin(GEPA);
1182 gep_type_iterator GTIB = gep_type_begin(GEPB);
1183 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1184 if (GTIA.getOperand() != GTIB.getOperand())
1185 return std::nullopt;
1186 ++GTIA;
1187 ++GTIB;
1188 }
1189
1190 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1191 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1192 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1193 OpA->getType() != OpB->getType())
1194 return std::nullopt;
1195
1196 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1197
1198 // Only look through a ZExt/SExt.
1199 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1200 return std::nullopt;
1201
1202 bool Signed = isa<SExtInst>(OpA);
1203
1204 // At this point A could be a function parameter, i.e. not an instruction
1205 Value *ValA = OpA->getOperand(0);
1206 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1207 if (!OpB || ValA->getType() != OpB->getType())
1208 return std::nullopt;
1209
1210 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1211 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1212 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1213 if (IdxDiffSCEV == SE.getCouldNotCompute())
1214 return std::nullopt;
1215
1216 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1217 if (!IdxDiffRange.isSingleElement())
1218 return std::nullopt;
1219 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1220
1221 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1222 << "\n");
1223
1224 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1225 bool Safe = false;
1226
1227 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1228 // ValA, we're okay.
1229 if (OpB->getOpcode() == Instruction::Add &&
1230 isa<ConstantInt>(OpB->getOperand(1)) &&
1231 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1233 Safe = true;
1234
1235 // Second attempt: check if we have eligible add NSW/NUW instruction
1236 // sequences.
1237 OpA = dyn_cast<Instruction>(ValA);
1238 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1239 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1240 checkNoWrapFlags(OpB, Signed)) {
1241 // In the checks below a matching operand in OpA and OpB is an operand which
1242 // is the same in those two instructions. Below we account for possible
1243 // orders of the operands of these add instructions.
1244 for (unsigned MatchingOpIdxA : {0, 1})
1245 for (unsigned MatchingOpIdxB : {0, 1})
1246 if (!Safe)
1247 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1248 MatchingOpIdxB, Signed);
1249 }
1250
1251 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1252
1253 // Third attempt:
1254 //
1255 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1256 // order bit other than the sign bit are known to be zero in ValA, we can add
1257 // Diff to it while guaranteeing no overflow of any sort.
1258 //
1259 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1260 if (!Safe) {
1261 // When computing known bits, use the GEPs as context instructions, since
1262 // they likely are in the same BB as the load/store.
1263 KnownBits Known(BitWidth);
1264 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1265 ContextInst, &DT);
1266 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1267 if (Signed)
1268 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1269 if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1270 return std::nullopt;
1271 Safe = true;
1272 }
1273
1274 if (Safe)
1275 return IdxDiff * Stride;
1276 return std::nullopt;
1277}
1278
1279std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1280 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1281 if (Depth++ == MaxDepth)
1282 return std::nullopt;
1283
1284 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1285 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1286 if (SelectA->getCondition() != SelectB->getCondition())
1287 return std::nullopt;
1288 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1289 << ", PtrB=" << *PtrB << ", ContextInst="
1290 << *ContextInst << ", Depth=" << Depth << "\n");
1291 std::optional<APInt> TrueDiff = getConstantOffset(
1292 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1293 if (!TrueDiff.has_value())
1294 return std::nullopt;
1295 std::optional<APInt> FalseDiff =
1296 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1297 ContextInst, Depth);
1298 if (TrueDiff == FalseDiff)
1299 return TrueDiff;
1300 }
1301 }
1302 return std::nullopt;
1303}
1304
1305EquivalenceClassMap
1306Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1308 EquivalenceClassMap Ret;
1309
1310 auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
1311 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1312 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1313 // The select's themselves are distinct instructions even if they share
1314 // the same condition and evaluate to consecutive pointers for true and
1315 // false values of the condition. Therefore using the select's themselves
1316 // for grouping instructions would put consecutive accesses into different
1317 // lists and they won't be even checked for being consecutive, and won't
1318 // be vectorized.
1319 return Sel->getCondition();
1320 }
1321 return ObjPtr;
1322 };
1323
1324 for (Instruction &I : make_range(Begin, End)) {
1325 auto *LI = dyn_cast<LoadInst>(&I);
1326 auto *SI = dyn_cast<StoreInst>(&I);
1327 if (!LI && !SI)
1328 continue;
1329
1330 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1331 continue;
1332
1333 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1334 (SI && !TTI.isLegalToVectorizeStore(SI)))
1335 continue;
1336
1337 Type *Ty = getLoadStoreType(&I);
1339 continue;
1340
1341 // Skip weird non-byte sizes. They probably aren't worth the effort of
1342 // handling correctly.
1343 unsigned TySize = DL.getTypeSizeInBits(Ty);
1344 if ((TySize % 8) != 0)
1345 continue;
1346
1347 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1348 // functions are currently using an integer type for the vectorized
1349 // load/store, and does not support casting between the integer type and a
1350 // vector of pointers (e.g. i64 to <2 x i16*>)
1351 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1352 continue;
1353
1355 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1356 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1357
1358 unsigned VF = VecRegSize / TySize;
1359 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1360
1361 // Only handle power-of-two sized elements.
1362 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1363 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1364 continue;
1365
1366 // No point in looking at these if they're too big to vectorize.
1367 if (TySize > VecRegSize / 2 ||
1368 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1369 continue;
1370
1372 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1373 /*IsLoad=*/LI != nullptr}]
1374 .push_back(&I);
1375 }
1376
1377 return Ret;
1378}
1379
1380std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1381 if (Instrs.empty())
1382 return {};
1383
1384 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1385 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1386
1387#ifndef NDEBUG
1388 // Check that Instrs is in BB order and all have the same addr space.
1389 for (size_t I = 1; I < Instrs.size(); ++I) {
1390 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1391 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1392 }
1393#endif
1394
1395 // Machinery to build an MRU-hashtable of Chains.
1396 //
1397 // (Ideally this could be done with MapVector, but as currently implemented,
1398 // moving an element to the front of a MapVector is O(n).)
1399 struct InstrListElem : ilist_node<InstrListElem>,
1400 std::pair<Instruction *, Chain> {
1401 explicit InstrListElem(Instruction *I)
1402 : std::pair<Instruction *, Chain>(I, {}) {}
1403 };
1404 struct InstrListElemDenseMapInfo {
1405 using PtrInfo = DenseMapInfo<InstrListElem *>;
1406 using IInfo = DenseMapInfo<Instruction *>;
1407 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1408 static InstrListElem *getTombstoneKey() {
1409 return PtrInfo::getTombstoneKey();
1410 }
1411 static unsigned getHashValue(const InstrListElem *E) {
1412 return IInfo::getHashValue(E->first);
1413 }
1414 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1415 if (A == getEmptyKey() || B == getEmptyKey())
1416 return A == getEmptyKey() && B == getEmptyKey();
1417 if (A == getTombstoneKey() || B == getTombstoneKey())
1418 return A == getTombstoneKey() && B == getTombstoneKey();
1419 return IInfo::isEqual(A->first, B->first);
1420 }
1421 };
1425
1426 // Compare each instruction in `instrs` to leader of the N most recently-used
1427 // chains. This limits the O(n^2) behavior of this pass while also allowing
1428 // us to build arbitrarily long chains.
1429 for (Instruction *I : Instrs) {
1430 constexpr int MaxChainsToTry = 64;
1431
1432 bool MatchFound = false;
1433 auto ChainIter = MRU.begin();
1434 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1435 ++J, ++ChainIter) {
1436 std::optional<APInt> Offset = getConstantOffset(
1437 getLoadStorePointerOperand(ChainIter->first),
1439 /*ContextInst=*/
1440 (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
1441 if (Offset.has_value()) {
1442 // `Offset` might not have the expected number of bits, if e.g. AS has a
1443 // different number of bits than opaque pointers.
1444 ChainIter->second.push_back(ChainElem{I, Offset.value()});
1445 // Move ChainIter to the front of the MRU list.
1446 MRU.remove(*ChainIter);
1447 MRU.push_front(*ChainIter);
1448 MatchFound = true;
1449 break;
1450 }
1451 }
1452
1453 if (!MatchFound) {
1454 APInt ZeroOffset(ASPtrBits, 0);
1455 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1456 E->second.push_back(ChainElem{I, ZeroOffset});
1457 MRU.push_front(*E);
1458 Chains.insert(E);
1459 }
1460 }
1461
1462 std::vector<Chain> Ret;
1463 Ret.reserve(Chains.size());
1464 // Iterate over MRU rather than Chains so the order is deterministic.
1465 for (auto &E : MRU)
1466 if (E.second.size() > 1)
1467 Ret.push_back(std::move(E.second));
1468 return Ret;
1469}
1470
1471std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1472 Instruction *ContextInst,
1473 unsigned Depth) {
1474 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1475 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1476 << ", Depth=" << Depth << "\n");
1477 // We'll ultimately return a value of this bit width, even if computations
1478 // happen in a different width.
1479 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1480 APInt OffsetA(OrigBitWidth, 0);
1481 APInt OffsetB(OrigBitWidth, 0);
1482 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1483 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1484 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1485 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1486 return std::nullopt;
1487
1488 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1489 // should properly handle a possible overflow and the value should fit into
1490 // the smallest data type used in the cast/gep chain.
1491 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1492 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1493
1494 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1495 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1496 if (PtrA == PtrB)
1497 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1498
1499 // Try to compute B - A.
1500 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1501 if (DistScev != SE.getCouldNotCompute()) {
1502 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1503 ConstantRange DistRange = SE.getSignedRange(DistScev);
1504 if (DistRange.isSingleElement()) {
1505 // Handle index width (the width of Dist) != pointer width (the width of
1506 // the Offset*s at this point).
1507 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1508 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1509 }
1510 }
1511 std::optional<APInt> Diff =
1512 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1513 if (Diff.has_value())
1514 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1515 .sextOrTrunc(OrigBitWidth);
1516 return std::nullopt;
1517}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Mark last scratch load
This file implements a class to represent arbitrary precision integral constant values and operations...
Expand Atomic instructions
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static const unsigned MaxDepth
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, const MachineRegisterInfo &MRI)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:76
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1379
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt abs() const
Get the absolute value.
Definition: APInt.h:1730
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1173
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1433
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1083
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1138
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:453
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1209
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1507
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:166
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:164
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
This class represents a range of values.
Definition: ConstantRange.h:47
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:275
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:313
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:339
const BasicBlock * getParent() const
Definition: Instruction.h:150
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:250
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
An instruction for reading from memory.
Definition: Instructions.h:184
bool isSimple() const
Definition: Instructions.h:272
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getCouldNotCompute()
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:382
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalToVectorizeLoad(LoadInst *LI) const
bool isLegalToVectorizeStore(StoreInst *SI) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
TypeSize getSequentialElementStride(const DataLayout &DL) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A simple intrusive list implementation.
Definition: simple_ilist.h:81
void push_front(reference Node)
Insert a node at the front; never copies.
Definition: simple_ilist.h:150
void remove(reference N)
Remove a node by reference; never deletes.
Definition: simple_ilist.h:189
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1536
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1656
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
TargetTransformInfo TTI
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2031
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117