LLVM 18.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
107#include <algorithm>
108#include <cassert>
109#include <cstdint>
110#include <cstdlib>
111#include <iterator>
112#include <limits>
113#include <numeric>
114#include <optional>
115#include <tuple>
116#include <type_traits>
117#include <utility>
118#include <vector>
119
120using namespace llvm;
121
122#define DEBUG_TYPE "load-store-vectorizer"
123
124STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
125STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
126
127namespace {
128
129// Equivalence class key, the initial tuple by which we group loads/stores.
130// Loads/stores with different EqClassKeys are never merged.
131//
132// (We could in theory remove element-size from the this tuple. We'd just need
133// to fix up the vector packing/unpacking code.)
134using EqClassKey =
135 std::tuple<const Value * /* result of getUnderlyingObject() */,
136 unsigned /* AddrSpace */,
137 unsigned /* Load/Store element size bits */,
138 char /* IsLoad; char b/c bool can't be a DenseMap key */
139 >;
141 const EqClassKey &K) {
142 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
143 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
144 << " of element size " << ElementSize << " bits in addrspace "
145 << AddrSpace;
146}
147
148// A Chain is a set of instructions such that:
149// - All instructions have the same equivalence class, so in particular all are
150// loads, or all are stores.
151// - We know the address accessed by the i'th chain elem relative to the
152// chain's leader instruction, which is the first instr of the chain in BB
153// order.
154//
155// Chains have two canonical orderings:
156// - BB order, sorted by Instr->comesBefore.
157// - Offset order, sorted by OffsetFromLeader.
158// This pass switches back and forth between these orders.
159struct ChainElem {
160 Instruction *Inst;
161 APInt OffsetFromLeader;
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
191 for (const ChainElem &E : C)
192 Values.push_back(E.Inst);
193 return propagateMetadata(I, Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
206
207 Worklist.push_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int i = 0; i < NumOperands; i++) {
212 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 if (!IM->comesBefore(I)) {
222 InstructionsToMove.insert(IM);
223 Worklist.push_back(IM);
224 }
225 }
226 }
227
228 // All instructions to move should follow I. Start from I, not from begin().
229 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
230 Instruction *IM = &*(BBI++);
231 if (!InstructionsToMove.count(IM))
232 continue;
233 IM->moveBefore(I);
234 }
235}
236
237class Vectorizer {
238 Function &F;
239 AliasAnalysis &AA;
240 AssumptionCache &AC;
241 DominatorTree &DT;
242 ScalarEvolution &SE;
244 const DataLayout &DL;
246
247 // We could erase instrs right after vectorizing them, but that can mess up
248 // our BB iterators, and also can make the equivalence class keys point to
249 // freed memory. This is fixable, but it's simpler just to wait until we're
250 // done with the BB and erase all at once.
252
253public:
254 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
256 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
257 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
258
259 bool run();
260
261private:
262 static const unsigned MaxDepth = 3;
263
264 /// Runs the vectorizer on a "pseudo basic block", which is a range of
265 /// instructions [Begin, End) within one BB all of which have
266 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
267 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
268
269 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
270 /// in the same BB with the same value for getUnderlyingObject() etc.
271 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
273
274 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
275 /// where all instructions access a known, constant offset from the first
276 /// instruction.
277 bool runOnChain(Chain &C);
278
279 /// Splits the chain into subchains of instructions which read/write a
280 /// contiguous block of memory. Discards any length-1 subchains (because
281 /// there's nothing to vectorize in there).
282 std::vector<Chain> splitChainByContiguity(Chain &C);
283
284 /// Splits the chain into subchains where it's safe to hoist loads up to the
285 /// beginning of the sub-chain and it's safe to sink loads up to the end of
286 /// the sub-chain. Discards any length-1 subchains.
287 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
288
289 /// Splits the chain into subchains that make legal, aligned accesses.
290 /// Discards any length-1 subchains.
291 std::vector<Chain> splitChainByAlignment(Chain &C);
292
293 /// Converts the instrs in the chain into a single vectorized load or store.
294 /// Adds the old scalar loads/stores to ToErase.
295 bool vectorizeChain(Chain &C);
296
297 /// Tries to compute the offset in bytes PtrB - PtrA.
298 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
299 Instruction *ContextInst,
300 unsigned Depth = 0);
301 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
302 Instruction *ContextInst,
303 unsigned Depth);
304 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
305 Instruction *ContextInst,
306 unsigned Depth);
307
308 /// Gets the element type of the vector that the chain will load or store.
309 /// This is nontrivial because the chain may contain elements of different
310 /// types; e.g. it's legal to have a chain that contains both i32 and float.
311 Type *getChainElemTy(const Chain &C);
312
313 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
314 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
315 /// instructions.
316 ///
317 /// The map ChainElemOffsets must contain all of the elements in
318 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
319 /// address. It's ok if it contains additional entries.
320 template <bool IsLoadChain>
321 bool isSafeToMove(
322 Instruction *ChainElem, Instruction *ChainBegin,
324
325 /// Collects loads and stores grouped by "equivalence class", where:
326 /// - all elements in an eq class are a load or all are a store,
327 /// - they all load/store the same element size (it's OK to have e.g. i8 and
328 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
329 /// - they all have the same value for getUnderlyingObject().
330 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
332
333 /// Partitions Instrs into "chains" where every instruction has a known
334 /// constant offset from the first instr in the chain.
335 ///
336 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
337 /// in the chain is the leader, and an instr touches distance 0 from itself.
338 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
339};
340
341class LoadStoreVectorizerLegacyPass : public FunctionPass {
342public:
343 static char ID;
344
345 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
348 }
349
350 bool runOnFunction(Function &F) override;
351
352 StringRef getPassName() const override {
353 return "GPU Load and Store Vectorizer";
354 }
355
356 void getAnalysisUsage(AnalysisUsage &AU) const override {
362 AU.setPreservesCFG();
363 }
364};
365
366} // end anonymous namespace
367
368char LoadStoreVectorizerLegacyPass::ID = 0;
369
370INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
371 "Vectorize load and Store instructions", false, false)
378INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
379 "Vectorize load and store instructions", false, false)
380
382 return new LoadStoreVectorizerLegacyPass();
383}
384
385bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
386 // Don't vectorize when the attribute NoImplicitFloat is used.
387 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
388 return false;
389
390 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
391 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
392 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
394 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
395
396 AssumptionCache &AC =
397 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
398
399 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
400}
401
404 // Don't vectorize when the attribute NoImplicitFloat is used.
405 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
406 return PreservedAnalyses::all();
407
413
414 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
417 return Changed ? PA : PreservedAnalyses::all();
418}
419
420bool Vectorizer::run() {
421 bool Changed = false;
422 // Break up the BB if there are any instrs which aren't guaranteed to transfer
423 // execution to their successor.
424 //
425 // Consider, for example:
426 //
427 // def assert_arr_len(int n) { if (n < 2) exit(); }
428 //
429 // load arr[0]
430 // call assert_array_len(arr.length)
431 // load arr[1]
432 //
433 // Even though assert_arr_len does not read or write any memory, we can't
434 // speculate the second load before the call. More info at
435 // https://github.com/llvm/llvm-project/issues/52950.
436 for (BasicBlock *BB : post_order(&F)) {
437 // BB must at least have a terminator.
438 assert(!BB->empty());
439
441 Barriers.push_back(BB->begin());
442 for (Instruction &I : *BB)
444 Barriers.push_back(I.getIterator());
445 Barriers.push_back(BB->end());
446
447 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
448 ++It)
449 Changed |= runOnPseudoBB(*It, *std::next(It));
450
451 for (Instruction *I : ToErase) {
452 auto *PtrOperand = getLoadStorePointerOperand(I);
453 if (I->use_empty())
454 I->eraseFromParent();
456 }
457 ToErase.clear();
458 }
459
460 return Changed;
461}
462
463bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
465 LLVM_DEBUG({
466 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
467 if (End != Begin->getParent()->end())
468 dbgs() << *End;
469 else
470 dbgs() << "<BB end>";
471 dbgs() << ")\n";
472 });
473
474 bool Changed = false;
475 for (const auto &[EqClassKey, EqClass] :
476 collectEquivalenceClasses(Begin, End))
477 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
478
479 return Changed;
480}
481
482bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
483 ArrayRef<Instruction *> EqClass) {
484 bool Changed = false;
485
486 LLVM_DEBUG({
487 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
488 << " keyed on " << EqClassKey << ":\n";
489 for (Instruction *I : EqClass)
490 dbgs() << " " << *I << "\n";
491 });
492
493 std::vector<Chain> Chains = gatherChains(EqClass);
494 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
495 << " nontrivial chains.\n";);
496 for (Chain &C : Chains)
497 Changed |= runOnChain(C);
498 return Changed;
499}
500
501bool Vectorizer::runOnChain(Chain &C) {
502 LLVM_DEBUG({
503 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
504 dumpChain(C);
505 });
506
507 // Split up the chain into increasingly smaller chains, until we can finally
508 // vectorize the chains.
509 //
510 // (Don't be scared by the depth of the loop nest here. These operations are
511 // all at worst O(n lg n) in the number of instructions, and splitting chains
512 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
513 bool Changed = false;
514 for (auto &C : splitChainByMayAliasInstrs(C))
515 for (auto &C : splitChainByContiguity(C))
516 for (auto &C : splitChainByAlignment(C))
517 Changed |= vectorizeChain(C);
518 return Changed;
519}
520
521std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
522 if (C.empty())
523 return {};
524
525 sortChainInBBOrder(C);
526
527 LLVM_DEBUG({
528 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
529 dumpChain(C);
530 });
531
532 // We know that elements in the chain with nonverlapping offsets can't
533 // alias, but AA may not be smart enough to figure this out. Use a
534 // hashtable.
535 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
536 for (const auto &E : C)
537 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
538
539 // Loads get hoisted up to the first load in the chain. Stores get sunk
540 // down to the last store in the chain. Our algorithm for loads is:
541 //
542 // - Take the first element of the chain. This is the start of a new chain.
543 // - Take the next element of `Chain` and check for may-alias instructions
544 // up to the start of NewChain. If no may-alias instrs, add it to
545 // NewChain. Otherwise, start a new NewChain.
546 //
547 // For stores it's the same except in the reverse direction.
548 //
549 // We expect IsLoad to be an std::bool_constant.
550 auto Impl = [&](auto IsLoad) {
551 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
552 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
553 if constexpr (IsLoad())
554 return std::make_pair(C.begin(), C.end());
555 else
556 return std::make_pair(C.rbegin(), C.rend());
557 }(IsLoad);
558 assert(ChainBegin != ChainEnd);
559
560 std::vector<Chain> Chains;
562 NewChain.push_back(*ChainBegin);
563 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
564 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
565 ChainOffsets)) {
566 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
567 << *ChainIt->Inst << " into " << *ChainBegin->Inst
568 << "\n");
569 NewChain.push_back(*ChainIt);
570 } else {
572 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
573 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
574 if (NewChain.size() > 1) {
575 LLVM_DEBUG({
576 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
577 dumpChain(NewChain);
578 });
579 Chains.push_back(std::move(NewChain));
580 }
581
582 // Start a new chain.
583 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
584 }
585 }
586 if (NewChain.size() > 1) {
587 LLVM_DEBUG({
588 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
589 dumpChain(NewChain);
590 });
591 Chains.push_back(std::move(NewChain));
592 }
593 return Chains;
594 };
595
596 if (isa<LoadInst>(C[0].Inst))
597 return Impl(/*IsLoad=*/std::bool_constant<true>());
598
599 assert(isa<StoreInst>(C[0].Inst));
600 return Impl(/*IsLoad=*/std::bool_constant<false>());
601}
602
603std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
604 if (C.empty())
605 return {};
606
607 sortChainInOffsetOrder(C);
608
609 LLVM_DEBUG({
610 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
611 dumpChain(C);
612 });
613
614 std::vector<Chain> Ret;
615 Ret.push_back({C.front()});
616
617 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
618 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
619 auto &CurChain = Ret.back();
620 const ChainElem &Prev = CurChain.back();
621 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
622 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
623 "collectEquivalenceClass");
624 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
625
626 // Add this instruction to the end of the current chain, or start a new one.
627 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
628 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
629 << (AreContiguous ? "" : "not ") << "contiguous: "
630 << *Prev.Inst << " (ends at offset " << PrevReadEnd
631 << ") -> " << *It->Inst << " (starts at offset "
632 << It->OffsetFromLeader << ")\n");
633 if (AreContiguous)
634 CurChain.push_back(*It);
635 else
636 Ret.push_back({*It});
637 }
638
639 // Filter out length-1 chains, these are uninteresting.
640 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
641 return Ret;
642}
643
644Type *Vectorizer::getChainElemTy(const Chain &C) {
645 assert(!C.empty());
646 // The rules are:
647 // - If there are any pointer types in the chain, use an integer type.
648 // - Prefer an integer type if it appears in the chain.
649 // - Otherwise, use the first type in the chain.
650 //
651 // The rule about pointer types is a simplification when we merge e.g. a load
652 // of a ptr and a double. There's no direct conversion from a ptr to a
653 // double; it requires a ptrtoint followed by a bitcast.
654 //
655 // It's unclear to me if the other rules have any practical effect, but we do
656 // it to match this pass's previous behavior.
657 if (any_of(C, [](const ChainElem &E) {
658 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
659 })) {
660 return Type::getIntNTy(
661 F.getContext(),
662 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
663 }
664
665 for (const ChainElem &E : C)
666 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
667 return T;
668 return getLoadStoreType(C[0].Inst)->getScalarType();
669}
670
671std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
672 // We use a simple greedy algorithm.
673 // - Given a chain of length N, find all prefixes that
674 // (a) are not longer than the max register length, and
675 // (b) are a power of 2.
676 // - Starting from the longest prefix, try to create a vector of that length.
677 // - If one of them works, great. Repeat the algorithm on any remaining
678 // elements in the chain.
679 // - If none of them work, discard the first element and repeat on a chain
680 // of length N-1.
681 if (C.empty())
682 return {};
683
684 sortChainInOffsetOrder(C);
685
686 LLVM_DEBUG({
687 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
688 dumpChain(C);
689 });
690
691 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
692 auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
693 unsigned ChainSizeBytes, VectorType *VecTy) {
694 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
695 ChainSizeBytes, VecTy)
696 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
697 ChainSizeBytes, VecTy);
698 };
699
700#ifndef NDEBUG
701 for (const auto &E : C) {
702 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
703 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
704 "Should have filtered out non-power-of-two elements in "
705 "collectEquivalenceClasses.");
706 }
707#endif
708
709 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
710 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
711
712 std::vector<Chain> Ret;
713 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
714 // Find candidate chains of size not greater than the largest vector reg.
715 // These chains are over the closed interval [CBegin, CEnd].
716 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
717 CandidateChains;
718 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
719 APInt Sz = C[CEnd].OffsetFromLeader +
720 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
721 C[CBegin].OffsetFromLeader;
722 if (Sz.sgt(VecRegBytes))
723 break;
724 CandidateChains.push_back(
725 {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
726 }
727
728 // Consider the longest chain first.
729 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
730 It != End; ++It) {
731 auto [CEnd, SizeBytes] = *It;
733 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
734 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
735
736 Type *VecElemTy = getChainElemTy(C);
737 // Note, VecElemTy is a power of 2, but might be less than one byte. For
738 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
739 // VecElemTy would be i4.
740 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
741
742 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
743 assert((8 * SizeBytes) % VecElemBits == 0);
744 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
745 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
746 unsigned VF = 8 * VecRegBytes / VecElemBits;
747
748 // Check that TTI is happy with this vectorization factor.
749 unsigned TargetVF = getVectorFactor(VF, VecElemBits,
750 VecElemBits * NumVecElems / 8, VecTy);
751 if (TargetVF != VF && TargetVF < NumVecElems) {
753 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
754 "because TargetVF="
755 << TargetVF << " != VF=" << VF
756 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
757 continue;
758 }
759
760 // Is a load/store with this alignment allowed by TTI and at least as fast
761 // as an unvectorized load/store?
762 //
763 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
764 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
765 &F = F](Align Alignment) {
766 if (Alignment.value() % SizeBytes == 0)
767 return true;
768 unsigned VectorizedSpeed = 0;
769 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
770 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
771 if (!AllowsMisaligned) {
773 << "LSV: Access of " << SizeBytes << "B in addrspace "
774 << AS << " with alignment " << Alignment.value()
775 << " is misaligned, and therefore can't be vectorized.\n");
776 return false;
777 }
778
779 unsigned ElementwiseSpeed = 0;
780 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
781 Alignment, &ElementwiseSpeed);
782 if (VectorizedSpeed < ElementwiseSpeed) {
784 << "LSV: Access of " << SizeBytes << "B in addrspace "
785 << AS << " with alignment " << Alignment.value()
786 << " has relative speed " << VectorizedSpeed
787 << ", which is lower than the elementwise speed of "
788 << ElementwiseSpeed
789 << ". Therefore this access won't be vectorized.\n");
790 return false;
791 }
792 return true;
793 };
794
795 // If we're loading/storing from an alloca, align it if possible.
796 //
797 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
798 // tells us this is beneficial. This feels a bit odd, but it matches
799 // existing tests. This isn't *so* bad, because at most we align to 4
800 // bytes (current value of StackAdjustedAlignment).
801 //
802 // FIXME: We will upgrade the alignment of the alloca even if it turns out
803 // we can't vectorize for some other reason.
804 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
805 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
806 isa<AllocaInst>(PtrOperand->stripPointerCasts());
807 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
808 Align PrefAlign = Align(StackAdjustedAlignment);
809 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
810 IsAllowedAndFast(PrefAlign)) {
812 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
813 if (NewAlign >= Alignment) {
815 << "LSV: splitByChain upgrading alloca alignment from "
816 << Alignment.value() << " to " << NewAlign.value()
817 << "\n");
818 Alignment = NewAlign;
819 }
820 }
821
822 if (!IsAllowedAndFast(Alignment)) {
824 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
825 "because its alignment is not AllowedAndFast: "
826 << Alignment.value() << "\n");
827 continue;
828 }
829
830 if ((IsLoadChain &&
831 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
832 (!IsLoadChain &&
833 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
835 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
836 "because !isLegalToVectorizeLoad/StoreChain.");
837 continue;
838 }
839
840 // Hooray, we can vectorize this chain!
841 Chain &NewChain = Ret.emplace_back();
842 for (unsigned I = CBegin; I <= CEnd; ++I)
843 NewChain.push_back(C[I]);
844 CBegin = CEnd; // Skip over the instructions we've added to the chain.
845 break;
846 }
847 }
848 return Ret;
849}
850
851bool Vectorizer::vectorizeChain(Chain &C) {
852 if (C.size() < 2)
853 return false;
854
855 sortChainInOffsetOrder(C);
856
857 LLVM_DEBUG({
858 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
859 dumpChain(C);
860 });
861
862 Type *VecElemTy = getChainElemTy(C);
863 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
864 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
865 unsigned ChainBytes = std::accumulate(
866 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
867 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
868 });
869 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
870 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
871 // than 1 byte (e.g. VecTy == <32 x i1>).
872 Type *VecTy = FixedVectorType::get(
873 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
874
875 Align Alignment = getLoadStoreAlignment(C[0].Inst);
876 // If this is a load/store of an alloca, we might have upgraded the alloca's
877 // alignment earlier. Get the new alignment.
878 if (AS == DL.getAllocaAddrSpace()) {
879 Alignment = std::max(
880 Alignment,
882 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
883 }
884
885 // All elements of the chain must have the same scalar-type size.
886#ifndef NDEBUG
887 for (const ChainElem &E : C)
888 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
889 DL.getTypeStoreSize(VecElemTy));
890#endif
891
892 Instruction *VecInst;
893 if (IsLoadChain) {
894 // Loads get hoisted to the location of the first load in the chain. We may
895 // also need to hoist the (transitive) operands of the loads.
896 Builder.SetInsertPoint(
897 std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {
898 return A.Inst->comesBefore(B.Inst);
899 })->Inst);
900
901 // Chain is in offset order, so C[0] is the instr with the lowest offset,
902 // i.e. the root of the vector.
903 VecInst = Builder.CreateAlignedLoad(VecTy,
905 Alignment);
906
907 unsigned VecIdx = 0;
908 for (const ChainElem &E : C) {
909 Instruction *I = E.Inst;
910 Value *V;
912 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
913 auto Mask = llvm::to_vector<8>(
914 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
915 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
916 VecIdx += VT->getNumElements();
917 } else {
918 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
919 I->getName());
920 ++VecIdx;
921 }
922 if (V->getType() != I->getType())
923 V = Builder.CreateBitOrPointerCast(V, I->getType());
924 I->replaceAllUsesWith(V);
925 }
926
927 // Finally, we need to reorder the instrs in the BB so that the (transitive)
928 // operands of VecInst appear before it. To see why, suppose we have
929 // vectorized the following code:
930 //
931 // ptr1 = gep a, 1
932 // load1 = load i32 ptr1
933 // ptr0 = gep a, 0
934 // load0 = load i32 ptr0
935 //
936 // We will put the vectorized load at the location of the earliest load in
937 // the BB, i.e. load1. We get:
938 //
939 // ptr1 = gep a, 1
940 // loadv = load <2 x i32> ptr0
941 // load0 = extractelement loadv, 0
942 // load1 = extractelement loadv, 1
943 // ptr0 = gep a, 0
944 //
945 // Notice that loadv uses ptr0, which is defined *after* it!
946 reorder(VecInst);
947 } else {
948 // Stores get sunk to the location of the last store in the chain.
949 Builder.SetInsertPoint(
950 std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {
951 return A.Inst->comesBefore(B.Inst);
952 })->Inst);
953
954 // Build the vector to store.
955 Value *Vec = PoisonValue::get(VecTy);
956 unsigned VecIdx = 0;
957 auto InsertElem = [&](Value *V) {
958 if (V->getType() != VecElemTy)
959 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
960 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
961 };
962 for (const ChainElem &E : C) {
963 auto I = cast<StoreInst>(E.Inst);
964 if (FixedVectorType *VT =
965 dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
966 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
967 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
968 Builder.getInt32(J)));
969 }
970 } else {
971 InsertElem(I->getValueOperand());
972 }
973 }
974
975 // Chain is in offset order, so C[0] is the instr with the lowest offset,
976 // i.e. the root of the vector.
977 VecInst = Builder.CreateAlignedStore(
978 Vec,
980 Alignment);
981 }
982
983 propagateMetadata(VecInst, C);
984
985 for (const ChainElem &E : C)
986 ToErase.push_back(E.Inst);
987
988 ++NumVectorInstructions;
989 NumScalarsVectorized += C.size();
990 return true;
991}
992
993template <bool IsLoadChain>
994bool Vectorizer::isSafeToMove(
995 Instruction *ChainElem, Instruction *ChainBegin,
997 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
998 << *ChainBegin << ")\n");
999
1000 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1001 if (ChainElem == ChainBegin)
1002 return true;
1003
1004 // Invariant loads can always be reordered; by definition they are not
1005 // clobbered by stores.
1006 if (isInvariantLoad(ChainElem))
1007 return true;
1008
1009 auto BBIt = std::next([&] {
1010 if constexpr (IsLoadChain)
1011 return BasicBlock::reverse_iterator(ChainElem);
1012 else
1013 return BasicBlock::iterator(ChainElem);
1014 }());
1015 auto BBItEnd = std::next([&] {
1016 if constexpr (IsLoadChain)
1017 return BasicBlock::reverse_iterator(ChainBegin);
1018 else
1019 return BasicBlock::iterator(ChainBegin);
1020 }());
1021
1022 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1023 const unsigned ChainElemSize =
1024 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1025
1026 for (; BBIt != BBItEnd; ++BBIt) {
1027 Instruction *I = &*BBIt;
1028
1029 if (!I->mayReadOrWriteMemory())
1030 continue;
1031
1032 // Loads can be reordered with other loads.
1033 if (IsLoadChain && isa<LoadInst>(I))
1034 continue;
1035
1036 // Stores can be sunk below invariant loads.
1037 if (!IsLoadChain && isInvariantLoad(I))
1038 continue;
1039
1040 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1041 // what offset ChainIt accesses. This may be better than AA is able to do.
1042 //
1043 // We should really only have duplicate offsets for stores (the duplicate
1044 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1045 // split the chain so we don't have to handle this case specially.
1046 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1047 // I and ChainElem overlap if:
1048 // - I and ChainElem have the same offset, OR
1049 // - I's offset is less than ChainElem's, but I touches past the
1050 // beginning of ChainElem, OR
1051 // - ChainElem's offset is less than I's, but ChainElem touches past the
1052 // beginning of I.
1053 const APInt &IOffset = OffsetIt->second;
1054 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1055 if (IOffset == ChainElemOffset ||
1056 (IOffset.sle(ChainElemOffset) &&
1057 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1058 (ChainElemOffset.sle(IOffset) &&
1059 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1060 LLVM_DEBUG({
1061 // Double check that AA also sees this alias. If not, we probably
1062 // have a bug.
1063 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1064 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1065 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1066 });
1067 return false; // We found an aliasing instruction; bail.
1068 }
1069
1070 continue; // We're confident there's no alias.
1071 }
1072
1073 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1074 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1075 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1076 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1077 << " Aliasing instruction:\n"
1078 << " " << *I << '\n'
1079 << " Aliased instruction and pointer:\n"
1080 << " " << *ChainElem << '\n'
1081 << " " << *getLoadStorePointerOperand(ChainElem)
1082 << '\n');
1083
1084 return false;
1085 }
1086 }
1087 return true;
1088}
1089
1091 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1092 return (Signed && BinOpI->hasNoSignedWrap()) ||
1093 (!Signed && BinOpI->hasNoUnsignedWrap());
1094}
1095
1096static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1097 unsigned MatchingOpIdxA, Instruction *AddOpB,
1098 unsigned MatchingOpIdxB, bool Signed) {
1099 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1100 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1101 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1102 << ", MatchingOpIdxB=" << MatchingOpIdxB
1103 << ", Signed=" << Signed << "\n");
1104 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1105 // being the same, we can guarantee that the transformation is safe if we can
1106 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1107 // For example:
1108 // %tmp7 = add nsw i32 %tmp2, %v0
1109 // %tmp8 = sext i32 %tmp7 to i64
1110 // ...
1111 // %tmp11 = add nsw i32 %v0, 1
1112 // %tmp12 = add nsw i32 %tmp2, %tmp11
1113 // %tmp13 = sext i32 %tmp12 to i64
1114 //
1115 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1116 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1117 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1118 assert(AddOpA->getOpcode() == Instruction::Add &&
1119 AddOpB->getOpcode() == Instruction::Add &&
1120 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1121 if (AddOpA->getOperand(MatchingOpIdxA) ==
1122 AddOpB->getOperand(MatchingOpIdxB)) {
1123 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1124 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1125 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1126 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1127 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1128 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1129 checkNoWrapFlags(OtherInstrB, Signed) &&
1130 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1131 int64_t CstVal =
1132 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1133 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1134 IdxDiff.getSExtValue() == CstVal)
1135 return true;
1136 }
1137 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1138 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1139 checkNoWrapFlags(OtherInstrA, Signed) &&
1140 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1141 int64_t CstVal =
1142 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1143 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1144 IdxDiff.getSExtValue() == -CstVal)
1145 return true;
1146 }
1147 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1148 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1149 if (OtherInstrA && OtherInstrB &&
1150 OtherInstrA->getOpcode() == Instruction::Add &&
1151 OtherInstrB->getOpcode() == Instruction::Add &&
1152 checkNoWrapFlags(OtherInstrA, Signed) &&
1153 checkNoWrapFlags(OtherInstrB, Signed) &&
1154 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1155 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1156 int64_t CstValA =
1157 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1158 int64_t CstValB =
1159 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1160 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1161 IdxDiff.getSExtValue() == (CstValB - CstValA))
1162 return true;
1163 }
1164 }
1165 return false;
1166}
1167
1168std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1169 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1170 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1171 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1172 << " Depth=" << Depth << "\n");
1173 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1174 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1175 if (!GEPA || !GEPB)
1176 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1177
1178 // Look through GEPs after checking they're the same except for the last
1179 // index.
1180 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1181 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1182 return std::nullopt;
1183 gep_type_iterator GTIA = gep_type_begin(GEPA);
1184 gep_type_iterator GTIB = gep_type_begin(GEPB);
1185 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1186 if (GTIA.getOperand() != GTIB.getOperand())
1187 return std::nullopt;
1188 ++GTIA;
1189 ++GTIB;
1190 }
1191
1192 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1193 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1194 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1195 OpA->getType() != OpB->getType())
1196 return std::nullopt;
1197
1198 uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
1199
1200 // Only look through a ZExt/SExt.
1201 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1202 return std::nullopt;
1203
1204 bool Signed = isa<SExtInst>(OpA);
1205
1206 // At this point A could be a function parameter, i.e. not an instruction
1207 Value *ValA = OpA->getOperand(0);
1208 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1209 if (!OpB || ValA->getType() != OpB->getType())
1210 return std::nullopt;
1211
1212 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1213 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1214 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1215 if (IdxDiffSCEV == SE.getCouldNotCompute())
1216 return std::nullopt;
1217
1218 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1219 if (!IdxDiffRange.isSingleElement())
1220 return std::nullopt;
1221 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1222
1223 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1224 << "\n");
1225
1226 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1227 bool Safe = false;
1228
1229 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1230 // ValA, we're okay.
1231 if (OpB->getOpcode() == Instruction::Add &&
1232 isa<ConstantInt>(OpB->getOperand(1)) &&
1233 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1235 Safe = true;
1236
1237 // Second attempt: check if we have eligible add NSW/NUW instruction
1238 // sequences.
1239 OpA = dyn_cast<Instruction>(ValA);
1240 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1241 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1242 checkNoWrapFlags(OpB, Signed)) {
1243 // In the checks below a matching operand in OpA and OpB is an operand which
1244 // is the same in those two instructions. Below we account for possible
1245 // orders of the operands of these add instructions.
1246 for (unsigned MatchingOpIdxA : {0, 1})
1247 for (unsigned MatchingOpIdxB : {0, 1})
1248 if (!Safe)
1249 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1250 MatchingOpIdxB, Signed);
1251 }
1252
1253 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1254
1255 // Third attempt:
1256 //
1257 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1258 // order bit other than the sign bit are known to be zero in ValA, we can add
1259 // Diff to it while guaranteeing no overflow of any sort.
1260 //
1261 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1262 if (!Safe) {
1263 // When computing known bits, use the GEPs as context instructions, since
1264 // they likely are in the same BB as the load/store.
1265 KnownBits Known(BitWidth);
1266 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1267 ContextInst, &DT);
1268 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1269 if (Signed)
1270 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1271 if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1272 return std::nullopt;
1273 Safe = true;
1274 }
1275
1276 if (Safe)
1277 return IdxDiff * Stride;
1278 return std::nullopt;
1279}
1280
1281std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1282 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1283 if (Depth++ == MaxDepth)
1284 return std::nullopt;
1285
1286 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1287 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1288 if (SelectA->getCondition() != SelectB->getCondition())
1289 return std::nullopt;
1290 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1291 << ", PtrB=" << *PtrB << ", ContextInst="
1292 << *ContextInst << ", Depth=" << Depth << "\n");
1293 std::optional<APInt> TrueDiff = getConstantOffset(
1294 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1295 if (!TrueDiff.has_value())
1296 return std::nullopt;
1297 std::optional<APInt> FalseDiff =
1298 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1299 ContextInst, Depth);
1300 if (TrueDiff == FalseDiff)
1301 return TrueDiff;
1302 }
1303 }
1304 return std::nullopt;
1305}
1306
1307EquivalenceClassMap
1308Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1310 EquivalenceClassMap Ret;
1311
1312 auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
1313 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1314 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1315 // The select's themselves are distinct instructions even if they share
1316 // the same condition and evaluate to consecutive pointers for true and
1317 // false values of the condition. Therefore using the select's themselves
1318 // for grouping instructions would put consecutive accesses into different
1319 // lists and they won't be even checked for being consecutive, and won't
1320 // be vectorized.
1321 return Sel->getCondition();
1322 }
1323 return ObjPtr;
1324 };
1325
1326 for (Instruction &I : make_range(Begin, End)) {
1327 auto *LI = dyn_cast<LoadInst>(&I);
1328 auto *SI = dyn_cast<StoreInst>(&I);
1329 if (!LI && !SI)
1330 continue;
1331
1332 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1333 continue;
1334
1335 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1336 (SI && !TTI.isLegalToVectorizeStore(SI)))
1337 continue;
1338
1339 Type *Ty = getLoadStoreType(&I);
1341 continue;
1342
1343 // Skip weird non-byte sizes. They probably aren't worth the effort of
1344 // handling correctly.
1345 unsigned TySize = DL.getTypeSizeInBits(Ty);
1346 if ((TySize % 8) != 0)
1347 continue;
1348
1349 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1350 // functions are currently using an integer type for the vectorized
1351 // load/store, and does not support casting between the integer type and a
1352 // vector of pointers (e.g. i64 to <2 x i16*>)
1353 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1354 continue;
1355
1357 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1358 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1359
1360 unsigned VF = VecRegSize / TySize;
1361 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1362
1363 // Only handle power-of-two sized elements.
1364 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1365 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1366 continue;
1367
1368 // No point in looking at these if they're too big to vectorize.
1369 if (TySize > VecRegSize / 2 ||
1370 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1371 continue;
1372
1374 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1375 /*IsLoad=*/LI != nullptr}]
1376 .push_back(&I);
1377 }
1378
1379 return Ret;
1380}
1381
1382std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1383 if (Instrs.empty())
1384 return {};
1385
1386 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1387 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1388
1389#ifndef NDEBUG
1390 // Check that Instrs is in BB order and all have the same addr space.
1391 for (size_t I = 1; I < Instrs.size(); ++I) {
1392 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1393 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1394 }
1395#endif
1396
1397 // Machinery to build an MRU-hashtable of Chains.
1398 //
1399 // (Ideally this could be done with MapVector, but as currently implemented,
1400 // moving an element to the front of a MapVector is O(n).)
1401 struct InstrListElem : ilist_node<InstrListElem>,
1402 std::pair<Instruction *, Chain> {
1403 explicit InstrListElem(Instruction *I)
1404 : std::pair<Instruction *, Chain>(I, {}) {}
1405 };
1406 struct InstrListElemDenseMapInfo {
1407 using PtrInfo = DenseMapInfo<InstrListElem *>;
1408 using IInfo = DenseMapInfo<Instruction *>;
1409 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1410 static InstrListElem *getTombstoneKey() {
1411 return PtrInfo::getTombstoneKey();
1412 }
1413 static unsigned getHashValue(const InstrListElem *E) {
1414 return IInfo::getHashValue(E->first);
1415 }
1416 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1417 if (A == getEmptyKey() || B == getEmptyKey())
1418 return A == getEmptyKey() && B == getEmptyKey();
1419 if (A == getTombstoneKey() || B == getTombstoneKey())
1420 return A == getTombstoneKey() && B == getTombstoneKey();
1421 return IInfo::isEqual(A->first, B->first);
1422 }
1423 };
1427
1428 // Compare each instruction in `instrs` to leader of the N most recently-used
1429 // chains. This limits the O(n^2) behavior of this pass while also allowing
1430 // us to build arbitrarily long chains.
1431 for (Instruction *I : Instrs) {
1432 constexpr int MaxChainsToTry = 64;
1433
1434 bool MatchFound = false;
1435 auto ChainIter = MRU.begin();
1436 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1437 ++J, ++ChainIter) {
1438 std::optional<APInt> Offset = getConstantOffset(
1439 getLoadStorePointerOperand(ChainIter->first),
1441 /*ContextInst=*/
1442 (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
1443 if (Offset.has_value()) {
1444 // `Offset` might not have the expected number of bits, if e.g. AS has a
1445 // different number of bits than opaque pointers.
1446 ChainIter->second.push_back(ChainElem{I, Offset.value()});
1447 // Move ChainIter to the front of the MRU list.
1448 MRU.remove(*ChainIter);
1449 MRU.push_front(*ChainIter);
1450 MatchFound = true;
1451 break;
1452 }
1453 }
1454
1455 if (!MatchFound) {
1456 APInt ZeroOffset(ASPtrBits, 0);
1457 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1458 E->second.push_back(ChainElem{I, ZeroOffset});
1459 MRU.push_front(*E);
1460 Chains.insert(E);
1461 }
1462 }
1463
1464 std::vector<Chain> Ret;
1465 Ret.reserve(Chains.size());
1466 // Iterate over MRU rather than Chains so the order is deterministic.
1467 for (auto &E : MRU)
1468 if (E.second.size() > 1)
1469 Ret.push_back(std::move(E.second));
1470 return Ret;
1471}
1472
1473std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1474 Instruction *ContextInst,
1475 unsigned Depth) {
1476 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1477 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1478 << ", Depth=" << Depth << "\n");
1479 // We'll ultimately return a value of this bit width, even if computations
1480 // happen in a different width.
1481 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1482 APInt OffsetA(OrigBitWidth, 0);
1483 APInt OffsetB(OrigBitWidth, 0);
1484 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1485 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1486 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1487 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1488 return std::nullopt;
1489
1490 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1491 // should properly handle a possible overflow and the value should fit into
1492 // the smallest data type used in the cast/gep chain.
1493 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1494 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1495
1496 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1497 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1498 if (PtrA == PtrB)
1499 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1500
1501 // Try to compute B - A.
1502 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1503 if (DistScev != SE.getCouldNotCompute()) {
1504 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1505 ConstantRange DistRange = SE.getSignedRange(DistScev);
1506 if (DistRange.isSingleElement()) {
1507 // Handle index width (the width of Dist) != pointer width (the width of
1508 // the Offset*s at this point).
1509 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1510 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1511 }
1512 }
1513 std::optional<APInt> Diff =
1514 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1515 if (Diff.has_value())
1516 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1517 .sextOrTrunc(OrigBitWidth);
1518 return std::nullopt;
1519}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
assume Assume Builder
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:469
static const unsigned MaxDepth
Select target instructions out of generic instructions
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, const MachineRegisterInfo &MRI)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:76
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1379
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt abs() const
Get the absolute value.
Definition: APInt.h:1730
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1173
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1433
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1083
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1138
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:453
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1209
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1507
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:89
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
Represents analyses that only rely on functions' control flow.
Definition: PassManager.h:113
This class represents a range of values.
Definition: ConstantRange.h:47
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:536
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:693
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2628
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:284
const BasicBlock * getParent() const
Definition: Instruction.h:90
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:195
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
An instruction for reading from memory.
Definition: Instructions.h:177
bool isSimple() const
Definition: Instructions.h:256
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1743
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:188
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getCouldNotCompute()
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:382
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalToVectorizeLoad(LoadInst *LI) const
bool isLegalToVectorizeStore(StoreInst *SI) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:724
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:688
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:684
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A simple intrusive list implementation.
Definition: simple_ilist.h:81
void push_front(reference Node)
Insert a node at the front; never copies.
Definition: simple_ilist.h:142
void remove(reference N)
Remove a node by reference; never deletes.
Definition: simple_ilist.h:181
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:440
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:529
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1439
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1652
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
TargetTransformInfo TTI
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:292
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:184
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2021
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117