LLVM 20.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160};
161using Chain = SmallVector<ChainElem, 1>;
162
163void sortChainInBBOrder(Chain &C) {
164 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
165}
166
167void sortChainInOffsetOrder(Chain &C) {
168 sort(C, [](const auto &A, const auto &B) {
169 if (A.OffsetFromLeader != B.OffsetFromLeader)
170 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
171 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
172 });
173}
174
175[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
176 for (const auto &E : C) {
177 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
178 }
179}
180
181using EquivalenceClassMap =
183
184// FIXME: Assuming stack alignment of 4 is always good enough
185constexpr unsigned StackAdjustedAlignment = 4;
186
189 for (const ChainElem &E : C)
190 Values.push_back(E.Inst);
191 return propagateMetadata(I, Values);
192}
193
194bool isInvariantLoad(const Instruction *I) {
195 const LoadInst *LI = dyn_cast<LoadInst>(I);
196 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
197}
198
199/// Reorders the instructions that I depends on (the instructions defining its
200/// operands), to ensure they dominate I.
201void reorder(Instruction *I) {
202 SmallPtrSet<Instruction *, 16> InstructionsToMove;
204
205 Worklist.push_back(I);
206 while (!Worklist.empty()) {
207 Instruction *IW = Worklist.pop_back_val();
208 int NumOperands = IW->getNumOperands();
209 for (int i = 0; i < NumOperands; i++) {
210 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
211 if (!IM || IM->getOpcode() == Instruction::PHI)
212 continue;
213
214 // If IM is in another BB, no need to move it, because this pass only
215 // vectorizes instructions within one BB.
216 if (IM->getParent() != I->getParent())
217 continue;
218
219 assert(IM != I && "Unexpected cycle while re-ordering instructions");
220
221 if (!IM->comesBefore(I)) {
222 InstructionsToMove.insert(IM);
223 Worklist.push_back(IM);
224 }
225 }
226 }
227
228 // All instructions to move should follow I. Start from I, not from begin().
229 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
230 Instruction *IM = &*(BBI++);
231 if (!InstructionsToMove.count(IM))
232 continue;
233 IM->moveBefore(I);
234 }
235}
236
237class Vectorizer {
238 Function &F;
239 AliasAnalysis &AA;
240 AssumptionCache &AC;
241 DominatorTree &DT;
242 ScalarEvolution &SE;
244 const DataLayout &DL;
245 IRBuilder<> Builder;
246
247 // We could erase instrs right after vectorizing them, but that can mess up
248 // our BB iterators, and also can make the equivalence class keys point to
249 // freed memory. This is fixable, but it's simpler just to wait until we're
250 // done with the BB and erase all at once.
252
253public:
254 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
256 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
257 DL(F.getDataLayout()), Builder(SE.getContext()) {}
258
259 bool run();
260
261private:
262 static const unsigned MaxDepth = 3;
263
264 /// Runs the vectorizer on a "pseudo basic block", which is a range of
265 /// instructions [Begin, End) within one BB all of which have
266 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
267 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
268
269 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
270 /// in the same BB with the same value for getUnderlyingObject() etc.
271 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
273
274 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
275 /// where all instructions access a known, constant offset from the first
276 /// instruction.
277 bool runOnChain(Chain &C);
278
279 /// Splits the chain into subchains of instructions which read/write a
280 /// contiguous block of memory. Discards any length-1 subchains (because
281 /// there's nothing to vectorize in there).
282 std::vector<Chain> splitChainByContiguity(Chain &C);
283
284 /// Splits the chain into subchains where it's safe to hoist loads up to the
285 /// beginning of the sub-chain and it's safe to sink loads up to the end of
286 /// the sub-chain. Discards any length-1 subchains.
287 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
288
289 /// Splits the chain into subchains that make legal, aligned accesses.
290 /// Discards any length-1 subchains.
291 std::vector<Chain> splitChainByAlignment(Chain &C);
292
293 /// Converts the instrs in the chain into a single vectorized load or store.
294 /// Adds the old scalar loads/stores to ToErase.
295 bool vectorizeChain(Chain &C);
296
297 /// Tries to compute the offset in bytes PtrB - PtrA.
298 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
299 Instruction *ContextInst,
300 unsigned Depth = 0);
301 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
302 Instruction *ContextInst,
303 unsigned Depth);
304 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
305 Instruction *ContextInst,
306 unsigned Depth);
307
308 /// Gets the element type of the vector that the chain will load or store.
309 /// This is nontrivial because the chain may contain elements of different
310 /// types; e.g. it's legal to have a chain that contains both i32 and float.
311 Type *getChainElemTy(const Chain &C);
312
313 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
314 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
315 /// instructions.
316 ///
317 /// The map ChainElemOffsets must contain all of the elements in
318 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
319 /// address. It's ok if it contains additional entries.
320 template <bool IsLoadChain>
321 bool isSafeToMove(
322 Instruction *ChainElem, Instruction *ChainBegin,
324
325 /// Collects loads and stores grouped by "equivalence class", where:
326 /// - all elements in an eq class are a load or all are a store,
327 /// - they all load/store the same element size (it's OK to have e.g. i8 and
328 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
329 /// - they all have the same value for getUnderlyingObject().
330 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
332
333 /// Partitions Instrs into "chains" where every instruction has a known
334 /// constant offset from the first instr in the chain.
335 ///
336 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
337 /// in the chain is the leader, and an instr touches distance 0 from itself.
338 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
339};
340
341class LoadStoreVectorizerLegacyPass : public FunctionPass {
342public:
343 static char ID;
344
345 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
348 }
349
350 bool runOnFunction(Function &F) override;
351
352 StringRef getPassName() const override {
353 return "GPU Load and Store Vectorizer";
354 }
355
356 void getAnalysisUsage(AnalysisUsage &AU) const override {
362 AU.setPreservesCFG();
363 }
364};
365
366} // end anonymous namespace
367
368char LoadStoreVectorizerLegacyPass::ID = 0;
369
370INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
371 "Vectorize load and Store instructions", false, false)
378INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
379 "Vectorize load and store instructions", false, false)
380
382 return new LoadStoreVectorizerLegacyPass();
383}
384
385bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
386 // Don't vectorize when the attribute NoImplicitFloat is used.
387 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
388 return false;
389
390 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
391 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
392 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
394 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
395
396 AssumptionCache &AC =
397 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
398
399 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
400}
401
404 // Don't vectorize when the attribute NoImplicitFloat is used.
405 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
406 return PreservedAnalyses::all();
407
413
414 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
417 return Changed ? PA : PreservedAnalyses::all();
418}
419
420bool Vectorizer::run() {
421 bool Changed = false;
422 // Break up the BB if there are any instrs which aren't guaranteed to transfer
423 // execution to their successor.
424 //
425 // Consider, for example:
426 //
427 // def assert_arr_len(int n) { if (n < 2) exit(); }
428 //
429 // load arr[0]
430 // call assert_array_len(arr.length)
431 // load arr[1]
432 //
433 // Even though assert_arr_len does not read or write any memory, we can't
434 // speculate the second load before the call. More info at
435 // https://github.com/llvm/llvm-project/issues/52950.
436 for (BasicBlock *BB : post_order(&F)) {
437 // BB must at least have a terminator.
438 assert(!BB->empty());
439
441 Barriers.push_back(BB->begin());
442 for (Instruction &I : *BB)
444 Barriers.push_back(I.getIterator());
445 Barriers.push_back(BB->end());
446
447 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
448 ++It)
449 Changed |= runOnPseudoBB(*It, *std::next(It));
450
451 for (Instruction *I : ToErase) {
452 auto *PtrOperand = getLoadStorePointerOperand(I);
453 if (I->use_empty())
454 I->eraseFromParent();
456 }
457 ToErase.clear();
458 }
459
460 return Changed;
461}
462
463bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
465 LLVM_DEBUG({
466 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
467 if (End != Begin->getParent()->end())
468 dbgs() << *End;
469 else
470 dbgs() << "<BB end>";
471 dbgs() << ")\n";
472 });
473
474 bool Changed = false;
475 for (const auto &[EqClassKey, EqClass] :
476 collectEquivalenceClasses(Begin, End))
477 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
478
479 return Changed;
480}
481
482bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
483 ArrayRef<Instruction *> EqClass) {
484 bool Changed = false;
485
486 LLVM_DEBUG({
487 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
488 << " keyed on " << EqClassKey << ":\n";
489 for (Instruction *I : EqClass)
490 dbgs() << " " << *I << "\n";
491 });
492
493 std::vector<Chain> Chains = gatherChains(EqClass);
494 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
495 << " nontrivial chains.\n";);
496 for (Chain &C : Chains)
497 Changed |= runOnChain(C);
498 return Changed;
499}
500
501bool Vectorizer::runOnChain(Chain &C) {
502 LLVM_DEBUG({
503 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
504 dumpChain(C);
505 });
506
507 // Split up the chain into increasingly smaller chains, until we can finally
508 // vectorize the chains.
509 //
510 // (Don't be scared by the depth of the loop nest here. These operations are
511 // all at worst O(n lg n) in the number of instructions, and splitting chains
512 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
513 bool Changed = false;
514 for (auto &C : splitChainByMayAliasInstrs(C))
515 for (auto &C : splitChainByContiguity(C))
516 for (auto &C : splitChainByAlignment(C))
517 Changed |= vectorizeChain(C);
518 return Changed;
519}
520
521std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
522 if (C.empty())
523 return {};
524
525 sortChainInBBOrder(C);
526
527 LLVM_DEBUG({
528 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
529 dumpChain(C);
530 });
531
532 // We know that elements in the chain with nonverlapping offsets can't
533 // alias, but AA may not be smart enough to figure this out. Use a
534 // hashtable.
535 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
536 for (const auto &E : C)
537 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
538
539 // Loads get hoisted up to the first load in the chain. Stores get sunk
540 // down to the last store in the chain. Our algorithm for loads is:
541 //
542 // - Take the first element of the chain. This is the start of a new chain.
543 // - Take the next element of `Chain` and check for may-alias instructions
544 // up to the start of NewChain. If no may-alias instrs, add it to
545 // NewChain. Otherwise, start a new NewChain.
546 //
547 // For stores it's the same except in the reverse direction.
548 //
549 // We expect IsLoad to be an std::bool_constant.
550 auto Impl = [&](auto IsLoad) {
551 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
552 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
553 if constexpr (IsLoad())
554 return std::make_pair(C.begin(), C.end());
555 else
556 return std::make_pair(C.rbegin(), C.rend());
557 }(IsLoad);
558 assert(ChainBegin != ChainEnd);
559
560 std::vector<Chain> Chains;
562 NewChain.push_back(*ChainBegin);
563 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
564 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
565 ChainOffsets)) {
566 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
567 << *ChainIt->Inst << " into " << *ChainBegin->Inst
568 << "\n");
569 NewChain.push_back(*ChainIt);
570 } else {
572 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
573 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
574 if (NewChain.size() > 1) {
575 LLVM_DEBUG({
576 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
577 dumpChain(NewChain);
578 });
579 Chains.push_back(std::move(NewChain));
580 }
581
582 // Start a new chain.
583 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
584 }
585 }
586 if (NewChain.size() > 1) {
587 LLVM_DEBUG({
588 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
589 dumpChain(NewChain);
590 });
591 Chains.push_back(std::move(NewChain));
592 }
593 return Chains;
594 };
595
596 if (isa<LoadInst>(C[0].Inst))
597 return Impl(/*IsLoad=*/std::bool_constant<true>());
598
599 assert(isa<StoreInst>(C[0].Inst));
600 return Impl(/*IsLoad=*/std::bool_constant<false>());
601}
602
603std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
604 if (C.empty())
605 return {};
606
607 sortChainInOffsetOrder(C);
608
609 LLVM_DEBUG({
610 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
611 dumpChain(C);
612 });
613
614 std::vector<Chain> Ret;
615 Ret.push_back({C.front()});
616
617 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
618 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
619 auto &CurChain = Ret.back();
620 const ChainElem &Prev = CurChain.back();
621 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
622 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
623 "collectEquivalenceClass");
624 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
625
626 // Add this instruction to the end of the current chain, or start a new one.
627 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
628 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
629 << (AreContiguous ? "" : "not ") << "contiguous: "
630 << *Prev.Inst << " (ends at offset " << PrevReadEnd
631 << ") -> " << *It->Inst << " (starts at offset "
632 << It->OffsetFromLeader << ")\n");
633 if (AreContiguous)
634 CurChain.push_back(*It);
635 else
636 Ret.push_back({*It});
637 }
638
639 // Filter out length-1 chains, these are uninteresting.
640 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
641 return Ret;
642}
643
644Type *Vectorizer::getChainElemTy(const Chain &C) {
645 assert(!C.empty());
646 // The rules are:
647 // - If there are any pointer types in the chain, use an integer type.
648 // - Prefer an integer type if it appears in the chain.
649 // - Otherwise, use the first type in the chain.
650 //
651 // The rule about pointer types is a simplification when we merge e.g. a load
652 // of a ptr and a double. There's no direct conversion from a ptr to a
653 // double; it requires a ptrtoint followed by a bitcast.
654 //
655 // It's unclear to me if the other rules have any practical effect, but we do
656 // it to match this pass's previous behavior.
657 if (any_of(C, [](const ChainElem &E) {
658 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
659 })) {
660 return Type::getIntNTy(
661 F.getContext(),
662 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
663 }
664
665 for (const ChainElem &E : C)
666 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
667 return T;
668 return getLoadStoreType(C[0].Inst)->getScalarType();
669}
670
671std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
672 // We use a simple greedy algorithm.
673 // - Given a chain of length N, find all prefixes that
674 // (a) are not longer than the max register length, and
675 // (b) are a power of 2.
676 // - Starting from the longest prefix, try to create a vector of that length.
677 // - If one of them works, great. Repeat the algorithm on any remaining
678 // elements in the chain.
679 // - If none of them work, discard the first element and repeat on a chain
680 // of length N-1.
681 if (C.empty())
682 return {};
683
684 sortChainInOffsetOrder(C);
685
686 LLVM_DEBUG({
687 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
688 dumpChain(C);
689 });
690
691 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
692 auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
693 unsigned ChainSizeBytes, VectorType *VecTy) {
694 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
695 ChainSizeBytes, VecTy)
696 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
697 ChainSizeBytes, VecTy);
698 };
699
700#ifndef NDEBUG
701 for (const auto &E : C) {
702 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
703 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
704 "Should have filtered out non-power-of-two elements in "
705 "collectEquivalenceClasses.");
706 }
707#endif
708
709 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
710 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
711
712 std::vector<Chain> Ret;
713 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
714 // Find candidate chains of size not greater than the largest vector reg.
715 // These chains are over the closed interval [CBegin, CEnd].
716 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
717 CandidateChains;
718 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
719 APInt Sz = C[CEnd].OffsetFromLeader +
720 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
721 C[CBegin].OffsetFromLeader;
722 if (Sz.sgt(VecRegBytes))
723 break;
724 CandidateChains.push_back(
725 {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
726 }
727
728 // Consider the longest chain first.
729 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
730 It != End; ++It) {
731 auto [CEnd, SizeBytes] = *It;
733 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
734 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
735
736 Type *VecElemTy = getChainElemTy(C);
737 // Note, VecElemTy is a power of 2, but might be less than one byte. For
738 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
739 // VecElemTy would be i4.
740 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
741
742 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
743 assert((8 * SizeBytes) % VecElemBits == 0);
744 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
745 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
746 unsigned VF = 8 * VecRegBytes / VecElemBits;
747
748 // Check that TTI is happy with this vectorization factor.
749 unsigned TargetVF = getVectorFactor(VF, VecElemBits,
750 VecElemBits * NumVecElems / 8, VecTy);
751 if (TargetVF != VF && TargetVF < NumVecElems) {
753 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
754 "because TargetVF="
755 << TargetVF << " != VF=" << VF
756 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
757 continue;
758 }
759
760 // Is a load/store with this alignment allowed by TTI and at least as fast
761 // as an unvectorized load/store?
762 //
763 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
764 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
765 &F = F](Align Alignment) {
766 if (Alignment.value() % SizeBytes == 0)
767 return true;
768 unsigned VectorizedSpeed = 0;
769 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
770 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
771 if (!AllowsMisaligned) {
773 << "LSV: Access of " << SizeBytes << "B in addrspace "
774 << AS << " with alignment " << Alignment.value()
775 << " is misaligned, and therefore can't be vectorized.\n");
776 return false;
777 }
778
779 unsigned ElementwiseSpeed = 0;
780 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
781 Alignment, &ElementwiseSpeed);
782 if (VectorizedSpeed < ElementwiseSpeed) {
784 << "LSV: Access of " << SizeBytes << "B in addrspace "
785 << AS << " with alignment " << Alignment.value()
786 << " has relative speed " << VectorizedSpeed
787 << ", which is lower than the elementwise speed of "
788 << ElementwiseSpeed
789 << ". Therefore this access won't be vectorized.\n");
790 return false;
791 }
792 return true;
793 };
794
795 // If we're loading/storing from an alloca, align it if possible.
796 //
797 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
798 // tells us this is beneficial. This feels a bit odd, but it matches
799 // existing tests. This isn't *so* bad, because at most we align to 4
800 // bytes (current value of StackAdjustedAlignment).
801 //
802 // FIXME: We will upgrade the alignment of the alloca even if it turns out
803 // we can't vectorize for some other reason.
804 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
805 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
806 isa<AllocaInst>(PtrOperand->stripPointerCasts());
807 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
808 Align PrefAlign = Align(StackAdjustedAlignment);
809 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
810 IsAllowedAndFast(PrefAlign)) {
812 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
813 if (NewAlign >= Alignment) {
815 << "LSV: splitByChain upgrading alloca alignment from "
816 << Alignment.value() << " to " << NewAlign.value()
817 << "\n");
818 Alignment = NewAlign;
819 }
820 }
821
822 if (!IsAllowedAndFast(Alignment)) {
824 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
825 "because its alignment is not AllowedAndFast: "
826 << Alignment.value() << "\n");
827 continue;
828 }
829
830 if ((IsLoadChain &&
831 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
832 (!IsLoadChain &&
833 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
835 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
836 "because !isLegalToVectorizeLoad/StoreChain.");
837 continue;
838 }
839
840 // Hooray, we can vectorize this chain!
841 Chain &NewChain = Ret.emplace_back();
842 for (unsigned I = CBegin; I <= CEnd; ++I)
843 NewChain.push_back(C[I]);
844 CBegin = CEnd; // Skip over the instructions we've added to the chain.
845 break;
846 }
847 }
848 return Ret;
849}
850
851bool Vectorizer::vectorizeChain(Chain &C) {
852 if (C.size() < 2)
853 return false;
854
855 sortChainInOffsetOrder(C);
856
857 LLVM_DEBUG({
858 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
859 dumpChain(C);
860 });
861
862 Type *VecElemTy = getChainElemTy(C);
863 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
864 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
865 unsigned ChainBytes = std::accumulate(
866 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
867 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
868 });
869 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
870 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
871 // than 1 byte (e.g. VecTy == <32 x i1>).
872 Type *VecTy = FixedVectorType::get(
873 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
874
875 Align Alignment = getLoadStoreAlignment(C[0].Inst);
876 // If this is a load/store of an alloca, we might have upgraded the alloca's
877 // alignment earlier. Get the new alignment.
878 if (AS == DL.getAllocaAddrSpace()) {
879 Alignment = std::max(
880 Alignment,
882 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
883 }
884
885 // All elements of the chain must have the same scalar-type size.
886#ifndef NDEBUG
887 for (const ChainElem &E : C)
888 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
889 DL.getTypeStoreSize(VecElemTy));
890#endif
891
892 Instruction *VecInst;
893 if (IsLoadChain) {
894 // Loads get hoisted to the location of the first load in the chain. We may
895 // also need to hoist the (transitive) operands of the loads.
896 Builder.SetInsertPoint(
897 llvm::min_element(C, [](const auto &A, const auto &B) {
898 return A.Inst->comesBefore(B.Inst);
899 })->Inst);
900
901 // Chain is in offset order, so C[0] is the instr with the lowest offset,
902 // i.e. the root of the vector.
903 VecInst = Builder.CreateAlignedLoad(VecTy,
905 Alignment);
906
907 unsigned VecIdx = 0;
908 for (const ChainElem &E : C) {
909 Instruction *I = E.Inst;
910 Value *V;
912 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
913 auto Mask = llvm::to_vector<8>(
914 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
915 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
916 VecIdx += VT->getNumElements();
917 } else {
918 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
919 I->getName());
920 ++VecIdx;
921 }
922 if (V->getType() != I->getType())
923 V = Builder.CreateBitOrPointerCast(V, I->getType());
924 I->replaceAllUsesWith(V);
925 }
926
927 // Finally, we need to reorder the instrs in the BB so that the (transitive)
928 // operands of VecInst appear before it. To see why, suppose we have
929 // vectorized the following code:
930 //
931 // ptr1 = gep a, 1
932 // load1 = load i32 ptr1
933 // ptr0 = gep a, 0
934 // load0 = load i32 ptr0
935 //
936 // We will put the vectorized load at the location of the earliest load in
937 // the BB, i.e. load1. We get:
938 //
939 // ptr1 = gep a, 1
940 // loadv = load <2 x i32> ptr0
941 // load0 = extractelement loadv, 0
942 // load1 = extractelement loadv, 1
943 // ptr0 = gep a, 0
944 //
945 // Notice that loadv uses ptr0, which is defined *after* it!
946 reorder(VecInst);
947 } else {
948 // Stores get sunk to the location of the last store in the chain.
949 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
950 return A.Inst->comesBefore(B.Inst);
951 })->Inst);
952
953 // Build the vector to store.
954 Value *Vec = PoisonValue::get(VecTy);
955 unsigned VecIdx = 0;
956 auto InsertElem = [&](Value *V) {
957 if (V->getType() != VecElemTy)
958 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
959 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
960 };
961 for (const ChainElem &E : C) {
962 auto I = cast<StoreInst>(E.Inst);
963 if (FixedVectorType *VT =
964 dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
965 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
966 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
967 Builder.getInt32(J)));
968 }
969 } else {
970 InsertElem(I->getValueOperand());
971 }
972 }
973
974 // Chain is in offset order, so C[0] is the instr with the lowest offset,
975 // i.e. the root of the vector.
976 VecInst = Builder.CreateAlignedStore(
977 Vec,
979 Alignment);
980 }
981
982 propagateMetadata(VecInst, C);
983
984 for (const ChainElem &E : C)
985 ToErase.push_back(E.Inst);
986
987 ++NumVectorInstructions;
988 NumScalarsVectorized += C.size();
989 return true;
990}
991
992template <bool IsLoadChain>
993bool Vectorizer::isSafeToMove(
994 Instruction *ChainElem, Instruction *ChainBegin,
996 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
997 << *ChainBegin << ")\n");
998
999 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1000 if (ChainElem == ChainBegin)
1001 return true;
1002
1003 // Invariant loads can always be reordered; by definition they are not
1004 // clobbered by stores.
1005 if (isInvariantLoad(ChainElem))
1006 return true;
1007
1008 auto BBIt = std::next([&] {
1009 if constexpr (IsLoadChain)
1010 return BasicBlock::reverse_iterator(ChainElem);
1011 else
1012 return BasicBlock::iterator(ChainElem);
1013 }());
1014 auto BBItEnd = std::next([&] {
1015 if constexpr (IsLoadChain)
1016 return BasicBlock::reverse_iterator(ChainBegin);
1017 else
1018 return BasicBlock::iterator(ChainBegin);
1019 }());
1020
1021 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1022 const unsigned ChainElemSize =
1023 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1024
1025 for (; BBIt != BBItEnd; ++BBIt) {
1026 Instruction *I = &*BBIt;
1027
1028 if (!I->mayReadOrWriteMemory())
1029 continue;
1030
1031 // Loads can be reordered with other loads.
1032 if (IsLoadChain && isa<LoadInst>(I))
1033 continue;
1034
1035 // Stores can be sunk below invariant loads.
1036 if (!IsLoadChain && isInvariantLoad(I))
1037 continue;
1038
1039 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1040 // what offset ChainIt accesses. This may be better than AA is able to do.
1041 //
1042 // We should really only have duplicate offsets for stores (the duplicate
1043 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1044 // split the chain so we don't have to handle this case specially.
1045 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1046 // I and ChainElem overlap if:
1047 // - I and ChainElem have the same offset, OR
1048 // - I's offset is less than ChainElem's, but I touches past the
1049 // beginning of ChainElem, OR
1050 // - ChainElem's offset is less than I's, but ChainElem touches past the
1051 // beginning of I.
1052 const APInt &IOffset = OffsetIt->second;
1053 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1054 if (IOffset == ChainElemOffset ||
1055 (IOffset.sle(ChainElemOffset) &&
1056 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1057 (ChainElemOffset.sle(IOffset) &&
1058 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1059 LLVM_DEBUG({
1060 // Double check that AA also sees this alias. If not, we probably
1061 // have a bug.
1062 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1063 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1064 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1065 });
1066 return false; // We found an aliasing instruction; bail.
1067 }
1068
1069 continue; // We're confident there's no alias.
1070 }
1071
1072 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1073 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1074 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1075 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1076 << " Aliasing instruction:\n"
1077 << " " << *I << '\n'
1078 << " Aliased instruction and pointer:\n"
1079 << " " << *ChainElem << '\n'
1080 << " " << *getLoadStorePointerOperand(ChainElem)
1081 << '\n');
1082
1083 return false;
1084 }
1085 }
1086 return true;
1087}
1088
1090 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1091 return (Signed && BinOpI->hasNoSignedWrap()) ||
1092 (!Signed && BinOpI->hasNoUnsignedWrap());
1093}
1094
1095static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1096 unsigned MatchingOpIdxA, Instruction *AddOpB,
1097 unsigned MatchingOpIdxB, bool Signed) {
1098 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1099 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1100 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1101 << ", MatchingOpIdxB=" << MatchingOpIdxB
1102 << ", Signed=" << Signed << "\n");
1103 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1104 // being the same, we can guarantee that the transformation is safe if we can
1105 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1106 // For example:
1107 // %tmp7 = add nsw i32 %tmp2, %v0
1108 // %tmp8 = sext i32 %tmp7 to i64
1109 // ...
1110 // %tmp11 = add nsw i32 %v0, 1
1111 // %tmp12 = add nsw i32 %tmp2, %tmp11
1112 // %tmp13 = sext i32 %tmp12 to i64
1113 //
1114 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1115 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1116 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1117 assert(AddOpA->getOpcode() == Instruction::Add &&
1118 AddOpB->getOpcode() == Instruction::Add &&
1119 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1120 if (AddOpA->getOperand(MatchingOpIdxA) ==
1121 AddOpB->getOperand(MatchingOpIdxB)) {
1122 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1123 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1124 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1125 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1126 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1127 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1128 checkNoWrapFlags(OtherInstrB, Signed) &&
1129 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1130 int64_t CstVal =
1131 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1132 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1133 IdxDiff.getSExtValue() == CstVal)
1134 return true;
1135 }
1136 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1137 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1138 checkNoWrapFlags(OtherInstrA, Signed) &&
1139 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1140 int64_t CstVal =
1141 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1142 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1143 IdxDiff.getSExtValue() == -CstVal)
1144 return true;
1145 }
1146 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1147 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1148 if (OtherInstrA && OtherInstrB &&
1149 OtherInstrA->getOpcode() == Instruction::Add &&
1150 OtherInstrB->getOpcode() == Instruction::Add &&
1151 checkNoWrapFlags(OtherInstrA, Signed) &&
1152 checkNoWrapFlags(OtherInstrB, Signed) &&
1153 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1154 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1155 int64_t CstValA =
1156 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1157 int64_t CstValB =
1158 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1159 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1160 IdxDiff.getSExtValue() == (CstValB - CstValA))
1161 return true;
1162 }
1163 }
1164 return false;
1165}
1166
1167std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1168 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1169 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1170 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1171 << " Depth=" << Depth << "\n");
1172 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1173 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1174 if (!GEPA || !GEPB)
1175 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1176
1177 // Look through GEPs after checking they're the same except for the last
1178 // index.
1179 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1180 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1181 return std::nullopt;
1182 gep_type_iterator GTIA = gep_type_begin(GEPA);
1183 gep_type_iterator GTIB = gep_type_begin(GEPB);
1184 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1185 if (GTIA.getOperand() != GTIB.getOperand())
1186 return std::nullopt;
1187 ++GTIA;
1188 ++GTIB;
1189 }
1190
1191 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1192 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1193 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1194 OpA->getType() != OpB->getType())
1195 return std::nullopt;
1196
1197 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1198
1199 // Only look through a ZExt/SExt.
1200 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1201 return std::nullopt;
1202
1203 bool Signed = isa<SExtInst>(OpA);
1204
1205 // At this point A could be a function parameter, i.e. not an instruction
1206 Value *ValA = OpA->getOperand(0);
1207 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1208 if (!OpB || ValA->getType() != OpB->getType())
1209 return std::nullopt;
1210
1211 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1212 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1213 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1214 if (IdxDiffSCEV == SE.getCouldNotCompute())
1215 return std::nullopt;
1216
1217 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1218 if (!IdxDiffRange.isSingleElement())
1219 return std::nullopt;
1220 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1221
1222 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1223 << "\n");
1224
1225 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1226 bool Safe = false;
1227
1228 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1229 // ValA, we're okay.
1230 if (OpB->getOpcode() == Instruction::Add &&
1231 isa<ConstantInt>(OpB->getOperand(1)) &&
1232 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1234 Safe = true;
1235
1236 // Second attempt: check if we have eligible add NSW/NUW instruction
1237 // sequences.
1238 OpA = dyn_cast<Instruction>(ValA);
1239 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1240 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1241 checkNoWrapFlags(OpB, Signed)) {
1242 // In the checks below a matching operand in OpA and OpB is an operand which
1243 // is the same in those two instructions. Below we account for possible
1244 // orders of the operands of these add instructions.
1245 for (unsigned MatchingOpIdxA : {0, 1})
1246 for (unsigned MatchingOpIdxB : {0, 1})
1247 if (!Safe)
1248 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1249 MatchingOpIdxB, Signed);
1250 }
1251
1252 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1253
1254 // Third attempt:
1255 //
1256 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1257 // order bit other than the sign bit are known to be zero in ValA, we can add
1258 // Diff to it while guaranteeing no overflow of any sort.
1259 //
1260 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1261 if (!Safe) {
1262 // When computing known bits, use the GEPs as context instructions, since
1263 // they likely are in the same BB as the load/store.
1264 KnownBits Known(BitWidth);
1265 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1266 ContextInst, &DT);
1267 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1268 if (Signed)
1269 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1270 if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1271 return std::nullopt;
1272 Safe = true;
1273 }
1274
1275 if (Safe)
1276 return IdxDiff * Stride;
1277 return std::nullopt;
1278}
1279
1280std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1281 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1282 if (Depth++ == MaxDepth)
1283 return std::nullopt;
1284
1285 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1286 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1287 if (SelectA->getCondition() != SelectB->getCondition())
1288 return std::nullopt;
1289 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1290 << ", PtrB=" << *PtrB << ", ContextInst="
1291 << *ContextInst << ", Depth=" << Depth << "\n");
1292 std::optional<APInt> TrueDiff = getConstantOffset(
1293 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1294 if (!TrueDiff.has_value())
1295 return std::nullopt;
1296 std::optional<APInt> FalseDiff =
1297 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1298 ContextInst, Depth);
1299 if (TrueDiff == FalseDiff)
1300 return TrueDiff;
1301 }
1302 }
1303 return std::nullopt;
1304}
1305
1306EquivalenceClassMap
1307Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1309 EquivalenceClassMap Ret;
1310
1311 auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
1312 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1313 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1314 // The select's themselves are distinct instructions even if they share
1315 // the same condition and evaluate to consecutive pointers for true and
1316 // false values of the condition. Therefore using the select's themselves
1317 // for grouping instructions would put consecutive accesses into different
1318 // lists and they won't be even checked for being consecutive, and won't
1319 // be vectorized.
1320 return Sel->getCondition();
1321 }
1322 return ObjPtr;
1323 };
1324
1325 for (Instruction &I : make_range(Begin, End)) {
1326 auto *LI = dyn_cast<LoadInst>(&I);
1327 auto *SI = dyn_cast<StoreInst>(&I);
1328 if (!LI && !SI)
1329 continue;
1330
1331 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1332 continue;
1333
1334 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1335 (SI && !TTI.isLegalToVectorizeStore(SI)))
1336 continue;
1337
1338 Type *Ty = getLoadStoreType(&I);
1340 continue;
1341
1342 // Skip weird non-byte sizes. They probably aren't worth the effort of
1343 // handling correctly.
1344 unsigned TySize = DL.getTypeSizeInBits(Ty);
1345 if ((TySize % 8) != 0)
1346 continue;
1347
1348 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1349 // functions are currently using an integer type for the vectorized
1350 // load/store, and does not support casting between the integer type and a
1351 // vector of pointers (e.g. i64 to <2 x i16*>)
1352 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1353 continue;
1354
1356 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1357 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1358
1359 unsigned VF = VecRegSize / TySize;
1360 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1361
1362 // Only handle power-of-two sized elements.
1363 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1364 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1365 continue;
1366
1367 // No point in looking at these if they're too big to vectorize.
1368 if (TySize > VecRegSize / 2 ||
1369 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1370 continue;
1371
1373 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1374 /*IsLoad=*/LI != nullptr}]
1375 .push_back(&I);
1376 }
1377
1378 return Ret;
1379}
1380
1381std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1382 if (Instrs.empty())
1383 return {};
1384
1385 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1386 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1387
1388#ifndef NDEBUG
1389 // Check that Instrs is in BB order and all have the same addr space.
1390 for (size_t I = 1; I < Instrs.size(); ++I) {
1391 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1392 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1393 }
1394#endif
1395
1396 // Machinery to build an MRU-hashtable of Chains.
1397 //
1398 // (Ideally this could be done with MapVector, but as currently implemented,
1399 // moving an element to the front of a MapVector is O(n).)
1400 struct InstrListElem : ilist_node<InstrListElem>,
1401 std::pair<Instruction *, Chain> {
1402 explicit InstrListElem(Instruction *I)
1403 : std::pair<Instruction *, Chain>(I, {}) {}
1404 };
1405 struct InstrListElemDenseMapInfo {
1406 using PtrInfo = DenseMapInfo<InstrListElem *>;
1407 using IInfo = DenseMapInfo<Instruction *>;
1408 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1409 static InstrListElem *getTombstoneKey() {
1410 return PtrInfo::getTombstoneKey();
1411 }
1412 static unsigned getHashValue(const InstrListElem *E) {
1413 return IInfo::getHashValue(E->first);
1414 }
1415 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1416 if (A == getEmptyKey() || B == getEmptyKey())
1417 return A == getEmptyKey() && B == getEmptyKey();
1418 if (A == getTombstoneKey() || B == getTombstoneKey())
1419 return A == getTombstoneKey() && B == getTombstoneKey();
1420 return IInfo::isEqual(A->first, B->first);
1421 }
1422 };
1426
1427 // Compare each instruction in `instrs` to leader of the N most recently-used
1428 // chains. This limits the O(n^2) behavior of this pass while also allowing
1429 // us to build arbitrarily long chains.
1430 for (Instruction *I : Instrs) {
1431 constexpr int MaxChainsToTry = 64;
1432
1433 bool MatchFound = false;
1434 auto ChainIter = MRU.begin();
1435 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1436 ++J, ++ChainIter) {
1437 std::optional<APInt> Offset = getConstantOffset(
1438 getLoadStorePointerOperand(ChainIter->first),
1440 /*ContextInst=*/
1441 (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
1442 if (Offset.has_value()) {
1443 // `Offset` might not have the expected number of bits, if e.g. AS has a
1444 // different number of bits than opaque pointers.
1445 ChainIter->second.push_back(ChainElem{I, Offset.value()});
1446 // Move ChainIter to the front of the MRU list.
1447 MRU.remove(*ChainIter);
1448 MRU.push_front(*ChainIter);
1449 MatchFound = true;
1450 break;
1451 }
1452 }
1453
1454 if (!MatchFound) {
1455 APInt ZeroOffset(ASPtrBits, 0);
1456 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1457 E->second.push_back(ChainElem{I, ZeroOffset});
1458 MRU.push_front(*E);
1459 Chains.insert(E);
1460 }
1461 }
1462
1463 std::vector<Chain> Ret;
1464 Ret.reserve(Chains.size());
1465 // Iterate over MRU rather than Chains so the order is deterministic.
1466 for (auto &E : MRU)
1467 if (E.second.size() > 1)
1468 Ret.push_back(std::move(E.second));
1469 return Ret;
1470}
1471
1472std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1473 Instruction *ContextInst,
1474 unsigned Depth) {
1475 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1476 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1477 << ", Depth=" << Depth << "\n");
1478 // We'll ultimately return a value of this bit width, even if computations
1479 // happen in a different width.
1480 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1481 APInt OffsetA(OrigBitWidth, 0);
1482 APInt OffsetB(OrigBitWidth, 0);
1483 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1484 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1485 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1486 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1487 return std::nullopt;
1488
1489 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1490 // should properly handle a possible overflow and the value should fit into
1491 // the smallest data type used in the cast/gep chain.
1492 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1493 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1494
1495 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1496 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1497 if (PtrA == PtrB)
1498 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1499
1500 // Try to compute B - A.
1501 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1502 if (DistScev != SE.getCouldNotCompute()) {
1503 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1504 ConstantRange DistRange = SE.getSignedRange(DistScev);
1505 if (DistRange.isSingleElement()) {
1506 // Handle index width (the width of Dist) != pointer width (the width of
1507 // the Offset*s at this point).
1508 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1509 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1510 }
1511 }
1512 std::optional<APInt> Diff =
1513 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1514 if (Diff.has_value())
1515 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1516 .sextOrTrunc(OrigBitWidth);
1517 return std::nullopt;
1518}
AMDGPU Mark last scratch load
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static const unsigned MaxDepth
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, const MachineRegisterInfo &MRI)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:78
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1387
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt abs() const
Get the absolute value.
Definition: APInt.h:1753
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1181
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1448
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1091
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1146
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:455
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1217
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents a range of values.
Definition: ConstantRange.h:47
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:103
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2674
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:363
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
An instruction for reading from memory.
Definition: Instructions.h:174
bool isSimple() const
Definition: Instructions.h:245
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getCouldNotCompute()
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:436
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:503
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:389
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalToVectorizeLoad(LoadInst *LI) const
bool isLegalToVectorizeStore(StoreInst *SI) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:258
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:694
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:671
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
TypeSize getSequentialElementStride(const DataLayout &DL) const
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A simple intrusive list implementation.
Definition: simple_ilist.h:81
void push_front(reference Node)
Insert a node at the front; never copies.
Definition: simple_ilist.h:150
void remove(reference N)
Remove a node by reference; never deletes.
Definition: simple_ilist.h:189
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
auto min_element(R &&Range)
Definition: STLExtras.h:1978
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< po_iterator< T > > post_order(const T &G)
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1541
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
TargetTransformInfo TTI
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
auto max_element(R &&Range)
Definition: STLExtras.h:1986
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2057
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117