LLVM 22.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
191 for (const ChainElem &E : C)
192 Values.emplace_back(E.Inst);
193 return propagateMetadata(I, Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
206
207 Worklist.emplace_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(I)) {
224 InstructionsToMove.insert(IM);
225 Worklist.emplace_back(IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(IM))
234 continue;
235 IM->moveBefore(I->getIterator());
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
245 TargetTransformInfo &TTI;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 /// We could erase instrs right after vectorizing them, but that can mess up
250 /// our BB iterators, and also can make the equivalence class keys point to
251 /// freed memory. This is fixable, but it's simpler just to wait until we're
252 /// done with the BB and erase all at once.
254
255 /// We insert load/store instructions and GEPs to fill gaps and extend chains
256 /// to enable vectorization. Keep track and delete them later.
257 DenseSet<Instruction *> ExtraElements;
258
259public:
260 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
261 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
262 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
263 DL(F.getDataLayout()), Builder(SE.getContext()) {}
264
265 bool run();
266
267private:
268 static const unsigned MaxDepth = 3;
269
270 /// Runs the vectorizer on a "pseudo basic block", which is a range of
271 /// instructions [Begin, End) within one BB all of which have
272 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
273 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
274
275 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
276 /// in the same BB with the same value for getUnderlyingObject() etc.
277 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
279
280 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
281 /// where all instructions access a known, constant offset from the first
282 /// instruction.
283 bool runOnChain(Chain &C);
284
285 /// Splits the chain into subchains of instructions which read/write a
286 /// contiguous block of memory. Discards any length-1 subchains (because
287 /// there's nothing to vectorize in there). Also attempts to fill gaps with
288 /// "extra" elements to artificially make chains contiguous in some cases.
289 std::vector<Chain> splitChainByContiguity(Chain &C);
290
291 /// Splits the chain into subchains where it's safe to hoist loads up to the
292 /// beginning of the sub-chain and it's safe to sink loads up to the end of
293 /// the sub-chain. Discards any length-1 subchains. Also attempts to extend
294 /// non-power-of-two chains by adding "extra" elements in some cases.
295 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
296
297 /// Splits the chain into subchains that make legal, aligned accesses.
298 /// Discards any length-1 subchains.
299 std::vector<Chain> splitChainByAlignment(Chain &C);
300
301 /// Converts the instrs in the chain into a single vectorized load or store.
302 /// Adds the old scalar loads/stores to ToErase.
303 bool vectorizeChain(Chain &C);
304
305 /// Tries to compute the offset in bytes PtrB - PtrA.
306 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth = 0);
309 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
310 Instruction *ContextInst,
311 unsigned Depth);
312 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
313 Instruction *ContextInst,
314 unsigned Depth);
315
316 /// Gets the element type of the vector that the chain will load or store.
317 /// This is nontrivial because the chain may contain elements of different
318 /// types; e.g. it's legal to have a chain that contains both i32 and float.
319 Type *getChainElemTy(const Chain &C);
320
321 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
322 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
323 /// instructions.
324 ///
325 /// The map ChainElemOffsets must contain all of the elements in
326 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
327 /// address. It's ok if it contains additional entries.
328 template <bool IsLoadChain>
329 bool isSafeToMove(
330 Instruction *ChainElem, Instruction *ChainBegin,
331 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
332 BatchAAResults &BatchAA);
333
334 /// Merges the equivalence classes if they have underlying objects that differ
335 /// by one level of indirection (i.e., one is a getelementptr and the other is
336 /// the base pointer in that getelementptr).
337 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
338
339 /// Collects loads and stores grouped by "equivalence class", where:
340 /// - all elements in an eq class are a load or all are a store,
341 /// - they all load/store the same element size (it's OK to have e.g. i8 and
342 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
343 /// - they all have the same value for getUnderlyingObject().
344 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
346
347 /// Partitions Instrs into "chains" where every instruction has a known
348 /// constant offset from the first instr in the chain.
349 ///
350 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
351 /// in the chain is the leader, and an instr touches distance 0 from itself.
352 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
353
354 /// Checks if a potential vector load/store with a given alignment is allowed
355 /// and fast. Aligned accesses are always allowed and fast, while misaligned
356 /// accesses depend on TTI checks to determine whether they can and should be
357 /// vectorized or kept as element-wise accesses.
358 bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment,
359 unsigned VecElemBits) const;
360
361 /// Create a new GEP and a new Load/Store instruction such that the GEP
362 /// is pointing at PrevElem + Offset. In the case of stores, store poison.
363 /// Extra elements will either be combined into a masked load/store or
364 /// deleted before the end of the pass.
365 ChainElem createExtraElementAfter(const ChainElem &PrevElem, Type *Ty,
366 APInt Offset, StringRef Prefix,
367 Align Alignment = Align());
368
369 /// Create a mask that masks off the extra elements in the chain, to be used
370 /// for the creation of a masked load/store vector.
371 Value *createMaskForExtraElements(const ArrayRef<ChainElem> C,
372 FixedVectorType *VecTy);
373
374 /// Delete dead GEPs and extra Load/Store instructions created by
375 /// createExtraElementAfter
376 void deleteExtraElements();
377};
378
379class LoadStoreVectorizerLegacyPass : public FunctionPass {
380public:
381 static char ID;
382
383 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
386 }
387
388 bool runOnFunction(Function &F) override;
389
390 StringRef getPassName() const override {
391 return "GPU Load and Store Vectorizer";
392 }
393
394 void getAnalysisUsage(AnalysisUsage &AU) const override {
395 AU.addRequired<AAResultsWrapperPass>();
396 AU.addRequired<AssumptionCacheTracker>();
397 AU.addRequired<ScalarEvolutionWrapperPass>();
398 AU.addRequired<DominatorTreeWrapperPass>();
399 AU.addRequired<TargetTransformInfoWrapperPass>();
400 AU.setPreservesCFG();
401 }
402};
403
404} // end anonymous namespace
405
406char LoadStoreVectorizerLegacyPass::ID = 0;
407
408INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
409 "Vectorize load and Store instructions", false, false)
416INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
417 "Vectorize load and store instructions", false, false)
418
420 return new LoadStoreVectorizerLegacyPass();
421}
422
423bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
424 // Don't vectorize when the attribute NoImplicitFloat is used.
425 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
426 return false;
427
428 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
429 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
430 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
431 TargetTransformInfo &TTI =
432 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
433
434 AssumptionCache &AC =
435 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
436
437 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
438}
439
442 // Don't vectorize when the attribute NoImplicitFloat is used.
443 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
444 return PreservedAnalyses::all();
445
451
452 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
455 return Changed ? PA : PreservedAnalyses::all();
456}
457
458bool Vectorizer::run() {
459 bool Changed = false;
460 // Break up the BB if there are any instrs which aren't guaranteed to transfer
461 // execution to their successor.
462 //
463 // Consider, for example:
464 //
465 // def assert_arr_len(int n) { if (n < 2) exit(); }
466 //
467 // load arr[0]
468 // call assert_array_len(arr.length)
469 // load arr[1]
470 //
471 // Even though assert_arr_len does not read or write any memory, we can't
472 // speculate the second load before the call. More info at
473 // https://github.com/llvm/llvm-project/issues/52950.
474 for (BasicBlock *BB : post_order(&F)) {
475 // BB must at least have a terminator.
476 assert(!BB->empty());
477
479 Barriers.emplace_back(BB->begin());
480 for (Instruction &I : *BB)
482 Barriers.emplace_back(I.getIterator());
483 Barriers.emplace_back(BB->end());
484
485 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
486 ++It)
487 Changed |= runOnPseudoBB(*It, *std::next(It));
488
489 for (Instruction *I : ToErase) {
490 // These will get deleted in deleteExtraElements.
491 // This is because ExtraElements will include both extra elements
492 // that *were* vectorized and extra elements that *were not*
493 // vectorized. ToErase will only include extra elements that *were*
494 // vectorized, so in order to avoid double deletion we skip them here and
495 // handle them in deleteExtraElements.
496 if (ExtraElements.contains(I))
497 continue;
498 auto *PtrOperand = getLoadStorePointerOperand(I);
499 if (I->use_empty())
500 I->eraseFromParent();
502 }
503 ToErase.clear();
504 deleteExtraElements();
505 }
506
507 return Changed;
508}
509
510bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
512 LLVM_DEBUG({
513 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
514 if (End != Begin->getParent()->end())
515 dbgs() << *End;
516 else
517 dbgs() << "<BB end>";
518 dbgs() << ")\n";
519 });
520
521 bool Changed = false;
522 for (const auto &[EqClassKey, EqClass] :
523 collectEquivalenceClasses(Begin, End))
524 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
525
526 return Changed;
527}
528
529bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
530 ArrayRef<Instruction *> EqClass) {
531 bool Changed = false;
532
533 LLVM_DEBUG({
534 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
535 << " keyed on " << EqClassKey << ":\n";
536 for (Instruction *I : EqClass)
537 dbgs() << " " << *I << "\n";
538 });
539
540 std::vector<Chain> Chains = gatherChains(EqClass);
541 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
542 << " nontrivial chains.\n";);
543 for (Chain &C : Chains)
544 Changed |= runOnChain(C);
545 return Changed;
546}
547
548bool Vectorizer::runOnChain(Chain &C) {
549 LLVM_DEBUG({
550 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
551 dumpChain(C);
552 });
553
554 // Split up the chain into increasingly smaller chains, until we can finally
555 // vectorize the chains.
556 //
557 // (Don't be scared by the depth of the loop nest here. These operations are
558 // all at worst O(n lg n) in the number of instructions, and splitting chains
559 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
560 bool Changed = false;
561 for (auto &C : splitChainByMayAliasInstrs(C))
562 for (auto &C : splitChainByContiguity(C))
563 for (auto &C : splitChainByAlignment(C))
564 Changed |= vectorizeChain(C);
565 return Changed;
566}
567
568std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
569 if (C.empty())
570 return {};
571
572 sortChainInBBOrder(C);
573
574 LLVM_DEBUG({
575 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
576 dumpChain(C);
577 });
578
579 // We know that elements in the chain with nonverlapping offsets can't
580 // alias, but AA may not be smart enough to figure this out. Use a
581 // hashtable.
582 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
583 for (const auto &E : C)
584 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
585
586 // Across a single invocation of this function the IR is not changing, so
587 // using a batched Alias Analysis is safe and can reduce compile time.
588 BatchAAResults BatchAA(AA);
589
590 // Loads get hoisted up to the first load in the chain. Stores get sunk
591 // down to the last store in the chain. Our algorithm for loads is:
592 //
593 // - Take the first element of the chain. This is the start of a new chain.
594 // - Take the next element of `Chain` and check for may-alias instructions
595 // up to the start of NewChain. If no may-alias instrs, add it to
596 // NewChain. Otherwise, start a new NewChain.
597 //
598 // For stores it's the same except in the reverse direction.
599 //
600 // We expect IsLoad to be an std::bool_constant.
601 auto Impl = [&](auto IsLoad) {
602 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
603 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
604 if constexpr (IsLoad())
605 return std::make_pair(C.begin(), C.end());
606 else
607 return std::make_pair(C.rbegin(), C.rend());
608 }(IsLoad);
609 assert(ChainBegin != ChainEnd);
610
611 std::vector<Chain> Chains;
613 NewChain.emplace_back(*ChainBegin);
614 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
615 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
616 ChainOffsets, BatchAA)) {
617 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
618 << *ChainIt->Inst << " into " << *ChainBegin->Inst
619 << "\n");
620 NewChain.emplace_back(*ChainIt);
621 } else {
623 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
624 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
625 if (NewChain.size() > 1) {
626 LLVM_DEBUG({
627 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
628 dumpChain(NewChain);
629 });
630 Chains.emplace_back(std::move(NewChain));
631 }
632
633 // Start a new chain.
634 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
635 }
636 }
637 if (NewChain.size() > 1) {
638 LLVM_DEBUG({
639 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
640 dumpChain(NewChain);
641 });
642 Chains.emplace_back(std::move(NewChain));
643 }
644 return Chains;
645 };
646
647 if (isa<LoadInst>(C[0].Inst))
648 return Impl(/*IsLoad=*/std::bool_constant<true>());
649
650 assert(isa<StoreInst>(C[0].Inst));
651 return Impl(/*IsLoad=*/std::bool_constant<false>());
652}
653
654std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
655 if (C.empty())
656 return {};
657
658 sortChainInOffsetOrder(C);
659
660 LLVM_DEBUG({
661 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
662 dumpChain(C);
663 });
664
665 // If the chain is not contiguous, we try to fill the gap with "extra"
666 // elements to artificially make it contiguous, to try to enable
667 // vectorization. We only fill gaps if there is potential to end up with a
668 // legal masked load/store given the target, address space, and element type.
669 // At this point, when querying the TTI, optimistically assume max alignment
670 // and max vector size, as splitChainByAlignment will ensure the final vector
671 // shape passes the legalization check.
672 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
674 unsigned MaxVecRegBits = TTI.getLoadStoreVecRegBitWidth(AS);
675 Align OptimisticAlign = Align(MaxVecRegBits / 8);
676 unsigned int MaxVectorNumElems =
677 MaxVecRegBits / DL.getTypeSizeInBits(ElementType);
678 // Note: This check decides whether to try to fill gaps based on the masked
679 // legality of the target's maximum vector size (getLoadStoreVecRegBitWidth).
680 // If a target *does not* support a masked load/store with this max vector
681 // size, but *does* support a masked load/store with a *smaller* vector size,
682 // that optimization will be missed. This does not occur in any of the targets
683 // that currently support this API.
684 FixedVectorType *OptimisticVectorType =
685 FixedVectorType::get(ElementType, MaxVectorNumElems);
686 bool TryFillGaps =
687 isa<LoadInst>(C[0].Inst)
688 ? TTI.isLegalMaskedLoad(OptimisticVectorType, OptimisticAlign, AS,
690 : TTI.isLegalMaskedStore(OptimisticVectorType, OptimisticAlign, AS,
692
693 // Cache the best aligned element in the chain for use when creating extra
694 // elements.
695 Align BestAlignedElemAlign = getLoadStoreAlignment(C[0].Inst);
696 APInt OffsetOfBestAlignedElemFromLeader = C[0].OffsetFromLeader;
697 for (const auto &E : C) {
698 Align ElementAlignment = getLoadStoreAlignment(E.Inst);
699 if (ElementAlignment > BestAlignedElemAlign) {
700 BestAlignedElemAlign = ElementAlignment;
701 OffsetOfBestAlignedElemFromLeader = E.OffsetFromLeader;
702 }
703 }
704
705 auto DeriveAlignFromBestAlignedElem = [&](APInt NewElemOffsetFromLeader) {
706 return commonAlignment(
707 BestAlignedElemAlign,
708 (NewElemOffsetFromLeader - OffsetOfBestAlignedElemFromLeader)
709 .abs()
710 .getLimitedValue());
711 };
712
713 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
714
715 std::vector<Chain> Ret;
716 Ret.push_back({C.front()});
717
718 unsigned ChainElemTyBits = DL.getTypeSizeInBits(getChainElemTy(C));
719 ChainElem &Prev = C[0];
720 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
721 auto &CurChain = Ret.back();
722
723 APInt PrevSzBytes =
724 APInt(ASPtrBits, DL.getTypeStoreSize(getLoadStoreType(Prev.Inst)));
725 APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes;
726 unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(It->Inst));
727
728 // Add this instruction to the end of the current chain, or start a new one.
729 assert(
730 8 * SzBytes % ChainElemTyBits == 0 &&
731 "Every chain-element size must be a multiple of the element size after "
732 "vectorization.");
733 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
734 // Allow redundancy: partial or full overlap counts as contiguous.
735 bool AreContiguous = false;
736 if (It->OffsetFromLeader.sle(PrevReadEnd)) {
737 // Check overlap is a multiple of the element size after vectorization.
738 uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
739 if (8 * Overlap % ChainElemTyBits == 0)
740 AreContiguous = true;
741 }
742
743 LLVM_DEBUG(dbgs() << "LSV: Instruction is "
744 << (AreContiguous ? "contiguous" : "chain-breaker")
745 << *It->Inst << " (starts at offset "
746 << It->OffsetFromLeader << ")\n");
747
748 // If the chain is not contiguous, try to fill in gaps between Prev and
749 // Curr. For now, we aren't filling gaps between load/stores of different
750 // sizes. Additionally, as a conservative heuristic, we only fill gaps of
751 // 1-2 elements. Generating loads/stores with too many unused bytes has a
752 // side effect of increasing register pressure (on NVIDIA targets at least),
753 // which could cancel out the benefits of reducing number of load/stores.
754 bool GapFilled = false;
755 if (!AreContiguous && TryFillGaps && PrevSzBytes == SzBytes) {
756 APInt GapSzBytes = It->OffsetFromLeader - PrevReadEnd;
757 if (GapSzBytes == PrevSzBytes) {
758 // There is a single gap between Prev and Curr, create one extra element
759 ChainElem NewElem = createExtraElementAfter(
760 Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
761 DeriveAlignFromBestAlignedElem(PrevReadEnd));
762 CurChain.push_back(NewElem);
763 GapFilled = true;
764 }
765 // There are two gaps between Prev and Curr, only create two extra
766 // elements if Prev is the first element in a sequence of four.
767 // This has the highest chance of resulting in a beneficial vectorization.
768 if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) {
769 ChainElem NewElem1 = createExtraElementAfter(
770 Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
771 DeriveAlignFromBestAlignedElem(PrevReadEnd));
772 ChainElem NewElem2 = createExtraElementAfter(
773 NewElem1, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
774 DeriveAlignFromBestAlignedElem(PrevReadEnd + PrevSzBytes));
775 CurChain.push_back(NewElem1);
776 CurChain.push_back(NewElem2);
777 GapFilled = true;
778 }
779 }
780
781 if (AreContiguous || GapFilled)
782 CurChain.push_back(*It);
783 else
784 Ret.push_back({*It});
785 // In certain cases when handling redundant elements with partial overlaps,
786 // the previous element may still extend beyond the current element. Only
787 // update Prev if the current element is the new end of the chain.
788 if (ReadEnd.sge(PrevReadEnd))
789 Prev = *It;
790 }
791
792 // Filter out length-1 chains, these are uninteresting.
793 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
794 return Ret;
795}
796
797Type *Vectorizer::getChainElemTy(const Chain &C) {
798 assert(!C.empty());
799 // The rules are:
800 // - If there are any pointer types in the chain, use an integer type.
801 // - Prefer an integer type if it appears in the chain.
802 // - Otherwise, use the first type in the chain.
803 //
804 // The rule about pointer types is a simplification when we merge e.g. a load
805 // of a ptr and a double. There's no direct conversion from a ptr to a
806 // double; it requires a ptrtoint followed by a bitcast.
807 //
808 // It's unclear to me if the other rules have any practical effect, but we do
809 // it to match this pass's previous behavior.
810 if (any_of(C, [](const ChainElem &E) {
811 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
812 })) {
813 return Type::getIntNTy(
814 F.getContext(),
815 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
816 }
817
818 for (const ChainElem &E : C)
819 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
820 return T;
821 return getLoadStoreType(C[0].Inst)->getScalarType();
822}
823
824std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
825 // We use a simple greedy algorithm.
826 // - Given a chain of length N, find all prefixes that
827 // (a) are not longer than the max register length, and
828 // (b) are a power of 2.
829 // - Starting from the longest prefix, try to create a vector of that length.
830 // - If one of them works, great. Repeat the algorithm on any remaining
831 // elements in the chain.
832 // - If none of them work, discard the first element and repeat on a chain
833 // of length N-1.
834 if (C.empty())
835 return {};
836
837 sortChainInOffsetOrder(C);
838
839 LLVM_DEBUG({
840 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
841 dumpChain(C);
842 });
843
844 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
845 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
846 unsigned ChainSizeBytes, VectorType *VecTy) {
847 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
848 ChainSizeBytes, VecTy)
849 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
850 ChainSizeBytes, VecTy);
851 };
852
853#ifndef NDEBUG
854 for (const auto &E : C) {
855 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
856 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
857 "Should have filtered out non-power-of-two elements in "
858 "collectEquivalenceClasses.");
859 }
860#endif
861
862 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
863 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
864
865 // For compile time reasons, we cache whether or not the superset
866 // of all candidate chains contains any extra loads/stores from earlier gap
867 // filling.
868 bool CandidateChainsMayContainExtraLoadsStores = any_of(
869 C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); });
870
871 std::vector<Chain> Ret;
872 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
873 // Find candidate chains of size not greater than the largest vector reg.
874 // These chains are over the closed interval [CBegin, CEnd].
875 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
876 CandidateChains;
877 // Need to compute the size of every candidate chain from its beginning
878 // because of possible overlapping among chain elements.
879 unsigned Sz = DL.getTypeStoreSize(getLoadStoreType(C[CBegin].Inst));
880 APInt PrevReadEnd = C[CBegin].OffsetFromLeader + Sz;
881 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
882 APInt ReadEnd = C[CEnd].OffsetFromLeader +
883 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst));
884 unsigned BytesAdded =
885 PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
886 Sz += BytesAdded;
887 if (Sz > VecRegBytes)
888 break;
889 CandidateChains.emplace_back(CEnd, Sz);
890 PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
891 }
892
893 // Consider the longest chain first.
894 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
895 It != End; ++It) {
896 auto [CEnd, SizeBytes] = *It;
898 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
899 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
900
901 Type *VecElemTy = getChainElemTy(C);
902 // Note, VecElemTy is a power of 2, but might be less than one byte. For
903 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
904 // VecElemTy would be i4.
905 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
906
907 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
908 assert((8 * SizeBytes) % VecElemBits == 0);
909 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
910 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
911 unsigned VF = 8 * VecRegBytes / VecElemBits;
912
913 // Check that TTI is happy with this vectorization factor.
914 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
915 VecElemBits * NumVecElems / 8, VecTy);
916 if (TargetVF != VF && TargetVF < NumVecElems) {
918 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
919 "because TargetVF="
920 << TargetVF << " != VF=" << VF
921 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
922 continue;
923 }
924
925 // If we're loading/storing from an alloca, align it if possible.
926 //
927 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
928 // tells us this is beneficial. This feels a bit odd, but it matches
929 // existing tests. This isn't *so* bad, because at most we align to 4
930 // bytes (current value of StackAdjustedAlignment).
931 //
932 // FIXME: We will upgrade the alignment of the alloca even if it turns out
933 // we can't vectorize for some other reason.
934 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
935 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
936 isa<AllocaInst>(PtrOperand->stripPointerCasts());
937 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
938 Align PrefAlign = Align(StackAdjustedAlignment);
939 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
940 accessIsAllowedAndFast(SizeBytes, AS, PrefAlign, VecElemBits)) {
942 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
943 if (NewAlign >= Alignment) {
945 << "LSV: splitByChain upgrading alloca alignment from "
946 << Alignment.value() << " to " << NewAlign.value()
947 << "\n");
948 Alignment = NewAlign;
949 }
950 }
951
952 Chain ExtendingLoadsStores;
953 if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) {
954 // If we have a non-power-of-2 element count, attempt to extend the
955 // chain to the next power-of-2 if it makes the access allowed and
956 // fast.
957 bool AllowedAndFast = false;
958 if (NumVecElems < TargetVF && !isPowerOf2_32(NumVecElems) &&
959 VecElemBits >= 8) {
960 // TargetVF may be a lot higher than NumVecElems,
961 // so only extend to the next power of 2.
962 assert(VecElemBits % 8 == 0);
963 unsigned VecElemBytes = VecElemBits / 8;
964 unsigned NewNumVecElems = PowerOf2Ceil(NumVecElems);
965 unsigned NewSizeBytes = VecElemBytes * NewNumVecElems;
966
967 assert(isPowerOf2_32(TargetVF) &&
968 "TargetVF expected to be a power of 2");
969 assert(NewNumVecElems <= TargetVF &&
970 "Should not extend past TargetVF");
971
973 << "LSV: attempting to extend chain of " << NumVecElems
974 << " " << (IsLoadChain ? "loads" : "stores") << " to "
975 << NewNumVecElems << " elements\n");
976 bool IsLegalToExtend =
977 IsLoadChain ? TTI.isLegalMaskedLoad(
978 FixedVectorType::get(VecElemTy, NewNumVecElems),
979 Alignment, AS, TTI::MaskKind::ConstantMask)
981 FixedVectorType::get(VecElemTy, NewNumVecElems),
982 Alignment, AS, TTI::MaskKind::ConstantMask);
983 // Only artificially increase the chain if it would be AllowedAndFast
984 // and if the resulting masked load/store will be legal for the
985 // target.
986 if (IsLegalToExtend &&
987 accessIsAllowedAndFast(NewSizeBytes, AS, Alignment,
988 VecElemBits)) {
990 << "LSV: extending " << (IsLoadChain ? "load" : "store")
991 << " chain of " << NumVecElems << " "
992 << (IsLoadChain ? "loads" : "stores")
993 << " with total byte size of " << SizeBytes << " to "
994 << NewNumVecElems << " "
995 << (IsLoadChain ? "loads" : "stores")
996 << " with total byte size of " << NewSizeBytes
997 << ", TargetVF=" << TargetVF << " \n");
998
999 // Create (NewNumVecElems - NumVecElems) extra elements.
1000 // We are basing each extra element on CBegin, which means the
1001 // offsets should be based on SizeBytes, which represents the offset
1002 // from CBegin to the current end of the chain.
1003 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1004 for (unsigned I = 0; I < (NewNumVecElems - NumVecElems); I++) {
1005 ChainElem NewElem = createExtraElementAfter(
1006 C[CBegin], VecElemTy,
1007 APInt(ASPtrBits, SizeBytes + I * VecElemBytes), "Extend");
1008 ExtendingLoadsStores.push_back(NewElem);
1009 }
1010
1011 // Update the size and number of elements for upcoming checks.
1012 SizeBytes = NewSizeBytes;
1013 NumVecElems = NewNumVecElems;
1014 AllowedAndFast = true;
1015 }
1016 }
1017 if (!AllowedAndFast) {
1018 // We were not able to achieve legality by extending the chain.
1020 << "LSV: splitChainByAlignment discarding candidate chain "
1021 "because its alignment is not AllowedAndFast: "
1022 << Alignment.value() << "\n");
1023 continue;
1024 }
1025 }
1026
1027 if ((IsLoadChain &&
1028 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
1029 (!IsLoadChain &&
1030 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
1031 LLVM_DEBUG(
1032 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
1033 "because !isLegalToVectorizeLoad/StoreChain.");
1034 continue;
1035 }
1036
1037 if (CandidateChainsMayContainExtraLoadsStores) {
1038 // If the candidate chain contains extra loads/stores from an earlier
1039 // optimization, confirm legality now. This filter is essential because
1040 // when filling gaps in splitChainByContiguity, we queried the API to
1041 // check that (for a given element type and address space) there *may*
1042 // have been a legal masked load/store we could possibly create. Now, we
1043 // need to check if the actual chain we ended up with is legal to turn
1044 // into a masked load/store. This is relevant for NVPTX, for example,
1045 // where a masked store is only legal if we have ended up with a 256-bit
1046 // vector.
1047 bool CurrCandContainsExtraLoadsStores = llvm::any_of(
1048 ArrayRef<ChainElem>(C).slice(CBegin, CEnd - CBegin + 1),
1049 [this](const ChainElem &E) {
1050 return ExtraElements.contains(E.Inst);
1051 });
1052
1053 if (CurrCandContainsExtraLoadsStores &&
1054 (IsLoadChain ? !TTI.isLegalMaskedLoad(
1055 FixedVectorType::get(VecElemTy, NumVecElems),
1056 Alignment, AS, TTI::MaskKind::ConstantMask)
1058 FixedVectorType::get(VecElemTy, NumVecElems),
1059 Alignment, AS, TTI::MaskKind::ConstantMask))) {
1061 << "LSV: splitChainByAlignment discarding candidate chain "
1062 "because it contains extra loads/stores that we cannot "
1063 "legally vectorize into a masked load/store \n");
1064 continue;
1065 }
1066 }
1067
1068 // Hooray, we can vectorize this chain!
1069 Chain &NewChain = Ret.emplace_back();
1070 for (unsigned I = CBegin; I <= CEnd; ++I)
1071 NewChain.emplace_back(C[I]);
1072 for (ChainElem E : ExtendingLoadsStores)
1073 NewChain.emplace_back(E);
1074 CBegin = CEnd; // Skip over the instructions we've added to the chain.
1075 break;
1076 }
1077 }
1078 return Ret;
1079}
1080
1081bool Vectorizer::vectorizeChain(Chain &C) {
1082 if (C.size() < 2)
1083 return false;
1084
1085 bool ChainContainsExtraLoadsStores = llvm::any_of(
1086 C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); });
1087
1088 // If we are left with a two-element chain, and one of the elements is an
1089 // extra element, we don't want to vectorize
1090 if (C.size() == 2 && ChainContainsExtraLoadsStores)
1091 return false;
1092
1093 sortChainInOffsetOrder(C);
1094
1095 LLVM_DEBUG({
1096 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
1097 dumpChain(C);
1098 });
1099
1100 Type *VecElemTy = getChainElemTy(C);
1101 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
1102 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
1103 unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
1104 APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
1105 unsigned ChainBytes = BytesAdded;
1106 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
1107 unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
1108 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
1109 // Update ChainBytes considering possible overlap.
1110 BytesAdded =
1111 PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
1112 ChainBytes += BytesAdded;
1113 PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
1114 }
1115
1116 assert(8 * ChainBytes % DL.getTypeSizeInBits(VecElemTy) == 0);
1117 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
1118 // than 1 byte (e.g. VecTy == <32 x i1>).
1119 unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
1120 Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
1121
1122 Align Alignment = getLoadStoreAlignment(C[0].Inst);
1123 // If this is a load/store of an alloca, we might have upgraded the alloca's
1124 // alignment earlier. Get the new alignment.
1125 if (AS == DL.getAllocaAddrSpace()) {
1126 Alignment = std::max(
1127 Alignment,
1129 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
1130 }
1131
1132 // All elements of the chain must have the same scalar-type size.
1133#ifndef NDEBUG
1134 for (const ChainElem &E : C)
1135 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
1136 DL.getTypeStoreSize(VecElemTy));
1137#endif
1138
1139 Instruction *VecInst;
1140 if (IsLoadChain) {
1141 // Loads get hoisted to the location of the first load in the chain. We may
1142 // also need to hoist the (transitive) operands of the loads.
1143 Builder.SetInsertPoint(
1144 llvm::min_element(C, [](const auto &A, const auto &B) {
1145 return A.Inst->comesBefore(B.Inst);
1146 })->Inst);
1147
1148 // If the chain contains extra loads, we need to vectorize into a
1149 // masked load.
1150 if (ChainContainsExtraLoadsStores) {
1151 assert(TTI.isLegalMaskedLoad(VecTy, Alignment, AS,
1153 Value *Mask = createMaskForExtraElements(C, cast<FixedVectorType>(VecTy));
1154 VecInst = Builder.CreateMaskedLoad(
1155 VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask);
1156 } else {
1157 // This can happen due to a chain of redundant loads.
1158 // In this case, just use the element-type, and avoid ExtractElement.
1159 if (NumElem == 1)
1160 VecTy = VecElemTy;
1161 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1162 // i.e. the root of the vector.
1163 VecInst = Builder.CreateAlignedLoad(
1164 VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment);
1165 }
1166
1167 for (const ChainElem &E : C) {
1168 Instruction *I = E.Inst;
1169 Value *V;
1171 unsigned EOffset =
1172 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1173 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
1174 if (!VecTy->isVectorTy()) {
1175 V = VecInst;
1176 } else if (auto *VT = dyn_cast<FixedVectorType>(T)) {
1177 auto Mask = llvm::to_vector<8>(
1178 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
1179 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
1180 } else {
1181 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
1182 I->getName());
1183 }
1184 if (V->getType() != I->getType())
1185 V = Builder.CreateBitOrPointerCast(V, I->getType());
1187 }
1188
1189 // Finally, we need to reorder the instrs in the BB so that the (transitive)
1190 // operands of VecInst appear before it. To see why, suppose we have
1191 // vectorized the following code:
1192 //
1193 // ptr1 = gep a, 1
1194 // load1 = load i32 ptr1
1195 // ptr0 = gep a, 0
1196 // load0 = load i32 ptr0
1197 //
1198 // We will put the vectorized load at the location of the earliest load in
1199 // the BB, i.e. load1. We get:
1200 //
1201 // ptr1 = gep a, 1
1202 // loadv = load <2 x i32> ptr0
1203 // load0 = extractelement loadv, 0
1204 // load1 = extractelement loadv, 1
1205 // ptr0 = gep a, 0
1206 //
1207 // Notice that loadv uses ptr0, which is defined *after* it!
1208 reorder(VecInst);
1209 } else {
1210 // Stores get sunk to the location of the last store in the chain.
1211 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
1212 return A.Inst->comesBefore(B.Inst);
1213 })->Inst);
1214
1215 // Build the vector to store.
1216 Value *Vec = PoisonValue::get(VecTy);
1217 auto InsertElem = [&](Value *V, unsigned VecIdx) {
1218 if (V->getType() != VecElemTy)
1219 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
1220 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
1221 };
1222 for (const ChainElem &E : C) {
1223 auto *I = cast<StoreInst>(E.Inst);
1224 unsigned EOffset =
1225 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1226 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
1227 if (FixedVectorType *VT =
1229 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
1230 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
1231 Builder.getInt32(J)),
1232 VecIdx++);
1233 }
1234 } else {
1235 InsertElem(I->getValueOperand(), VecIdx);
1236 }
1237 }
1238
1239 // If the chain originates from extra stores, we need to vectorize into a
1240 // masked store.
1241 if (ChainContainsExtraLoadsStores) {
1242 assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS,
1244 Value *Mask =
1245 createMaskForExtraElements(C, cast<FixedVectorType>(Vec->getType()));
1246 VecInst = Builder.CreateMaskedStore(
1247 Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask);
1248 } else {
1249 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1250 // i.e. the root of the vector.
1251 VecInst = Builder.CreateAlignedStore(
1252 Vec, getLoadStorePointerOperand(C[0].Inst), Alignment);
1253 }
1254 }
1255
1256 propagateMetadata(VecInst, C);
1257
1258 for (const ChainElem &E : C)
1259 ToErase.emplace_back(E.Inst);
1260
1261 ++NumVectorInstructions;
1262 NumScalarsVectorized += C.size();
1263 return true;
1264}
1265
1266template <bool IsLoadChain>
1267bool Vectorizer::isSafeToMove(
1268 Instruction *ChainElem, Instruction *ChainBegin,
1269 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1270 BatchAAResults &BatchAA) {
1271 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1272 << *ChainBegin << ")\n");
1273
1274 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1275 if (ChainElem == ChainBegin)
1276 return true;
1277
1278 // Invariant loads can always be reordered; by definition they are not
1279 // clobbered by stores.
1280 if (isInvariantLoad(ChainElem))
1281 return true;
1282
1283 auto BBIt = std::next([&] {
1284 if constexpr (IsLoadChain)
1285 return BasicBlock::reverse_iterator(ChainElem);
1286 else
1287 return BasicBlock::iterator(ChainElem);
1288 }());
1289 auto BBItEnd = std::next([&] {
1290 if constexpr (IsLoadChain)
1291 return BasicBlock::reverse_iterator(ChainBegin);
1292 else
1293 return BasicBlock::iterator(ChainBegin);
1294 }());
1295
1296 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1297 const unsigned ChainElemSize =
1298 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1299
1300 for (; BBIt != BBItEnd; ++BBIt) {
1301 Instruction *I = &*BBIt;
1302
1303 if (!I->mayReadOrWriteMemory())
1304 continue;
1305
1306 // Loads can be reordered with other loads.
1307 if (IsLoadChain && isa<LoadInst>(I))
1308 continue;
1309
1310 // Stores can be sunk below invariant loads.
1311 if (!IsLoadChain && isInvariantLoad(I))
1312 continue;
1313
1314 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1315 // what offset ChainIt accesses. This may be better than AA is able to do.
1316 //
1317 // We should really only have duplicate offsets for stores (the duplicate
1318 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1319 // split the chain so we don't have to handle this case specially.
1320 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1321 // I and ChainElem overlap if:
1322 // - I and ChainElem have the same offset, OR
1323 // - I's offset is less than ChainElem's, but I touches past the
1324 // beginning of ChainElem, OR
1325 // - ChainElem's offset is less than I's, but ChainElem touches past the
1326 // beginning of I.
1327 const APInt &IOffset = OffsetIt->second;
1328 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1329 if (IOffset == ChainElemOffset ||
1330 (IOffset.sle(ChainElemOffset) &&
1331 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1332 (ChainElemOffset.sle(IOffset) &&
1333 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1334 LLVM_DEBUG({
1335 // Double check that AA also sees this alias. If not, we probably
1336 // have a bug.
1337 ModRefInfo MR =
1338 BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1339 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1340 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1341 });
1342 return false; // We found an aliasing instruction; bail.
1343 }
1344
1345 continue; // We're confident there's no alias.
1346 }
1347
1348 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1349 ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1350 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1351 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1352 << " Aliasing instruction:\n"
1353 << " " << *I << '\n'
1354 << " Aliased instruction and pointer:\n"
1355 << " " << *ChainElem << '\n'
1356 << " " << *getLoadStorePointerOperand(ChainElem)
1357 << '\n');
1358
1359 return false;
1360 }
1361 }
1362 return true;
1363}
1364
1367 return (Signed && BinOpI->hasNoSignedWrap()) ||
1368 (!Signed && BinOpI->hasNoUnsignedWrap());
1369}
1370
1371static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1372 unsigned MatchingOpIdxA, Instruction *AddOpB,
1373 unsigned MatchingOpIdxB, bool Signed) {
1374 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1375 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1376 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1377 << ", MatchingOpIdxB=" << MatchingOpIdxB
1378 << ", Signed=" << Signed << "\n");
1379 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1380 // being the same, we can guarantee that the transformation is safe if we can
1381 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1382 // For example:
1383 // %tmp7 = add nsw i32 %tmp2, %v0
1384 // %tmp8 = sext i32 %tmp7 to i64
1385 // ...
1386 // %tmp11 = add nsw i32 %v0, 1
1387 // %tmp12 = add nsw i32 %tmp2, %tmp11
1388 // %tmp13 = sext i32 %tmp12 to i64
1389 //
1390 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1391 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1392 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1393 assert(AddOpA->getOpcode() == Instruction::Add &&
1394 AddOpB->getOpcode() == Instruction::Add &&
1395 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1396 if (AddOpA->getOperand(MatchingOpIdxA) ==
1397 AddOpB->getOperand(MatchingOpIdxB)) {
1398 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1399 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1400 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1401 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1402 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1403 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1404 checkNoWrapFlags(OtherInstrB, Signed) &&
1405 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1406 int64_t CstVal =
1407 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1408 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1409 IdxDiff.getSExtValue() == CstVal)
1410 return true;
1411 }
1412 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1413 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1414 checkNoWrapFlags(OtherInstrA, Signed) &&
1415 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1416 int64_t CstVal =
1417 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1418 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1419 IdxDiff.getSExtValue() == -CstVal)
1420 return true;
1421 }
1422 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1423 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1424 if (OtherInstrA && OtherInstrB &&
1425 OtherInstrA->getOpcode() == Instruction::Add &&
1426 OtherInstrB->getOpcode() == Instruction::Add &&
1427 checkNoWrapFlags(OtherInstrA, Signed) &&
1428 checkNoWrapFlags(OtherInstrB, Signed) &&
1429 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1430 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1431 int64_t CstValA =
1432 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1433 int64_t CstValB =
1434 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1435 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1436 IdxDiff.getSExtValue() == (CstValB - CstValA))
1437 return true;
1438 }
1439 }
1440 return false;
1441}
1442
1443std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1444 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1445 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1446 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1447 << " Depth=" << Depth << "\n");
1448 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1449 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1450 if (!GEPA || !GEPB)
1451 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1452
1453 // Look through GEPs after checking they're the same except for the last
1454 // index.
1455 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1456 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1457 return std::nullopt;
1458 gep_type_iterator GTIA = gep_type_begin(GEPA);
1459 gep_type_iterator GTIB = gep_type_begin(GEPB);
1460 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1461 if (GTIA.getOperand() != GTIB.getOperand())
1462 return std::nullopt;
1463 ++GTIA;
1464 ++GTIB;
1465 }
1466
1469 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1470 OpA->getType() != OpB->getType())
1471 return std::nullopt;
1472
1473 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1474
1475 // Only look through a ZExt/SExt.
1476 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1477 return std::nullopt;
1478
1479 bool Signed = isa<SExtInst>(OpA);
1480
1481 // At this point A could be a function parameter, i.e. not an instruction
1482 Value *ValA = OpA->getOperand(0);
1483 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1484 if (!OpB || ValA->getType() != OpB->getType())
1485 return std::nullopt;
1486
1487 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1488 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1489 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1490 if (IdxDiffSCEV == SE.getCouldNotCompute())
1491 return std::nullopt;
1492
1493 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1494 if (!IdxDiffRange.isSingleElement())
1495 return std::nullopt;
1496 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1497
1498 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1499 << "\n");
1500
1501 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1502 bool Safe = false;
1503
1504 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1505 // ValA, we're okay.
1506 if (OpB->getOpcode() == Instruction::Add &&
1507 isa<ConstantInt>(OpB->getOperand(1)) &&
1508 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1510 Safe = true;
1511
1512 // Second attempt: check if we have eligible add NSW/NUW instruction
1513 // sequences.
1514 OpA = dyn_cast<Instruction>(ValA);
1515 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1516 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1517 checkNoWrapFlags(OpB, Signed)) {
1518 // In the checks below a matching operand in OpA and OpB is an operand which
1519 // is the same in those two instructions. Below we account for possible
1520 // orders of the operands of these add instructions.
1521 for (unsigned MatchingOpIdxA : {0, 1})
1522 for (unsigned MatchingOpIdxB : {0, 1})
1523 if (!Safe)
1524 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1525 MatchingOpIdxB, Signed);
1526 }
1527
1528 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1529
1530 // Third attempt:
1531 //
1532 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1533 // order bit other than the sign bit are known to be zero in ValA, we can add
1534 // Diff to it while guaranteeing no overflow of any sort.
1535 //
1536 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1537 if (!Safe) {
1538 // When computing known bits, use the GEPs as context instructions, since
1539 // they likely are in the same BB as the load/store.
1540 KnownBits Known(BitWidth);
1541 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, &AC, ContextInst,
1542 &DT);
1543 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1544 if (Signed)
1545 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1546 Safe = BitsAllowedToBeSet.uge(IdxDiff.abs());
1547 }
1548
1549 if (Safe)
1550 return IdxDiff * Stride;
1551 return std::nullopt;
1552}
1553
1554std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1555 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1556 if (Depth++ == MaxDepth)
1557 return std::nullopt;
1558
1559 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1560 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1561 if (SelectA->getCondition() != SelectB->getCondition())
1562 return std::nullopt;
1563 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1564 << ", PtrB=" << *PtrB << ", ContextInst="
1565 << *ContextInst << ", Depth=" << Depth << "\n");
1566 std::optional<APInt> TrueDiff = getConstantOffset(
1567 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1568 if (!TrueDiff)
1569 return std::nullopt;
1570 std::optional<APInt> FalseDiff =
1571 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1572 ContextInst, Depth);
1573 if (TrueDiff == FalseDiff)
1574 return TrueDiff;
1575 }
1576 }
1577 return std::nullopt;
1578}
1579
1580void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1581 if (EQClasses.size() < 2) // There is nothing to merge.
1582 return;
1583
1584 // The reduced key has all elements of the ECClassKey except the underlying
1585 // object. Check that EqClassKey has 4 elements and define the reduced key.
1586 static_assert(std::tuple_size_v<EqClassKey> == 4,
1587 "EqClassKey has changed - EqClassReducedKey needs changes too");
1588 using EqClassReducedKey =
1589 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1590 std::tuple_element_t<2, EqClassKey> /* Element size */,
1591 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1592 using ECReducedKeyToUnderlyingObjectMap =
1593 MapVector<EqClassReducedKey,
1594 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1595
1596 // Form a map from the reduced key (without the underlying object) to the
1597 // underlying objects: 1 reduced key to many underlying objects, to form
1598 // groups of potentially merge-able equivalence classes.
1599 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1600 bool FoundPotentiallyOptimizableEC = false;
1601 for (const auto &EC : EQClasses) {
1602 const auto &Key = EC.first;
1603 EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1604 std::get<3>(Key)};
1605 auto &UOMap = RedKeyToUOMap[RedKey];
1606 UOMap.insert(std::get<0>(Key));
1607 if (UOMap.size() > 1)
1608 FoundPotentiallyOptimizableEC = true;
1609 }
1610 if (!FoundPotentiallyOptimizableEC)
1611 return;
1612
1613 LLVM_DEBUG({
1614 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1615 for (const auto &EC : EQClasses) {
1616 dbgs() << " Key: {" << EC.first << "}\n";
1617 for (const auto &Inst : EC.second)
1618 dbgs() << " Inst: " << *Inst << '\n';
1619 }
1620 });
1621 LLVM_DEBUG({
1622 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1623 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1624 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1625 << std::get<1>(RedKeyToUO.first) << ", "
1626 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1627 << RedKeyToUO.second.size() << " underlying objects:\n";
1628 for (auto UObject : RedKeyToUO.second)
1629 dbgs() << " " << *UObject << '\n';
1630 }
1631 });
1632
1633 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1634
1635 // Compute the ultimate targets for a set of underlying objects.
1636 auto GetUltimateTargets =
1637 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1638 UObjectToUObjectMap IndirectionMap;
1639 for (const auto *UObject : UObjects) {
1640 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1641 const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
1642 if (UltimateTarget != UObject)
1643 IndirectionMap[UObject] = UltimateTarget;
1644 }
1645 UObjectToUObjectMap UltimateTargetsMap;
1646 for (const auto *UObject : UObjects) {
1647 auto Target = UObject;
1648 auto It = IndirectionMap.find(Target);
1649 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
1650 Target = It->second;
1651 UltimateTargetsMap[UObject] = Target;
1652 }
1653 return UltimateTargetsMap;
1654 };
1655
1656 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1657 // try to merge the equivalence classes.
1658 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1659 if (UObjects.size() < 2)
1660 continue;
1661 auto UTMap = GetUltimateTargets(UObjects);
1662 for (const auto &[UObject, UltimateTarget] : UTMap) {
1663 if (UObject == UltimateTarget)
1664 continue;
1665
1666 EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1667 std::get<2>(RedKey)};
1668 EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1669 std::get<2>(RedKey)};
1670 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1671 // request the reference to the instructions vector for KeyTo first.
1672 const auto &VecTo = EQClasses[KeyTo];
1673 const auto &VecFrom = EQClasses[KeyFrom];
1674 SmallVector<Instruction *, 8> MergedVec;
1675 std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1676 std::back_inserter(MergedVec),
1677 [](Instruction *A, Instruction *B) {
1678 return A && B && A->comesBefore(B);
1679 });
1680 EQClasses[KeyTo] = std::move(MergedVec);
1681 EQClasses.erase(KeyFrom);
1682 }
1683 }
1684 LLVM_DEBUG({
1685 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1686 for (const auto &EC : EQClasses) {
1687 dbgs() << " Key: {" << EC.first << "}\n";
1688 for (const auto &Inst : EC.second)
1689 dbgs() << " Inst: " << *Inst << '\n';
1690 }
1691 });
1692}
1693
1694EquivalenceClassMap
1695Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1697 EquivalenceClassMap Ret;
1698
1699 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1700 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1701 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1702 // The select's themselves are distinct instructions even if they share
1703 // the same condition and evaluate to consecutive pointers for true and
1704 // false values of the condition. Therefore using the select's themselves
1705 // for grouping instructions would put consecutive accesses into different
1706 // lists and they won't be even checked for being consecutive, and won't
1707 // be vectorized.
1708 return Sel->getCondition();
1709 }
1710 return ObjPtr;
1711 };
1712
1713 for (Instruction &I : make_range(Begin, End)) {
1714 auto *LI = dyn_cast<LoadInst>(&I);
1715 auto *SI = dyn_cast<StoreInst>(&I);
1716 if (!LI && !SI)
1717 continue;
1718
1719 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1720 continue;
1721
1722 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1723 (SI && !TTI.isLegalToVectorizeStore(SI)))
1724 continue;
1725
1726 Type *Ty = getLoadStoreType(&I);
1727 if (!VectorType::isValidElementType(Ty->getScalarType()))
1728 continue;
1729
1730 // Skip weird non-byte sizes. They probably aren't worth the effort of
1731 // handling correctly.
1732 unsigned TySize = DL.getTypeSizeInBits(Ty);
1733 if ((TySize % 8) != 0)
1734 continue;
1735
1736 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1737 // functions are currently using an integer type for the vectorized
1738 // load/store, and does not support casting between the integer type and a
1739 // vector of pointers (e.g. i64 to <2 x i16*>)
1740 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1741 continue;
1742
1744 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1745 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1746
1747 unsigned VF = VecRegSize / TySize;
1748 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1749
1750 // Only handle power-of-two sized elements.
1751 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1752 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1753 continue;
1754
1755 // No point in looking at these if they're too big to vectorize.
1756 if (TySize > VecRegSize / 2 ||
1757 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1758 continue;
1759
1760 Ret[{GetUnderlyingObject(Ptr), AS,
1761 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1762 /*IsLoad=*/LI != nullptr}]
1763 .emplace_back(&I);
1764 }
1765
1766 mergeEquivalenceClasses(Ret);
1767 return Ret;
1768}
1769
1770std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1771 if (Instrs.empty())
1772 return {};
1773
1774 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1775 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1776
1777#ifndef NDEBUG
1778 // Check that Instrs is in BB order and all have the same addr space.
1779 for (size_t I = 1; I < Instrs.size(); ++I) {
1780 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1781 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1782 }
1783#endif
1784
1785 // Machinery to build an MRU-hashtable of Chains.
1786 //
1787 // (Ideally this could be done with MapVector, but as currently implemented,
1788 // moving an element to the front of a MapVector is O(n).)
1789 struct InstrListElem : ilist_node<InstrListElem>,
1790 std::pair<Instruction *, Chain> {
1791 explicit InstrListElem(Instruction *I)
1792 : std::pair<Instruction *, Chain>(I, {}) {}
1793 };
1794 struct InstrListElemDenseMapInfo {
1795 using PtrInfo = DenseMapInfo<InstrListElem *>;
1796 using IInfo = DenseMapInfo<Instruction *>;
1797 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1798 static InstrListElem *getTombstoneKey() {
1799 return PtrInfo::getTombstoneKey();
1800 }
1801 static unsigned getHashValue(const InstrListElem *E) {
1802 return IInfo::getHashValue(E->first);
1803 }
1804 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1805 if (A == getEmptyKey() || B == getEmptyKey())
1806 return A == getEmptyKey() && B == getEmptyKey();
1807 if (A == getTombstoneKey() || B == getTombstoneKey())
1808 return A == getTombstoneKey() && B == getTombstoneKey();
1809 return IInfo::isEqual(A->first, B->first);
1810 }
1811 };
1812 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1813 simple_ilist<InstrListElem> MRU;
1814 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1815
1816 // Compare each instruction in `instrs` to leader of the N most recently-used
1817 // chains. This limits the O(n^2) behavior of this pass while also allowing
1818 // us to build arbitrarily long chains.
1819 for (Instruction *I : Instrs) {
1820 constexpr int MaxChainsToTry = 64;
1821
1822 bool MatchFound = false;
1823 auto ChainIter = MRU.begin();
1824 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1825 ++J, ++ChainIter) {
1826 if (std::optional<APInt> Offset = getConstantOffset(
1827 getLoadStorePointerOperand(ChainIter->first),
1829 /*ContextInst=*/
1830 (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1831 // `Offset` might not have the expected number of bits, if e.g. AS has a
1832 // different number of bits than opaque pointers.
1833 ChainIter->second.emplace_back(I, Offset.value());
1834 // Move ChainIter to the front of the MRU list.
1835 MRU.remove(*ChainIter);
1836 MRU.push_front(*ChainIter);
1837 MatchFound = true;
1838 break;
1839 }
1840 }
1841
1842 if (!MatchFound) {
1843 APInt ZeroOffset(ASPtrBits, 0);
1844 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1845 E->second.emplace_back(I, ZeroOffset);
1846 MRU.push_front(*E);
1847 Chains.insert(E);
1848 }
1849 }
1850
1851 std::vector<Chain> Ret;
1852 Ret.reserve(Chains.size());
1853 // Iterate over MRU rather than Chains so the order is deterministic.
1854 for (auto &E : MRU)
1855 if (E.second.size() > 1)
1856 Ret.emplace_back(std::move(E.second));
1857 return Ret;
1858}
1859
1860std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1861 Instruction *ContextInst,
1862 unsigned Depth) {
1863 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1864 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1865 << ", Depth=" << Depth << "\n");
1866 // We'll ultimately return a value of this bit width, even if computations
1867 // happen in a different width.
1868 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1869 APInt OffsetA(OrigBitWidth, 0);
1870 APInt OffsetB(OrigBitWidth, 0);
1871 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1872 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1873 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1874 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1875 return std::nullopt;
1876
1877 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1878 // should properly handle a possible overflow and the value should fit into
1879 // the smallest data type used in the cast/gep chain.
1880 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1881 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1882
1883 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1884 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1885 if (PtrA == PtrB)
1886 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1887
1888 // Try to compute B - A.
1889 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1890 if (DistScev != SE.getCouldNotCompute()) {
1891 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1892 ConstantRange DistRange = SE.getSignedRange(DistScev);
1893 if (DistRange.isSingleElement()) {
1894 // Handle index width (the width of Dist) != pointer width (the width of
1895 // the Offset*s at this point).
1896 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1897 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1898 }
1899 }
1900 if (std::optional<APInt> Diff =
1901 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1902 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1903 .sextOrTrunc(OrigBitWidth);
1904 return std::nullopt;
1905}
1906
1907bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS,
1908 Align Alignment,
1909 unsigned VecElemBits) const {
1910 // Aligned vector accesses are ALWAYS faster than element-wise accesses.
1911 if (Alignment.value() % SizeBytes == 0)
1912 return true;
1913
1914 // Ask TTI whether misaligned accesses are faster as vector or element-wise.
1915 unsigned VectorizedSpeed = 0;
1916 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
1917 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
1918 if (!AllowsMisaligned) {
1919 LLVM_DEBUG(
1920 dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS
1921 << " with alignment " << Alignment.value()
1922 << " is misaligned, and therefore can't be vectorized.\n");
1923 return false;
1924 }
1925
1926 unsigned ElementwiseSpeed = 0;
1927 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
1928 Alignment, &ElementwiseSpeed);
1929 if (VectorizedSpeed < ElementwiseSpeed) {
1930 LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace "
1931 << AS << " with alignment " << Alignment.value()
1932 << " has relative speed " << VectorizedSpeed
1933 << ", which is lower than the elementwise speed of "
1934 << ElementwiseSpeed
1935 << ". Therefore this access won't be vectorized.\n");
1936 return false;
1937 }
1938 return true;
1939}
1940
1941ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, Type *Ty,
1942 APInt Offset, StringRef Prefix,
1943 Align Alignment) {
1944 Instruction *NewElement = nullptr;
1945 Builder.SetInsertPoint(Prev.Inst->getNextNode());
1946 if (LoadInst *PrevLoad = dyn_cast<LoadInst>(Prev.Inst)) {
1947 Value *NewGep = Builder.CreatePtrAdd(
1948 PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
1949 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1950 NewElement = Builder.CreateAlignedLoad(Ty, NewGep, Alignment, Prefix);
1951 } else {
1952 StoreInst *PrevStore = cast<StoreInst>(Prev.Inst);
1953
1954 Value *NewGep = Builder.CreatePtrAdd(
1955 PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
1956 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1957 NewElement =
1958 Builder.CreateAlignedStore(PoisonValue::get(Ty), NewGep, Alignment);
1959 }
1960
1961 // Attach all metadata to the new element.
1962 // propagateMetadata will fold it into the final vector when applicable.
1963 NewElement->copyMetadata(*Prev.Inst);
1964
1965 // Cache created elements for tracking and cleanup
1966 ExtraElements.insert(NewElement);
1967
1968 APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset;
1969 LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n"
1970 << *NewElement
1971 << " OffsetFromLeader: " << NewOffsetFromLeader << "\n");
1972 return ChainElem{NewElement, NewOffsetFromLeader};
1973}
1974
1975Value *Vectorizer::createMaskForExtraElements(const ArrayRef<ChainElem> C,
1976 FixedVectorType *VecTy) {
1977 // Start each mask element as false
1979 Builder.getInt1(false));
1980 // Iterate over the chain and set the corresponding mask element to true for
1981 // each element that is not an extra element.
1982 for (const ChainElem &E : C) {
1983 if (ExtraElements.contains(E.Inst))
1984 continue;
1985 unsigned EOffset =
1986 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1987 unsigned VecIdx =
1988 8 * EOffset / DL.getTypeSizeInBits(VecTy->getScalarType());
1989 if (FixedVectorType *VT =
1991 for (unsigned J = 0; J < VT->getNumElements(); ++J)
1992 MaskElts[VecIdx + J] = Builder.getInt1(true);
1993 else
1994 MaskElts[VecIdx] = Builder.getInt1(true);
1995 }
1996 return ConstantVector::get(MaskElts);
1997}
1998
1999void Vectorizer::deleteExtraElements() {
2000 for (auto *ExtraElement : ExtraElements) {
2001 if (isa<LoadInst>(ExtraElement)) {
2002 [[maybe_unused]] bool Deleted =
2004 assert(Deleted && "Extra Load should always be trivially dead");
2005 } else {
2006 // Unlike Extra Loads, Extra Stores won't be "dead", but should all be
2007 // deleted regardless. They will have either been combined into a masked
2008 // store, or will be left behind and need to be cleaned up.
2009 auto *PtrOperand = getLoadStorePointerOperand(ExtraElement);
2010 ExtraElement->eraseFromParent();
2012 }
2013 }
2014
2015 ExtraElements.clear();
2016}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define T
static bool isInvariantLoad(const Instruction *I, const Value *Ptr, const bool IsKernelFn)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To)
Check if it's safe to move From down to To, checking that no physical registers are clobbered.
Basic Register Allocator
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
This pass exposes codegen information to IR-level passes.
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition APInt.h:78
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1407
APInt abs() const
Get the absolute value.
Definition APInt.h:1796
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1167
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Legacy wrapper pass to provide the GlobalsAAResult object.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2039
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1886
ConstantInt * getInt(const APInt &AI)
Get a constant integer value.
Definition IRBuilder.h:538
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
An instruction for reading from memory.
bool isSimple() const
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:124
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Legacy wrapper pass to provide the SCEVAAResult object.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getCouldNotCompute()
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Value * getPointerOperand()
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool isLegalToVectorizeLoad(LoadInst *LI) const
LLVM_ABI bool isLegalToVectorizeStore(StoreInst *SI) const
LLVM_ABI bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, MaskKind MaskKind=VariableOrConstantMask) const
Return true if the target supports masked store.
LLVM_ABI unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
LLVM_ABI unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
LLVM_ABI unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
LLVM_ABI bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, MaskKind MaskKind=VariableOrConstantMask) const
Return true if the target supports masked load.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
TypeSize getSequentialElementStride(const DataLayout &DL) const
const ParentTy * getParent() const
Definition ilist_node.h:34
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
void push_front(reference Node)
Insert a node at the front; never copies.
void remove(reference N)
Remove a node by reference; never deletes.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2254
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:60
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2032
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition APFloat.h:1545
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
iterator_range< po_iterator< T > > post_order(const T &G)
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition Local.cpp:1566
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
generic_gep_type_iterator<> gep_type_iterator
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1879
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2132
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77