LLVM 17.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40
42#include "llvm/ADT/APInt.h"
43#include "llvm/ADT/ArrayRef.h"
44#include "llvm/ADT/MapVector.h"
46#include "llvm/ADT/STLExtras.h"
49#include "llvm/ADT/Statistic.h"
58#include "llvm/IR/Attributes.h"
59#include "llvm/IR/BasicBlock.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
63#include "llvm/IR/Dominators.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/IRBuilder.h"
67#include "llvm/IR/InstrTypes.h"
68#include "llvm/IR/Instruction.h"
70#include "llvm/IR/Module.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Value.h"
74#include "llvm/Pass.h"
76#include "llvm/Support/Debug.h"
82#include <algorithm>
83#include <cassert>
84#include <cstdlib>
85#include <tuple>
86#include <utility>
87
88using namespace llvm;
89
90#define DEBUG_TYPE "load-store-vectorizer"
91
92STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94
95// FIXME: Assuming stack alignment of 4 is always good enough
96static const unsigned StackAdjustedAlignment = 4;
97
98namespace {
99
100/// ChainID is an arbitrary token that is allowed to be different only for the
101/// accesses that are guaranteed to be considered non-consecutive by
102/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103/// together and reducing the number of instructions the main search operates on
104/// at a time, i.e. this is to reduce compile time and nothing else as the main
105/// search has O(n^2) time complexity. The underlying type of ChainID should not
106/// be relied upon.
107using ChainID = const Value *;
108using InstrList = SmallVector<Instruction *, 8>;
109using InstrListMap = MapVector<ChainID, InstrList>;
110
111class Vectorizer {
112 Function &F;
113 AliasAnalysis &AA;
114 AssumptionCache &AC;
115 DominatorTree &DT;
116 ScalarEvolution &SE;
118 const DataLayout &DL;
120
121public:
122 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
124 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
125 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
126
127 bool run();
128
129private:
130 unsigned getPointerAddressSpace(Value *I);
131
132 static const unsigned MaxDepth = 3;
133
135 bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
136 unsigned Depth = 0) const;
137 bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
138 unsigned Depth) const;
139 bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
140 unsigned Depth) const;
141
142 /// After vectorization, reorder the instructions that I depends on
143 /// (the instructions defining its operands), to ensure they dominate I.
144 void reorder(Instruction *I);
145
146 /// Returns the first and the last instructions in Chain.
147 std::pair<BasicBlock::iterator, BasicBlock::iterator>
148 getBoundaryInstrs(ArrayRef<Instruction *> Chain);
149
150 /// Erases the original instructions after vectorizing.
151 void eraseInstructions(ArrayRef<Instruction *> Chain);
152
153 /// "Legalize" the vector type that would be produced by combining \p
154 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
155 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
156 /// expected to have more than 4 elements.
157 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
158 splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
159
160 /// Finds the largest prefix of Chain that's vectorizable, checking for
161 /// intervening instructions which may affect the memory accessed by the
162 /// instructions within Chain.
163 ///
164 /// The elements of \p Chain must be all loads or all stores and must be in
165 /// address order.
166 ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
167
168 /// Collects load and store instructions to vectorize.
169 std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
170
171 /// Processes the collected instructions, the \p Map. The values of \p Map
172 /// should be all loads or all stores.
173 bool vectorizeChains(InstrListMap &Map);
174
175 /// Finds the load/stores to consecutive memory addresses and vectorizes them.
176 bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
177
178 /// Vectorizes the load instructions in Chain.
179 bool
180 vectorizeLoadChain(ArrayRef<Instruction *> Chain,
181 SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
182
183 /// Vectorizes the store instructions in Chain.
184 bool
185 vectorizeStoreChain(ArrayRef<Instruction *> Chain,
186 SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
187
188 /// Check if this load/store access is misaligned accesses.
189 /// Returns a \p RelativeSpeed of an operation if allowed suitable to
190 /// compare to another result for the same \p AddressSpace and potentially
191 /// different \p Alignment and \p SzInBytes.
192 bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
193 Align Alignment, unsigned &RelativeSpeed);
194};
195
196class LoadStoreVectorizerLegacyPass : public FunctionPass {
197public:
198 static char ID;
199
200 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
202 }
203
204 bool runOnFunction(Function &F) override;
205
206 StringRef getPassName() const override {
207 return "GPU Load and Store Vectorizer";
208 }
209
210 void getAnalysisUsage(AnalysisUsage &AU) const override {
216 AU.setPreservesCFG();
217 }
218};
219
220} // end anonymous namespace
221
222char LoadStoreVectorizerLegacyPass::ID = 0;
223
224INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
225 "Vectorize load and Store instructions", false, false)
232INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
233 "Vectorize load and store instructions", false, false)
234
236 return new LoadStoreVectorizerLegacyPass();
237}
238
239bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
240 // Don't vectorize when the attribute NoImplicitFloat is used.
241 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
242 return false;
243
244 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
245 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
246 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
248 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
249
250 AssumptionCache &AC =
251 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
252
253 Vectorizer V(F, AA, AC, DT, SE, TTI);
254 return V.run();
255}
256
258 // Don't vectorize when the attribute NoImplicitFloat is used.
259 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
260 return PreservedAnalyses::all();
261
267
268 Vectorizer V(F, AA, AC, DT, SE, TTI);
269 bool Changed = V.run();
272 return Changed ? PA : PreservedAnalyses::all();
273}
274
275// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
276// vectors of Instructions.
278 SmallVector<Value *, 8> VL(IL.begin(), IL.end());
279 propagateMetadata(I, VL);
280}
281
282// Vectorizer Implementation
283bool Vectorizer::run() {
284 bool Changed = false;
285
286 // Scan the blocks in the function in post order.
287 for (BasicBlock *BB : post_order(&F)) {
288 InstrListMap LoadRefs, StoreRefs;
289 std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
290 Changed |= vectorizeChains(LoadRefs);
291 Changed |= vectorizeChains(StoreRefs);
292 }
293
294 return Changed;
295}
296
297unsigned Vectorizer::getPointerAddressSpace(Value *I) {
298 if (LoadInst *L = dyn_cast<LoadInst>(I))
299 return L->getPointerAddressSpace();
300 if (StoreInst *S = dyn_cast<StoreInst>(I))
301 return S->getPointerAddressSpace();
302 return -1;
303}
304
305// FIXME: Merge with llvm::isConsecutiveAccess
306bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
309 unsigned ASA = getPointerAddressSpace(A);
310 unsigned ASB = getPointerAddressSpace(B);
311
312 // Check that the address spaces match and that the pointers are valid.
313 if (!PtrA || !PtrB || (ASA != ASB))
314 return false;
315
316 // Make sure that A and B are different pointers of the same size type.
317 Type *PtrATy = getLoadStoreType(A);
318 Type *PtrBTy = getLoadStoreType(B);
319 if (PtrA == PtrB ||
320 PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
321 DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
322 DL.getTypeStoreSize(PtrATy->getScalarType()) !=
323 DL.getTypeStoreSize(PtrBTy->getScalarType()))
324 return false;
325
326 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
327 APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
328
329 return areConsecutivePointers(PtrA, PtrB, Size);
330}
331
332bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
333 APInt PtrDelta, unsigned Depth) const {
334 unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
335 APInt OffsetA(PtrBitWidth, 0);
336 APInt OffsetB(PtrBitWidth, 0);
337 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
338 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
339
340 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
341
342 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
343 return false;
344
345 // In case if we have to shrink the pointer
346 // stripAndAccumulateInBoundsConstantOffsets should properly handle a
347 // possible overflow and the value should fit into a smallest data type
348 // used in the cast/gep chain.
349 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
350 OffsetB.getSignificantBits() <= NewPtrBitWidth);
351
352 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
353 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
354 PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
355
356 APInt OffsetDelta = OffsetB - OffsetA;
357
358 // Check if they are based on the same pointer. That makes the offsets
359 // sufficient.
360 if (PtrA == PtrB)
361 return OffsetDelta == PtrDelta;
362
363 // Compute the necessary base pointer delta to have the necessary final delta
364 // equal to the pointer delta requested.
365 APInt BaseDelta = PtrDelta - OffsetDelta;
366
367 // Compute the distance with SCEV between the base pointers.
368 const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
369 const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
370 const SCEV *C = SE.getConstant(BaseDelta);
371 const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
372 if (X == PtrSCEVB)
373 return true;
374
375 // The above check will not catch the cases where one of the pointers is
376 // factorized but the other one is not, such as (C + (S * (A + B))) vs
377 // (AS + BS). Get the minus scev. That will allow re-combining the expresions
378 // and getting the simplified difference.
379 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
380 if (C == Dist)
381 return true;
382
383 // Sometimes even this doesn't work, because SCEV can't always see through
384 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
385 // things the hard way.
386 return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
387}
388
390 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
391 return (Signed && BinOpI->hasNoSignedWrap()) ||
392 (!Signed && BinOpI->hasNoUnsignedWrap());
393}
394
395static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
396 unsigned MatchingOpIdxA, Instruction *AddOpB,
397 unsigned MatchingOpIdxB, bool Signed) {
398 // If both OpA and OpB is an add with NSW/NUW and with
399 // one of the operands being the same, we can guarantee that the
400 // transformation is safe if we can prove that OpA won't overflow when
401 // IdxDiff added to the other operand of OpA.
402 // For example:
403 // %tmp7 = add nsw i32 %tmp2, %v0
404 // %tmp8 = sext i32 %tmp7 to i64
405 // ...
406 // %tmp11 = add nsw i32 %v0, 1
407 // %tmp12 = add nsw i32 %tmp2, %tmp11
408 // %tmp13 = sext i32 %tmp12 to i64
409 //
410 // Both %tmp7 and %tmp2 has the nsw flag and the first operand
411 // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
412 // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
413 // nsw flag.
414 assert(AddOpA->getOpcode() == Instruction::Add &&
415 AddOpB->getOpcode() == Instruction::Add &&
416 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
417 if (AddOpA->getOperand(MatchingOpIdxA) ==
418 AddOpB->getOperand(MatchingOpIdxB)) {
419 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
420 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
421 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
422 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
423 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
424 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
425 checkNoWrapFlags(OtherInstrB, Signed) &&
426 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
427 int64_t CstVal =
428 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
429 if (OtherInstrB->getOperand(0) == OtherOperandA &&
430 IdxDiff.getSExtValue() == CstVal)
431 return true;
432 }
433 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
434 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
435 checkNoWrapFlags(OtherInstrA, Signed) &&
436 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
437 int64_t CstVal =
438 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
439 if (OtherInstrA->getOperand(0) == OtherOperandB &&
440 IdxDiff.getSExtValue() == -CstVal)
441 return true;
442 }
443 // Match `x +nsw/nuw (y +nsw/nuw c)` and
444 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
445 if (OtherInstrA && OtherInstrB &&
446 OtherInstrA->getOpcode() == Instruction::Add &&
447 OtherInstrB->getOpcode() == Instruction::Add &&
448 checkNoWrapFlags(OtherInstrA, Signed) &&
449 checkNoWrapFlags(OtherInstrB, Signed) &&
450 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
451 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
452 int64_t CstValA =
453 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
454 int64_t CstValB =
455 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
456 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
457 IdxDiff.getSExtValue() == (CstValB - CstValA))
458 return true;
459 }
460 }
461 return false;
462}
463
464bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
465 APInt PtrDelta,
466 unsigned Depth) const {
467 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
468 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
469 if (!GEPA || !GEPB)
470 return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
471
472 // Look through GEPs after checking they're the same except for the last
473 // index.
474 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
475 GEPA->getPointerOperand() != GEPB->getPointerOperand())
476 return false;
479 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
480 if (GTIA.getOperand() != GTIB.getOperand())
481 return false;
482 ++GTIA;
483 ++GTIB;
484 }
485
486 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
487 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
488 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
489 OpA->getType() != OpB->getType())
490 return false;
491
492 if (PtrDelta.isNegative()) {
493 if (PtrDelta.isMinSignedValue())
494 return false;
495 PtrDelta.negate();
496 std::swap(OpA, OpB);
497 }
498 uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
499 if (PtrDelta.urem(Stride) != 0)
500 return false;
501 unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
502 APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
503
504 // Only look through a ZExt/SExt.
505 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
506 return false;
507
508 bool Signed = isa<SExtInst>(OpA);
509
510 // At this point A could be a function parameter, i.e. not an instruction
511 Value *ValA = OpA->getOperand(0);
512 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
513 if (!OpB || ValA->getType() != OpB->getType())
514 return false;
515
516 // Now we need to prove that adding IdxDiff to ValA won't overflow.
517 bool Safe = false;
518
519 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
520 // ValA, we're okay.
521 if (OpB->getOpcode() == Instruction::Add &&
522 isa<ConstantInt>(OpB->getOperand(1)) &&
523 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
525 Safe = true;
526
527 // Second attempt: check if we have eligible add NSW/NUW instruction
528 // sequences.
529 OpA = dyn_cast<Instruction>(ValA);
530 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
531 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
532 checkNoWrapFlags(OpB, Signed)) {
533 // In the checks below a matching operand in OpA and OpB is
534 // an operand which is the same in those two instructions.
535 // Below we account for possible orders of the operands of
536 // these add instructions.
537 for (unsigned MatchingOpIdxA : {0, 1})
538 for (unsigned MatchingOpIdxB : {0, 1})
539 if (!Safe)
540 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
541 MatchingOpIdxB, Signed);
542 }
543
544 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
545
546 // Third attempt:
547 // If all set bits of IdxDiff or any higher order bit other than the sign bit
548 // are known to be zero in ValA, we can add Diff to it while guaranteeing no
549 // overflow of any sort.
550 if (!Safe) {
551 KnownBits Known(BitWidth);
552 computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
553 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
554 if (Signed)
555 BitsAllowedToBeSet.clearBit(BitWidth - 1);
556 if (BitsAllowedToBeSet.ult(IdxDiff))
557 return false;
558 }
559
560 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
561 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
562 const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
563 const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
564 return X == OffsetSCEVB;
565}
566
567bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
568 const APInt &PtrDelta,
569 unsigned Depth) const {
570 if (Depth++ == MaxDepth)
571 return false;
572
573 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
574 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
575 return SelectA->getCondition() == SelectB->getCondition() &&
576 areConsecutivePointers(SelectA->getTrueValue(),
577 SelectB->getTrueValue(), PtrDelta, Depth) &&
578 areConsecutivePointers(SelectA->getFalseValue(),
579 SelectB->getFalseValue(), PtrDelta, Depth);
580 }
581 }
582 return false;
583}
584
585void Vectorizer::reorder(Instruction *I) {
586 SmallPtrSet<Instruction *, 16> InstructionsToMove;
588
589 Worklist.push_back(I);
590 while (!Worklist.empty()) {
591 Instruction *IW = Worklist.pop_back_val();
592 int NumOperands = IW->getNumOperands();
593 for (int i = 0; i < NumOperands; i++) {
594 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
595 if (!IM || IM->getOpcode() == Instruction::PHI)
596 continue;
597
598 // If IM is in another BB, no need to move it, because this pass only
599 // vectorizes instructions within one BB.
600 if (IM->getParent() != I->getParent())
601 continue;
602
603 if (!IM->comesBefore(I)) {
604 InstructionsToMove.insert(IM);
605 Worklist.push_back(IM);
606 }
607 }
608 }
609
610 // All instructions to move should follow I. Start from I, not from begin().
611 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
612 ++BBI) {
613 if (!InstructionsToMove.count(&*BBI))
614 continue;
615 Instruction *IM = &*BBI;
616 --BBI;
617 IM->removeFromParent();
618 IM->insertBefore(I);
619 }
620}
621
622std::pair<BasicBlock::iterator, BasicBlock::iterator>
623Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
624 Instruction *C0 = Chain[0];
625 BasicBlock::iterator FirstInstr = C0->getIterator();
626 BasicBlock::iterator LastInstr = C0->getIterator();
627
628 BasicBlock *BB = C0->getParent();
629 unsigned NumFound = 0;
630 for (Instruction &I : *BB) {
631 if (!is_contained(Chain, &I))
632 continue;
633
634 ++NumFound;
635 if (NumFound == 1) {
636 FirstInstr = I.getIterator();
637 }
638 if (NumFound == Chain.size()) {
639 LastInstr = I.getIterator();
640 break;
641 }
642 }
643
644 // Range is [first, last).
645 return std::make_pair(FirstInstr, ++LastInstr);
646}
647
648void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
650 for (Instruction *I : Chain) {
651 Value *PtrOperand = getLoadStorePointerOperand(I);
652 assert(PtrOperand && "Instruction must have a pointer operand.");
653 Instrs.push_back(I);
654 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
655 Instrs.push_back(GEP);
656 }
657
658 // Erase instructions.
659 for (Instruction *I : Instrs)
660 if (I->use_empty())
661 I->eraseFromParent();
662}
663
664std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
665Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
666 unsigned ElementSizeBits) {
667 unsigned ElementSizeBytes = ElementSizeBits / 8;
668 unsigned SizeBytes = ElementSizeBytes * Chain.size();
669 unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
670 if (NumLeft == Chain.size()) {
671 if ((NumLeft & 1) == 0)
672 NumLeft /= 2; // Split even in half
673 else
674 --NumLeft; // Split off last element
675 } else if (NumLeft == 0)
676 NumLeft = 1;
677 return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
678}
679
681Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
682 // These are in BB order, unlike Chain, which is in address order.
685
686 bool IsLoadChain = isa<LoadInst>(Chain[0]);
687 LLVM_DEBUG({
688 for (Instruction *I : Chain) {
689 if (IsLoadChain)
690 assert(isa<LoadInst>(I) &&
691 "All elements of Chain must be loads, or all must be stores.");
692 else
693 assert(isa<StoreInst>(I) &&
694 "All elements of Chain must be loads, or all must be stores.");
695 }
696 });
697
698 for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
699 if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
700 ChainInstrs.push_back(&I);
701 continue;
702 }
704 LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
705 << I << '\n');
706 break;
707 }
708 if (I.mayReadOrWriteMemory())
709 MemoryInstrs.push_back(&I);
710 }
711
712 // Loop until we find an instruction in ChainInstrs that we can't vectorize.
713 unsigned ChainInstrIdx = 0;
714 Instruction *BarrierMemoryInstr = nullptr;
715
716 for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
717 Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
718
719 // If a barrier memory instruction was found, chain instructions that follow
720 // will not be added to the valid prefix.
721 if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
722 break;
723
724 // Check (in BB order) if any instruction prevents ChainInstr from being
725 // vectorized. Find and store the first such "conflicting" instruction.
726 for (Instruction *MemInstr : MemoryInstrs) {
727 // If a barrier memory instruction was found, do not check past it.
728 if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
729 break;
730
731 auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
732 auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
733 if (MemLoad && ChainLoad)
734 continue;
735
736 // We can ignore the alias if the we have a load store pair and the load
737 // is known to be invariant. The load cannot be clobbered by the store.
738 auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
739 return LI->hasMetadata(LLVMContext::MD_invariant_load);
740 };
741
742 if (IsLoadChain) {
743 // We can ignore the alias as long as the load comes before the store,
744 // because that means we won't be moving the load past the store to
745 // vectorize it (the vectorized load is inserted at the location of the
746 // first load in the chain).
747 if (ChainInstr->comesBefore(MemInstr) ||
748 (ChainLoad && IsInvariantLoad(ChainLoad)))
749 continue;
750 } else {
751 // Same case, but in reverse.
752 if (MemInstr->comesBefore(ChainInstr) ||
753 (MemLoad && IsInvariantLoad(MemLoad)))
754 continue;
755 }
756
757 ModRefInfo MR =
758 AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
759 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
760 LLVM_DEBUG({
761 dbgs() << "LSV: Found alias:\n"
762 " Aliasing instruction:\n"
763 << " " << *MemInstr << '\n'
764 << " Aliased instruction and pointer:\n"
765 << " " << *ChainInstr << '\n'
766 << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
767 });
768 // Save this aliasing memory instruction as a barrier, but allow other
769 // instructions that precede the barrier to be vectorized with this one.
770 BarrierMemoryInstr = MemInstr;
771 break;
772 }
773 }
774 // Continue the search only for store chains, since vectorizing stores that
775 // precede an aliasing load is valid. Conversely, vectorizing loads is valid
776 // up to an aliasing store, but should not pull loads from further down in
777 // the basic block.
778 if (IsLoadChain && BarrierMemoryInstr) {
779 // The BarrierMemoryInstr is a store that precedes ChainInstr.
780 assert(BarrierMemoryInstr->comesBefore(ChainInstr));
781 break;
782 }
783 }
784
785 // Find the largest prefix of Chain whose elements are all in
786 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
787 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
788 // order.)
789 SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
790 ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
791 unsigned ChainIdx = 0;
792 for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
793 if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
794 break;
795 }
796 return Chain.slice(0, ChainIdx);
797}
798
799static ChainID getChainID(const Value *Ptr) {
800 const Value *ObjPtr = getUnderlyingObject(Ptr);
801 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
802 // The select's themselves are distinct instructions even if they share the
803 // same condition and evaluate to consecutive pointers for true and false
804 // values of the condition. Therefore using the select's themselves for
805 // grouping instructions would put consecutive accesses into different lists
806 // and they won't be even checked for being consecutive, and won't be
807 // vectorized.
808 return Sel->getCondition();
809 }
810 return ObjPtr;
811}
812
813std::pair<InstrListMap, InstrListMap>
814Vectorizer::collectInstructions(BasicBlock *BB) {
815 InstrListMap LoadRefs;
816 InstrListMap StoreRefs;
817
818 for (Instruction &I : *BB) {
819 if (!I.mayReadOrWriteMemory())
820 continue;
821
822 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
823 if (!LI->isSimple())
824 continue;
825
826 // Skip if it's not legal.
828 continue;
829
830 Type *Ty = LI->getType();
832 continue;
833
834 // Skip weird non-byte sizes. They probably aren't worth the effort of
835 // handling correctly.
836 unsigned TySize = DL.getTypeSizeInBits(Ty);
837 if ((TySize % 8) != 0)
838 continue;
839
840 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
841 // functions are currently using an integer type for the vectorized
842 // load/store, and does not support casting between the integer type and a
843 // vector of pointers (e.g. i64 to <2 x i16*>)
844 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
845 continue;
846
847 Value *Ptr = LI->getPointerOperand();
848 unsigned AS = Ptr->getType()->getPointerAddressSpace();
849 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
850
851 unsigned VF = VecRegSize / TySize;
852 VectorType *VecTy = dyn_cast<VectorType>(Ty);
853
854 // No point in looking at these if they're too big to vectorize.
855 if (TySize > VecRegSize / 2 ||
856 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
857 continue;
858
859 // Save the load locations.
860 const ChainID ID = getChainID(Ptr);
861 LoadRefs[ID].push_back(LI);
862 } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
863 if (!SI->isSimple())
864 continue;
865
866 // Skip if it's not legal.
868 continue;
869
870 Type *Ty = SI->getValueOperand()->getType();
872 continue;
873
874 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
875 // functions are currently using an integer type for the vectorized
876 // load/store, and does not support casting between the integer type and a
877 // vector of pointers (e.g. i64 to <2 x i16*>)
878 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
879 continue;
880
881 // Skip weird non-byte sizes. They probably aren't worth the effort of
882 // handling correctly.
883 unsigned TySize = DL.getTypeSizeInBits(Ty);
884 if ((TySize % 8) != 0)
885 continue;
886
887 Value *Ptr = SI->getPointerOperand();
888 unsigned AS = Ptr->getType()->getPointerAddressSpace();
889 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
890
891 unsigned VF = VecRegSize / TySize;
892 VectorType *VecTy = dyn_cast<VectorType>(Ty);
893
894 // No point in looking at these if they're too big to vectorize.
895 if (TySize > VecRegSize / 2 ||
896 (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
897 continue;
898
899 // Save store location.
900 const ChainID ID = getChainID(Ptr);
901 StoreRefs[ID].push_back(SI);
902 }
903 }
904
905 return {LoadRefs, StoreRefs};
906}
907
908bool Vectorizer::vectorizeChains(InstrListMap &Map) {
909 bool Changed = false;
910
911 for (const std::pair<ChainID, InstrList> &Chain : Map) {
912 unsigned Size = Chain.second.size();
913 if (Size < 2)
914 continue;
915
916 LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
917
918 // Process the stores in chunks of 64.
919 for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
920 unsigned Len = std::min<unsigned>(CE - CI, 64);
921 ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
922 Changed |= vectorizeInstructions(Chunk);
923 }
924 }
925
926 return Changed;
927}
928
929bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
930 LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
931 << " instructions.\n");
932 SmallVector<int, 16> Heads, Tails;
933 int ConsecutiveChain[64];
934
935 // Do a quadratic search on all of the given loads/stores and find all of the
936 // pairs of loads/stores that follow each other.
937 for (int i = 0, e = Instrs.size(); i < e; ++i) {
938 ConsecutiveChain[i] = -1;
939 for (int j = e - 1; j >= 0; --j) {
940 if (i == j)
941 continue;
942
943 if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
944 if (ConsecutiveChain[i] != -1) {
945 int CurDistance = std::abs(ConsecutiveChain[i] - i);
946 int NewDistance = std::abs(ConsecutiveChain[i] - j);
947 if (j < i || NewDistance > CurDistance)
948 continue; // Should not insert.
949 }
950
951 Tails.push_back(j);
952 Heads.push_back(i);
953 ConsecutiveChain[i] = j;
954 }
955 }
956 }
957
958 bool Changed = false;
959 SmallPtrSet<Instruction *, 16> InstructionsProcessed;
960
961 for (int Head : Heads) {
962 if (InstructionsProcessed.count(Instrs[Head]))
963 continue;
964 bool LongerChainExists = false;
965 for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
966 if (Head == Tails[TIt] &&
967 !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
968 LongerChainExists = true;
969 break;
970 }
971 if (LongerChainExists)
972 continue;
973
974 // We found an instr that starts a chain. Now follow the chain and try to
975 // vectorize it.
977 int I = Head;
978 while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
979 if (InstructionsProcessed.count(Instrs[I]))
980 break;
981
982 Operands.push_back(Instrs[I]);
983 I = ConsecutiveChain[I];
984 }
985
986 bool Vectorized = false;
987 if (isa<LoadInst>(*Operands.begin()))
988 Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
989 else
990 Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
991
992 Changed |= Vectorized;
993 }
994
995 return Changed;
996}
997
998bool Vectorizer::vectorizeStoreChain(
1000 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1001 StoreInst *S0 = cast<StoreInst>(Chain[0]);
1002
1003 // If the vector has an int element, default to int for the whole store.
1004 Type *StoreTy = nullptr;
1005 for (Instruction *I : Chain) {
1006 StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
1007 if (StoreTy->isIntOrIntVectorTy())
1008 break;
1009
1010 if (StoreTy->isPtrOrPtrVectorTy()) {
1011 StoreTy = Type::getIntNTy(F.getParent()->getContext(),
1012 DL.getTypeSizeInBits(StoreTy));
1013 break;
1014 }
1015 }
1016 assert(StoreTy && "Failed to find store type");
1017
1018 unsigned Sz = DL.getTypeSizeInBits(StoreTy);
1019 unsigned AS = S0->getPointerAddressSpace();
1020 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1021 unsigned VF = VecRegSize / Sz;
1022 unsigned ChainSize = Chain.size();
1023 Align Alignment = S0->getAlign();
1024
1025 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1026 InstructionsProcessed->insert(Chain.begin(), Chain.end());
1027 return false;
1028 }
1029
1030 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1031 if (NewChain.empty()) {
1032 // No vectorization possible.
1033 InstructionsProcessed->insert(Chain.begin(), Chain.end());
1034 return false;
1035 }
1036 if (NewChain.size() == 1) {
1037 // Failed after the first instruction. Discard it and try the smaller chain.
1038 InstructionsProcessed->insert(NewChain.front());
1039 return false;
1040 }
1041
1042 // Update Chain to the valid vectorizable subchain.
1043 Chain = NewChain;
1044 ChainSize = Chain.size();
1045
1046 // Check if it's legal to vectorize this chain. If not, split the chain and
1047 // try again.
1048 unsigned EltSzInBytes = Sz / 8;
1049 unsigned SzInBytes = EltSzInBytes * ChainSize;
1050
1051 FixedVectorType *VecTy;
1052 auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
1053 if (VecStoreTy)
1054 VecTy = FixedVectorType::get(StoreTy->getScalarType(),
1055 Chain.size() * VecStoreTy->getNumElements());
1056 else
1057 VecTy = FixedVectorType::get(StoreTy, Chain.size());
1058
1059 // If it's more than the max vector size or the target has a better
1060 // vector factor, break it into two pieces.
1061 unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1062 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1063 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1064 " Creating two separate arrays.\n");
1065 bool Vectorized = false;
1066 Vectorized |=
1067 vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
1068 Vectorized |=
1069 vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1070 return Vectorized;
1071 }
1072
1073 LLVM_DEBUG({
1074 dbgs() << "LSV: Stores to vectorize:\n";
1075 for (Instruction *I : Chain)
1076 dbgs() << " " << *I << "\n";
1077 });
1078
1079 // We won't try again to vectorize the elements of the chain, regardless of
1080 // whether we succeed below.
1081 InstructionsProcessed->insert(Chain.begin(), Chain.end());
1082
1083 // If the store is going to be misaligned, don't vectorize it.
1084 unsigned RelativeSpeed;
1085 if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
1086 if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1087 unsigned SpeedBefore;
1088 accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
1089 if (SpeedBefore > RelativeSpeed)
1090 return false;
1091
1092 auto Chains = splitOddVectorElts(Chain, Sz);
1093 bool Vectorized = false;
1094 Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
1095 Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
1096 return Vectorized;
1097 }
1098
1101 DL, S0, nullptr, &DT);
1102 if (NewAlign >= Alignment)
1103 Alignment = NewAlign;
1104 else
1105 return false;
1106 }
1107
1108 if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1109 auto Chains = splitOddVectorElts(Chain, Sz);
1110 bool Vectorized = false;
1111 Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
1112 Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
1113 return Vectorized;
1114 }
1115
1117 std::tie(First, Last) = getBoundaryInstrs(Chain);
1118 Builder.SetInsertPoint(&*Last);
1119
1120 Value *Vec = PoisonValue::get(VecTy);
1121
1122 if (VecStoreTy) {
1123 unsigned VecWidth = VecStoreTy->getNumElements();
1124 for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1125 StoreInst *Store = cast<StoreInst>(Chain[I]);
1126 for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1127 unsigned NewIdx = J + I * VecWidth;
1128 Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1129 Builder.getInt32(J));
1130 if (Extract->getType() != StoreTy->getScalarType())
1131 Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1132
1133 Value *Insert =
1134 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1135 Vec = Insert;
1136 }
1137 }
1138 } else {
1139 for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1140 StoreInst *Store = cast<StoreInst>(Chain[I]);
1141 Value *Extract = Store->getValueOperand();
1142 if (Extract->getType() != StoreTy->getScalarType())
1143 Extract =
1144 Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1145
1146 Value *Insert =
1147 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1148 Vec = Insert;
1149 }
1150 }
1151
1152 StoreInst *SI = Builder.CreateAlignedStore(
1153 Vec,
1154 Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1155 Alignment);
1156 propagateMetadata(SI, Chain);
1157
1158 eraseInstructions(Chain);
1159 ++NumVectorInstructions;
1160 NumScalarsVectorized += Chain.size();
1161 return true;
1162}
1163
1164bool Vectorizer::vectorizeLoadChain(
1166 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1167 LoadInst *L0 = cast<LoadInst>(Chain[0]);
1168
1169 // If the vector has an int element, default to int for the whole load.
1170 Type *LoadTy = nullptr;
1171 for (const auto &V : Chain) {
1172 LoadTy = cast<LoadInst>(V)->getType();
1173 if (LoadTy->isIntOrIntVectorTy())
1174 break;
1175
1176 if (LoadTy->isPtrOrPtrVectorTy()) {
1177 LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1178 DL.getTypeSizeInBits(LoadTy));
1179 break;
1180 }
1181 }
1182 assert(LoadTy && "Can't determine LoadInst type from chain");
1183
1184 unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1185 unsigned AS = L0->getPointerAddressSpace();
1186 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1187 unsigned VF = VecRegSize / Sz;
1188 unsigned ChainSize = Chain.size();
1189 Align Alignment = L0->getAlign();
1190
1191 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1192 InstructionsProcessed->insert(Chain.begin(), Chain.end());
1193 return false;
1194 }
1195
1196 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1197 if (NewChain.empty()) {
1198 // No vectorization possible.
1199 InstructionsProcessed->insert(Chain.begin(), Chain.end());
1200 return false;
1201 }
1202 if (NewChain.size() == 1) {
1203 // Failed after the first instruction. Discard it and try the smaller chain.
1204 InstructionsProcessed->insert(NewChain.front());
1205 return false;
1206 }
1207
1208 // Update Chain to the valid vectorizable subchain.
1209 Chain = NewChain;
1210 ChainSize = Chain.size();
1211
1212 // Check if it's legal to vectorize this chain. If not, split the chain and
1213 // try again.
1214 unsigned EltSzInBytes = Sz / 8;
1215 unsigned SzInBytes = EltSzInBytes * ChainSize;
1216 VectorType *VecTy;
1217 auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
1218 if (VecLoadTy)
1219 VecTy = FixedVectorType::get(LoadTy->getScalarType(),
1220 Chain.size() * VecLoadTy->getNumElements());
1221 else
1222 VecTy = FixedVectorType::get(LoadTy, Chain.size());
1223
1224 // If it's more than the max vector size or the target has a better
1225 // vector factor, break it into two pieces.
1226 unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1227 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1228 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1229 " Creating two separate arrays.\n");
1230 bool Vectorized = false;
1231 Vectorized |=
1232 vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
1233 Vectorized |=
1234 vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1235 return Vectorized;
1236 }
1237
1238 // We won't try again to vectorize the elements of the chain, regardless of
1239 // whether we succeed below.
1240 InstructionsProcessed->insert(Chain.begin(), Chain.end());
1241
1242 // If the load is going to be misaligned, don't vectorize it.
1243 unsigned RelativeSpeed;
1244 if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
1245 if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1246 unsigned SpeedBefore;
1247 accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
1248 if (SpeedBefore > RelativeSpeed)
1249 return false;
1250
1251 auto Chains = splitOddVectorElts(Chain, Sz);
1252 bool Vectorized = false;
1253 Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
1254 Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
1255 return Vectorized;
1256 }
1257
1260 DL, L0, nullptr, &DT);
1261 if (NewAlign >= Alignment)
1262 Alignment = NewAlign;
1263 else
1264 return false;
1265 }
1266
1267 if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1268 auto Chains = splitOddVectorElts(Chain, Sz);
1269 bool Vectorized = false;
1270 Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
1271 Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
1272 return Vectorized;
1273 }
1274
1275 LLVM_DEBUG({
1276 dbgs() << "LSV: Loads to vectorize:\n";
1277 for (Instruction *I : Chain)
1278 I->dump();
1279 });
1280
1281 // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1282 // Last may have changed since then, but the value of First won't have. If it
1283 // matters, we could compute getBoundaryInstrs only once and reuse it here.
1285 std::tie(First, Last) = getBoundaryInstrs(Chain);
1286 Builder.SetInsertPoint(&*First);
1287
1288 Value *Bitcast =
1289 Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1290 LoadInst *LI =
1291 Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
1292 propagateMetadata(LI, Chain);
1293
1294 for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1295 Value *CV = Chain[I];
1296 Value *V;
1297 if (VecLoadTy) {
1298 // Extract a subvector using shufflevector.
1299 unsigned VecWidth = VecLoadTy->getNumElements();
1300 auto Mask =
1301 llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
1302 V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
1303 } else {
1304 V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1305 }
1306
1307 if (V->getType() != CV->getType()) {
1308 V = Builder.CreateBitOrPointerCast(V, CV->getType());
1309 }
1310
1311 // Replace the old instruction.
1312 CV->replaceAllUsesWith(V);
1313 }
1314
1315 // Since we might have opaque pointers we might end up using the pointer
1316 // operand of the first load (wrt. memory loaded) for the vector load. Since
1317 // this first load might not be the first in the block we potentially need to
1318 // reorder the pointer operand (and its operands). If we have a bitcast though
1319 // it might be before the load and should be the reorder start instruction.
1320 // "Might" because for opaque pointers the "bitcast" is just the first loads
1321 // pointer operand, as oppposed to something we inserted at the right position
1322 // ourselves.
1323 Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
1324 reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
1325
1326 eraseInstructions(Chain);
1327
1328 ++NumVectorInstructions;
1329 NumScalarsVectorized += Chain.size();
1330 return true;
1331}
1332
1333bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1334 Align Alignment, unsigned &RelativeSpeed) {
1335 RelativeSpeed = 0;
1336 if (Alignment.value() % SzInBytes == 0)
1337 return false;
1338
1339 bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1340 SzInBytes * 8, AddressSpace,
1341 Alignment, &RelativeSpeed);
1342 LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1343 << " with relative speed = " << RelativeSpeed << '\n';);
1344 return !Allows || !RelativeSpeed;
1345}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
assume Assume Builder
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
static const unsigned MaxDepth
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static ChainID getChainID(const Value *Ptr)
static const unsigned StackAdjustedAlignment
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
print must be executed print the must be executed context for all instructions
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:75
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1571
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1385
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:973
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:415
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:898
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1664
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1089
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:312
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
void negate()
Negate this APInt in place.
Definition: APInt.h:1421
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1516
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:166
iterator end() const
Definition: ArrayRef.h:152
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
iterator begin() const
Definition: ArrayRef.h:151
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:193
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
Represents analyses that only rely on functions' control flow.
Definition: PassManager.h:113
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2558
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:78
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:88
const BasicBlock * getParent() const
Definition: Instruction.h:90
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:168
An instruction for reading from memory.
Definition: Instructions.h:177
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:270
Value * getPointerOperand()
Definition: Instructions.h:264
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:220
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1750
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:188
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Align getAlign() const
Definition: Instructions.h:345
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:399
Value * getPointerOperand()
Definition: Instructions.h:393
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalToVectorizeLoad(LoadInst *LI) const
bool isLegalToVectorizeStore(StoreInst *SI) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:267
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:237
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:264
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:724
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:695
self_iterator getIterator()
Definition: ilist_node.h:82
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ CE
Windows NT (Windows on ARM)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1439
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:184
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1976
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117