LLVM  13.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/InitializePasses.h"
75 #include "llvm/Pass.h"
76 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/KnownBits.h"
83 #include <algorithm>
84 #include <cassert>
85 #include <cstdlib>
86 #include <tuple>
87 #include <utility>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "load-store-vectorizer"
92 
93 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
94 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
95 
96 // FIXME: Assuming stack alignment of 4 is always good enough
97 static const unsigned StackAdjustedAlignment = 4;
98 
99 namespace {
100 
101 /// ChainID is an arbitrary token that is allowed to be different only for the
102 /// accesses that are guaranteed to be considered non-consecutive by
103 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
104 /// together and reducing the number of instructions the main search operates on
105 /// at a time, i.e. this is to reduce compile time and nothing else as the main
106 /// search has O(n^2) time complexity. The underlying type of ChainID should not
107 /// be relied upon.
108 using ChainID = const Value *;
109 using InstrList = SmallVector<Instruction *, 8>;
110 using InstrListMap = MapVector<ChainID, InstrList>;
111 
112 class Vectorizer {
113  Function &F;
114  AliasAnalysis &AA;
115  AssumptionCache &AC;
116  DominatorTree &DT;
117  ScalarEvolution &SE;
119  const DataLayout &DL;
121 
122 public:
123  Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
125  : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
126  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
127 
128  bool run();
129 
130 private:
131  unsigned getPointerAddressSpace(Value *I);
132 
133  static const unsigned MaxDepth = 3;
134 
135  bool isConsecutiveAccess(Value *A, Value *B);
136  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
137  unsigned Depth = 0) const;
138  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
139  unsigned Depth) const;
140  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
141  unsigned Depth) const;
142 
143  /// After vectorization, reorder the instructions that I depends on
144  /// (the instructions defining its operands), to ensure they dominate I.
145  void reorder(Instruction *I);
146 
147  /// Returns the first and the last instructions in Chain.
148  std::pair<BasicBlock::iterator, BasicBlock::iterator>
149  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
150 
151  /// Erases the original instructions after vectorizing.
152  void eraseInstructions(ArrayRef<Instruction *> Chain);
153 
154  /// "Legalize" the vector type that would be produced by combining \p
155  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
156  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
157  /// expected to have more than 4 elements.
158  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
159  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
160 
161  /// Finds the largest prefix of Chain that's vectorizable, checking for
162  /// intervening instructions which may affect the memory accessed by the
163  /// instructions within Chain.
164  ///
165  /// The elements of \p Chain must be all loads or all stores and must be in
166  /// address order.
167  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
168 
169  /// Collects load and store instructions to vectorize.
170  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
171 
172  /// Processes the collected instructions, the \p Map. The values of \p Map
173  /// should be all loads or all stores.
174  bool vectorizeChains(InstrListMap &Map);
175 
176  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
177  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
178 
179  /// Vectorizes the load instructions in Chain.
180  bool
181  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
182  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
183 
184  /// Vectorizes the store instructions in Chain.
185  bool
186  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
187  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
188 
189  /// Check if this load/store access is misaligned accesses.
190  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
191  Align Alignment);
192 };
193 
194 class LoadStoreVectorizerLegacyPass : public FunctionPass {
195 public:
196  static char ID;
197 
198  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
200  }
201 
202  bool runOnFunction(Function &F) override;
203 
204  StringRef getPassName() const override {
205  return "GPU Load and Store Vectorizer";
206  }
207 
208  void getAnalysisUsage(AnalysisUsage &AU) const override {
214  AU.setPreservesCFG();
215  }
216 };
217 
218 } // end anonymous namespace
219 
221 
222 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
223  "Vectorize load and Store instructions", false, false)
230 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
231  "Vectorize load and store instructions", false, false)
232 
234  return new LoadStoreVectorizerLegacyPass();
235 }
236 
238  // Don't vectorize when the attribute NoImplicitFloat is used.
239  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
240  return false;
241 
242  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
243  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
244  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
246  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
247 
248  AssumptionCache &AC =
249  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
250 
251  Vectorizer V(F, AA, AC, DT, SE, TTI);
252  return V.run();
253 }
254 
256  // Don't vectorize when the attribute NoImplicitFloat is used.
257  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
258  return PreservedAnalyses::all();
259 
260  AliasAnalysis &AA = AM.getResult<AAManager>(F);
265 
266  Vectorizer V(F, AA, AC, DT, SE, TTI);
267  bool Changed = V.run();
269  PA.preserveSet<CFGAnalyses>();
270  return Changed ? PA : PreservedAnalyses::all();
271 }
272 
273 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
274 // vectors of Instructions.
276  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
277  propagateMetadata(I, VL);
278 }
279 
280 // Vectorizer Implementation
281 bool Vectorizer::run() {
282  bool Changed = false;
283 
284  // Scan the blocks in the function in post order.
285  for (BasicBlock *BB : post_order(&F)) {
286  InstrListMap LoadRefs, StoreRefs;
287  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
288  Changed |= vectorizeChains(LoadRefs);
289  Changed |= vectorizeChains(StoreRefs);
290  }
291 
292  return Changed;
293 }
294 
295 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
296  if (LoadInst *L = dyn_cast<LoadInst>(I))
297  return L->getPointerAddressSpace();
298  if (StoreInst *S = dyn_cast<StoreInst>(I))
299  return S->getPointerAddressSpace();
300  return -1;
301 }
302 
303 // FIXME: Merge with llvm::isConsecutiveAccess
307  unsigned ASA = getPointerAddressSpace(A);
308  unsigned ASB = getPointerAddressSpace(B);
309 
310  // Check that the address spaces match and that the pointers are valid.
311  if (!PtrA || !PtrB || (ASA != ASB))
312  return false;
313 
314  // Make sure that A and B are different pointers of the same size type.
315  Type *PtrATy = PtrA->getType()->getPointerElementType();
316  Type *PtrBTy = PtrB->getType()->getPointerElementType();
317  if (PtrA == PtrB ||
318  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
319  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
320  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
321  DL.getTypeStoreSize(PtrBTy->getScalarType()))
322  return false;
323 
324  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
325  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
326 
327  return areConsecutivePointers(PtrA, PtrB, Size);
328 }
329 
330 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
331  APInt PtrDelta, unsigned Depth) const {
332  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
333  APInt OffsetA(PtrBitWidth, 0);
334  APInt OffsetB(PtrBitWidth, 0);
335  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
336  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
337 
338  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
339 
340  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
341  return false;
342 
343  // In case if we have to shrink the pointer
344  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
345  // possible overflow and the value should fit into a smallest data type
346  // used in the cast/gep chain.
347  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
348  OffsetB.getMinSignedBits() <= NewPtrBitWidth);
349 
350  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
351  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
352  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
353 
354  APInt OffsetDelta = OffsetB - OffsetA;
355 
356  // Check if they are based on the same pointer. That makes the offsets
357  // sufficient.
358  if (PtrA == PtrB)
359  return OffsetDelta == PtrDelta;
360 
361  // Compute the necessary base pointer delta to have the necessary final delta
362  // equal to the pointer delta requested.
363  APInt BaseDelta = PtrDelta - OffsetDelta;
364 
365  // Compute the distance with SCEV between the base pointers.
366  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
367  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
368  const SCEV *C = SE.getConstant(BaseDelta);
369  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
370  if (X == PtrSCEVB)
371  return true;
372 
373  // The above check will not catch the cases where one of the pointers is
374  // factorized but the other one is not, such as (C + (S * (A + B))) vs
375  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
376  // and getting the simplified difference.
377  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
378  if (C == Dist)
379  return true;
380 
381  // Sometimes even this doesn't work, because SCEV can't always see through
382  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
383  // things the hard way.
384  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
385 }
386 
387 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
388  APInt PtrDelta,
389  unsigned Depth) const {
390  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
391  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
392  if (!GEPA || !GEPB)
393  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
394 
395  // Look through GEPs after checking they're the same except for the last
396  // index.
397  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
398  GEPA->getPointerOperand() != GEPB->getPointerOperand())
399  return false;
400  gep_type_iterator GTIA = gep_type_begin(GEPA);
401  gep_type_iterator GTIB = gep_type_begin(GEPB);
402  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
403  if (GTIA.getOperand() != GTIB.getOperand())
404  return false;
405  ++GTIA;
406  ++GTIB;
407  }
408 
409  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
410  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
411  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
412  OpA->getType() != OpB->getType())
413  return false;
414 
415  if (PtrDelta.isNegative()) {
416  if (PtrDelta.isMinSignedValue())
417  return false;
418  PtrDelta.negate();
419  std::swap(OpA, OpB);
420  }
421  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
422  if (PtrDelta.urem(Stride) != 0)
423  return false;
424  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
425  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
426 
427  // Only look through a ZExt/SExt.
428  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
429  return false;
430 
431  bool Signed = isa<SExtInst>(OpA);
432 
433  // At this point A could be a function parameter, i.e. not an instruction
434  Value *ValA = OpA->getOperand(0);
435  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
436  if (!OpB || ValA->getType() != OpB->getType())
437  return false;
438 
439  // Now we need to prove that adding IdxDiff to ValA won't overflow.
440  bool Safe = false;
441  auto CheckFlags = [](Instruction *I, bool Signed) {
442  BinaryOperator *BinOpI = cast<BinaryOperator>(I);
443  return (Signed && BinOpI->hasNoSignedWrap()) ||
444  (!Signed && BinOpI->hasNoUnsignedWrap());
445  };
446 
447  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
448  // ValA, we're okay.
449  if (OpB->getOpcode() == Instruction::Add &&
450  isa<ConstantInt>(OpB->getOperand(1)) &&
451  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
452  CheckFlags(OpB, Signed))
453  Safe = true;
454 
455  // Second attempt: If both OpA and OpB is an add with NSW/NUW and with
456  // the same LHS operand, we can guarantee that the transformation is safe
457  // if we can prove that OpA won't overflow when IdxDiff added to the RHS
458  // of OpA.
459  // For example:
460  // %tmp7 = add nsw i32 %tmp2, %v0
461  // %tmp8 = sext i32 %tmp7 to i64
462  // ...
463  // %tmp11 = add nsw i32 %v0, 1
464  // %tmp12 = add nsw i32 %tmp2, %tmp11
465  // %tmp13 = sext i32 %tmp12 to i64
466  //
467  // Both %tmp7 and %tmp2 has the nsw flag and the first operand
468  // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
469  // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
470  // nsw flag.
471  OpA = dyn_cast<Instruction>(ValA);
472  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
473  OpB->getOpcode() == Instruction::Add &&
474  OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) &&
475  CheckFlags(OpB, Signed)) {
476  Value *RHSA = OpA->getOperand(1);
477  Value *RHSB = OpB->getOperand(1);
478  Instruction *OpRHSA = dyn_cast<Instruction>(RHSA);
479  Instruction *OpRHSB = dyn_cast<Instruction>(RHSB);
480  // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
481  if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add &&
482  CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) {
483  int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
484  if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal)
485  Safe = true;
486  }
487  // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
488  if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add &&
489  CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) {
490  int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
491  if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal)
492  Safe = true;
493  }
494  // Match `x +nsw/nuw (y +nsw/nuw c)` and
495  // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
496  if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add &&
497  OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) &&
498  CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) &&
499  isa<ConstantInt>(OpRHSB->getOperand(1))) {
500  int64_t CstValA =
501  cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
502  int64_t CstValB =
503  cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
504  if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) &&
505  IdxDiff.getSExtValue() == (CstValB - CstValA))
506  Safe = true;
507  }
508  }
509 
510  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
511 
512  // Third attempt:
513  // If all set bits of IdxDiff or any higher order bit other than the sign bit
514  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
515  // overflow of any sort.
516  if (!Safe) {
517  OpA = dyn_cast<Instruction>(ValA);
518  if (!OpA)
519  return false;
520  KnownBits Known(BitWidth);
521  computeKnownBits(OpA, Known, DL, 0, &AC, OpA, &DT);
522  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
523  if (Signed)
524  BitsAllowedToBeSet.clearBit(BitWidth - 1);
525  if (BitsAllowedToBeSet.ult(IdxDiff))
526  return false;
527  }
528 
529  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
530  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
531  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
532  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
533  return X == OffsetSCEVB;
534 }
535 
536 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
537  const APInt &PtrDelta,
538  unsigned Depth) const {
539  if (Depth++ == MaxDepth)
540  return false;
541 
542  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
543  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
544  return SelectA->getCondition() == SelectB->getCondition() &&
545  areConsecutivePointers(SelectA->getTrueValue(),
546  SelectB->getTrueValue(), PtrDelta, Depth) &&
547  areConsecutivePointers(SelectA->getFalseValue(),
548  SelectB->getFalseValue(), PtrDelta, Depth);
549  }
550  }
551  return false;
552 }
553 
554 void Vectorizer::reorder(Instruction *I) {
555  SmallPtrSet<Instruction *, 16> InstructionsToMove;
557 
558  Worklist.push_back(I);
559  while (!Worklist.empty()) {
560  Instruction *IW = Worklist.pop_back_val();
561  int NumOperands = IW->getNumOperands();
562  for (int i = 0; i < NumOperands; i++) {
563  Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
564  if (!IM || IM->getOpcode() == Instruction::PHI)
565  continue;
566 
567  // If IM is in another BB, no need to move it, because this pass only
568  // vectorizes instructions within one BB.
569  if (IM->getParent() != I->getParent())
570  continue;
571 
572  if (!IM->comesBefore(I)) {
573  InstructionsToMove.insert(IM);
574  Worklist.push_back(IM);
575  }
576  }
577  }
578 
579  // All instructions to move should follow I. Start from I, not from begin().
580  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
581  ++BBI) {
582  if (!InstructionsToMove.count(&*BBI))
583  continue;
584  Instruction *IM = &*BBI;
585  --BBI;
586  IM->removeFromParent();
587  IM->insertBefore(I);
588  }
589 }
590 
591 std::pair<BasicBlock::iterator, BasicBlock::iterator>
592 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
593  Instruction *C0 = Chain[0];
594  BasicBlock::iterator FirstInstr = C0->getIterator();
595  BasicBlock::iterator LastInstr = C0->getIterator();
596 
597  BasicBlock *BB = C0->getParent();
598  unsigned NumFound = 0;
599  for (Instruction &I : *BB) {
600  if (!is_contained(Chain, &I))
601  continue;
602 
603  ++NumFound;
604  if (NumFound == 1) {
605  FirstInstr = I.getIterator();
606  }
607  if (NumFound == Chain.size()) {
608  LastInstr = I.getIterator();
609  break;
610  }
611  }
612 
613  // Range is [first, last).
614  return std::make_pair(FirstInstr, ++LastInstr);
615 }
616 
617 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
619  for (Instruction *I : Chain) {
620  Value *PtrOperand = getLoadStorePointerOperand(I);
621  assert(PtrOperand && "Instruction must have a pointer operand.");
622  Instrs.push_back(I);
623  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
624  Instrs.push_back(GEP);
625  }
626 
627  // Erase instructions.
628  for (Instruction *I : Instrs)
629  if (I->use_empty())
630  I->eraseFromParent();
631 }
632 
633 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
634 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
635  unsigned ElementSizeBits) {
636  unsigned ElementSizeBytes = ElementSizeBits / 8;
637  unsigned SizeBytes = ElementSizeBytes * Chain.size();
638  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
639  if (NumLeft == Chain.size()) {
640  if ((NumLeft & 1) == 0)
641  NumLeft /= 2; // Split even in half
642  else
643  --NumLeft; // Split off last element
644  } else if (NumLeft == 0)
645  NumLeft = 1;
646  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
647 }
648 
650 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
651  // These are in BB order, unlike Chain, which is in address order.
652  SmallVector<Instruction *, 16> MemoryInstrs;
653  SmallVector<Instruction *, 16> ChainInstrs;
654 
655  bool IsLoadChain = isa<LoadInst>(Chain[0]);
656  LLVM_DEBUG({
657  for (Instruction *I : Chain) {
658  if (IsLoadChain)
659  assert(isa<LoadInst>(I) &&
660  "All elements of Chain must be loads, or all must be stores.");
661  else
662  assert(isa<StoreInst>(I) &&
663  "All elements of Chain must be loads, or all must be stores.");
664  }
665  });
666 
667  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
668  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
669  if (!is_contained(Chain, &I))
670  MemoryInstrs.push_back(&I);
671  else
672  ChainInstrs.push_back(&I);
673  } else if (isa<IntrinsicInst>(&I) &&
674  cast<IntrinsicInst>(&I)->getIntrinsicID() ==
675  Intrinsic::sideeffect) {
676  // Ignore llvm.sideeffect calls.
677  } else if (isa<IntrinsicInst>(&I) &&
678  cast<IntrinsicInst>(&I)->getIntrinsicID() ==
679  Intrinsic::pseudoprobe) {
680  // Ignore llvm.pseudoprobe calls.
681  } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
682  LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
683  << '\n');
684  break;
685  } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
686  LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
687  << '\n');
688  break;
689  }
690  }
691 
692  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
693  unsigned ChainInstrIdx = 0;
694  Instruction *BarrierMemoryInstr = nullptr;
695 
696  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
697  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
698 
699  // If a barrier memory instruction was found, chain instructions that follow
700  // will not be added to the valid prefix.
701  if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
702  break;
703 
704  // Check (in BB order) if any instruction prevents ChainInstr from being
705  // vectorized. Find and store the first such "conflicting" instruction.
706  for (Instruction *MemInstr : MemoryInstrs) {
707  // If a barrier memory instruction was found, do not check past it.
708  if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
709  break;
710 
711  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
712  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
713  if (MemLoad && ChainLoad)
714  continue;
715 
716  // We can ignore the alias if the we have a load store pair and the load
717  // is known to be invariant. The load cannot be clobbered by the store.
718  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
719  return LI->hasMetadata(LLVMContext::MD_invariant_load);
720  };
721 
722  // We can ignore the alias as long as the load comes before the store,
723  // because that means we won't be moving the load past the store to
724  // vectorize it (the vectorized load is inserted at the location of the
725  // first load in the chain).
726  if (isa<StoreInst>(MemInstr) && ChainLoad &&
727  (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
728  continue;
729 
730  // Same case, but in reverse.
731  if (MemLoad && isa<StoreInst>(ChainInstr) &&
732  (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
733  continue;
734 
735  if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
736  MemoryLocation::get(ChainInstr))) {
737  LLVM_DEBUG({
738  dbgs() << "LSV: Found alias:\n"
739  " Aliasing instruction and pointer:\n"
740  << " " << *MemInstr << '\n'
741  << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
742  << " Aliased instruction and pointer:\n"
743  << " " << *ChainInstr << '\n'
744  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
745  });
746  // Save this aliasing memory instruction as a barrier, but allow other
747  // instructions that precede the barrier to be vectorized with this one.
748  BarrierMemoryInstr = MemInstr;
749  break;
750  }
751  }
752  // Continue the search only for store chains, since vectorizing stores that
753  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
754  // up to an aliasing store, but should not pull loads from further down in
755  // the basic block.
756  if (IsLoadChain && BarrierMemoryInstr) {
757  // The BarrierMemoryInstr is a store that precedes ChainInstr.
758  assert(BarrierMemoryInstr->comesBefore(ChainInstr));
759  break;
760  }
761  }
762 
763  // Find the largest prefix of Chain whose elements are all in
764  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
765  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
766  // order.)
767  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
768  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
769  unsigned ChainIdx = 0;
770  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
771  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
772  break;
773  }
774  return Chain.slice(0, ChainIdx);
775 }
776 
777 static ChainID getChainID(const Value *Ptr) {
778  const Value *ObjPtr = getUnderlyingObject(Ptr);
779  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
780  // The select's themselves are distinct instructions even if they share the
781  // same condition and evaluate to consecutive pointers for true and false
782  // values of the condition. Therefore using the select's themselves for
783  // grouping instructions would put consecutive accesses into different lists
784  // and they won't be even checked for being consecutive, and won't be
785  // vectorized.
786  return Sel->getCondition();
787  }
788  return ObjPtr;
789 }
790 
791 std::pair<InstrListMap, InstrListMap>
792 Vectorizer::collectInstructions(BasicBlock *BB) {
793  InstrListMap LoadRefs;
794  InstrListMap StoreRefs;
795 
796  for (Instruction &I : *BB) {
797  if (!I.mayReadOrWriteMemory())
798  continue;
799 
800  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
801  if (!LI->isSimple())
802  continue;
803 
804  // Skip if it's not legal.
805  if (!TTI.isLegalToVectorizeLoad(LI))
806  continue;
807 
808  Type *Ty = LI->getType();
810  continue;
811 
812  // Skip weird non-byte sizes. They probably aren't worth the effort of
813  // handling correctly.
814  unsigned TySize = DL.getTypeSizeInBits(Ty);
815  if ((TySize % 8) != 0)
816  continue;
817 
818  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
819  // functions are currently using an integer type for the vectorized
820  // load/store, and does not support casting between the integer type and a
821  // vector of pointers (e.g. i64 to <2 x i16*>)
822  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
823  continue;
824 
825  Value *Ptr = LI->getPointerOperand();
826  unsigned AS = Ptr->getType()->getPointerAddressSpace();
827  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
828 
829  unsigned VF = VecRegSize / TySize;
830  VectorType *VecTy = dyn_cast<VectorType>(Ty);
831 
832  // No point in looking at these if they're too big to vectorize.
833  if (TySize > VecRegSize / 2 ||
834  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
835  continue;
836 
837  // Make sure all the users of a vector are constant-index extracts.
838  if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
839  const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
840  return EEI && isa<ConstantInt>(EEI->getOperand(1));
841  }))
842  continue;
843 
844  // Save the load locations.
845  const ChainID ID = getChainID(Ptr);
846  LoadRefs[ID].push_back(LI);
847  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
848  if (!SI->isSimple())
849  continue;
850 
851  // Skip if it's not legal.
853  continue;
854 
855  Type *Ty = SI->getValueOperand()->getType();
857  continue;
858 
859  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
860  // functions are currently using an integer type for the vectorized
861  // load/store, and does not support casting between the integer type and a
862  // vector of pointers (e.g. i64 to <2 x i16*>)
863  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
864  continue;
865 
866  // Skip weird non-byte sizes. They probably aren't worth the effort of
867  // handling correctly.
868  unsigned TySize = DL.getTypeSizeInBits(Ty);
869  if ((TySize % 8) != 0)
870  continue;
871 
872  Value *Ptr = SI->getPointerOperand();
873  unsigned AS = Ptr->getType()->getPointerAddressSpace();
874  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
875 
876  unsigned VF = VecRegSize / TySize;
877  VectorType *VecTy = dyn_cast<VectorType>(Ty);
878 
879  // No point in looking at these if they're too big to vectorize.
880  if (TySize > VecRegSize / 2 ||
881  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
882  continue;
883 
884  if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
885  const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
886  return EEI && isa<ConstantInt>(EEI->getOperand(1));
887  }))
888  continue;
889 
890  // Save store location.
891  const ChainID ID = getChainID(Ptr);
892  StoreRefs[ID].push_back(SI);
893  }
894  }
895 
896  return {LoadRefs, StoreRefs};
897 }
898 
899 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
900  bool Changed = false;
901 
902  for (const std::pair<ChainID, InstrList> &Chain : Map) {
903  unsigned Size = Chain.second.size();
904  if (Size < 2)
905  continue;
906 
907  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
908 
909  // Process the stores in chunks of 64.
910  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
911  unsigned Len = std::min<unsigned>(CE - CI, 64);
912  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
913  Changed |= vectorizeInstructions(Chunk);
914  }
915  }
916 
917  return Changed;
918 }
919 
920 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
921  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
922  << " instructions.\n");
923  SmallVector<int, 16> Heads, Tails;
924  int ConsecutiveChain[64];
925 
926  // Do a quadratic search on all of the given loads/stores and find all of the
927  // pairs of loads/stores that follow each other.
928  for (int i = 0, e = Instrs.size(); i < e; ++i) {
929  ConsecutiveChain[i] = -1;
930  for (int j = e - 1; j >= 0; --j) {
931  if (i == j)
932  continue;
933 
934  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
935  if (ConsecutiveChain[i] != -1) {
936  int CurDistance = std::abs(ConsecutiveChain[i] - i);
937  int NewDistance = std::abs(ConsecutiveChain[i] - j);
938  if (j < i || NewDistance > CurDistance)
939  continue; // Should not insert.
940  }
941 
942  Tails.push_back(j);
943  Heads.push_back(i);
944  ConsecutiveChain[i] = j;
945  }
946  }
947  }
948 
949  bool Changed = false;
950  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
951 
952  for (int Head : Heads) {
953  if (InstructionsProcessed.count(Instrs[Head]))
954  continue;
955  bool LongerChainExists = false;
956  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
957  if (Head == Tails[TIt] &&
958  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
959  LongerChainExists = true;
960  break;
961  }
962  if (LongerChainExists)
963  continue;
964 
965  // We found an instr that starts a chain. Now follow the chain and try to
966  // vectorize it.
968  int I = Head;
969  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
970  if (InstructionsProcessed.count(Instrs[I]))
971  break;
972 
973  Operands.push_back(Instrs[I]);
974  I = ConsecutiveChain[I];
975  }
976 
977  bool Vectorized = false;
978  if (isa<LoadInst>(*Operands.begin()))
979  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
980  else
981  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
982 
983  Changed |= Vectorized;
984  }
985 
986  return Changed;
987 }
988 
989 bool Vectorizer::vectorizeStoreChain(
991  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
992  StoreInst *S0 = cast<StoreInst>(Chain[0]);
993 
994  // If the vector has an int element, default to int for the whole store.
995  Type *StoreTy = nullptr;
996  for (Instruction *I : Chain) {
997  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
998  if (StoreTy->isIntOrIntVectorTy())
999  break;
1000 
1001  if (StoreTy->isPtrOrPtrVectorTy()) {
1002  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
1003  DL.getTypeSizeInBits(StoreTy));
1004  break;
1005  }
1006  }
1007  assert(StoreTy && "Failed to find store type");
1008 
1009  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
1010  unsigned AS = S0->getPointerAddressSpace();
1011  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1012  unsigned VF = VecRegSize / Sz;
1013  unsigned ChainSize = Chain.size();
1014  Align Alignment = S0->getAlign();
1015 
1016  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1017  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1018  return false;
1019  }
1020 
1021  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1022  if (NewChain.empty()) {
1023  // No vectorization possible.
1024  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1025  return false;
1026  }
1027  if (NewChain.size() == 1) {
1028  // Failed after the first instruction. Discard it and try the smaller chain.
1029  InstructionsProcessed->insert(NewChain.front());
1030  return false;
1031  }
1032 
1033  // Update Chain to the valid vectorizable subchain.
1034  Chain = NewChain;
1035  ChainSize = Chain.size();
1036 
1037  // Check if it's legal to vectorize this chain. If not, split the chain and
1038  // try again.
1039  unsigned EltSzInBytes = Sz / 8;
1040  unsigned SzInBytes = EltSzInBytes * ChainSize;
1041 
1042  FixedVectorType *VecTy;
1043  auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
1044  if (VecStoreTy)
1045  VecTy = FixedVectorType::get(StoreTy->getScalarType(),
1046  Chain.size() * VecStoreTy->getNumElements());
1047  else
1048  VecTy = FixedVectorType::get(StoreTy, Chain.size());
1049 
1050  // If it's more than the max vector size or the target has a better
1051  // vector factor, break it into two pieces.
1052  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1053  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1054  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1055  " Creating two separate arrays.\n");
1056  return vectorizeStoreChain(Chain.slice(0, TargetVF),
1057  InstructionsProcessed) |
1058  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1059  }
1060 
1061  LLVM_DEBUG({
1062  dbgs() << "LSV: Stores to vectorize:\n";
1063  for (Instruction *I : Chain)
1064  dbgs() << " " << *I << "\n";
1065  });
1066 
1067  // We won't try again to vectorize the elements of the chain, regardless of
1068  // whether we succeed below.
1069  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1070 
1071  // If the store is going to be misaligned, don't vectorize it.
1072  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1073  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1074  auto Chains = splitOddVectorElts(Chain, Sz);
1075  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1076  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1077  }
1078 
1081  DL, S0, nullptr, &DT);
1082  if (NewAlign >= Alignment)
1083  Alignment = NewAlign;
1084  else
1085  return false;
1086  }
1087 
1088  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1089  auto Chains = splitOddVectorElts(Chain, Sz);
1090  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1091  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1092  }
1093 
1095  std::tie(First, Last) = getBoundaryInstrs(Chain);
1096  Builder.SetInsertPoint(&*Last);
1097 
1098  Value *Vec = UndefValue::get(VecTy);
1099 
1100  if (VecStoreTy) {
1101  unsigned VecWidth = VecStoreTy->getNumElements();
1102  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1103  StoreInst *Store = cast<StoreInst>(Chain[I]);
1104  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1105  unsigned NewIdx = J + I * VecWidth;
1106  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1107  Builder.getInt32(J));
1108  if (Extract->getType() != StoreTy->getScalarType())
1109  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1110 
1111  Value *Insert =
1112  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1113  Vec = Insert;
1114  }
1115  }
1116  } else {
1117  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1118  StoreInst *Store = cast<StoreInst>(Chain[I]);
1119  Value *Extract = Store->getValueOperand();
1120  if (Extract->getType() != StoreTy->getScalarType())
1121  Extract =
1122  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1123 
1124  Value *Insert =
1125  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1126  Vec = Insert;
1127  }
1128  }
1129 
1130  StoreInst *SI = Builder.CreateAlignedStore(
1131  Vec,
1132  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1133  Alignment);
1134  propagateMetadata(SI, Chain);
1135 
1136  eraseInstructions(Chain);
1137  ++NumVectorInstructions;
1138  NumScalarsVectorized += Chain.size();
1139  return true;
1140 }
1141 
1142 bool Vectorizer::vectorizeLoadChain(
1144  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1145  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1146 
1147  // If the vector has an int element, default to int for the whole load.
1148  Type *LoadTy = nullptr;
1149  for (const auto &V : Chain) {
1150  LoadTy = cast<LoadInst>(V)->getType();
1151  if (LoadTy->isIntOrIntVectorTy())
1152  break;
1153 
1154  if (LoadTy->isPtrOrPtrVectorTy()) {
1155  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1156  DL.getTypeSizeInBits(LoadTy));
1157  break;
1158  }
1159  }
1160  assert(LoadTy && "Can't determine LoadInst type from chain");
1161 
1162  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1163  unsigned AS = L0->getPointerAddressSpace();
1164  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1165  unsigned VF = VecRegSize / Sz;
1166  unsigned ChainSize = Chain.size();
1167  Align Alignment = L0->getAlign();
1168 
1169  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1170  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1171  return false;
1172  }
1173 
1174  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1175  if (NewChain.empty()) {
1176  // No vectorization possible.
1177  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1178  return false;
1179  }
1180  if (NewChain.size() == 1) {
1181  // Failed after the first instruction. Discard it and try the smaller chain.
1182  InstructionsProcessed->insert(NewChain.front());
1183  return false;
1184  }
1185 
1186  // Update Chain to the valid vectorizable subchain.
1187  Chain = NewChain;
1188  ChainSize = Chain.size();
1189 
1190  // Check if it's legal to vectorize this chain. If not, split the chain and
1191  // try again.
1192  unsigned EltSzInBytes = Sz / 8;
1193  unsigned SzInBytes = EltSzInBytes * ChainSize;
1194  VectorType *VecTy;
1195  auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
1196  if (VecLoadTy)
1197  VecTy = FixedVectorType::get(LoadTy->getScalarType(),
1198  Chain.size() * VecLoadTy->getNumElements());
1199  else
1200  VecTy = FixedVectorType::get(LoadTy, Chain.size());
1201 
1202  // If it's more than the max vector size or the target has a better
1203  // vector factor, break it into two pieces.
1204  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1205  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1206  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1207  " Creating two separate arrays.\n");
1208  return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1209  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1210  }
1211 
1212  // We won't try again to vectorize the elements of the chain, regardless of
1213  // whether we succeed below.
1214  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1215 
1216  // If the load is going to be misaligned, don't vectorize it.
1217  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1218  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1219  auto Chains = splitOddVectorElts(Chain, Sz);
1220  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1221  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1222  }
1223 
1226  DL, L0, nullptr, &DT);
1227  if (NewAlign >= Alignment)
1228  Alignment = NewAlign;
1229  else
1230  return false;
1231  }
1232 
1233  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1234  auto Chains = splitOddVectorElts(Chain, Sz);
1235  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1236  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1237  }
1238 
1239  LLVM_DEBUG({
1240  dbgs() << "LSV: Loads to vectorize:\n";
1241  for (Instruction *I : Chain)
1242  I->dump();
1243  });
1244 
1245  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1246  // Last may have changed since then, but the value of First won't have. If it
1247  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1249  std::tie(First, Last) = getBoundaryInstrs(Chain);
1250  Builder.SetInsertPoint(&*First);
1251 
1252  Value *Bitcast =
1253  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1254  LoadInst *LI =
1255  Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
1256  propagateMetadata(LI, Chain);
1257 
1258  if (VecLoadTy) {
1259  SmallVector<Instruction *, 16> InstrsToErase;
1260 
1261  unsigned VecWidth = VecLoadTy->getNumElements();
1262  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1263  for (auto Use : Chain[I]->users()) {
1264  // All users of vector loads are ExtractElement instructions with
1265  // constant indices, otherwise we would have bailed before now.
1266  Instruction *UI = cast<Instruction>(Use);
1267  unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1268  unsigned NewIdx = Idx + I * VecWidth;
1269  Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1270  UI->getName());
1271  if (V->getType() != UI->getType())
1272  V = Builder.CreateBitCast(V, UI->getType());
1273 
1274  // Replace the old instruction.
1275  UI->replaceAllUsesWith(V);
1276  InstrsToErase.push_back(UI);
1277  }
1278  }
1279 
1280  // Bitcast might not be an Instruction, if the value being loaded is a
1281  // constant. In that case, no need to reorder anything.
1282  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1283  reorder(BitcastInst);
1284 
1285  for (auto I : InstrsToErase)
1286  I->eraseFromParent();
1287  } else {
1288  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1289  Value *CV = Chain[I];
1290  Value *V =
1291  Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1292  if (V->getType() != CV->getType()) {
1293  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1294  }
1295 
1296  // Replace the old instruction.
1297  CV->replaceAllUsesWith(V);
1298  }
1299 
1300  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1301  reorder(BitcastInst);
1302  }
1303 
1304  eraseInstructions(Chain);
1305 
1306  ++NumVectorInstructions;
1307  NumScalarsVectorized += Chain.size();
1308  return true;
1309 }
1310 
1311 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1312  Align Alignment) {
1313  if (Alignment.value() % SzInBytes == 0)
1314  return false;
1315 
1316  bool Fast = false;
1317  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1318  SzInBytes * 8, AddressSpace,
1319  Alignment, &Fast);
1320  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1321  << " and fast? " << Fast << "\n";);
1322  return !Allows || !Fast;
1323 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::ScalarEvolution::getContext
LLVMContext & getContext() const
Definition: ScalarEvolution.h:481
AssumptionCache.h
llvm::AAManager
A manager for alias analyses.
Definition: AliasAnalysis.h:1221
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2319
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4543
llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition: ScalarEvolution.h:2074
MathExtras.h
llvm::MemoryLocation::get
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition: MemoryLocation.cpp:37
llvm
Definition: AllocatorList.h:23
llvm::initializeLoadStoreVectorizerLegacyPassPass
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::AArch64CC::NE
@ NE
Definition: AArch64BaseInfo.h:237
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
IntrinsicInst.h
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:785
llvm::Function
Definition: Function.h:61
Pass.h
store
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening store
Definition: README.txt:133
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:317
llvm::SmallVector< Instruction *, 8 >
Statistic.h
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:167
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:693
llvm::IRBuilder<>
MapVector.h
llvm::AAResults::isNoAlias
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Definition: AliasAnalysis.h:510
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::Instruction::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const
Determine whether the no unsigned wrap flag is set.
Definition: Instruction.cpp:132
ValueTracking.h
Local.h
llvm::Instruction::insertBefore
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:84
llvm::LoadStoreVectorizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: LoadStoreVectorizer.cpp:255
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
APInt.h
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:34
ScalarEvolution.h
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1581
Module.h
llvm::getOrEnforceKnownAlignment
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1337
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:111
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1235
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::SmallPtrSet< Instruction *, 16 >
STLExtras.h
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
llvm::propagateMetadata
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition: VectorUtils.cpp:710
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:266
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:139
and
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 and
Definition: README.txt:1271
llvm::CallingConv::Fast
@ Fast
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:527
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:222
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
F
#define F(x, y, z)
Definition: MD5.cpp:56
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Instruction.h
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1482
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:364
llvm::TargetTransformInfo::isLegalToVectorizeLoad
bool isLegalToVectorizeLoad(LoadInst *LI) const
Definition: TargetTransformInfo.cpp:980
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
Constants.h
llvm::AAResults
Definition: AliasAnalysis.h:456
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
InstrTypes.h
SI
@ SI
Definition: SIInstrInfo.cpp:7463
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::generic_gep_type_iterator::getOperand
Value * getOperand() const
Definition: GetElementPtrTypeIterator.h:78
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:281
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1770
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4314
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2104
llvm::createLoadStoreVectorizerPass
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
SmallPtrSet.h
llvm::Instruction::removeFromParent
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:74
llvm::APInt::sle
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1259
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:644
llvm::StoreInst::getAlign
Align getAlign() const
Definition: Instructions.h:353
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:193
Type.h
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:3893
getChainID
static ChainID getChainID(const Value *Ptr)
Definition: LoadStoreVectorizer.cpp:777
llvm::VectorType::isValidElementType
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:635
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
VectorUtils.h
BasicBlock.h
LoadStoreVectorizer.h
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:272
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::instructions
inst_range instructions(Function *F)
Definition: InstIterator.h:133
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:303
llvm::SCEVAAWrapperPass
Legacy wrapper pass to provide the SCEVAAResult object.
Definition: ScalarEvolutionAliasAnalysis.h:52
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2375
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:169
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
StackAdjustedAlignment
static const unsigned StackAdjustedAlignment
Definition: LoadStoreVectorizer.cpp:97
llvm::LegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition: LegalizerInfo.h:72
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
MemoryLocation.h
llvm::APInt::negate
void negate()
Negate this APInt in place.
Definition: APInt.h:1563
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:905
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1547
ArrayRef.h
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:212
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
load
LLVM currently emits rax rax movq rax rax ret It could narrow the loads and stores to emit rax rax movq rax rax ret The trouble is that there is a TokenFactor between the store and the load
Definition: README.txt:1531
llvm::IndexedInstrProf::HashT::Last
@ Last
llvm::TargetTransformInfo::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:988
llvm::generic_gep_type_iterator::getIndexedType
Type * getIndexedType() const
Definition: GetElementPtrTypeIterator.h:72
iterator_range.h
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:162
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::TargetTransformInfo::allowsMisalignedMemoryAccesses
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), bool *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
Definition: TargetTransformInfo.cpp:511
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::APInt::urem
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1693
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::BinaryOperator
Definition: InstrTypes.h:190
DataLayout.h
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
llvm::APInt::zextOrSelf
APInt zextOrSelf(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:970
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:444
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: PassManager.h:116
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:527
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:232
llvm::APInt::ult
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1205
llvm::TargetTransformInfo::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: TargetTransformInfo.cpp:1005
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:299
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::ArrayRef::front
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:166
users
iv users
Definition: IVUsers.cpp:52
llvm::APInt::udiv
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1600
llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1525
llvm::TargetTransformInfo::isLegalToVectorizeStore
bool isLegalToVectorizeStore(StoreInst *SI) const
Definition: TargetTransformInfo.cpp:984
MaxDepth
static const unsigned MaxDepth
Definition: InstCombineMulDivRem.cpp:853
llvm::Value::stripAndAccumulateInBoundsConstantOffsets
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:731
llvm::APInt::zext
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:930
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoadStoreVectorizer.cpp:91
Attributes.h
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
llvm::APInt::trunc
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:858
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:201
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
Definition: ScalarEvolution.cpp:3994
llvm::APInt::isMinSignedValue
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:448
llvm::StoreInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:407
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::ArrayRef::begin
iterator begin() const
Definition: ArrayRef.h:151
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:208
llvm::APInt::sextOrTrunc
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:956
llvm::TargetTransformInfo::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:976
Casting.h
llvm::post_order
iterator_range< po_iterator< T > > post_order(const T &G)
Definition: PostOrderIterator.h:188
Function.h
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:709
INITIALIZE_PASS_END
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and store instructions", false, false) Pass *llvm
Definition: LoadStoreVectorizer.cpp:230
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:252
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
PostOrderIterator.h
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:191
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
SmallVector.h
User.h
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:401
llvm::Instruction::hasNoSignedWrap
bool hasNoSignedWrap() const
Determine whether the no signed wrap flag is set.
Definition: Instruction.cpp:136
Dominators.h
llvm::Type::getPointerElementType
Type * getPointerElementType() const
Definition: Type.h:378
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1269
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::GlobalsAAWrapperPass
Legacy wrapper pass to provide the GlobalsAAResult object.
Definition: GlobalsModRef.h:143
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
TargetTransformInfo.h
llvm::TargetTransformInfo::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: TargetTransformInfo.cpp:1012
llvm::TargetTransformInfo::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:994
Vectorize.h
DerivedTypes.h
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5239
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2313
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
raw_ostream.h
Value.h
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1272
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
Debug.h
llvm::ArrayRef::end
iterator end() const
Definition: ArrayRef.h:152
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38