LLVM  16.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
66 #include "llvm/IR/IRBuilder.h"
67 #include "llvm/IR/InstrTypes.h"
68 #include "llvm/IR/Instruction.h"
69 #include "llvm/IR/Instructions.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/InitializePasses.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
106 /// be relied upon.
107 using ChainID = const Value *;
109 using InstrListMap = MapVector<ChainID, InstrList>;
110 
111 class Vectorizer {
112  Function &F;
113  AliasAnalysis &AA;
114  AssumptionCache &AC;
115  DominatorTree &DT;
116  ScalarEvolution &SE;
118  const DataLayout &DL;
120 
121 public:
122  Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
124  : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
125  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
126 
127  bool run();
128 
129 private:
130  unsigned getPointerAddressSpace(Value *I);
131 
132  static const unsigned MaxDepth = 3;
133 
134  bool isConsecutiveAccess(Value *A, Value *B);
135  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
136  unsigned Depth = 0) const;
137  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
138  unsigned Depth) const;
139  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
140  unsigned Depth) const;
141 
142  /// After vectorization, reorder the instructions that I depends on
143  /// (the instructions defining its operands), to ensure they dominate I.
144  void reorder(Instruction *I);
145 
146  /// Returns the first and the last instructions in Chain.
147  std::pair<BasicBlock::iterator, BasicBlock::iterator>
148  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
149 
150  /// Erases the original instructions after vectorizing.
151  void eraseInstructions(ArrayRef<Instruction *> Chain);
152 
153  /// "Legalize" the vector type that would be produced by combining \p
154  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
155  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
156  /// expected to have more than 4 elements.
157  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
158  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
159 
160  /// Finds the largest prefix of Chain that's vectorizable, checking for
161  /// intervening instructions which may affect the memory accessed by the
162  /// instructions within Chain.
163  ///
164  /// The elements of \p Chain must be all loads or all stores and must be in
165  /// address order.
166  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
167 
168  /// Collects load and store instructions to vectorize.
169  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
170 
171  /// Processes the collected instructions, the \p Map. The values of \p Map
172  /// should be all loads or all stores.
173  bool vectorizeChains(InstrListMap &Map);
174 
175  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
176  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
177 
178  /// Vectorizes the load instructions in Chain.
179  bool
180  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
181  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
182 
183  /// Vectorizes the store instructions in Chain.
184  bool
185  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
186  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
187 
188  /// Check if this load/store access is misaligned accesses.
189  /// Returns a \p RelativeSpeed of an operation if allowed suitable to
190  /// compare to another result for the same \p AddressSpace and potentially
191  /// different \p Alignment and \p SzInBytes.
192  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
193  Align Alignment, unsigned &RelativeSpeed);
194 };
195 
196 class LoadStoreVectorizerLegacyPass : public FunctionPass {
197 public:
198  static char ID;
199 
200  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
202  }
203 
204  bool runOnFunction(Function &F) override;
205 
206  StringRef getPassName() const override {
207  return "GPU Load and Store Vectorizer";
208  }
209 
210  void getAnalysisUsage(AnalysisUsage &AU) const override {
216  AU.setPreservesCFG();
217  }
218 };
219 
220 } // end anonymous namespace
221 
223 
224 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
225  "Vectorize load and Store instructions", false, false)
232 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
233  "Vectorize load and store instructions", false, false)
234 
236  return new LoadStoreVectorizerLegacyPass();
237 }
238 
240  // Don't vectorize when the attribute NoImplicitFloat is used.
241  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
242  return false;
243 
244  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
245  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
246  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
248  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
249 
250  AssumptionCache &AC =
251  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
252 
253  Vectorizer V(F, AA, AC, DT, SE, TTI);
254  return V.run();
255 }
256 
258  // Don't vectorize when the attribute NoImplicitFloat is used.
259  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
260  return PreservedAnalyses::all();
261 
262  AliasAnalysis &AA = AM.getResult<AAManager>(F);
267 
268  Vectorizer V(F, AA, AC, DT, SE, TTI);
269  bool Changed = V.run();
271  PA.preserveSet<CFGAnalyses>();
272  return Changed ? PA : PreservedAnalyses::all();
273 }
274 
275 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
276 // vectors of Instructions.
278  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
279  propagateMetadata(I, VL);
280 }
281 
282 // Vectorizer Implementation
283 bool Vectorizer::run() {
284  bool Changed = false;
285 
286  // Scan the blocks in the function in post order.
287  for (BasicBlock *BB : post_order(&F)) {
288  InstrListMap LoadRefs, StoreRefs;
289  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
290  Changed |= vectorizeChains(LoadRefs);
291  Changed |= vectorizeChains(StoreRefs);
292  }
293 
294  return Changed;
295 }
296 
297 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
298  if (LoadInst *L = dyn_cast<LoadInst>(I))
299  return L->getPointerAddressSpace();
300  if (StoreInst *S = dyn_cast<StoreInst>(I))
301  return S->getPointerAddressSpace();
302  return -1;
303 }
304 
305 // FIXME: Merge with llvm::isConsecutiveAccess
309  unsigned ASA = getPointerAddressSpace(A);
310  unsigned ASB = getPointerAddressSpace(B);
311 
312  // Check that the address spaces match and that the pointers are valid.
313  if (!PtrA || !PtrB || (ASA != ASB))
314  return false;
315 
316  // Make sure that A and B are different pointers of the same size type.
317  Type *PtrATy = getLoadStoreType(A);
318  Type *PtrBTy = getLoadStoreType(B);
319  if (PtrA == PtrB ||
320  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
321  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
322  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
323  DL.getTypeStoreSize(PtrBTy->getScalarType()))
324  return false;
325 
326  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
327  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
328 
329  return areConsecutivePointers(PtrA, PtrB, Size);
330 }
331 
332 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
333  APInt PtrDelta, unsigned Depth) const {
334  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
335  APInt OffsetA(PtrBitWidth, 0);
336  APInt OffsetB(PtrBitWidth, 0);
337  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
338  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
339 
340  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
341 
342  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
343  return false;
344 
345  // In case if we have to shrink the pointer
346  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
347  // possible overflow and the value should fit into a smallest data type
348  // used in the cast/gep chain.
349  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
350  OffsetB.getMinSignedBits() <= NewPtrBitWidth);
351 
352  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
353  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
354  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
355 
356  APInt OffsetDelta = OffsetB - OffsetA;
357 
358  // Check if they are based on the same pointer. That makes the offsets
359  // sufficient.
360  if (PtrA == PtrB)
361  return OffsetDelta == PtrDelta;
362 
363  // Compute the necessary base pointer delta to have the necessary final delta
364  // equal to the pointer delta requested.
365  APInt BaseDelta = PtrDelta - OffsetDelta;
366 
367  // Compute the distance with SCEV between the base pointers.
368  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
369  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
370  const SCEV *C = SE.getConstant(BaseDelta);
371  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
372  if (X == PtrSCEVB)
373  return true;
374 
375  // The above check will not catch the cases where one of the pointers is
376  // factorized but the other one is not, such as (C + (S * (A + B))) vs
377  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
378  // and getting the simplified difference.
379  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
380  if (C == Dist)
381  return true;
382 
383  // Sometimes even this doesn't work, because SCEV can't always see through
384  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
385  // things the hard way.
386  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
387 }
388 
389 static bool checkNoWrapFlags(Instruction *I, bool Signed) {
390  BinaryOperator *BinOpI = cast<BinaryOperator>(I);
391  return (Signed && BinOpI->hasNoSignedWrap()) ||
392  (!Signed && BinOpI->hasNoUnsignedWrap());
393 }
394 
395 static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
396  unsigned MatchingOpIdxA, Instruction *AddOpB,
397  unsigned MatchingOpIdxB, bool Signed) {
398  // If both OpA and OpB is an add with NSW/NUW and with
399  // one of the operands being the same, we can guarantee that the
400  // transformation is safe if we can prove that OpA won't overflow when
401  // IdxDiff added to the other operand of OpA.
402  // For example:
403  // %tmp7 = add nsw i32 %tmp2, %v0
404  // %tmp8 = sext i32 %tmp7 to i64
405  // ...
406  // %tmp11 = add nsw i32 %v0, 1
407  // %tmp12 = add nsw i32 %tmp2, %tmp11
408  // %tmp13 = sext i32 %tmp12 to i64
409  //
410  // Both %tmp7 and %tmp2 has the nsw flag and the first operand
411  // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
412  // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
413  // nsw flag.
414  assert(AddOpA->getOpcode() == Instruction::Add &&
415  AddOpB->getOpcode() == Instruction::Add &&
416  checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
417  if (AddOpA->getOperand(MatchingOpIdxA) ==
418  AddOpB->getOperand(MatchingOpIdxB)) {
419  Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
420  Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
421  Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
422  Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
423  // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
424  if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
425  checkNoWrapFlags(OtherInstrB, Signed) &&
426  isa<ConstantInt>(OtherInstrB->getOperand(1))) {
427  int64_t CstVal =
428  cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
429  if (OtherInstrB->getOperand(0) == OtherOperandA &&
430  IdxDiff.getSExtValue() == CstVal)
431  return true;
432  }
433  // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
434  if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
435  checkNoWrapFlags(OtherInstrA, Signed) &&
436  isa<ConstantInt>(OtherInstrA->getOperand(1))) {
437  int64_t CstVal =
438  cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
439  if (OtherInstrA->getOperand(0) == OtherOperandB &&
440  IdxDiff.getSExtValue() == -CstVal)
441  return true;
442  }
443  // Match `x +nsw/nuw (y +nsw/nuw c)` and
444  // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
445  if (OtherInstrA && OtherInstrB &&
446  OtherInstrA->getOpcode() == Instruction::Add &&
447  OtherInstrB->getOpcode() == Instruction::Add &&
448  checkNoWrapFlags(OtherInstrA, Signed) &&
449  checkNoWrapFlags(OtherInstrB, Signed) &&
450  isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
451  isa<ConstantInt>(OtherInstrB->getOperand(1))) {
452  int64_t CstValA =
453  cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
454  int64_t CstValB =
455  cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
456  if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
457  IdxDiff.getSExtValue() == (CstValB - CstValA))
458  return true;
459  }
460  }
461  return false;
462 }
463 
464 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
465  APInt PtrDelta,
466  unsigned Depth) const {
467  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
468  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
469  if (!GEPA || !GEPB)
470  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
471 
472  // Look through GEPs after checking they're the same except for the last
473  // index.
474  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
475  GEPA->getPointerOperand() != GEPB->getPointerOperand())
476  return false;
477  gep_type_iterator GTIA = gep_type_begin(GEPA);
478  gep_type_iterator GTIB = gep_type_begin(GEPB);
479  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
480  if (GTIA.getOperand() != GTIB.getOperand())
481  return false;
482  ++GTIA;
483  ++GTIB;
484  }
485 
486  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
487  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
488  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
489  OpA->getType() != OpB->getType())
490  return false;
491 
492  if (PtrDelta.isNegative()) {
493  if (PtrDelta.isMinSignedValue())
494  return false;
495  PtrDelta.negate();
496  std::swap(OpA, OpB);
497  }
498  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
499  if (PtrDelta.urem(Stride) != 0)
500  return false;
501  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
502  APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
503 
504  // Only look through a ZExt/SExt.
505  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
506  return false;
507 
508  bool Signed = isa<SExtInst>(OpA);
509 
510  // At this point A could be a function parameter, i.e. not an instruction
511  Value *ValA = OpA->getOperand(0);
512  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
513  if (!OpB || ValA->getType() != OpB->getType())
514  return false;
515 
516  // Now we need to prove that adding IdxDiff to ValA won't overflow.
517  bool Safe = false;
518 
519  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
520  // ValA, we're okay.
521  if (OpB->getOpcode() == Instruction::Add &&
522  isa<ConstantInt>(OpB->getOperand(1)) &&
523  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
524  checkNoWrapFlags(OpB, Signed))
525  Safe = true;
526 
527  // Second attempt: check if we have eligible add NSW/NUW instruction
528  // sequences.
529  OpA = dyn_cast<Instruction>(ValA);
530  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
531  OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
532  checkNoWrapFlags(OpB, Signed)) {
533  // In the checks below a matching operand in OpA and OpB is
534  // an operand which is the same in those two instructions.
535  // Below we account for possible orders of the operands of
536  // these add instructions.
537  for (unsigned MatchingOpIdxA : {0, 1})
538  for (unsigned MatchingOpIdxB : {0, 1})
539  if (!Safe)
540  Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
541  MatchingOpIdxB, Signed);
542  }
543 
544  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
545 
546  // Third attempt:
547  // If all set bits of IdxDiff or any higher order bit other than the sign bit
548  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
549  // overflow of any sort.
550  if (!Safe) {
551  KnownBits Known(BitWidth);
552  computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
553  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
554  if (Signed)
555  BitsAllowedToBeSet.clearBit(BitWidth - 1);
556  if (BitsAllowedToBeSet.ult(IdxDiff))
557  return false;
558  }
559 
560  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
561  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
562  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
563  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
564  return X == OffsetSCEVB;
565 }
566 
567 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
568  const APInt &PtrDelta,
569  unsigned Depth) const {
570  if (Depth++ == MaxDepth)
571  return false;
572 
573  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
574  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
575  return SelectA->getCondition() == SelectB->getCondition() &&
576  areConsecutivePointers(SelectA->getTrueValue(),
577  SelectB->getTrueValue(), PtrDelta, Depth) &&
578  areConsecutivePointers(SelectA->getFalseValue(),
579  SelectB->getFalseValue(), PtrDelta, Depth);
580  }
581  }
582  return false;
583 }
584 
585 void Vectorizer::reorder(Instruction *I) {
586  SmallPtrSet<Instruction *, 16> InstructionsToMove;
588 
589  Worklist.push_back(I);
590  while (!Worklist.empty()) {
591  Instruction *IW = Worklist.pop_back_val();
592  int NumOperands = IW->getNumOperands();
593  for (int i = 0; i < NumOperands; i++) {
594  Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
595  if (!IM || IM->getOpcode() == Instruction::PHI)
596  continue;
597 
598  // If IM is in another BB, no need to move it, because this pass only
599  // vectorizes instructions within one BB.
600  if (IM->getParent() != I->getParent())
601  continue;
602 
603  if (!IM->comesBefore(I)) {
604  InstructionsToMove.insert(IM);
605  Worklist.push_back(IM);
606  }
607  }
608  }
609 
610  // All instructions to move should follow I. Start from I, not from begin().
611  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
612  ++BBI) {
613  if (!InstructionsToMove.count(&*BBI))
614  continue;
615  Instruction *IM = &*BBI;
616  --BBI;
617  IM->removeFromParent();
618  IM->insertBefore(I);
619  }
620 }
621 
622 std::pair<BasicBlock::iterator, BasicBlock::iterator>
623 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
624  Instruction *C0 = Chain[0];
625  BasicBlock::iterator FirstInstr = C0->getIterator();
626  BasicBlock::iterator LastInstr = C0->getIterator();
627 
628  BasicBlock *BB = C0->getParent();
629  unsigned NumFound = 0;
630  for (Instruction &I : *BB) {
631  if (!is_contained(Chain, &I))
632  continue;
633 
634  ++NumFound;
635  if (NumFound == 1) {
636  FirstInstr = I.getIterator();
637  }
638  if (NumFound == Chain.size()) {
639  LastInstr = I.getIterator();
640  break;
641  }
642  }
643 
644  // Range is [first, last).
645  return std::make_pair(FirstInstr, ++LastInstr);
646 }
647 
648 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
650  for (Instruction *I : Chain) {
651  Value *PtrOperand = getLoadStorePointerOperand(I);
652  assert(PtrOperand && "Instruction must have a pointer operand.");
653  Instrs.push_back(I);
654  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
655  Instrs.push_back(GEP);
656  }
657 
658  // Erase instructions.
659  for (Instruction *I : Instrs)
660  if (I->use_empty())
661  I->eraseFromParent();
662 }
663 
664 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
665 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
666  unsigned ElementSizeBits) {
667  unsigned ElementSizeBytes = ElementSizeBits / 8;
668  unsigned SizeBytes = ElementSizeBytes * Chain.size();
669  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
670  if (NumLeft == Chain.size()) {
671  if ((NumLeft & 1) == 0)
672  NumLeft /= 2; // Split even in half
673  else
674  --NumLeft; // Split off last element
675  } else if (NumLeft == 0)
676  NumLeft = 1;
677  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
678 }
679 
681 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
682  // These are in BB order, unlike Chain, which is in address order.
683  SmallVector<Instruction *, 16> MemoryInstrs;
684  SmallVector<Instruction *, 16> ChainInstrs;
685 
686  bool IsLoadChain = isa<LoadInst>(Chain[0]);
687  LLVM_DEBUG({
688  for (Instruction *I : Chain) {
689  if (IsLoadChain)
690  assert(isa<LoadInst>(I) &&
691  "All elements of Chain must be loads, or all must be stores.");
692  else
693  assert(isa<StoreInst>(I) &&
694  "All elements of Chain must be loads, or all must be stores.");
695  }
696  });
697 
698  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
699  if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
700  ChainInstrs.push_back(&I);
701  continue;
702  }
704  LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
705  << I << '\n');
706  break;
707  }
708  if (I.mayReadOrWriteMemory())
709  MemoryInstrs.push_back(&I);
710  }
711 
712  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
713  unsigned ChainInstrIdx = 0;
714  Instruction *BarrierMemoryInstr = nullptr;
715 
716  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
717  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
718 
719  // If a barrier memory instruction was found, chain instructions that follow
720  // will not be added to the valid prefix.
721  if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
722  break;
723 
724  // Check (in BB order) if any instruction prevents ChainInstr from being
725  // vectorized. Find and store the first such "conflicting" instruction.
726  for (Instruction *MemInstr : MemoryInstrs) {
727  // If a barrier memory instruction was found, do not check past it.
728  if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
729  break;
730 
731  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
732  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
733  if (MemLoad && ChainLoad)
734  continue;
735 
736  // We can ignore the alias if the we have a load store pair and the load
737  // is known to be invariant. The load cannot be clobbered by the store.
738  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
739  return LI->hasMetadata(LLVMContext::MD_invariant_load);
740  };
741 
742  if (IsLoadChain) {
743  // We can ignore the alias as long as the load comes before the store,
744  // because that means we won't be moving the load past the store to
745  // vectorize it (the vectorized load is inserted at the location of the
746  // first load in the chain).
747  if (ChainInstr->comesBefore(MemInstr) ||
748  (ChainLoad && IsInvariantLoad(ChainLoad)))
749  continue;
750  } else {
751  // Same case, but in reverse.
752  if (MemInstr->comesBefore(ChainInstr) ||
753  (MemLoad && IsInvariantLoad(MemLoad)))
754  continue;
755  }
756 
757  ModRefInfo MR =
758  AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
759  if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
760  LLVM_DEBUG({
761  dbgs() << "LSV: Found alias:\n"
762  " Aliasing instruction:\n"
763  << " " << *MemInstr << '\n'
764  << " Aliased instruction and pointer:\n"
765  << " " << *ChainInstr << '\n'
766  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
767  });
768  // Save this aliasing memory instruction as a barrier, but allow other
769  // instructions that precede the barrier to be vectorized with this one.
770  BarrierMemoryInstr = MemInstr;
771  break;
772  }
773  }
774  // Continue the search only for store chains, since vectorizing stores that
775  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
776  // up to an aliasing store, but should not pull loads from further down in
777  // the basic block.
778  if (IsLoadChain && BarrierMemoryInstr) {
779  // The BarrierMemoryInstr is a store that precedes ChainInstr.
780  assert(BarrierMemoryInstr->comesBefore(ChainInstr));
781  break;
782  }
783  }
784 
785  // Find the largest prefix of Chain whose elements are all in
786  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
787  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
788  // order.)
789  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
790  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
791  unsigned ChainIdx = 0;
792  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
793  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
794  break;
795  }
796  return Chain.slice(0, ChainIdx);
797 }
798 
799 static ChainID getChainID(const Value *Ptr) {
800  const Value *ObjPtr = getUnderlyingObject(Ptr);
801  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
802  // The select's themselves are distinct instructions even if they share the
803  // same condition and evaluate to consecutive pointers for true and false
804  // values of the condition. Therefore using the select's themselves for
805  // grouping instructions would put consecutive accesses into different lists
806  // and they won't be even checked for being consecutive, and won't be
807  // vectorized.
808  return Sel->getCondition();
809  }
810  return ObjPtr;
811 }
812 
813 std::pair<InstrListMap, InstrListMap>
814 Vectorizer::collectInstructions(BasicBlock *BB) {
815  InstrListMap LoadRefs;
816  InstrListMap StoreRefs;
817 
818  for (Instruction &I : *BB) {
819  if (!I.mayReadOrWriteMemory())
820  continue;
821 
822  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
823  if (!LI->isSimple())
824  continue;
825 
826  // Skip if it's not legal.
827  if (!TTI.isLegalToVectorizeLoad(LI))
828  continue;
829 
830  Type *Ty = LI->getType();
832  continue;
833 
834  // Skip weird non-byte sizes. They probably aren't worth the effort of
835  // handling correctly.
836  unsigned TySize = DL.getTypeSizeInBits(Ty);
837  if ((TySize % 8) != 0)
838  continue;
839 
840  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
841  // functions are currently using an integer type for the vectorized
842  // load/store, and does not support casting between the integer type and a
843  // vector of pointers (e.g. i64 to <2 x i16*>)
844  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
845  continue;
846 
847  Value *Ptr = LI->getPointerOperand();
848  unsigned AS = Ptr->getType()->getPointerAddressSpace();
849  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
850 
851  unsigned VF = VecRegSize / TySize;
852  VectorType *VecTy = dyn_cast<VectorType>(Ty);
853 
854  // No point in looking at these if they're too big to vectorize.
855  if (TySize > VecRegSize / 2 ||
856  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
857  continue;
858 
859  // Save the load locations.
860  const ChainID ID = getChainID(Ptr);
861  LoadRefs[ID].push_back(LI);
862  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
863  if (!SI->isSimple())
864  continue;
865 
866  // Skip if it's not legal.
868  continue;
869 
870  Type *Ty = SI->getValueOperand()->getType();
872  continue;
873 
874  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
875  // functions are currently using an integer type for the vectorized
876  // load/store, and does not support casting between the integer type and a
877  // vector of pointers (e.g. i64 to <2 x i16*>)
878  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
879  continue;
880 
881  // Skip weird non-byte sizes. They probably aren't worth the effort of
882  // handling correctly.
883  unsigned TySize = DL.getTypeSizeInBits(Ty);
884  if ((TySize % 8) != 0)
885  continue;
886 
887  Value *Ptr = SI->getPointerOperand();
888  unsigned AS = Ptr->getType()->getPointerAddressSpace();
889  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
890 
891  unsigned VF = VecRegSize / TySize;
892  VectorType *VecTy = dyn_cast<VectorType>(Ty);
893 
894  // No point in looking at these if they're too big to vectorize.
895  if (TySize > VecRegSize / 2 ||
896  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
897  continue;
898 
899  // Save store location.
900  const ChainID ID = getChainID(Ptr);
901  StoreRefs[ID].push_back(SI);
902  }
903  }
904 
905  return {LoadRefs, StoreRefs};
906 }
907 
908 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
909  bool Changed = false;
910 
911  for (const std::pair<ChainID, InstrList> &Chain : Map) {
912  unsigned Size = Chain.second.size();
913  if (Size < 2)
914  continue;
915 
916  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
917 
918  // Process the stores in chunks of 64.
919  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
920  unsigned Len = std::min<unsigned>(CE - CI, 64);
921  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
922  Changed |= vectorizeInstructions(Chunk);
923  }
924  }
925 
926  return Changed;
927 }
928 
929 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
930  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
931  << " instructions.\n");
932  SmallVector<int, 16> Heads, Tails;
933  int ConsecutiveChain[64];
934 
935  // Do a quadratic search on all of the given loads/stores and find all of the
936  // pairs of loads/stores that follow each other.
937  for (int i = 0, e = Instrs.size(); i < e; ++i) {
938  ConsecutiveChain[i] = -1;
939  for (int j = e - 1; j >= 0; --j) {
940  if (i == j)
941  continue;
942 
943  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
944  if (ConsecutiveChain[i] != -1) {
945  int CurDistance = std::abs(ConsecutiveChain[i] - i);
946  int NewDistance = std::abs(ConsecutiveChain[i] - j);
947  if (j < i || NewDistance > CurDistance)
948  continue; // Should not insert.
949  }
950 
951  Tails.push_back(j);
952  Heads.push_back(i);
953  ConsecutiveChain[i] = j;
954  }
955  }
956  }
957 
958  bool Changed = false;
959  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
960 
961  for (int Head : Heads) {
962  if (InstructionsProcessed.count(Instrs[Head]))
963  continue;
964  bool LongerChainExists = false;
965  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
966  if (Head == Tails[TIt] &&
967  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
968  LongerChainExists = true;
969  break;
970  }
971  if (LongerChainExists)
972  continue;
973 
974  // We found an instr that starts a chain. Now follow the chain and try to
975  // vectorize it.
977  int I = Head;
978  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
979  if (InstructionsProcessed.count(Instrs[I]))
980  break;
981 
982  Operands.push_back(Instrs[I]);
983  I = ConsecutiveChain[I];
984  }
985 
986  bool Vectorized = false;
987  if (isa<LoadInst>(*Operands.begin()))
988  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
989  else
990  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
991 
992  Changed |= Vectorized;
993  }
994 
995  return Changed;
996 }
997 
998 bool Vectorizer::vectorizeStoreChain(
1000  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1001  StoreInst *S0 = cast<StoreInst>(Chain[0]);
1002 
1003  // If the vector has an int element, default to int for the whole store.
1004  Type *StoreTy = nullptr;
1005  for (Instruction *I : Chain) {
1006  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
1007  if (StoreTy->isIntOrIntVectorTy())
1008  break;
1009 
1010  if (StoreTy->isPtrOrPtrVectorTy()) {
1011  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
1012  DL.getTypeSizeInBits(StoreTy));
1013  break;
1014  }
1015  }
1016  assert(StoreTy && "Failed to find store type");
1017 
1018  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
1019  unsigned AS = S0->getPointerAddressSpace();
1020  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1021  unsigned VF = VecRegSize / Sz;
1022  unsigned ChainSize = Chain.size();
1023  Align Alignment = S0->getAlign();
1024 
1025  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1026  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1027  return false;
1028  }
1029 
1030  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1031  if (NewChain.empty()) {
1032  // No vectorization possible.
1033  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1034  return false;
1035  }
1036  if (NewChain.size() == 1) {
1037  // Failed after the first instruction. Discard it and try the smaller chain.
1038  InstructionsProcessed->insert(NewChain.front());
1039  return false;
1040  }
1041 
1042  // Update Chain to the valid vectorizable subchain.
1043  Chain = NewChain;
1044  ChainSize = Chain.size();
1045 
1046  // Check if it's legal to vectorize this chain. If not, split the chain and
1047  // try again.
1048  unsigned EltSzInBytes = Sz / 8;
1049  unsigned SzInBytes = EltSzInBytes * ChainSize;
1050 
1051  FixedVectorType *VecTy;
1052  auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
1053  if (VecStoreTy)
1054  VecTy = FixedVectorType::get(StoreTy->getScalarType(),
1055  Chain.size() * VecStoreTy->getNumElements());
1056  else
1057  VecTy = FixedVectorType::get(StoreTy, Chain.size());
1058 
1059  // If it's more than the max vector size or the target has a better
1060  // vector factor, break it into two pieces.
1061  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1062  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1063  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1064  " Creating two separate arrays.\n");
1065  bool Vectorized = false;
1066  Vectorized |=
1067  vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
1068  Vectorized |=
1069  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1070  return Vectorized;
1071  }
1072 
1073  LLVM_DEBUG({
1074  dbgs() << "LSV: Stores to vectorize:\n";
1075  for (Instruction *I : Chain)
1076  dbgs() << " " << *I << "\n";
1077  });
1078 
1079  // We won't try again to vectorize the elements of the chain, regardless of
1080  // whether we succeed below.
1081  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1082 
1083  // If the store is going to be misaligned, don't vectorize it.
1084  unsigned RelativeSpeed;
1085  if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
1086  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1087  unsigned SpeedBefore;
1088  accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
1089  if (SpeedBefore > RelativeSpeed)
1090  return false;
1091 
1092  auto Chains = splitOddVectorElts(Chain, Sz);
1093  bool Vectorized = false;
1094  Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
1095  Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
1096  return Vectorized;
1097  }
1098 
1101  DL, S0, nullptr, &DT);
1102  if (NewAlign >= Alignment)
1103  Alignment = NewAlign;
1104  else
1105  return false;
1106  }
1107 
1108  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1109  auto Chains = splitOddVectorElts(Chain, Sz);
1110  bool Vectorized = false;
1111  Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
1112  Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
1113  return Vectorized;
1114  }
1115 
1117  std::tie(First, Last) = getBoundaryInstrs(Chain);
1118  Builder.SetInsertPoint(&*Last);
1119 
1120  Value *Vec = PoisonValue::get(VecTy);
1121 
1122  if (VecStoreTy) {
1123  unsigned VecWidth = VecStoreTy->getNumElements();
1124  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1125  StoreInst *Store = cast<StoreInst>(Chain[I]);
1126  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1127  unsigned NewIdx = J + I * VecWidth;
1128  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1129  Builder.getInt32(J));
1130  if (Extract->getType() != StoreTy->getScalarType())
1131  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1132 
1133  Value *Insert =
1134  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1135  Vec = Insert;
1136  }
1137  }
1138  } else {
1139  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1140  StoreInst *Store = cast<StoreInst>(Chain[I]);
1141  Value *Extract = Store->getValueOperand();
1142  if (Extract->getType() != StoreTy->getScalarType())
1143  Extract =
1144  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1145 
1146  Value *Insert =
1147  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1148  Vec = Insert;
1149  }
1150  }
1151 
1152  StoreInst *SI = Builder.CreateAlignedStore(
1153  Vec,
1154  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1155  Alignment);
1156  propagateMetadata(SI, Chain);
1157 
1158  eraseInstructions(Chain);
1159  ++NumVectorInstructions;
1160  NumScalarsVectorized += Chain.size();
1161  return true;
1162 }
1163 
1164 bool Vectorizer::vectorizeLoadChain(
1166  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1167  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1168 
1169  // If the vector has an int element, default to int for the whole load.
1170  Type *LoadTy = nullptr;
1171  for (const auto &V : Chain) {
1172  LoadTy = cast<LoadInst>(V)->getType();
1173  if (LoadTy->isIntOrIntVectorTy())
1174  break;
1175 
1176  if (LoadTy->isPtrOrPtrVectorTy()) {
1177  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1178  DL.getTypeSizeInBits(LoadTy));
1179  break;
1180  }
1181  }
1182  assert(LoadTy && "Can't determine LoadInst type from chain");
1183 
1184  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1185  unsigned AS = L0->getPointerAddressSpace();
1186  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1187  unsigned VF = VecRegSize / Sz;
1188  unsigned ChainSize = Chain.size();
1189  Align Alignment = L0->getAlign();
1190 
1191  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1192  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1193  return false;
1194  }
1195 
1196  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1197  if (NewChain.empty()) {
1198  // No vectorization possible.
1199  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1200  return false;
1201  }
1202  if (NewChain.size() == 1) {
1203  // Failed after the first instruction. Discard it and try the smaller chain.
1204  InstructionsProcessed->insert(NewChain.front());
1205  return false;
1206  }
1207 
1208  // Update Chain to the valid vectorizable subchain.
1209  Chain = NewChain;
1210  ChainSize = Chain.size();
1211 
1212  // Check if it's legal to vectorize this chain. If not, split the chain and
1213  // try again.
1214  unsigned EltSzInBytes = Sz / 8;
1215  unsigned SzInBytes = EltSzInBytes * ChainSize;
1216  VectorType *VecTy;
1217  auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
1218  if (VecLoadTy)
1219  VecTy = FixedVectorType::get(LoadTy->getScalarType(),
1220  Chain.size() * VecLoadTy->getNumElements());
1221  else
1222  VecTy = FixedVectorType::get(LoadTy, Chain.size());
1223 
1224  // If it's more than the max vector size or the target has a better
1225  // vector factor, break it into two pieces.
1226  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1227  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1228  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1229  " Creating two separate arrays.\n");
1230  bool Vectorized = false;
1231  Vectorized |=
1232  vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
1233  Vectorized |=
1234  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1235  return Vectorized;
1236  }
1237 
1238  // We won't try again to vectorize the elements of the chain, regardless of
1239  // whether we succeed below.
1240  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1241 
1242  // If the load is going to be misaligned, don't vectorize it.
1243  unsigned RelativeSpeed;
1244  if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) {
1245  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1246  unsigned SpeedBefore;
1247  accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore);
1248  if (SpeedBefore > RelativeSpeed)
1249  return false;
1250 
1251  auto Chains = splitOddVectorElts(Chain, Sz);
1252  bool Vectorized = false;
1253  Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
1254  Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
1255  return Vectorized;
1256  }
1257 
1260  DL, L0, nullptr, &DT);
1261  if (NewAlign >= Alignment)
1262  Alignment = NewAlign;
1263  else
1264  return false;
1265  }
1266 
1267  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1268  auto Chains = splitOddVectorElts(Chain, Sz);
1269  bool Vectorized = false;
1270  Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
1271  Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
1272  return Vectorized;
1273  }
1274 
1275  LLVM_DEBUG({
1276  dbgs() << "LSV: Loads to vectorize:\n";
1277  for (Instruction *I : Chain)
1278  I->dump();
1279  });
1280 
1281  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1282  // Last may have changed since then, but the value of First won't have. If it
1283  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1285  std::tie(First, Last) = getBoundaryInstrs(Chain);
1286  Builder.SetInsertPoint(&*First);
1287 
1288  Value *Bitcast =
1289  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1290  LoadInst *LI =
1291  Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
1292  propagateMetadata(LI, Chain);
1293 
1294  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1295  Value *CV = Chain[I];
1296  Value *V;
1297  if (VecLoadTy) {
1298  // Extract a subvector using shufflevector.
1299  unsigned VecWidth = VecLoadTy->getNumElements();
1300  auto Mask =
1301  llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
1302  V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
1303  } else {
1304  V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1305  }
1306 
1307  if (V->getType() != CV->getType()) {
1308  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1309  }
1310 
1311  // Replace the old instruction.
1312  CV->replaceAllUsesWith(V);
1313  }
1314 
1315  // Since we might have opaque pointers we might end up using the pointer
1316  // operand of the first load (wrt. memory loaded) for the vector load. Since
1317  // this first load might not be the first in the block we potentially need to
1318  // reorder the pointer operand (and its operands). If we have a bitcast though
1319  // it might be before the load and should be the reorder start instruction.
1320  // "Might" because for opaque pointers the "bitcast" is just the first loads
1321  // pointer operand, as oppposed to something we inserted at the right position
1322  // ourselves.
1323  Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
1324  reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
1325 
1326  eraseInstructions(Chain);
1327 
1328  ++NumVectorInstructions;
1329  NumScalarsVectorized += Chain.size();
1330  return true;
1331 }
1332 
1333 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1334  Align Alignment, unsigned &RelativeSpeed) {
1335  RelativeSpeed = 0;
1336  if (Alignment.value() % SzInBytes == 0)
1337  return false;
1338 
1339  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1340  SzInBytes * 8, AddressSpace,
1341  Alignment, &RelativeSpeed);
1342  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1343  << " with relative speed = " << RelativeSpeed << '\n';);
1344  return !Allows || !RelativeSpeed;
1345 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::ScalarEvolution::getContext
LLVMContext & getContext() const
Definition: ScalarEvolution.h:491
AssumptionCache.h
llvm::AAManager
A manager for alias analyses.
Definition: AliasAnalysis.h:881
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2592
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4714
llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition: ScalarEvolution.h:2164
MathExtras.h
llvm::MemoryLocation::get
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition: MemoryLocation.cpp:36
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::initializeLoadStoreVectorizerLegacyPassPass
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::AArch64CC::NE
@ NE
Definition: AArch64BaseInfo.h:256
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
checkNoWrapFlags
static bool checkNoWrapFlags(Instruction *I, bool Signed)
Definition: LoadStoreVectorizer.cpp:389
llvm::Function
Definition: Function.h:60
llvm::PseudoProbeReservedId::Last
@ Last
Pass.h
GetElementPtrTypeIterator.h
store
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening store
Definition: README.txt:133
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:328
llvm::SmallVector< Instruction *, 8 >
Statistic.h
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1499
llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition: LegacyLegalizerInfo.h:54
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:173
llvm::IRBuilder<>
MapVector.h
llvm::isModSet
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
ValueTracking.h
Local.h
llvm::Instruction::insertBefore
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:87
llvm::LoadStoreVectorizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: LoadStoreVectorizer.cpp:257
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::SPIRV::InstrList
SmallVector< MachineInstr * > InstrList
Definition: SPIRVModuleAnalysis.h:115
APInt.h
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5403
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
ScalarEvolution.h
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1432
Module.h
llvm::getOrEnforceKnownAlignment
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1434
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:121
llvm::isModOrRefSet
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1591
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
STLExtras.h
llvm::propagateMetadata
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition: VectorUtils.cpp:877
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:261
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:458
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:123
and
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 and
Definition: README.txt:1271
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:217
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:159
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
checkIfSafeAddSequence
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
Definition: LoadStoreVectorizer.cpp:395
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:169
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:313
llvm::TargetTransformInfo::isLegalToVectorizeLoad
bool isLegalToVectorizeLoad(LoadInst *LI) const
Definition: TargetTransformInfo.cpp:1101
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
Constants.h
llvm::AAResults
Definition: AliasAnalysis.h:294
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
InstrTypes.h
SI
@ SI
Definition: SIInstrInfo.cpp:7985
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::generic_gep_type_iterator::getOperand
Value * getOperand() const
Definition: GetElementPtrTypeIterator.h:76
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:246
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
false
Definition: StackSlotColoring.cpp:141
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:188
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:306
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4506
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2194
llvm::createLoadStoreVectorizerPass
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
SmallPtrSet.h
llvm::Instruction::removeFromParent
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:77
llvm::APInt::sle
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1137
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:683
llvm::StoreInst::getAlign
Align getAlign() const
Definition: Instructions.h:342
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:194
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Type.h
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::omp::RTLDependInfoFields::Len
@ Len
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:189
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4441
getChainID
static ChainID getChainID(const Value *Ptr)
Definition: LoadStoreVectorizer.cpp:799
llvm::VectorType::isValidElementType
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:674
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
VectorUtils.h
BasicBlock.h
LoadStoreVectorizer.h
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:267
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::instructions
inst_range instructions(Function *F)
Definition: InstIterator.h:133
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:298
llvm::Instruction::hasNoSignedWrap
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
Definition: Instruction.cpp:176
llvm::SCEVAAWrapperPass
Legacy wrapper pass to provide the SCEVAAResult object.
Definition: ScalarEvolutionAliasAnalysis.h:56
uint64_t
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2648
llvm::Instruction::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
Definition: Instruction.cpp:172
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
StackAdjustedAlignment
static const unsigned StackAdjustedAlignment
Definition: LoadStoreVectorizer.cpp:96
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:53
MemoryLocation.h
llvm::APInt::negate
void negate()
Negate this APInt in place.
Definition: APInt.h:1414
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const Instruction *I, const Optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Definition: AliasAnalysis.h:488
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:929
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1868
ArrayRef.h
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:196
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
load
LLVM currently emits rax rax movq rax rax ret It could narrow the loads and stores to emit rax rax movq rax rax ret The trouble is that there is a TokenFactor between the store and the load
Definition: README.txt:1531
llvm::TargetTransformInfo::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:1109
llvm::generic_gep_type_iterator::getIndexedType
Type * getIndexedType() const
Definition: GetElementPtrTypeIterator.h:70
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
iterator_range.h
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:168
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::APInt::urem
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1665
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:76
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:33
llvm::BinaryOperator
Definition: InstrTypes.h:188
DataLayout.h
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:471
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: PassManager.h:113
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:243
llvm::APInt::ult
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1082
llvm::TargetTransformInfo::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: TargetTransformInfo.cpp:1130
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::ArrayRef::front
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:167
llvm::APInt::udiv
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1572
llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1378
llvm::TargetTransformInfo::isLegalToVectorizeStore
bool isLegalToVectorizeStore(StoreInst *SI) const
Definition: TargetTransformInfo.cpp:1105
MaxDepth
static const unsigned MaxDepth
Definition: InstCombineMulDivRem.cpp:1082
llvm::Value::stripAndAccumulateInBoundsConstantOffsets
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:724
llvm::APInt::zext
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:974
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoadStoreVectorizer.cpp:90
Attributes.h
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:85
j
return j(j<< 16)
llvm::APInt::trunc
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:899
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:242
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition: ScalarEvolution.cpp:4549
llvm::APInt::isMinSignedValue
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:409
llvm::StoreInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:396
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::ArrayRef::begin
iterator begin() const
Definition: ArrayRef.h:152
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:216
llvm::APInt::sextOrTrunc
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1003
llvm::isGuaranteedToTransferExecutionToSuccessor
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
Definition: ValueTracking.cpp:5495
llvm::TargetTransformInfo::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:1097
Casting.h
llvm::post_order
iterator_range< po_iterator< T > > post_order(const T &G)
Definition: PostOrderIterator.h:189
Function.h
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:773
INITIALIZE_PASS_END
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and store instructions", false, false) Pass *llvm
Definition: LoadStoreVectorizer.cpp:232
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:271
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
PostOrderIterator.h
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:188
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
SmallVector.h
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:390
Dominators.h
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:929
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition: SmallVector.h:677
llvm::GlobalsAAWrapperPass
Legacy wrapper pass to provide the GlobalsAAResult object.
Definition: GlobalsModRef.h:142
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:164
TargetTransformInfo.h
llvm::TargetTransformInfo::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: TargetTransformInfo.cpp:1137
llvm::TargetTransformInfo::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:1115
Vectorize.h
DerivedTypes.h
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5358
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2492
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::TargetTransformInfo::allowsMisalignedMemoryAccesses
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
Definition: TargetTransformInfo.cpp:567
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
raw_ostream.h
Value.h
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1297
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
Debug.h
llvm::ArrayRef::end
iterator end() const
Definition: ArrayRef.h:153
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1732