LLVM  15.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
66 #include "llvm/IR/IRBuilder.h"
67 #include "llvm/IR/InstrTypes.h"
68 #include "llvm/IR/Instruction.h"
69 #include "llvm/IR/Instructions.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/InitializePasses.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
106 /// be relied upon.
107 using ChainID = const Value *;
109 using InstrListMap = MapVector<ChainID, InstrList>;
110 
111 class Vectorizer {
112  Function &F;
113  AliasAnalysis &AA;
114  AssumptionCache &AC;
115  DominatorTree &DT;
116  ScalarEvolution &SE;
118  const DataLayout &DL;
120 
121 public:
122  Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
124  : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
125  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
126 
127  bool run();
128 
129 private:
130  unsigned getPointerAddressSpace(Value *I);
131 
132  static const unsigned MaxDepth = 3;
133 
134  bool isConsecutiveAccess(Value *A, Value *B);
135  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
136  unsigned Depth = 0) const;
137  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
138  unsigned Depth) const;
139  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
140  unsigned Depth) const;
141 
142  /// After vectorization, reorder the instructions that I depends on
143  /// (the instructions defining its operands), to ensure they dominate I.
144  void reorder(Instruction *I);
145 
146  /// Returns the first and the last instructions in Chain.
147  std::pair<BasicBlock::iterator, BasicBlock::iterator>
148  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
149 
150  /// Erases the original instructions after vectorizing.
151  void eraseInstructions(ArrayRef<Instruction *> Chain);
152 
153  /// "Legalize" the vector type that would be produced by combining \p
154  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
155  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
156  /// expected to have more than 4 elements.
157  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
158  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
159 
160  /// Finds the largest prefix of Chain that's vectorizable, checking for
161  /// intervening instructions which may affect the memory accessed by the
162  /// instructions within Chain.
163  ///
164  /// The elements of \p Chain must be all loads or all stores and must be in
165  /// address order.
166  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
167 
168  /// Collects load and store instructions to vectorize.
169  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
170 
171  /// Processes the collected instructions, the \p Map. The values of \p Map
172  /// should be all loads or all stores.
173  bool vectorizeChains(InstrListMap &Map);
174 
175  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
176  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
177 
178  /// Vectorizes the load instructions in Chain.
179  bool
180  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
181  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
182 
183  /// Vectorizes the store instructions in Chain.
184  bool
185  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
186  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
187 
188  /// Check if this load/store access is misaligned accesses.
189  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
190  Align Alignment);
191 };
192 
193 class LoadStoreVectorizerLegacyPass : public FunctionPass {
194 public:
195  static char ID;
196 
197  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
199  }
200 
201  bool runOnFunction(Function &F) override;
202 
203  StringRef getPassName() const override {
204  return "GPU Load and Store Vectorizer";
205  }
206 
207  void getAnalysisUsage(AnalysisUsage &AU) const override {
213  AU.setPreservesCFG();
214  }
215 };
216 
217 } // end anonymous namespace
218 
220 
221 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
222  "Vectorize load and Store instructions", false, false)
229 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
230  "Vectorize load and store instructions", false, false)
231 
233  return new LoadStoreVectorizerLegacyPass();
234 }
235 
237  // Don't vectorize when the attribute NoImplicitFloat is used.
238  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
239  return false;
240 
241  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
242  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
243  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
245  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
246 
247  AssumptionCache &AC =
248  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
249 
250  Vectorizer V(F, AA, AC, DT, SE, TTI);
251  return V.run();
252 }
253 
255  // Don't vectorize when the attribute NoImplicitFloat is used.
256  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
257  return PreservedAnalyses::all();
258 
264 
265  Vectorizer V(F, AA, AC, DT, SE, TTI);
266  bool Changed = V.run();
268  PA.preserveSet<CFGAnalyses>();
269  return Changed ? PA : PreservedAnalyses::all();
270 }
271 
272 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
273 // vectors of Instructions.
275  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
276  propagateMetadata(I, VL);
277 }
278 
279 // Vectorizer Implementation
280 bool Vectorizer::run() {
281  bool Changed = false;
282 
283  // Scan the blocks in the function in post order.
284  for (BasicBlock *BB : post_order(&F)) {
285  InstrListMap LoadRefs, StoreRefs;
286  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
287  Changed |= vectorizeChains(LoadRefs);
288  Changed |= vectorizeChains(StoreRefs);
289  }
290 
291  return Changed;
292 }
293 
294 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
295  if (LoadInst *L = dyn_cast<LoadInst>(I))
296  return L->getPointerAddressSpace();
297  if (StoreInst *S = dyn_cast<StoreInst>(I))
298  return S->getPointerAddressSpace();
299  return -1;
300 }
301 
302 // FIXME: Merge with llvm::isConsecutiveAccess
306  unsigned ASA = getPointerAddressSpace(A);
307  unsigned ASB = getPointerAddressSpace(B);
308 
309  // Check that the address spaces match and that the pointers are valid.
310  if (!PtrA || !PtrB || (ASA != ASB))
311  return false;
312 
313  // Make sure that A and B are different pointers of the same size type.
314  Type *PtrATy = getLoadStoreType(A);
315  Type *PtrBTy = getLoadStoreType(B);
316  if (PtrA == PtrB ||
317  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
318  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
319  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
320  DL.getTypeStoreSize(PtrBTy->getScalarType()))
321  return false;
322 
323  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
324  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
325 
326  return areConsecutivePointers(PtrA, PtrB, Size);
327 }
328 
329 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
330  APInt PtrDelta, unsigned Depth) const {
331  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
332  APInt OffsetA(PtrBitWidth, 0);
333  APInt OffsetB(PtrBitWidth, 0);
334  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
335  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
336 
337  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
338 
339  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
340  return false;
341 
342  // In case if we have to shrink the pointer
343  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
344  // possible overflow and the value should fit into a smallest data type
345  // used in the cast/gep chain.
346  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
347  OffsetB.getMinSignedBits() <= NewPtrBitWidth);
348 
349  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
350  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
351  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
352 
353  APInt OffsetDelta = OffsetB - OffsetA;
354 
355  // Check if they are based on the same pointer. That makes the offsets
356  // sufficient.
357  if (PtrA == PtrB)
358  return OffsetDelta == PtrDelta;
359 
360  // Compute the necessary base pointer delta to have the necessary final delta
361  // equal to the pointer delta requested.
362  APInt BaseDelta = PtrDelta - OffsetDelta;
363 
364  // Compute the distance with SCEV between the base pointers.
365  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
366  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
367  const SCEV *C = SE.getConstant(BaseDelta);
368  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
369  if (X == PtrSCEVB)
370  return true;
371 
372  // The above check will not catch the cases where one of the pointers is
373  // factorized but the other one is not, such as (C + (S * (A + B))) vs
374  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
375  // and getting the simplified difference.
376  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
377  if (C == Dist)
378  return true;
379 
380  // Sometimes even this doesn't work, because SCEV can't always see through
381  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
382  // things the hard way.
383  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
384 }
385 
386 static bool checkNoWrapFlags(Instruction *I, bool Signed) {
387  BinaryOperator *BinOpI = cast<BinaryOperator>(I);
388  return (Signed && BinOpI->hasNoSignedWrap()) ||
389  (!Signed && BinOpI->hasNoUnsignedWrap());
390 }
391 
392 static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
393  unsigned MatchingOpIdxA, Instruction *AddOpB,
394  unsigned MatchingOpIdxB, bool Signed) {
395  // If both OpA and OpB is an add with NSW/NUW and with
396  // one of the operands being the same, we can guarantee that the
397  // transformation is safe if we can prove that OpA won't overflow when
398  // IdxDiff added to the other operand of OpA.
399  // For example:
400  // %tmp7 = add nsw i32 %tmp2, %v0
401  // %tmp8 = sext i32 %tmp7 to i64
402  // ...
403  // %tmp11 = add nsw i32 %v0, 1
404  // %tmp12 = add nsw i32 %tmp2, %tmp11
405  // %tmp13 = sext i32 %tmp12 to i64
406  //
407  // Both %tmp7 and %tmp2 has the nsw flag and the first operand
408  // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
409  // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
410  // nsw flag.
411  assert(AddOpA->getOpcode() == Instruction::Add &&
412  AddOpB->getOpcode() == Instruction::Add &&
413  checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
414  if (AddOpA->getOperand(MatchingOpIdxA) ==
415  AddOpB->getOperand(MatchingOpIdxB)) {
416  Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
417  Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
418  Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
419  Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
420  // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
421  if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
422  checkNoWrapFlags(OtherInstrB, Signed) &&
423  isa<ConstantInt>(OtherInstrB->getOperand(1))) {
424  int64_t CstVal =
425  cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
426  if (OtherInstrB->getOperand(0) == OtherOperandA &&
427  IdxDiff.getSExtValue() == CstVal)
428  return true;
429  }
430  // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
431  if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
432  checkNoWrapFlags(OtherInstrA, Signed) &&
433  isa<ConstantInt>(OtherInstrA->getOperand(1))) {
434  int64_t CstVal =
435  cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
436  if (OtherInstrA->getOperand(0) == OtherOperandB &&
437  IdxDiff.getSExtValue() == -CstVal)
438  return true;
439  }
440  // Match `x +nsw/nuw (y +nsw/nuw c)` and
441  // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
442  if (OtherInstrA && OtherInstrB &&
443  OtherInstrA->getOpcode() == Instruction::Add &&
444  OtherInstrB->getOpcode() == Instruction::Add &&
445  checkNoWrapFlags(OtherInstrA, Signed) &&
446  checkNoWrapFlags(OtherInstrB, Signed) &&
447  isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
448  isa<ConstantInt>(OtherInstrB->getOperand(1))) {
449  int64_t CstValA =
450  cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
451  int64_t CstValB =
452  cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
453  if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
454  IdxDiff.getSExtValue() == (CstValB - CstValA))
455  return true;
456  }
457  }
458  return false;
459 }
460 
461 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
462  APInt PtrDelta,
463  unsigned Depth) const {
464  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
465  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
466  if (!GEPA || !GEPB)
467  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
468 
469  // Look through GEPs after checking they're the same except for the last
470  // index.
471  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
472  GEPA->getPointerOperand() != GEPB->getPointerOperand())
473  return false;
474  gep_type_iterator GTIA = gep_type_begin(GEPA);
475  gep_type_iterator GTIB = gep_type_begin(GEPB);
476  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
477  if (GTIA.getOperand() != GTIB.getOperand())
478  return false;
479  ++GTIA;
480  ++GTIB;
481  }
482 
483  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
484  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
485  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
486  OpA->getType() != OpB->getType())
487  return false;
488 
489  if (PtrDelta.isNegative()) {
490  if (PtrDelta.isMinSignedValue())
491  return false;
492  PtrDelta.negate();
493  std::swap(OpA, OpB);
494  }
495  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
496  if (PtrDelta.urem(Stride) != 0)
497  return false;
498  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
499  APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
500 
501  // Only look through a ZExt/SExt.
502  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
503  return false;
504 
505  bool Signed = isa<SExtInst>(OpA);
506 
507  // At this point A could be a function parameter, i.e. not an instruction
508  Value *ValA = OpA->getOperand(0);
509  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
510  if (!OpB || ValA->getType() != OpB->getType())
511  return false;
512 
513  // Now we need to prove that adding IdxDiff to ValA won't overflow.
514  bool Safe = false;
515 
516  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
517  // ValA, we're okay.
518  if (OpB->getOpcode() == Instruction::Add &&
519  isa<ConstantInt>(OpB->getOperand(1)) &&
520  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
521  checkNoWrapFlags(OpB, Signed))
522  Safe = true;
523 
524  // Second attempt: check if we have eligible add NSW/NUW instruction
525  // sequences.
526  OpA = dyn_cast<Instruction>(ValA);
527  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
528  OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
529  checkNoWrapFlags(OpB, Signed)) {
530  // In the checks below a matching operand in OpA and OpB is
531  // an operand which is the same in those two instructions.
532  // Below we account for possible orders of the operands of
533  // these add instructions.
534  for (unsigned MatchingOpIdxA : {0, 1})
535  for (unsigned MatchingOpIdxB : {0, 1})
536  if (!Safe)
537  Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
538  MatchingOpIdxB, Signed);
539  }
540 
541  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
542 
543  // Third attempt:
544  // If all set bits of IdxDiff or any higher order bit other than the sign bit
545  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
546  // overflow of any sort.
547  if (!Safe) {
548  KnownBits Known(BitWidth);
549  computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
550  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
551  if (Signed)
552  BitsAllowedToBeSet.clearBit(BitWidth - 1);
553  if (BitsAllowedToBeSet.ult(IdxDiff))
554  return false;
555  }
556 
557  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
558  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
559  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
560  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
561  return X == OffsetSCEVB;
562 }
563 
564 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
565  const APInt &PtrDelta,
566  unsigned Depth) const {
567  if (Depth++ == MaxDepth)
568  return false;
569 
570  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
571  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
572  return SelectA->getCondition() == SelectB->getCondition() &&
573  areConsecutivePointers(SelectA->getTrueValue(),
574  SelectB->getTrueValue(), PtrDelta, Depth) &&
575  areConsecutivePointers(SelectA->getFalseValue(),
576  SelectB->getFalseValue(), PtrDelta, Depth);
577  }
578  }
579  return false;
580 }
581 
582 void Vectorizer::reorder(Instruction *I) {
583  SmallPtrSet<Instruction *, 16> InstructionsToMove;
585 
586  Worklist.push_back(I);
587  while (!Worklist.empty()) {
588  Instruction *IW = Worklist.pop_back_val();
589  int NumOperands = IW->getNumOperands();
590  for (int i = 0; i < NumOperands; i++) {
591  Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
592  if (!IM || IM->getOpcode() == Instruction::PHI)
593  continue;
594 
595  // If IM is in another BB, no need to move it, because this pass only
596  // vectorizes instructions within one BB.
597  if (IM->getParent() != I->getParent())
598  continue;
599 
600  if (!IM->comesBefore(I)) {
601  InstructionsToMove.insert(IM);
602  Worklist.push_back(IM);
603  }
604  }
605  }
606 
607  // All instructions to move should follow I. Start from I, not from begin().
608  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
609  ++BBI) {
610  if (!InstructionsToMove.count(&*BBI))
611  continue;
612  Instruction *IM = &*BBI;
613  --BBI;
614  IM->removeFromParent();
615  IM->insertBefore(I);
616  }
617 }
618 
619 std::pair<BasicBlock::iterator, BasicBlock::iterator>
620 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
621  Instruction *C0 = Chain[0];
622  BasicBlock::iterator FirstInstr = C0->getIterator();
623  BasicBlock::iterator LastInstr = C0->getIterator();
624 
625  BasicBlock *BB = C0->getParent();
626  unsigned NumFound = 0;
627  for (Instruction &I : *BB) {
628  if (!is_contained(Chain, &I))
629  continue;
630 
631  ++NumFound;
632  if (NumFound == 1) {
633  FirstInstr = I.getIterator();
634  }
635  if (NumFound == Chain.size()) {
636  LastInstr = I.getIterator();
637  break;
638  }
639  }
640 
641  // Range is [first, last).
642  return std::make_pair(FirstInstr, ++LastInstr);
643 }
644 
645 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
647  for (Instruction *I : Chain) {
648  Value *PtrOperand = getLoadStorePointerOperand(I);
649  assert(PtrOperand && "Instruction must have a pointer operand.");
650  Instrs.push_back(I);
651  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
652  Instrs.push_back(GEP);
653  }
654 
655  // Erase instructions.
656  for (Instruction *I : Instrs)
657  if (I->use_empty())
658  I->eraseFromParent();
659 }
660 
661 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
662 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
663  unsigned ElementSizeBits) {
664  unsigned ElementSizeBytes = ElementSizeBits / 8;
665  unsigned SizeBytes = ElementSizeBytes * Chain.size();
666  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
667  if (NumLeft == Chain.size()) {
668  if ((NumLeft & 1) == 0)
669  NumLeft /= 2; // Split even in half
670  else
671  --NumLeft; // Split off last element
672  } else if (NumLeft == 0)
673  NumLeft = 1;
674  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
675 }
676 
678 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
679  // These are in BB order, unlike Chain, which is in address order.
680  SmallVector<Instruction *, 16> MemoryInstrs;
681  SmallVector<Instruction *, 16> ChainInstrs;
682 
683  bool IsLoadChain = isa<LoadInst>(Chain[0]);
684  LLVM_DEBUG({
685  for (Instruction *I : Chain) {
686  if (IsLoadChain)
687  assert(isa<LoadInst>(I) &&
688  "All elements of Chain must be loads, or all must be stores.");
689  else
690  assert(isa<StoreInst>(I) &&
691  "All elements of Chain must be loads, or all must be stores.");
692  }
693  });
694 
695  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
696  if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
697  ChainInstrs.push_back(&I);
698  continue;
699  }
701  LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
702  << I << '\n');
703  break;
704  }
705  if (I.mayReadOrWriteMemory())
706  MemoryInstrs.push_back(&I);
707  }
708 
709  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
710  unsigned ChainInstrIdx = 0;
711  Instruction *BarrierMemoryInstr = nullptr;
712 
713  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
714  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
715 
716  // If a barrier memory instruction was found, chain instructions that follow
717  // will not be added to the valid prefix.
718  if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
719  break;
720 
721  // Check (in BB order) if any instruction prevents ChainInstr from being
722  // vectorized. Find and store the first such "conflicting" instruction.
723  for (Instruction *MemInstr : MemoryInstrs) {
724  // If a barrier memory instruction was found, do not check past it.
725  if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
726  break;
727 
728  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
729  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
730  if (MemLoad && ChainLoad)
731  continue;
732 
733  // We can ignore the alias if the we have a load store pair and the load
734  // is known to be invariant. The load cannot be clobbered by the store.
735  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
736  return LI->hasMetadata(LLVMContext::MD_invariant_load);
737  };
738 
739  if (IsLoadChain) {
740  // We can ignore the alias as long as the load comes before the store,
741  // because that means we won't be moving the load past the store to
742  // vectorize it (the vectorized load is inserted at the location of the
743  // first load in the chain).
744  if (ChainInstr->comesBefore(MemInstr) ||
745  (ChainLoad && IsInvariantLoad(ChainLoad)))
746  continue;
747  } else {
748  // Same case, but in reverse.
749  if (MemInstr->comesBefore(ChainInstr) ||
750  (MemLoad && IsInvariantLoad(MemLoad)))
751  continue;
752  }
753 
754  ModRefInfo MR =
755  AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
756  if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
757  LLVM_DEBUG({
758  dbgs() << "LSV: Found alias:\n"
759  " Aliasing instruction:\n"
760  << " " << *MemInstr << '\n'
761  << " Aliased instruction and pointer:\n"
762  << " " << *ChainInstr << '\n'
763  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
764  });
765  // Save this aliasing memory instruction as a barrier, but allow other
766  // instructions that precede the barrier to be vectorized with this one.
767  BarrierMemoryInstr = MemInstr;
768  break;
769  }
770  }
771  // Continue the search only for store chains, since vectorizing stores that
772  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
773  // up to an aliasing store, but should not pull loads from further down in
774  // the basic block.
775  if (IsLoadChain && BarrierMemoryInstr) {
776  // The BarrierMemoryInstr is a store that precedes ChainInstr.
777  assert(BarrierMemoryInstr->comesBefore(ChainInstr));
778  break;
779  }
780  }
781 
782  // Find the largest prefix of Chain whose elements are all in
783  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
784  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
785  // order.)
786  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
787  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
788  unsigned ChainIdx = 0;
789  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
790  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
791  break;
792  }
793  return Chain.slice(0, ChainIdx);
794 }
795 
796 static ChainID getChainID(const Value *Ptr) {
797  const Value *ObjPtr = getUnderlyingObject(Ptr);
798  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
799  // The select's themselves are distinct instructions even if they share the
800  // same condition and evaluate to consecutive pointers for true and false
801  // values of the condition. Therefore using the select's themselves for
802  // grouping instructions would put consecutive accesses into different lists
803  // and they won't be even checked for being consecutive, and won't be
804  // vectorized.
805  return Sel->getCondition();
806  }
807  return ObjPtr;
808 }
809 
810 std::pair<InstrListMap, InstrListMap>
811 Vectorizer::collectInstructions(BasicBlock *BB) {
812  InstrListMap LoadRefs;
813  InstrListMap StoreRefs;
814 
815  for (Instruction &I : *BB) {
816  if (!I.mayReadOrWriteMemory())
817  continue;
818 
819  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
820  if (!LI->isSimple())
821  continue;
822 
823  // Skip if it's not legal.
824  if (!TTI.isLegalToVectorizeLoad(LI))
825  continue;
826 
827  Type *Ty = LI->getType();
829  continue;
830 
831  // Skip weird non-byte sizes. They probably aren't worth the effort of
832  // handling correctly.
833  unsigned TySize = DL.getTypeSizeInBits(Ty);
834  if ((TySize % 8) != 0)
835  continue;
836 
837  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
838  // functions are currently using an integer type for the vectorized
839  // load/store, and does not support casting between the integer type and a
840  // vector of pointers (e.g. i64 to <2 x i16*>)
841  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
842  continue;
843 
844  Value *Ptr = LI->getPointerOperand();
845  unsigned AS = Ptr->getType()->getPointerAddressSpace();
846  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
847 
848  unsigned VF = VecRegSize / TySize;
849  VectorType *VecTy = dyn_cast<VectorType>(Ty);
850 
851  // No point in looking at these if they're too big to vectorize.
852  if (TySize > VecRegSize / 2 ||
853  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
854  continue;
855 
856  // Save the load locations.
857  const ChainID ID = getChainID(Ptr);
858  LoadRefs[ID].push_back(LI);
859  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
860  if (!SI->isSimple())
861  continue;
862 
863  // Skip if it's not legal.
865  continue;
866 
867  Type *Ty = SI->getValueOperand()->getType();
869  continue;
870 
871  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
872  // functions are currently using an integer type for the vectorized
873  // load/store, and does not support casting between the integer type and a
874  // vector of pointers (e.g. i64 to <2 x i16*>)
875  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
876  continue;
877 
878  // Skip weird non-byte sizes. They probably aren't worth the effort of
879  // handling correctly.
880  unsigned TySize = DL.getTypeSizeInBits(Ty);
881  if ((TySize % 8) != 0)
882  continue;
883 
884  Value *Ptr = SI->getPointerOperand();
885  unsigned AS = Ptr->getType()->getPointerAddressSpace();
886  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
887 
888  unsigned VF = VecRegSize / TySize;
889  VectorType *VecTy = dyn_cast<VectorType>(Ty);
890 
891  // No point in looking at these if they're too big to vectorize.
892  if (TySize > VecRegSize / 2 ||
893  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
894  continue;
895 
896  // Save store location.
897  const ChainID ID = getChainID(Ptr);
898  StoreRefs[ID].push_back(SI);
899  }
900  }
901 
902  return {LoadRefs, StoreRefs};
903 }
904 
905 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
906  bool Changed = false;
907 
908  for (const std::pair<ChainID, InstrList> &Chain : Map) {
909  unsigned Size = Chain.second.size();
910  if (Size < 2)
911  continue;
912 
913  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
914 
915  // Process the stores in chunks of 64.
916  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
917  unsigned Len = std::min<unsigned>(CE - CI, 64);
918  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
919  Changed |= vectorizeInstructions(Chunk);
920  }
921  }
922 
923  return Changed;
924 }
925 
926 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
927  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
928  << " instructions.\n");
929  SmallVector<int, 16> Heads, Tails;
930  int ConsecutiveChain[64];
931 
932  // Do a quadratic search on all of the given loads/stores and find all of the
933  // pairs of loads/stores that follow each other.
934  for (int i = 0, e = Instrs.size(); i < e; ++i) {
935  ConsecutiveChain[i] = -1;
936  for (int j = e - 1; j >= 0; --j) {
937  if (i == j)
938  continue;
939 
940  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
941  if (ConsecutiveChain[i] != -1) {
942  int CurDistance = std::abs(ConsecutiveChain[i] - i);
943  int NewDistance = std::abs(ConsecutiveChain[i] - j);
944  if (j < i || NewDistance > CurDistance)
945  continue; // Should not insert.
946  }
947 
948  Tails.push_back(j);
949  Heads.push_back(i);
950  ConsecutiveChain[i] = j;
951  }
952  }
953  }
954 
955  bool Changed = false;
956  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
957 
958  for (int Head : Heads) {
959  if (InstructionsProcessed.count(Instrs[Head]))
960  continue;
961  bool LongerChainExists = false;
962  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
963  if (Head == Tails[TIt] &&
964  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
965  LongerChainExists = true;
966  break;
967  }
968  if (LongerChainExists)
969  continue;
970 
971  // We found an instr that starts a chain. Now follow the chain and try to
972  // vectorize it.
974  int I = Head;
975  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
976  if (InstructionsProcessed.count(Instrs[I]))
977  break;
978 
979  Operands.push_back(Instrs[I]);
980  I = ConsecutiveChain[I];
981  }
982 
983  bool Vectorized = false;
984  if (isa<LoadInst>(*Operands.begin()))
985  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
986  else
987  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
988 
989  Changed |= Vectorized;
990  }
991 
992  return Changed;
993 }
994 
995 bool Vectorizer::vectorizeStoreChain(
997  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
998  StoreInst *S0 = cast<StoreInst>(Chain[0]);
999 
1000  // If the vector has an int element, default to int for the whole store.
1001  Type *StoreTy = nullptr;
1002  for (Instruction *I : Chain) {
1003  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
1004  if (StoreTy->isIntOrIntVectorTy())
1005  break;
1006 
1007  if (StoreTy->isPtrOrPtrVectorTy()) {
1008  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
1009  DL.getTypeSizeInBits(StoreTy));
1010  break;
1011  }
1012  }
1013  assert(StoreTy && "Failed to find store type");
1014 
1015  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
1016  unsigned AS = S0->getPointerAddressSpace();
1017  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1018  unsigned VF = VecRegSize / Sz;
1019  unsigned ChainSize = Chain.size();
1020  Align Alignment = S0->getAlign();
1021 
1022  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1023  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1024  return false;
1025  }
1026 
1027  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1028  if (NewChain.empty()) {
1029  // No vectorization possible.
1030  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1031  return false;
1032  }
1033  if (NewChain.size() == 1) {
1034  // Failed after the first instruction. Discard it and try the smaller chain.
1035  InstructionsProcessed->insert(NewChain.front());
1036  return false;
1037  }
1038 
1039  // Update Chain to the valid vectorizable subchain.
1040  Chain = NewChain;
1041  ChainSize = Chain.size();
1042 
1043  // Check if it's legal to vectorize this chain. If not, split the chain and
1044  // try again.
1045  unsigned EltSzInBytes = Sz / 8;
1046  unsigned SzInBytes = EltSzInBytes * ChainSize;
1047 
1048  FixedVectorType *VecTy;
1049  auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
1050  if (VecStoreTy)
1051  VecTy = FixedVectorType::get(StoreTy->getScalarType(),
1052  Chain.size() * VecStoreTy->getNumElements());
1053  else
1054  VecTy = FixedVectorType::get(StoreTy, Chain.size());
1055 
1056  // If it's more than the max vector size or the target has a better
1057  // vector factor, break it into two pieces.
1058  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1059  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1060  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1061  " Creating two separate arrays.\n");
1062  bool Vectorized = false;
1063  Vectorized |=
1064  vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
1065  Vectorized |=
1066  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1067  return Vectorized;
1068  }
1069 
1070  LLVM_DEBUG({
1071  dbgs() << "LSV: Stores to vectorize:\n";
1072  for (Instruction *I : Chain)
1073  dbgs() << " " << *I << "\n";
1074  });
1075 
1076  // We won't try again to vectorize the elements of the chain, regardless of
1077  // whether we succeed below.
1078  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1079 
1080  // If the store is going to be misaligned, don't vectorize it.
1081  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1082  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1083  auto Chains = splitOddVectorElts(Chain, Sz);
1084  bool Vectorized = false;
1085  Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
1086  Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
1087  return Vectorized;
1088  }
1089 
1092  DL, S0, nullptr, &DT);
1093  if (NewAlign >= Alignment)
1094  Alignment = NewAlign;
1095  else
1096  return false;
1097  }
1098 
1099  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1100  auto Chains = splitOddVectorElts(Chain, Sz);
1101  bool Vectorized = false;
1102  Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
1103  Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
1104  return Vectorized;
1105  }
1106 
1108  std::tie(First, Last) = getBoundaryInstrs(Chain);
1109  Builder.SetInsertPoint(&*Last);
1110 
1111  Value *Vec = PoisonValue::get(VecTy);
1112 
1113  if (VecStoreTy) {
1114  unsigned VecWidth = VecStoreTy->getNumElements();
1115  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1116  StoreInst *Store = cast<StoreInst>(Chain[I]);
1117  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1118  unsigned NewIdx = J + I * VecWidth;
1119  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1120  Builder.getInt32(J));
1121  if (Extract->getType() != StoreTy->getScalarType())
1122  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1123 
1124  Value *Insert =
1125  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1126  Vec = Insert;
1127  }
1128  }
1129  } else {
1130  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1131  StoreInst *Store = cast<StoreInst>(Chain[I]);
1132  Value *Extract = Store->getValueOperand();
1133  if (Extract->getType() != StoreTy->getScalarType())
1134  Extract =
1135  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1136 
1137  Value *Insert =
1138  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1139  Vec = Insert;
1140  }
1141  }
1142 
1143  StoreInst *SI = Builder.CreateAlignedStore(
1144  Vec,
1145  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1146  Alignment);
1147  propagateMetadata(SI, Chain);
1148 
1149  eraseInstructions(Chain);
1150  ++NumVectorInstructions;
1151  NumScalarsVectorized += Chain.size();
1152  return true;
1153 }
1154 
1155 bool Vectorizer::vectorizeLoadChain(
1157  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1158  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1159 
1160  // If the vector has an int element, default to int for the whole load.
1161  Type *LoadTy = nullptr;
1162  for (const auto &V : Chain) {
1163  LoadTy = cast<LoadInst>(V)->getType();
1164  if (LoadTy->isIntOrIntVectorTy())
1165  break;
1166 
1167  if (LoadTy->isPtrOrPtrVectorTy()) {
1168  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1169  DL.getTypeSizeInBits(LoadTy));
1170  break;
1171  }
1172  }
1173  assert(LoadTy && "Can't determine LoadInst type from chain");
1174 
1175  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1176  unsigned AS = L0->getPointerAddressSpace();
1177  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1178  unsigned VF = VecRegSize / Sz;
1179  unsigned ChainSize = Chain.size();
1180  Align Alignment = L0->getAlign();
1181 
1182  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1183  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1184  return false;
1185  }
1186 
1187  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1188  if (NewChain.empty()) {
1189  // No vectorization possible.
1190  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1191  return false;
1192  }
1193  if (NewChain.size() == 1) {
1194  // Failed after the first instruction. Discard it and try the smaller chain.
1195  InstructionsProcessed->insert(NewChain.front());
1196  return false;
1197  }
1198 
1199  // Update Chain to the valid vectorizable subchain.
1200  Chain = NewChain;
1201  ChainSize = Chain.size();
1202 
1203  // Check if it's legal to vectorize this chain. If not, split the chain and
1204  // try again.
1205  unsigned EltSzInBytes = Sz / 8;
1206  unsigned SzInBytes = EltSzInBytes * ChainSize;
1207  VectorType *VecTy;
1208  auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
1209  if (VecLoadTy)
1210  VecTy = FixedVectorType::get(LoadTy->getScalarType(),
1211  Chain.size() * VecLoadTy->getNumElements());
1212  else
1213  VecTy = FixedVectorType::get(LoadTy, Chain.size());
1214 
1215  // If it's more than the max vector size or the target has a better
1216  // vector factor, break it into two pieces.
1217  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1218  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1219  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1220  " Creating two separate arrays.\n");
1221  bool Vectorized = false;
1222  Vectorized |=
1223  vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
1224  Vectorized |=
1225  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1226  return Vectorized;
1227  }
1228 
1229  // We won't try again to vectorize the elements of the chain, regardless of
1230  // whether we succeed below.
1231  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1232 
1233  // If the load is going to be misaligned, don't vectorize it.
1234  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1235  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1236  auto Chains = splitOddVectorElts(Chain, Sz);
1237  bool Vectorized = false;
1238  Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
1239  Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
1240  return Vectorized;
1241  }
1242 
1245  DL, L0, nullptr, &DT);
1246  if (NewAlign >= Alignment)
1247  Alignment = NewAlign;
1248  else
1249  return false;
1250  }
1251 
1252  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1253  auto Chains = splitOddVectorElts(Chain, Sz);
1254  bool Vectorized = false;
1255  Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
1256  Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
1257  return Vectorized;
1258  }
1259 
1260  LLVM_DEBUG({
1261  dbgs() << "LSV: Loads to vectorize:\n";
1262  for (Instruction *I : Chain)
1263  I->dump();
1264  });
1265 
1266  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1267  // Last may have changed since then, but the value of First won't have. If it
1268  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1270  std::tie(First, Last) = getBoundaryInstrs(Chain);
1271  Builder.SetInsertPoint(&*First);
1272 
1273  Value *Bitcast =
1274  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1275  LoadInst *LI =
1276  Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
1277  propagateMetadata(LI, Chain);
1278 
1279  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1280  Value *CV = Chain[I];
1281  Value *V;
1282  if (VecLoadTy) {
1283  // Extract a subvector using shufflevector.
1284  unsigned VecWidth = VecLoadTy->getNumElements();
1285  auto Mask =
1286  llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
1287  V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
1288  } else {
1289  V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1290  }
1291 
1292  if (V->getType() != CV->getType()) {
1293  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1294  }
1295 
1296  // Replace the old instruction.
1297  CV->replaceAllUsesWith(V);
1298  }
1299 
1300  // Since we might have opaque pointers we might end up using the pointer
1301  // operand of the first load (wrt. memory loaded) for the vector load. Since
1302  // this first load might not be the first in the block we potentially need to
1303  // reorder the pointer operand (and its operands). If we have a bitcast though
1304  // it might be before the load and should be the reorder start instruction.
1305  // "Might" because for opaque pointers the "bitcast" is just the first loads
1306  // pointer operand, as oppposed to something we inserted at the right position
1307  // ourselves.
1308  Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
1309  reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
1310 
1311  eraseInstructions(Chain);
1312 
1313  ++NumVectorInstructions;
1314  NumScalarsVectorized += Chain.size();
1315  return true;
1316 }
1317 
1318 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1319  Align Alignment) {
1320  if (Alignment.value() % SzInBytes == 0)
1321  return false;
1322 
1323  bool Fast = false;
1324  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1325  SzInBytes * 8, AddressSpace,
1326  Alignment, &Fast);
1327  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1328  << " and fast? " << Fast << "\n";);
1329  return !Allows || !Fast;
1330 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:76
i
i
Definition: README.txt:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::ScalarEvolution::getContext
LLVMContext & getContext() const
Definition: ScalarEvolution.h:491
AssumptionCache.h
llvm::AAManager
A manager for alias analyses.
Definition: AliasAnalysis.h:1299
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2458
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4637
llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition: ScalarEvolution.h:2115
MathExtras.h
llvm::MemoryLocation::get
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition: MemoryLocation.cpp:35
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::initializeLoadStoreVectorizerLegacyPassPass
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::AArch64CC::NE
@ NE
Definition: AArch64BaseInfo.h:256
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:189
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:780
checkNoWrapFlags
static bool checkNoWrapFlags(Instruction *I, bool Signed)
Definition: LoadStoreVectorizer.cpp:386
llvm::Function
Definition: Function.h:60
Pass.h
GetElementPtrTypeIterator.h
store
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening store
Definition: README.txt:133
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::SmallVector< Instruction *, 8 >
Statistic.h
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1490
llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition: LegacyLegalizerInfo.h:54
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:167
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder<>
MapVector.h
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::Instruction::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const
Determine whether the no unsigned wrap flag is set.
Definition: Instruction.cpp:135
ValueTracking.h
Local.h
llvm::Instruction::insertBefore
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:83
llvm::LoadStoreVectorizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: LoadStoreVectorizer.cpp:254
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::SPIRV::InstrList
SmallVector< MachineInstr * > InstrList
Definition: SPIRVModuleAnalysis.h:41
APInt.h
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5375
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
ScalarEvolution.h
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1423
Module.h
llvm::getOrEnforceKnownAlignment
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1356
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:110
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1392
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
STLExtras.h
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:654
llvm::propagateMetadata
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition: VectorUtils.cpp:841
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:268
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:123
llvm::Sched::Fast
@ Fast
Definition: TargetLowering.h:104
and
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 and
Definition: README.txt:1271
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:224
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:159
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
checkIfSafeAddSequence
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
Definition: LoadStoreVectorizer.cpp:392
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:157
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:312
llvm::TargetTransformInfo::isLegalToVectorizeLoad
bool isLegalToVectorizeLoad(LoadInst *LI) const
Definition: TargetTransformInfo.cpp:1025
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
Constants.h
llvm::AAResults
Definition: AliasAnalysis.h:511
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
InstrTypes.h
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::generic_gep_type_iterator::getOperand
Value * getOperand() const
Definition: GetElementPtrTypeIterator.h:76
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
false
Definition: StackSlotColoring.cpp:141
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:302
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4343
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2145
llvm::createLoadStoreVectorizerPass
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
SmallPtrSet.h
llvm::Instruction::removeFromParent
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:73
llvm::APInt::sle
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1116
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::StoreInst::getAlign
Align getAlign() const
Definition: Instructions.h:354
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::isModSet
LLVM_NODISCARD bool isModSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:196
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:194
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Type.h
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:101
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4407
getChainID
static ChainID getChainID(const Value *Ptr)
Definition: LoadStoreVectorizer.cpp:796
llvm::VectorType::isValidElementType
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:675
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
VectorUtils.h
llvm::SPIRV::Decoration::Alignment
@ Alignment
BasicBlock.h
LoadStoreVectorizer.h
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:274
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::instructions
inst_range instructions(Function *F)
Definition: InstIterator.h:133
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:305
llvm::SCEVAAWrapperPass
Legacy wrapper pass to provide the SCEVAAResult object.
Definition: ScalarEvolutionAliasAnalysis.h:56
uint64_t
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2514
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
StackAdjustedAlignment
static const unsigned StackAdjustedAlignment
Definition: LoadStoreVectorizer.cpp:96
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
MemoryLocation.h
llvm::APInt::negate
void negate()
Negate this APInt in place.
Definition: APInt.h:1405
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:929
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1670
ArrayRef.h
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:222
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: AliasAnalysis.h:148
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
load
LLVM currently emits rax rax movq rax rax ret It could narrow the loads and stores to emit rax rax movq rax rax ret The trouble is that there is a TokenFactor between the store and the load
Definition: README.txt:1531
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::IndexedInstrProf::HashT::Last
@ Last
llvm::TargetTransformInfo::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:1033
llvm::generic_gep_type_iterator::getIndexedType
Type * getIndexedType() const
Definition: GetElementPtrTypeIterator.h:70
iterator_range.h
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:162
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::TargetTransformInfo::allowsMisalignedMemoryAccesses
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), bool *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
Definition: TargetTransformInfo.cpp:535
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::APInt::urem
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1682
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::BinaryOperator
Definition: InstrTypes.h:188
DataLayout.h
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:462
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: PassManager.h:113
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:529
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:224
llvm::APInt::ult
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1061
llvm::TargetTransformInfo::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: TargetTransformInfo.cpp:1054
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:305
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176
llvm::ArrayRef::front
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:167
llvm::APInt::udiv
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1589
llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1369
llvm::TargetTransformInfo::isLegalToVectorizeStore
bool isLegalToVectorizeStore(StoreInst *SI) const
Definition: TargetTransformInfo.cpp:1029
MaxDepth
static const unsigned MaxDepth
Definition: InstCombineMulDivRem.cpp:900
llvm::Value::stripAndAccumulateInBoundsConstantOffsets
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:727
llvm::APInt::zext
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:973
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoadStoreVectorizer.cpp:90
Attributes.h
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
llvm::APInt::trunc
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:898
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition: ScalarEvolution.cpp:4524
llvm::APInt::isMinSignedValue
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:408
llvm::StoreInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:408
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::ArrayRef::begin
iterator begin() const
Definition: ArrayRef.h:152
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:197
llvm::APInt::sextOrTrunc
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
llvm::isGuaranteedToTransferExecutionToSuccessor
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
Definition: ValueTracking.cpp:5277
llvm::TargetTransformInfo::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:1021
Casting.h
llvm::post_order
iterator_range< po_iterator< T > > post_order(const T &G)
Definition: PostOrderIterator.h:189
Function.h
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
INITIALIZE_PASS_END
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and store instructions", false, false) Pass *llvm
Definition: LoadStoreVectorizer.cpp:229
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
AA
llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:267
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
PostOrderIterator.h
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:188
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
SmallVector.h
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:402
llvm::Instruction::hasNoSignedWrap
bool hasNoSignedWrap() const
Determine whether the no signed wrap flag is set.
Definition: Instruction.cpp:139
Dominators.h
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1347
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::GlobalsAAWrapperPass
Legacy wrapper pass to provide the GlobalsAAResult object.
Definition: GlobalsModRef.h:148
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:164
TargetTransformInfo.h
llvm::TargetTransformInfo::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: TargetTransformInfo.cpp:1061
llvm::TargetTransformInfo::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: TargetTransformInfo.cpp:1039
Vectorize.h
DerivedTypes.h
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5330
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2454
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
raw_ostream.h
Value.h
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1281
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
Debug.h
llvm::ArrayRef::end
iterator end() const
Definition: ArrayRef.h:153
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1788