LLVM  14.0.0git
SROA.cpp
Go to the documentation of this file.
1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
14 ///
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
18 ///
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
22 ///
23 //===----------------------------------------------------------------------===//
24 
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringRef.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/ADT/iterator.h"
42 #include "llvm/Analysis/Loads.h"
44 #include "llvm/Config/llvm-config.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/ConstantFolder.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DIBuilder.h"
50 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Dominators.h"
54 #include "llvm/IR/Function.h"
56 #include "llvm/IR/GlobalAlias.h"
57 #include "llvm/IR/IRBuilder.h"
58 #include "llvm/IR/InstVisitor.h"
59 #include "llvm/IR/InstrTypes.h"
60 #include "llvm/IR/Instruction.h"
61 #include "llvm/IR/Instructions.h"
62 #include "llvm/IR/IntrinsicInst.h"
63 #include "llvm/IR/Intrinsics.h"
64 #include "llvm/IR/LLVMContext.h"
65 #include "llvm/IR/Metadata.h"
66 #include "llvm/IR/Module.h"
67 #include "llvm/IR/Operator.h"
68 #include "llvm/IR/PassManager.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/IR/Use.h"
71 #include "llvm/IR/User.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/InitializePasses.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Compiler.h"
78 #include "llvm/Support/Debug.h"
82 #include "llvm/Transforms/Scalar.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <chrono>
88 #include <cstddef>
89 #include <cstdint>
90 #include <cstring>
91 #include <iterator>
92 #include <string>
93 #include <tuple>
94 #include <utility>
95 #include <vector>
96 
97 using namespace llvm;
98 using namespace llvm::sroa;
99 
100 #define DEBUG_TYPE "sroa"
101 
102 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
103 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
104 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
105 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
106 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
107 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
108 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
109 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
110 STATISTIC(NumDeleted, "Number of instructions deleted");
111 STATISTIC(NumVectorized, "Number of vectorized aggregates");
112 
113 /// Hidden option to experiment with completely strict handling of inbounds
114 /// GEPs.
115 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
116  cl::Hidden);
117 
118 namespace {
119 
120 /// A custom IRBuilder inserter which prefixes all names, but only in
121 /// Assert builds.
122 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
123  std::string Prefix;
124 
125  const Twine getNameWithPrefix(const Twine &Name) const {
126  return Name.isTriviallyEmpty() ? Name : Prefix + Name;
127  }
128 
129 public:
130  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
131 
132  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
133  BasicBlock::iterator InsertPt) const override {
134  IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
135  InsertPt);
136  }
137 };
138 
139 /// Provide a type for IRBuilder that drops names in release builds.
141 
142 /// A used slice of an alloca.
143 ///
144 /// This structure represents a slice of an alloca used by some instruction. It
145 /// stores both the begin and end offsets of this use, a pointer to the use
146 /// itself, and a flag indicating whether we can classify the use as splittable
147 /// or not when forming partitions of the alloca.
148 class Slice {
149  /// The beginning offset of the range.
150  uint64_t BeginOffset = 0;
151 
152  /// The ending offset, not included in the range.
153  uint64_t EndOffset = 0;
154 
155  /// Storage for both the use of this slice and whether it can be
156  /// split.
157  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
158 
159 public:
160  Slice() = default;
161 
162  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
163  : BeginOffset(BeginOffset), EndOffset(EndOffset),
164  UseAndIsSplittable(U, IsSplittable) {}
165 
166  uint64_t beginOffset() const { return BeginOffset; }
167  uint64_t endOffset() const { return EndOffset; }
168 
169  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
170  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
171 
172  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
173 
174  bool isDead() const { return getUse() == nullptr; }
175  void kill() { UseAndIsSplittable.setPointer(nullptr); }
176 
177  /// Support for ordering ranges.
178  ///
179  /// This provides an ordering over ranges such that start offsets are
180  /// always increasing, and within equal start offsets, the end offsets are
181  /// decreasing. Thus the spanning range comes first in a cluster with the
182  /// same start position.
183  bool operator<(const Slice &RHS) const {
184  if (beginOffset() < RHS.beginOffset())
185  return true;
186  if (beginOffset() > RHS.beginOffset())
187  return false;
188  if (isSplittable() != RHS.isSplittable())
189  return !isSplittable();
190  if (endOffset() > RHS.endOffset())
191  return true;
192  return false;
193  }
194 
195  /// Support comparison with a single offset to allow binary searches.
196  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
197  uint64_t RHSOffset) {
198  return LHS.beginOffset() < RHSOffset;
199  }
200  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
201  const Slice &RHS) {
202  return LHSOffset < RHS.beginOffset();
203  }
204 
205  bool operator==(const Slice &RHS) const {
206  return isSplittable() == RHS.isSplittable() &&
207  beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
208  }
209  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
210 };
211 
212 } // end anonymous namespace
213 
214 /// Representation of the alloca slices.
215 ///
216 /// This class represents the slices of an alloca which are formed by its
217 /// various uses. If a pointer escapes, we can't fully build a representation
218 /// for the slices used and we reflect that in this structure. The uses are
219 /// stored, sorted by increasing beginning offset and with unsplittable slices
220 /// starting at a particular offset before splittable slices.
222 public:
223  /// Construct the slices of a particular alloca.
224  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
225 
226  /// Test whether a pointer to the allocation escapes our analysis.
227  ///
228  /// If this is true, the slices are never fully built and should be
229  /// ignored.
230  bool isEscaped() const { return PointerEscapingInstr; }
231 
232  /// Support for iterating over the slices.
233  /// @{
236 
237  iterator begin() { return Slices.begin(); }
238  iterator end() { return Slices.end(); }
239 
242 
243  const_iterator begin() const { return Slices.begin(); }
244  const_iterator end() const { return Slices.end(); }
245  /// @}
246 
247  /// Erase a range of slices.
248  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
249 
250  /// Insert new slices for this alloca.
251  ///
252  /// This moves the slices into the alloca's slices collection, and re-sorts
253  /// everything so that the usual ordering properties of the alloca's slices
254  /// hold.
255  void insert(ArrayRef<Slice> NewSlices) {
256  int OldSize = Slices.size();
257  Slices.append(NewSlices.begin(), NewSlices.end());
258  auto SliceI = Slices.begin() + OldSize;
259  llvm::sort(SliceI, Slices.end());
260  std::inplace_merge(Slices.begin(), SliceI, Slices.end());
261  }
262 
263  // Forward declare the iterator and range accessor for walking the
264  // partitions.
265  class partition_iterator;
267 
268  /// Access the dead users for this alloca.
269  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
270 
271  /// Access Uses that should be dropped if the alloca is promotable.
273  return DeadUseIfPromotable;
274  }
275 
276  /// Access the dead operands referring to this alloca.
277  ///
278  /// These are operands which have cannot actually be used to refer to the
279  /// alloca as they are outside its range and the user doesn't correct for
280  /// that. These mostly consist of PHI node inputs and the like which we just
281  /// need to replace with undef.
282  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
283 
284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
285  void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
287  StringRef Indent = " ") const;
289  StringRef Indent = " ") const;
290  void print(raw_ostream &OS) const;
291  void dump(const_iterator I) const;
292  void dump() const;
293 #endif
294 
295 private:
296  template <typename DerivedT, typename RetT = void> class BuilderBase;
297  class SliceBuilder;
298 
299  friend class AllocaSlices::SliceBuilder;
300 
301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
302  /// Handle to alloca instruction to simplify method interfaces.
303  AllocaInst &AI;
304 #endif
305 
306  /// The instruction responsible for this alloca not having a known set
307  /// of slices.
308  ///
309  /// When an instruction (potentially) escapes the pointer to the alloca, we
310  /// store a pointer to that here and abort trying to form slices of the
311  /// alloca. This will be null if the alloca slices are analyzed successfully.
312  Instruction *PointerEscapingInstr;
313 
314  /// The slices of the alloca.
315  ///
316  /// We store a vector of the slices formed by uses of the alloca here. This
317  /// vector is sorted by increasing begin offset, and then the unsplittable
318  /// slices before the splittable ones. See the Slice inner class for more
319  /// details.
320  SmallVector<Slice, 8> Slices;
321 
322  /// Instructions which will become dead if we rewrite the alloca.
323  ///
324  /// Note that these are not separated by slice. This is because we expect an
325  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
326  /// all these instructions can simply be removed and replaced with undef as
327  /// they come from outside of the allocated space.
329 
330  /// Uses which will become dead if can promote the alloca.
331  SmallVector<Use *, 8> DeadUseIfPromotable;
332 
333  /// Operands which will become dead if we rewrite the alloca.
334  ///
335  /// These are operands that in their particular use can be replaced with
336  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
337  /// to PHI nodes and the like. They aren't entirely dead (there might be
338  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
339  /// want to swap this particular input for undef to simplify the use lists of
340  /// the alloca.
341  SmallVector<Use *, 8> DeadOperands;
342 };
343 
344 /// A partition of the slices.
345 ///
346 /// An ephemeral representation for a range of slices which can be viewed as
347 /// a partition of the alloca. This range represents a span of the alloca's
348 /// memory which cannot be split, and provides access to all of the slices
349 /// overlapping some part of the partition.
350 ///
351 /// Objects of this type are produced by traversing the alloca's slices, but
352 /// are only ephemeral and not persistent.
354 private:
355  friend class AllocaSlices;
357 
358  using iterator = AllocaSlices::iterator;
359 
360  /// The beginning and ending offsets of the alloca for this
361  /// partition.
362  uint64_t BeginOffset = 0, EndOffset = 0;
363 
364  /// The start and end iterators of this partition.
365  iterator SI, SJ;
366 
367  /// A collection of split slice tails overlapping the partition.
368  SmallVector<Slice *, 4> SplitTails;
369 
370  /// Raw constructor builds an empty partition starting and ending at
371  /// the given iterator.
372  Partition(iterator SI) : SI(SI), SJ(SI) {}
373 
374 public:
375  /// The start offset of this partition.
376  ///
377  /// All of the contained slices start at or after this offset.
378  uint64_t beginOffset() const { return BeginOffset; }
379 
380  /// The end offset of this partition.
381  ///
382  /// All of the contained slices end at or before this offset.
383  uint64_t endOffset() const { return EndOffset; }
384 
385  /// The size of the partition.
386  ///
387  /// Note that this can never be zero.
388  uint64_t size() const {
389  assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
390  return EndOffset - BeginOffset;
391  }
392 
393  /// Test whether this partition contains no slices, and merely spans
394  /// a region occupied by split slices.
395  bool empty() const { return SI == SJ; }
396 
397  /// \name Iterate slices that start within the partition.
398  /// These may be splittable or unsplittable. They have a begin offset >= the
399  /// partition begin offset.
400  /// @{
401  // FIXME: We should probably define a "concat_iterator" helper and use that
402  // to stitch together pointee_iterators over the split tails and the
403  // contiguous iterators of the partition. That would give a much nicer
404  // interface here. We could then additionally expose filtered iterators for
405  // split, unsplit, and unsplittable splices based on the usage patterns.
406  iterator begin() const { return SI; }
407  iterator end() const { return SJ; }
408  /// @}
409 
410  /// Get the sequence of split slice tails.
411  ///
412  /// These tails are of slices which start before this partition but are
413  /// split and overlap into the partition. We accumulate these while forming
414  /// partitions.
415  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
416 };
417 
418 /// An iterator over partitions of the alloca's slices.
419 ///
420 /// This iterator implements the core algorithm for partitioning the alloca's
421 /// slices. It is a forward iterator as we don't support backtracking for
422 /// efficiency reasons, and re-use a single storage area to maintain the
423 /// current set of split slices.
424 ///
425 /// It is templated on the slice iterator type to use so that it can operate
426 /// with either const or non-const slice iterators.
428  : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
429  Partition> {
430  friend class AllocaSlices;
431 
432  /// Most of the state for walking the partitions is held in a class
433  /// with a nice interface for examining them.
434  Partition P;
435 
436  /// We need to keep the end of the slices to know when to stop.
438 
439  /// We also need to keep track of the maximum split end offset seen.
440  /// FIXME: Do we really?
441  uint64_t MaxSplitSliceEndOffset = 0;
442 
443  /// Sets the partition to be empty at given iterator, and sets the
444  /// end iterator.
446  : P(SI), SE(SE) {
447  // If not already at the end, advance our state to form the initial
448  // partition.
449  if (SI != SE)
450  advance();
451  }
452 
453  /// Advance the iterator to the next partition.
454  ///
455  /// Requires that the iterator not be at the end of the slices.
456  void advance() {
457  assert((P.SI != SE || !P.SplitTails.empty()) &&
458  "Cannot advance past the end of the slices!");
459 
460  // Clear out any split uses which have ended.
461  if (!P.SplitTails.empty()) {
462  if (P.EndOffset >= MaxSplitSliceEndOffset) {
463  // If we've finished all splits, this is easy.
464  P.SplitTails.clear();
465  MaxSplitSliceEndOffset = 0;
466  } else {
467  // Remove the uses which have ended in the prior partition. This
468  // cannot change the max split slice end because we just checked that
469  // the prior partition ended prior to that max.
470  llvm::erase_if(P.SplitTails,
471  [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
472  assert(llvm::any_of(P.SplitTails,
473  [&](Slice *S) {
474  return S->endOffset() == MaxSplitSliceEndOffset;
475  }) &&
476  "Could not find the current max split slice offset!");
477  assert(llvm::all_of(P.SplitTails,
478  [&](Slice *S) {
479  return S->endOffset() <= MaxSplitSliceEndOffset;
480  }) &&
481  "Max split slice end offset is not actually the max!");
482  }
483  }
484 
485  // If P.SI is already at the end, then we've cleared the split tail and
486  // now have an end iterator.
487  if (P.SI == SE) {
488  assert(P.SplitTails.empty() && "Failed to clear the split slices!");
489  return;
490  }
491 
492  // If we had a non-empty partition previously, set up the state for
493  // subsequent partitions.
494  if (P.SI != P.SJ) {
495  // Accumulate all the splittable slices which started in the old
496  // partition into the split list.
497  for (Slice &S : P)
498  if (S.isSplittable() && S.endOffset() > P.EndOffset) {
499  P.SplitTails.push_back(&S);
500  MaxSplitSliceEndOffset =
501  std::max(S.endOffset(), MaxSplitSliceEndOffset);
502  }
503 
504  // Start from the end of the previous partition.
505  P.SI = P.SJ;
506 
507  // If P.SI is now at the end, we at most have a tail of split slices.
508  if (P.SI == SE) {
509  P.BeginOffset = P.EndOffset;
510  P.EndOffset = MaxSplitSliceEndOffset;
511  return;
512  }
513 
514  // If the we have split slices and the next slice is after a gap and is
515  // not splittable immediately form an empty partition for the split
516  // slices up until the next slice begins.
517  if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
518  !P.SI->isSplittable()) {
519  P.BeginOffset = P.EndOffset;
520  P.EndOffset = P.SI->beginOffset();
521  return;
522  }
523  }
524 
525  // OK, we need to consume new slices. Set the end offset based on the
526  // current slice, and step SJ past it. The beginning offset of the
527  // partition is the beginning offset of the next slice unless we have
528  // pre-existing split slices that are continuing, in which case we begin
529  // at the prior end offset.
530  P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
531  P.EndOffset = P.SI->endOffset();
532  ++P.SJ;
533 
534  // There are two strategies to form a partition based on whether the
535  // partition starts with an unsplittable slice or a splittable slice.
536  if (!P.SI->isSplittable()) {
537  // When we're forming an unsplittable region, it must always start at
538  // the first slice and will extend through its end.
539  assert(P.BeginOffset == P.SI->beginOffset());
540 
541  // Form a partition including all of the overlapping slices with this
542  // unsplittable slice.
543  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
544  if (!P.SJ->isSplittable())
545  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
546  ++P.SJ;
547  }
548 
549  // We have a partition across a set of overlapping unsplittable
550  // partitions.
551  return;
552  }
553 
554  // If we're starting with a splittable slice, then we need to form
555  // a synthetic partition spanning it and any other overlapping splittable
556  // splices.
557  assert(P.SI->isSplittable() && "Forming a splittable partition!");
558 
559  // Collect all of the overlapping splittable slices.
560  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
561  P.SJ->isSplittable()) {
562  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
563  ++P.SJ;
564  }
565 
566  // Back upiP.EndOffset if we ended the span early when encountering an
567  // unsplittable slice. This synthesizes the early end offset of
568  // a partition spanning only splittable slices.
569  if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
570  assert(!P.SJ->isSplittable());
571  P.EndOffset = P.SJ->beginOffset();
572  }
573  }
574 
575 public:
576  bool operator==(const partition_iterator &RHS) const {
577  assert(SE == RHS.SE &&
578  "End iterators don't match between compared partition iterators!");
579 
580  // The observed positions of partitions is marked by the P.SI iterator and
581  // the emptiness of the split slices. The latter is only relevant when
582  // P.SI == SE, as the end iterator will additionally have an empty split
583  // slices list, but the prior may have the same P.SI and a tail of split
584  // slices.
585  if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
586  assert(P.SJ == RHS.P.SJ &&
587  "Same set of slices formed two different sized partitions!");
588  assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
589  "Same slice position with differently sized non-empty split "
590  "slice tails!");
591  return true;
592  }
593  return false;
594  }
595 
597  advance();
598  return *this;
599  }
600 
601  Partition &operator*() { return P; }
602 };
603 
604 /// A forward range over the partitions of the alloca's slices.
605 ///
606 /// This accesses an iterator range over the partitions of the alloca's
607 /// slices. It computes these partitions on the fly based on the overlapping
608 /// offsets of the slices and the ability to split them. It will visit "empty"
609 /// partitions to cover regions of the alloca only accessed via split
610 /// slices.
612  return make_range(partition_iterator(begin(), end()),
613  partition_iterator(end(), end()));
614 }
615 
617  // If the condition being selected on is a constant or the same value is
618  // being selected between, fold the select. Yes this does (rarely) happen
619  // early on.
620  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
621  return SI.getOperand(1 + CI->isZero());
622  if (SI.getOperand(1) == SI.getOperand(2))
623  return SI.getOperand(1);
624 
625  return nullptr;
626 }
627 
628 /// A helper that folds a PHI node or a select.
630  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
631  // If PN merges together the same value, return that value.
632  return PN->hasConstantValue();
633  }
634  return foldSelectInst(cast<SelectInst>(I));
635 }
636 
637 /// Builder for the alloca slices.
638 ///
639 /// This class builds a set of alloca slices by recursively visiting the uses
640 /// of an alloca and making a slice for each load and store at each offset.
641 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
643  friend class InstVisitor<SliceBuilder>;
644 
646 
647  const uint64_t AllocSize;
648  AllocaSlices &AS;
649 
650  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
652 
653  /// Set to de-duplicate dead instructions found in the use walk.
654  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
655 
656 public:
659  AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
660  AS(AS) {}
661 
662 private:
663  void markAsDead(Instruction &I) {
664  if (VisitedDeadInsts.insert(&I).second)
665  AS.DeadUsers.push_back(&I);
666  }
667 
668  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
669  bool IsSplittable = false) {
670  // Completely skip uses which have a zero size or start either before or
671  // past the end of the allocation.
672  if (Size == 0 || Offset.uge(AllocSize)) {
673  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
674  << Offset
675  << " which has zero size or starts outside of the "
676  << AllocSize << " byte alloca:\n"
677  << " alloca: " << AS.AI << "\n"
678  << " use: " << I << "\n");
679  return markAsDead(I);
680  }
681 
682  uint64_t BeginOffset = Offset.getZExtValue();
683  uint64_t EndOffset = BeginOffset + Size;
684 
685  // Clamp the end offset to the end of the allocation. Note that this is
686  // formulated to handle even the case where "BeginOffset + Size" overflows.
687  // This may appear superficially to be something we could ignore entirely,
688  // but that is not so! There may be widened loads or PHI-node uses where
689  // some instructions are dead but not others. We can't completely ignore
690  // them, and so have to record at least the information here.
691  assert(AllocSize >= BeginOffset); // Established above.
692  if (Size > AllocSize - BeginOffset) {
693  LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
694  << Offset << " to remain within the " << AllocSize
695  << " byte alloca:\n"
696  << " alloca: " << AS.AI << "\n"
697  << " use: " << I << "\n");
698  EndOffset = AllocSize;
699  }
700 
701  AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
702  }
703 
704  void visitBitCastInst(BitCastInst &BC) {
705  if (BC.use_empty())
706  return markAsDead(BC);
707 
708  return Base::visitBitCastInst(BC);
709  }
710 
711  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
712  if (ASC.use_empty())
713  return markAsDead(ASC);
714 
715  return Base::visitAddrSpaceCastInst(ASC);
716  }
717 
718  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
719  if (GEPI.use_empty())
720  return markAsDead(GEPI);
721 
722  if (SROAStrictInbounds && GEPI.isInBounds()) {
723  // FIXME: This is a manually un-factored variant of the basic code inside
724  // of GEPs with checking of the inbounds invariant specified in the
725  // langref in a very strict sense. If we ever want to enable
726  // SROAStrictInbounds, this code should be factored cleanly into
727  // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
728  // by writing out the code here where we have the underlying allocation
729  // size readily available.
730  APInt GEPOffset = Offset;
731  const DataLayout &DL = GEPI.getModule()->getDataLayout();
732  for (gep_type_iterator GTI = gep_type_begin(GEPI),
733  GTE = gep_type_end(GEPI);
734  GTI != GTE; ++GTI) {
735  ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
736  if (!OpC)
737  break;
738 
739  // Handle a struct index, which adds its field offset to the pointer.
740  if (StructType *STy = GTI.getStructTypeOrNull()) {
741  unsigned ElementIdx = OpC->getZExtValue();
742  const StructLayout *SL = DL.getStructLayout(STy);
743  GEPOffset +=
744  APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
745  } else {
746  // For array or vector indices, scale the index by the size of the
747  // type.
748  APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
749  GEPOffset +=
750  Index *
751  APInt(Offset.getBitWidth(),
752  DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
753  }
754 
755  // If this index has computed an intermediate pointer which is not
756  // inbounds, then the result of the GEP is a poison value and we can
757  // delete it and all uses.
758  if (GEPOffset.ugt(AllocSize))
759  return markAsDead(GEPI);
760  }
761  }
762 
763  return Base::visitGetElementPtrInst(GEPI);
764  }
765 
766  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
767  uint64_t Size, bool IsVolatile) {
768  // We allow splitting of non-volatile loads and stores where the type is an
769  // integer type. These may be used to implement 'memcpy' or other "transfer
770  // of bits" patterns.
771  bool IsSplittable =
772  Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
773 
774  insertUse(I, Offset, Size, IsSplittable);
775  }
776 
777  void visitLoadInst(LoadInst &LI) {
778  assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
779  "All simple FCA loads should have been pre-split");
780 
781  if (!IsOffsetKnown)
782  return PI.setAborted(&LI);
783 
784  if (LI.isVolatile() &&
785  LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
786  return PI.setAborted(&LI);
787 
788  if (isa<ScalableVectorType>(LI.getType()))
789  return PI.setAborted(&LI);
790 
791  uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
792  return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
793  }
794 
795  void visitStoreInst(StoreInst &SI) {
796  Value *ValOp = SI.getValueOperand();
797  if (ValOp == *U)
798  return PI.setEscapedAndAborted(&SI);
799  if (!IsOffsetKnown)
800  return PI.setAborted(&SI);
801 
802  if (SI.isVolatile() &&
803  SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
804  return PI.setAborted(&SI);
805 
806  if (isa<ScalableVectorType>(ValOp->getType()))
807  return PI.setAborted(&SI);
808 
809  uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
810 
811  // If this memory access can be shown to *statically* extend outside the
812  // bounds of the allocation, it's behavior is undefined, so simply
813  // ignore it. Note that this is more strict than the generic clamping
814  // behavior of insertUse. We also try to handle cases which might run the
815  // risk of overflow.
816  // FIXME: We should instead consider the pointer to have escaped if this
817  // function is being instrumented for addressing bugs or race conditions.
818  if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
819  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
820  << Offset << " which extends past the end of the "
821  << AllocSize << " byte alloca:\n"
822  << " alloca: " << AS.AI << "\n"
823  << " use: " << SI << "\n");
824  return markAsDead(SI);
825  }
826 
827  assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
828  "All simple FCA stores should have been pre-split");
829  handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
830  }
831 
832  void visitMemSetInst(MemSetInst &II) {
833  assert(II.getRawDest() == *U && "Pointer use is not the destination?");
834  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
835  if ((Length && Length->getValue() == 0) ||
836  (IsOffsetKnown && Offset.uge(AllocSize)))
837  // Zero-length mem transfer intrinsics can be ignored entirely.
838  return markAsDead(II);
839 
840  if (!IsOffsetKnown)
841  return PI.setAborted(&II);
842 
843  // Don't replace this with a store with a different address space. TODO:
844  // Use a store with the casted new alloca?
845  if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
846  return PI.setAborted(&II);
847 
848  insertUse(II, Offset, Length ? Length->getLimitedValue()
849  : AllocSize - Offset.getLimitedValue(),
850  (bool)Length);
851  }
852 
853  void visitMemTransferInst(MemTransferInst &II) {
854  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
855  if (Length && Length->getValue() == 0)
856  // Zero-length mem transfer intrinsics can be ignored entirely.
857  return markAsDead(II);
858 
859  // Because we can visit these intrinsics twice, also check to see if the
860  // first time marked this instruction as dead. If so, skip it.
861  if (VisitedDeadInsts.count(&II))
862  return;
863 
864  if (!IsOffsetKnown)
865  return PI.setAborted(&II);
866 
867  // Don't replace this with a load/store with a different address space.
868  // TODO: Use a store with the casted new alloca?
869  if (II.isVolatile() &&
870  (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
871  II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
872  return PI.setAborted(&II);
873 
874  // This side of the transfer is completely out-of-bounds, and so we can
875  // nuke the entire transfer. However, we also need to nuke the other side
876  // if already added to our partitions.
877  // FIXME: Yet another place we really should bypass this when
878  // instrumenting for ASan.
879  if (Offset.uge(AllocSize)) {
881  MemTransferSliceMap.find(&II);
882  if (MTPI != MemTransferSliceMap.end())
883  AS.Slices[MTPI->second].kill();
884  return markAsDead(II);
885  }
886 
887  uint64_t RawOffset = Offset.getLimitedValue();
888  uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
889 
890  // Check for the special case where the same exact value is used for both
891  // source and dest.
892  if (*U == II.getRawDest() && *U == II.getRawSource()) {
893  // For non-volatile transfers this is a no-op.
894  if (!II.isVolatile())
895  return markAsDead(II);
896 
897  return insertUse(II, Offset, Size, /*IsSplittable=*/false);
898  }
899 
900  // If we have seen both source and destination for a mem transfer, then
901  // they both point to the same alloca.
902  bool Inserted;
904  std::tie(MTPI, Inserted) =
905  MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
906  unsigned PrevIdx = MTPI->second;
907  if (!Inserted) {
908  Slice &PrevP = AS.Slices[PrevIdx];
909 
910  // Check if the begin offsets match and this is a non-volatile transfer.
911  // In that case, we can completely elide the transfer.
912  if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
913  PrevP.kill();
914  return markAsDead(II);
915  }
916 
917  // Otherwise we have an offset transfer within the same alloca. We can't
918  // split those.
919  PrevP.makeUnsplittable();
920  }
921 
922  // Insert the use now that we've fixed up the splittable nature.
923  insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
924 
925  // Check that we ended up with a valid index in the map.
926  assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
927  "Map index doesn't point back to a slice with this user.");
928  }
929 
930  // Disable SRoA for any intrinsics except for lifetime invariants and
931  // invariant group.
932  // FIXME: What about debug intrinsics? This matches old behavior, but
933  // doesn't make sense.
934  void visitIntrinsicInst(IntrinsicInst &II) {
935  if (II.isDroppable()) {
936  AS.DeadUseIfPromotable.push_back(U);
937  return;
938  }
939 
940  if (!IsOffsetKnown)
941  return PI.setAborted(&II);
942 
943  if (II.isLifetimeStartOrEnd()) {
944  ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
945  uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
946  Length->getLimitedValue());
947  insertUse(II, Offset, Size, true);
948  return;
949  }
950 
952  enqueueUsers(II);
953  return;
954  }
955 
956  Base::visitIntrinsicInst(II);
957  }
958 
959  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
960  // We consider any PHI or select that results in a direct load or store of
961  // the same offset to be a viable use for slicing purposes. These uses
962  // are considered unsplittable and the size is the maximum loaded or stored
963  // size.
966  Visited.insert(Root);
967  Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
968  const DataLayout &DL = Root->getModule()->getDataLayout();
969  // If there are no loads or stores, the access is dead. We mark that as
970  // a size zero access.
971  Size = 0;
972  do {
973  Instruction *I, *UsedI;
974  std::tie(UsedI, I) = Uses.pop_back_val();
975 
976  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
977  Size = std::max(Size,
978  DL.getTypeStoreSize(LI->getType()).getFixedSize());
979  continue;
980  }
981  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
982  Value *Op = SI->getOperand(0);
983  if (Op == UsedI)
984  return SI;
985  Size = std::max(Size,
986  DL.getTypeStoreSize(Op->getType()).getFixedSize());
987  continue;
988  }
989 
990  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
991  if (!GEP->hasAllZeroIndices())
992  return GEP;
993  } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
994  !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
995  return I;
996  }
997 
998  for (User *U : I->users())
999  if (Visited.insert(cast<Instruction>(U)).second)
1000  Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1001  } while (!Uses.empty());
1002 
1003  return nullptr;
1004  }
1005 
1006  void visitPHINodeOrSelectInst(Instruction &I) {
1007  assert(isa<PHINode>(I) || isa<SelectInst>(I));
1008  if (I.use_empty())
1009  return markAsDead(I);
1010 
1011  // TODO: We could use SimplifyInstruction here to fold PHINodes and
1012  // SelectInsts. However, doing so requires to change the current
1013  // dead-operand-tracking mechanism. For instance, suppose neither loading
1014  // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1015  // trap either. However, if we simply replace %U with undef using the
1016  // current dead-operand-tracking mechanism, "load (select undef, undef,
1017  // %other)" may trap because the select may return the first operand
1018  // "undef".
1019  if (Value *Result = foldPHINodeOrSelectInst(I)) {
1020  if (Result == *U)
1021  // If the result of the constant fold will be the pointer, recurse
1022  // through the PHI/select as if we had RAUW'ed it.
1023  enqueueUsers(I);
1024  else
1025  // Otherwise the operand to the PHI/select is dead, and we can replace
1026  // it with undef.
1027  AS.DeadOperands.push_back(U);
1028 
1029  return;
1030  }
1031 
1032  if (!IsOffsetKnown)
1033  return PI.setAborted(&I);
1034 
1035  // See if we already have computed info on this node.
1036  uint64_t &Size = PHIOrSelectSizes[&I];
1037  if (!Size) {
1038  // This is a new PHI/Select, check for an unsafe use of it.
1039  if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1040  return PI.setAborted(UnsafeI);
1041  }
1042 
1043  // For PHI and select operands outside the alloca, we can't nuke the entire
1044  // phi or select -- the other side might still be relevant, so we special
1045  // case them here and use a separate structure to track the operands
1046  // themselves which should be replaced with undef.
1047  // FIXME: This should instead be escaped in the event we're instrumenting
1048  // for address sanitization.
1049  if (Offset.uge(AllocSize)) {
1050  AS.DeadOperands.push_back(U);
1051  return;
1052  }
1053 
1054  insertUse(I, Offset, Size);
1055  }
1056 
1057  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1058 
1059  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1060 
1061  /// Disable SROA entirely if there are unhandled users of the alloca.
1062  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1063 };
1064 
1066  :
1067 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1068  AI(AI),
1069 #endif
1070  PointerEscapingInstr(nullptr) {
1071  SliceBuilder PB(DL, AI, *this);
1072  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1073  if (PtrI.isEscaped() || PtrI.isAborted()) {
1074  // FIXME: We should sink the escape vs. abort info into the caller nicely,
1075  // possibly by just storing the PtrInfo in the AllocaSlices.
1076  PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1077  : PtrI.getAbortingInst();
1078  assert(PointerEscapingInstr && "Did not track a bad instruction");
1079  return;
1080  }
1081 
1082  llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1083 
1084  // Sort the uses. This arranges for the offsets to be in ascending order,
1085  // and the sizes to be in descending order.
1086  llvm::stable_sort(Slices);
1087 }
1088 
1089 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1090 
1091 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1092  StringRef Indent) const {
1093  printSlice(OS, I, Indent);
1094  OS << "\n";
1095  printUse(OS, I, Indent);
1096 }
1097 
1098 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1099  StringRef Indent) const {
1100  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1101  << " slice #" << (I - begin())
1102  << (I->isSplittable() ? " (splittable)" : "");
1103 }
1104 
1105 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1106  StringRef Indent) const {
1107  OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1108 }
1109 
1110 void AllocaSlices::print(raw_ostream &OS) const {
1111  if (PointerEscapingInstr) {
1112  OS << "Can't analyze slices for alloca: " << AI << "\n"
1113  << " A pointer to this alloca escaped by:\n"
1114  << " " << *PointerEscapingInstr << "\n";
1115  return;
1116  }
1117 
1118  OS << "Slices of alloca: " << AI << "\n";
1119  for (const_iterator I = begin(), E = end(); I != E; ++I)
1120  print(OS, I);
1121 }
1122 
1123 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1124  print(dbgs(), I);
1125 }
1126 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1127 
1128 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1129 
1130 /// Walk the range of a partitioning looking for a common type to cover this
1131 /// sequence of slices.
1132 static std::pair<Type *, IntegerType *>
1133 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1134  uint64_t EndOffset) {
1135  Type *Ty = nullptr;
1136  bool TyIsCommon = true;
1137  IntegerType *ITy = nullptr;
1138 
1139  // Note that we need to look at *every* alloca slice's Use to ensure we
1140  // always get consistent results regardless of the order of slices.
1141  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1142  Use *U = I->getUse();
1143  if (isa<IntrinsicInst>(*U->getUser()))
1144  continue;
1145  if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1146  continue;
1147 
1148  Type *UserTy = nullptr;
1149  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1150  UserTy = LI->getType();
1151  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1152  UserTy = SI->getValueOperand()->getType();
1153  }
1154 
1155  if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1156  // If the type is larger than the partition, skip it. We only encounter
1157  // this for split integer operations where we want to use the type of the
1158  // entity causing the split. Also skip if the type is not a byte width
1159  // multiple.
1160  if (UserITy->getBitWidth() % 8 != 0 ||
1161  UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1162  continue;
1163 
1164  // Track the largest bitwidth integer type used in this way in case there
1165  // is no common type.
1166  if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1167  ITy = UserITy;
1168  }
1169 
1170  // To avoid depending on the order of slices, Ty and TyIsCommon must not
1171  // depend on types skipped above.
1172  if (!UserTy || (Ty && Ty != UserTy))
1173  TyIsCommon = false; // Give up on anything but an iN type.
1174  else
1175  Ty = UserTy;
1176  }
1177 
1178  return {TyIsCommon ? Ty : nullptr, ITy};
1179 }
1180 
1181 /// PHI instructions that use an alloca and are subsequently loaded can be
1182 /// rewritten to load both input pointers in the pred blocks and then PHI the
1183 /// results, allowing the load of the alloca to be promoted.
1184 /// From this:
1185 /// %P2 = phi [i32* %Alloca, i32* %Other]
1186 /// %V = load i32* %P2
1187 /// to:
1188 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1189 /// ...
1190 /// %V2 = load i32* %Other
1191 /// ...
1192 /// %V = phi [i32 %V1, i32 %V2]
1193 ///
1194 /// We can do this to a select if its only uses are loads and if the operands
1195 /// to the select can be loaded unconditionally.
1196 ///
1197 /// FIXME: This should be hoisted into a generic utility, likely in
1198 /// Transforms/Util/Local.h
1199 static bool isSafePHIToSpeculate(PHINode &PN) {
1200  const DataLayout &DL = PN.getModule()->getDataLayout();
1201 
1202  // For now, we can only do this promotion if the load is in the same block
1203  // as the PHI, and if there are no stores between the phi and load.
1204  // TODO: Allow recursive phi users.
1205  // TODO: Allow stores.
1206  BasicBlock *BB = PN.getParent();
1207  Align MaxAlign;
1208  uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1209  APInt MaxSize(APWidth, 0);
1210  bool HaveLoad = false;
1211  for (User *U : PN.users()) {
1212  LoadInst *LI = dyn_cast<LoadInst>(U);
1213  if (!LI || !LI->isSimple())
1214  return false;
1215 
1216  // For now we only allow loads in the same block as the PHI. This is
1217  // a common case that happens when instcombine merges two loads through
1218  // a PHI.
1219  if (LI->getParent() != BB)
1220  return false;
1221 
1222  // Ensure that there are no instructions between the PHI and the load that
1223  // could store.
1224  for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1225  if (BBI->mayWriteToMemory())
1226  return false;
1227 
1228  uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
1229  MaxAlign = std::max(MaxAlign, LI->getAlign());
1230  MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
1231  HaveLoad = true;
1232  }
1233 
1234  if (!HaveLoad)
1235  return false;
1236 
1237  // We can only transform this if it is safe to push the loads into the
1238  // predecessor blocks. The only thing to watch out for is that we can't put
1239  // a possibly trapping load in the predecessor if it is a critical edge.
1240  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1241  Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
1242  Value *InVal = PN.getIncomingValue(Idx);
1243 
1244  // If the value is produced by the terminator of the predecessor (an
1245  // invoke) or it has side-effects, there is no valid place to put a load
1246  // in the predecessor.
1247  if (TI == InVal || TI->mayHaveSideEffects())
1248  return false;
1249 
1250  // If the predecessor has a single successor, then the edge isn't
1251  // critical.
1252  if (TI->getNumSuccessors() == 1)
1253  continue;
1254 
1255  // If this pointer is always safe to load, or if we can prove that there
1256  // is already a load in the block, then we can move the load to the pred
1257  // block.
1258  if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI))
1259  continue;
1260 
1261  return false;
1262  }
1263 
1264  return true;
1265 }
1266 
1267 static void speculatePHINodeLoads(PHINode &PN) {
1268  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1269 
1270  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1271  Type *LoadTy = SomeLoad->getType();
1272  IRBuilderTy PHIBuilder(&PN);
1273  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1274  PN.getName() + ".sroa.speculated");
1275 
1276  // Get the AA tags and alignment to use from one of the loads. It does not
1277  // matter which one we get and if any differ.
1278  AAMDNodes AATags;
1279  SomeLoad->getAAMetadata(AATags);
1280  Align Alignment = SomeLoad->getAlign();
1281 
1282  // Rewrite all loads of the PN to use the new PHI.
1283  while (!PN.use_empty()) {
1284  LoadInst *LI = cast<LoadInst>(PN.user_back());
1285  LI->replaceAllUsesWith(NewPN);
1286  LI->eraseFromParent();
1287  }
1288 
1289  // Inject loads into all of the pred blocks.
1290  DenseMap<BasicBlock*, Value*> InjectedLoads;
1291  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1292  BasicBlock *Pred = PN.getIncomingBlock(Idx);
1293  Value *InVal = PN.getIncomingValue(Idx);
1294 
1295  // A PHI node is allowed to have multiple (duplicated) entries for the same
1296  // basic block, as long as the value is the same. So if we already injected
1297  // a load in the predecessor, then we should reuse the same load for all
1298  // duplicated entries.
1299  if (Value* V = InjectedLoads.lookup(Pred)) {
1300  NewPN->addIncoming(V, Pred);
1301  continue;
1302  }
1303 
1304  Instruction *TI = Pred->getTerminator();
1305  IRBuilderTy PredBuilder(TI);
1306 
1307  LoadInst *Load = PredBuilder.CreateAlignedLoad(
1308  LoadTy, InVal, Alignment,
1309  (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1310  ++NumLoadsSpeculated;
1311  if (AATags)
1312  Load->setAAMetadata(AATags);
1313  NewPN->addIncoming(Load, Pred);
1314  InjectedLoads[Pred] = Load;
1315  }
1316 
1317  LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1318  PN.eraseFromParent();
1319 }
1320 
1321 /// Select instructions that use an alloca and are subsequently loaded can be
1322 /// rewritten to load both input pointers and then select between the result,
1323 /// allowing the load of the alloca to be promoted.
1324 /// From this:
1325 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1326 /// %V = load i32* %P2
1327 /// to:
1328 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1329 /// %V2 = load i32* %Other
1330 /// %V = select i1 %cond, i32 %V1, i32 %V2
1331 ///
1332 /// We can do this to a select if its only uses are loads and if the operand
1333 /// to the select can be loaded unconditionally.
1335  Value *TValue = SI.getTrueValue();
1336  Value *FValue = SI.getFalseValue();
1337  const DataLayout &DL = SI.getModule()->getDataLayout();
1338 
1339  for (User *U : SI.users()) {
1340  LoadInst *LI = dyn_cast<LoadInst>(U);
1341  if (!LI || !LI->isSimple())
1342  return false;
1343 
1344  // Both operands to the select need to be dereferenceable, either
1345  // absolutely (e.g. allocas) or at this point because we can see other
1346  // accesses to it.
1347  if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
1348  LI->getAlign(), DL, LI))
1349  return false;
1350  if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
1351  LI->getAlign(), DL, LI))
1352  return false;
1353  }
1354 
1355  return true;
1356 }
1357 
1359  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
1360 
1361  IRBuilderTy IRB(&SI);
1362  Value *TV = SI.getTrueValue();
1363  Value *FV = SI.getFalseValue();
1364  // Replace the loads of the select with a select of two loads.
1365  while (!SI.use_empty()) {
1366  LoadInst *LI = cast<LoadInst>(SI.user_back());
1367  assert(LI->isSimple() && "We only speculate simple loads");
1368 
1369  IRB.SetInsertPoint(LI);
1370  LoadInst *TL = IRB.CreateLoad(LI->getType(), TV,
1371  LI->getName() + ".sroa.speculate.load.true");
1372  LoadInst *FL = IRB.CreateLoad(LI->getType(), FV,
1373  LI->getName() + ".sroa.speculate.load.false");
1374  NumLoadsSpeculated += 2;
1375 
1376  // Transfer alignment and AA info if present.
1377  TL->setAlignment(LI->getAlign());
1378  FL->setAlignment(LI->getAlign());
1379 
1380  AAMDNodes Tags;
1381  LI->getAAMetadata(Tags);
1382  if (Tags) {
1383  TL->setAAMetadata(Tags);
1384  FL->setAAMetadata(Tags);
1385  }
1386 
1387  Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1388  LI->getName() + ".sroa.speculated");
1389 
1390  LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1391  LI->replaceAllUsesWith(V);
1392  LI->eraseFromParent();
1393  }
1394  SI.eraseFromParent();
1395 }
1396 
1397 /// Build a GEP out of a base pointer and indices.
1398 ///
1399 /// This will return the BasePtr if that is valid, or build a new GEP
1400 /// instruction using the IRBuilder if GEP-ing is needed.
1401 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
1402  SmallVectorImpl<Value *> &Indices,
1403  const Twine &NamePrefix) {
1404  if (Indices.empty())
1405  return BasePtr;
1406 
1407  // A single zero index is a no-op, so check for this and avoid building a GEP
1408  // in that case.
1409  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1410  return BasePtr;
1411 
1412  return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
1413  BasePtr, Indices, NamePrefix + "sroa_idx");
1414 }
1415 
1416 /// Get a natural GEP off of the BasePtr walking through Ty toward
1417 /// TargetTy without changing the offset of the pointer.
1418 ///
1419 /// This routine assumes we've already established a properly offset GEP with
1420 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1421 /// zero-indices down through type layers until we find one the same as
1422 /// TargetTy. If we can't find one with the same type, we at least try to use
1423 /// one with the same size. If none of that works, we just produce the GEP as
1424 /// indicated by Indices to have the correct offset.
1425 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
1426  Value *BasePtr, Type *Ty, Type *TargetTy,
1427  SmallVectorImpl<Value *> &Indices,
1428  const Twine &NamePrefix) {
1429  if (Ty == TargetTy)
1430  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1431 
1432  // Offset size to use for the indices.
1433  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
1434 
1435  // See if we can descend into a struct and locate a field with the correct
1436  // type.
1437  unsigned NumLayers = 0;
1438  Type *ElementTy = Ty;
1439  do {
1440  if (ElementTy->isPointerTy())
1441  break;
1442 
1443  if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
1444  ElementTy = ArrayTy->getElementType();
1445  Indices.push_back(IRB.getIntN(OffsetSize, 0));
1446  } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
1447  ElementTy = VectorTy->getElementType();
1448  Indices.push_back(IRB.getInt32(0));
1449  } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1450  if (STy->element_begin() == STy->element_end())
1451  break; // Nothing left to descend into.
1452  ElementTy = *STy->element_begin();
1453  Indices.push_back(IRB.getInt32(0));
1454  } else {
1455  break;
1456  }
1457  ++NumLayers;
1458  } while (ElementTy != TargetTy);
1459  if (ElementTy != TargetTy)
1460  Indices.erase(Indices.end() - NumLayers, Indices.end());
1461 
1462  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1463 }
1464 
1465 /// Recursively compute indices for a natural GEP.
1466 ///
1467 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
1468 /// element types adding appropriate indices for the GEP.
1469 static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
1470  Value *Ptr, Type *Ty, APInt &Offset,
1471  Type *TargetTy,
1472  SmallVectorImpl<Value *> &Indices,
1473  const Twine &NamePrefix) {
1474  if (Offset == 0)
1475  return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
1476  NamePrefix);
1477 
1478  // We can't recurse through pointer types.
1479  if (Ty->isPointerTy())
1480  return nullptr;
1481 
1482  // We try to analyze GEPs over vectors here, but note that these GEPs are
1483  // extremely poorly defined currently. The long-term goal is to remove GEPing
1484  // over a vector from the IR completely.
1485  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
1486  unsigned ElementSizeInBits =
1487  DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
1488  if (ElementSizeInBits % 8 != 0) {
1489  // GEPs over non-multiple of 8 size vector elements are invalid.
1490  return nullptr;
1491  }
1492  APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
1493  APInt NumSkippedElements = Offset.sdiv(ElementSize);
1494  if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements()))
1495  return nullptr;
1496  Offset -= NumSkippedElements * ElementSize;
1497  Indices.push_back(IRB.getInt(NumSkippedElements));
1498  return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
1499  Offset, TargetTy, Indices, NamePrefix);
1500  }
1501 
1502  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
1503  Type *ElementTy = ArrTy->getElementType();
1504  APInt ElementSize(Offset.getBitWidth(),
1505  DL.getTypeAllocSize(ElementTy).getFixedSize());
1506  APInt NumSkippedElements = Offset.sdiv(ElementSize);
1507  if (NumSkippedElements.ugt(ArrTy->getNumElements()))
1508  return nullptr;
1509 
1510  Offset -= NumSkippedElements * ElementSize;
1511  Indices.push_back(IRB.getInt(NumSkippedElements));
1512  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1513  Indices, NamePrefix);
1514  }
1515 
1516  StructType *STy = dyn_cast<StructType>(Ty);
1517  if (!STy)
1518  return nullptr;
1519 
1520  const StructLayout *SL = DL.getStructLayout(STy);
1521  uint64_t StructOffset = Offset.getZExtValue();
1522  if (StructOffset >= SL->getSizeInBytes())
1523  return nullptr;
1524  unsigned Index = SL->getElementContainingOffset(StructOffset);
1525  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
1526  Type *ElementTy = STy->getElementType(Index);
1527  if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
1528  return nullptr; // The offset points into alignment padding.
1529 
1530  Indices.push_back(IRB.getInt32(Index));
1531  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1532  Indices, NamePrefix);
1533 }
1534 
1535 /// Get a natural GEP from a base pointer to a particular offset and
1536 /// resulting in a particular type.
1537 ///
1538 /// The goal is to produce a "natural" looking GEP that works with the existing
1539 /// composite types to arrive at the appropriate offset and element type for
1540 /// a pointer. TargetTy is the element type the returned GEP should point-to if
1541 /// possible. We recurse by decreasing Offset, adding the appropriate index to
1542 /// Indices, and setting Ty to the result subtype.
1543 ///
1544 /// If no natural GEP can be constructed, this function returns null.
1545 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
1546  Value *Ptr, APInt Offset, Type *TargetTy,
1547  SmallVectorImpl<Value *> &Indices,
1548  const Twine &NamePrefix) {
1549  PointerType *Ty = cast<PointerType>(Ptr->getType());
1550 
1551  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1552  // an i8.
1553  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
1554  return nullptr;
1555 
1556  Type *ElementTy = Ty->getElementType();
1557  if (!ElementTy->isSized())
1558  return nullptr; // We can't GEP through an unsized element.
1559  if (isa<ScalableVectorType>(ElementTy))
1560  return nullptr;
1561  APInt ElementSize(Offset.getBitWidth(),
1562  DL.getTypeAllocSize(ElementTy).getFixedSize());
1563  if (ElementSize == 0)
1564  return nullptr; // Zero-length arrays can't help us build a natural GEP.
1565  APInt NumSkippedElements = Offset.sdiv(ElementSize);
1566 
1567  Offset -= NumSkippedElements * ElementSize;
1568  Indices.push_back(IRB.getInt(NumSkippedElements));
1569  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1570  Indices, NamePrefix);
1571 }
1572 
1573 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1574 /// resulting pointer has PointerTy.
1575 ///
1576 /// This tries very hard to compute a "natural" GEP which arrives at the offset
1577 /// and produces the pointer type desired. Where it cannot, it will try to use
1578 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
1579 /// fails, it will try to use an existing i8* and GEP to the byte offset and
1580 /// bitcast to the type.
1581 ///
1582 /// The strategy for finding the more natural GEPs is to peel off layers of the
1583 /// pointer, walking back through bit casts and GEPs, searching for a base
1584 /// pointer from which we can compute a natural GEP with the desired
1585 /// properties. The algorithm tries to fold as many constant indices into
1586 /// a single GEP as possible, thus making each GEP more independent of the
1587 /// surrounding code.
1588 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1590  const Twine &NamePrefix) {
1591  // Even though we don't look through PHI nodes, we could be called on an
1592  // instruction in an unreachable block, which may be on a cycle.
1593  SmallPtrSet<Value *, 4> Visited;
1594  Visited.insert(Ptr);
1595  SmallVector<Value *, 4> Indices;
1596 
1597  // We may end up computing an offset pointer that has the wrong type. If we
1598  // never are able to compute one directly that has the correct type, we'll
1599  // fall back to it, so keep it and the base it was computed from around here.
1600  Value *OffsetPtr = nullptr;
1601  Value *OffsetBasePtr;
1602 
1603  // Remember any i8 pointer we come across to re-use if we need to do a raw
1604  // byte offset.
1605  Value *Int8Ptr = nullptr;
1606  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1607 
1608  PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
1609  Type *TargetTy = TargetPtrTy->getElementType();
1610 
1611  // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
1612  // address space from the expected `PointerTy` (the pointer to be used).
1613  // Adjust the pointer type based the original storage pointer.
1614  auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
1615  PointerTy = TargetTy->getPointerTo(AS);
1616 
1617  do {
1618  // First fold any existing GEPs into the offset.
1619  while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1620  APInt GEPOffset(Offset.getBitWidth(), 0);
1621  if (!GEP->accumulateConstantOffset(DL, GEPOffset))
1622  break;
1623  Offset += GEPOffset;
1624  Ptr = GEP->getPointerOperand();
1625  if (!Visited.insert(Ptr).second)
1626  break;
1627  }
1628 
1629  // See if we can perform a natural GEP here.
1630  Indices.clear();
1631  if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
1632  Indices, NamePrefix)) {
1633  // If we have a new natural pointer at the offset, clear out any old
1634  // offset pointer we computed. Unless it is the base pointer or
1635  // a non-instruction, we built a GEP we don't need. Zap it.
1636  if (OffsetPtr && OffsetPtr != OffsetBasePtr)
1637  if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
1638  assert(I->use_empty() && "Built a GEP with uses some how!");
1639  I->eraseFromParent();
1640  }
1641  OffsetPtr = P;
1642  OffsetBasePtr = Ptr;
1643  // If we also found a pointer of the right type, we're done.
1644  if (P->getType() == PointerTy)
1645  break;
1646  }
1647 
1648  // Stash this pointer if we've found an i8*.
1649  if (Ptr->getType()->isIntegerTy(8)) {
1650  Int8Ptr = Ptr;
1651  Int8PtrOffset = Offset;
1652  }
1653 
1654  // Peel off a layer of the pointer and update the offset appropriately.
1655  if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1656  Ptr = cast<Operator>(Ptr)->getOperand(0);
1657  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1658  if (GA->isInterposable())
1659  break;
1660  Ptr = GA->getAliasee();
1661  } else {
1662  break;
1663  }
1664  assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1665  } while (Visited.insert(Ptr).second);
1666 
1667  if (!OffsetPtr) {
1668  if (!Int8Ptr) {
1669  Int8Ptr = IRB.CreateBitCast(
1670  Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
1671  NamePrefix + "sroa_raw_cast");
1672  Int8PtrOffset = Offset;
1673  }
1674 
1675  OffsetPtr = Int8PtrOffset == 0
1676  ? Int8Ptr
1677  : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
1678  IRB.getInt(Int8PtrOffset),
1679  NamePrefix + "sroa_raw_idx");
1680  }
1681  Ptr = OffsetPtr;
1682 
1683  // On the off chance we were targeting i8*, guard the bitcast here.
1684  if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
1685  Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
1686  TargetPtrTy,
1687  NamePrefix + "sroa_cast");
1688  }
1689 
1690  return Ptr;
1691 }
1692 
1693 /// Compute the adjusted alignment for a load or store from an offset.
1696 }
1697 
1698 /// Test whether we can convert a value from the old to the new type.
1699 ///
1700 /// This predicate should be used to guard calls to convertValue in order to
1701 /// ensure that we only try to convert viable values. The strategy is that we
1702 /// will peel off single element struct and array wrappings to get to an
1703 /// underlying value, and convert that value.
1704 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1705  if (OldTy == NewTy)
1706  return true;
1707 
1708  // For integer types, we can't handle any bit-width differences. This would
1709  // break both vector conversions with extension and introduce endianness
1710  // issues when in conjunction with loads and stores.
1711  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1712  assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1713  cast<IntegerType>(NewTy)->getBitWidth() &&
1714  "We can't have the same bitwidth for different int types");
1715  return false;
1716  }
1717 
1718  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
1719  DL.getTypeSizeInBits(OldTy).getFixedSize())
1720  return false;
1721  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1722  return false;
1723 
1724  // We can convert pointers to integers and vice-versa. Same for vectors
1725  // of pointers and integers.
1726  OldTy = OldTy->getScalarType();
1727  NewTy = NewTy->getScalarType();
1728  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1729  if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1730  unsigned OldAS = OldTy->getPointerAddressSpace();
1731  unsigned NewAS = NewTy->getPointerAddressSpace();
1732  // Convert pointers if they are pointers from the same address space or
1733  // different integral (not non-integral) address spaces with the same
1734  // pointer size.
1735  return OldAS == NewAS ||
1736  (!DL.isNonIntegralAddressSpace(OldAS) &&
1737  !DL.isNonIntegralAddressSpace(NewAS) &&
1738  DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1739  }
1740 
1741  // We can convert integers to integral pointers, but not to non-integral
1742  // pointers.
1743  if (OldTy->isIntegerTy())
1744  return !DL.isNonIntegralPointerType(NewTy);
1745 
1746  // We can convert integral pointers to integers, but non-integral pointers
1747  // need to remain pointers.
1748  if (!DL.isNonIntegralPointerType(OldTy))
1749  return NewTy->isIntegerTy();
1750 
1751  return false;
1752  }
1753 
1754  return true;
1755 }
1756 
1757 /// Generic routine to convert an SSA value to a value of a different
1758 /// type.
1759 ///
1760 /// This will try various different casting techniques, such as bitcasts,
1761 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1762 /// two types for viability with this routine.
1763 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
1764  Type *NewTy) {
1765  Type *OldTy = V->getType();
1766  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
1767 
1768  if (OldTy == NewTy)
1769  return V;
1770 
1771  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
1772  "Integer types must be the exact same to convert.");
1773 
1774  // See if we need inttoptr for this type pair. May require additional bitcast.
1775  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1776  // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
1777  // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
1778  // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
1779  // Directly handle i64 to i8*
1780  return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
1781  NewTy);
1782  }
1783 
1784  // See if we need ptrtoint for this type pair. May require additional bitcast.
1785  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
1786  // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
1787  // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
1788  // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
1789  // Expand i8* to i64 --> i8* to i64 to i64
1790  return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1791  NewTy);
1792  }
1793 
1794  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1795  unsigned OldAS = OldTy->getPointerAddressSpace();
1796  unsigned NewAS = NewTy->getPointerAddressSpace();
1797  // To convert pointers with different address spaces (they are already
1798  // checked convertible, i.e. they have the same pointer size), so far we
1799  // cannot use `bitcast` (which has restrict on the same address space) or
1800  // `addrspacecast` (which is not always no-op casting). Instead, use a pair
1801  // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
1802  // size.
1803  if (OldAS != NewAS) {
1804  assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1805  return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1806  NewTy);
1807  }
1808  }
1809 
1810  return IRB.CreateBitCast(V, NewTy);
1811 }
1812 
1813 /// Test whether the given slice use can be promoted to a vector.
1814 ///
1815 /// This function is called to test each entry in a partition which is slated
1816 /// for a single slice.
1817 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
1818  VectorType *Ty,
1819  uint64_t ElementSize,
1820  const DataLayout &DL) {
1821  // First validate the slice offsets.
1822  uint64_t BeginOffset =
1823  std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
1824  uint64_t BeginIndex = BeginOffset / ElementSize;
1825  if (BeginIndex * ElementSize != BeginOffset ||
1826  BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
1827  return false;
1828  uint64_t EndOffset =
1829  std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
1830  uint64_t EndIndex = EndOffset / ElementSize;
1831  if (EndIndex * ElementSize != EndOffset ||
1832  EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
1833  return false;
1834 
1835  assert(EndIndex > BeginIndex && "Empty vector!");
1836  uint64_t NumElements = EndIndex - BeginIndex;
1837  Type *SliceTy = (NumElements == 1)
1838  ? Ty->getElementType()
1839  : FixedVectorType::get(Ty->getElementType(), NumElements);
1840 
1841  Type *SplitIntTy =
1842  Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
1843 
1844  Use *U = S.getUse();
1845 
1846  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
1847  if (MI->isVolatile())
1848  return false;
1849  if (!S.isSplittable())
1850  return false; // Skip any unsplittable intrinsics.
1851  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
1852  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
1853  return false;
1854  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
1855  // Disable vector promotion when there are loads or stores of an FCA.
1856  return false;
1857  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1858  if (LI->isVolatile())
1859  return false;
1860  Type *LTy = LI->getType();
1861  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1862  assert(LTy->isIntegerTy());
1863  LTy = SplitIntTy;
1864  }
1865  if (!canConvertValue(DL, SliceTy, LTy))
1866  return false;
1867  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1868  if (SI->isVolatile())
1869  return false;
1870  Type *STy = SI->getValueOperand()->getType();
1871  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1872  assert(STy->isIntegerTy());
1873  STy = SplitIntTy;
1874  }
1875  if (!canConvertValue(DL, STy, SliceTy))
1876  return false;
1877  } else {
1878  return false;
1879  }
1880 
1881  return true;
1882 }
1883 
1884 /// Test whether the given alloca partitioning and range of slices can be
1885 /// promoted to a vector.
1886 ///
1887 /// This is a quick test to check whether we can rewrite a particular alloca
1888 /// partition (and its newly formed alloca) into a vector alloca with only
1889 /// whole-vector loads and stores such that it could be promoted to a vector
1890 /// SSA value. We only can ensure this for a limited set of operations, and we
1891 /// don't want to do the rewrites unless we are confident that the result will
1892 /// be promotable, so we have an early test here.
1894  // Collect the candidate types for vector-based promotion. Also track whether
1895  // we have different element types.
1896  SmallVector<VectorType *, 4> CandidateTys;
1897  Type *CommonEltTy = nullptr;
1898  bool HaveCommonEltTy = true;
1899  auto CheckCandidateType = [&](Type *Ty) {
1900  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1901  // Return if bitcast to vectors is different for total size in bits.
1902  if (!CandidateTys.empty()) {
1903  VectorType *V = CandidateTys[0];
1904  if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
1905  DL.getTypeSizeInBits(V).getFixedSize()) {
1906  CandidateTys.clear();
1907  return;
1908  }
1909  }
1910  CandidateTys.push_back(VTy);
1911  if (!CommonEltTy)
1912  CommonEltTy = VTy->getElementType();
1913  else if (CommonEltTy != VTy->getElementType())
1914  HaveCommonEltTy = false;
1915  }
1916  };
1917  // Consider any loads or stores that are the exact size of the slice.
1918  for (const Slice &S : P)
1919  if (S.beginOffset() == P.beginOffset() &&
1920  S.endOffset() == P.endOffset()) {
1921  if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
1922  CheckCandidateType(LI->getType());
1923  else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
1924  CheckCandidateType(SI->getValueOperand()->getType());
1925  }
1926 
1927  // If we didn't find a vector type, nothing to do here.
1928  if (CandidateTys.empty())
1929  return nullptr;
1930 
1931  // Remove non-integer vector types if we had multiple common element types.
1932  // FIXME: It'd be nice to replace them with integer vector types, but we can't
1933  // do that until all the backends are known to produce good code for all
1934  // integer vector types.
1935  if (!HaveCommonEltTy) {
1936  llvm::erase_if(CandidateTys, [](VectorType *VTy) {
1937  return !VTy->getElementType()->isIntegerTy();
1938  });
1939 
1940  // If there were no integer vector types, give up.
1941  if (CandidateTys.empty())
1942  return nullptr;
1943 
1944  // Rank the remaining candidate vector types. This is easy because we know
1945  // they're all integer vectors. We sort by ascending number of elements.
1946  auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
1947  (void)DL;
1948  assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
1949  DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
1950  "Cannot have vector types of different sizes!");
1951  assert(RHSTy->getElementType()->isIntegerTy() &&
1952  "All non-integer types eliminated!");
1953  assert(LHSTy->getElementType()->isIntegerTy() &&
1954  "All non-integer types eliminated!");
1955  return cast<FixedVectorType>(RHSTy)->getNumElements() <
1956  cast<FixedVectorType>(LHSTy)->getNumElements();
1957  };
1958  llvm::sort(CandidateTys, RankVectorTypes);
1959  CandidateTys.erase(
1960  std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
1961  CandidateTys.end());
1962  } else {
1963 // The only way to have the same element type in every vector type is to
1964 // have the same vector type. Check that and remove all but one.
1965 #ifndef NDEBUG
1966  for (VectorType *VTy : CandidateTys) {
1967  assert(VTy->getElementType() == CommonEltTy &&
1968  "Unaccounted for element type!");
1969  assert(VTy == CandidateTys[0] &&
1970  "Different vector types with the same element type!");
1971  }
1972 #endif
1973  CandidateTys.resize(1);
1974  }
1975 
1976  // Try each vector type, and return the one which works.
1977  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
1978  uint64_t ElementSize =
1979  DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
1980 
1981  // While the definition of LLVM vectors is bitpacked, we don't support sizes
1982  // that aren't byte sized.
1983  if (ElementSize % 8)
1984  return false;
1985  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
1986  "vector size not a multiple of element size?");
1987  ElementSize /= 8;
1988 
1989  for (const Slice &S : P)
1990  if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
1991  return false;
1992 
1993  for (const Slice *S : P.splitSliceTails())
1994  if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
1995  return false;
1996 
1997  return true;
1998  };
1999  for (VectorType *VTy : CandidateTys)
2000  if (CheckVectorTypeForPromotion(VTy))
2001  return VTy;
2002 
2003  return nullptr;
2004 }
2005 
2006 /// Test whether a slice of an alloca is valid for integer widening.
2007 ///
2008 /// This implements the necessary checking for the \c isIntegerWideningViable
2009 /// test below on a single slice of the alloca.
2010 static bool isIntegerWideningViableForSlice(const Slice &S,
2011  uint64_t AllocBeginOffset,
2012  Type *AllocaTy,
2013  const DataLayout &DL,
2014  bool &WholeAllocaOp) {
2015  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
2016 
2017  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2018  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2019 
2020  // We can't reasonably handle cases where the load or store extends past
2021  // the end of the alloca's type and into its padding.
2022  if (RelEnd > Size)
2023  return false;
2024 
2025  Use *U = S.getUse();
2026 
2027  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2028  if (LI->isVolatile())
2029  return false;
2030  // We can't handle loads that extend past the allocated memory.
2031  if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
2032  return false;
2033  // So far, AllocaSliceRewriter does not support widening split slice tails
2034  // in rewriteIntegerLoad.
2035  if (S.beginOffset() < AllocBeginOffset)
2036  return false;
2037  // Note that we don't count vector loads or stores as whole-alloca
2038  // operations which enable integer widening because we would prefer to use
2039  // vector widening instead.
2040  if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2041  WholeAllocaOp = true;
2042  if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2043  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2044  return false;
2045  } else if (RelBegin != 0 || RelEnd != Size ||
2046  !canConvertValue(DL, AllocaTy, LI->getType())) {
2047  // Non-integer loads need to be convertible from the alloca type so that
2048  // they are promotable.
2049  return false;
2050  }
2051  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2052  Type *ValueTy = SI->getValueOperand()->getType();
2053  if (SI->isVolatile())
2054  return false;
2055  // We can't handle stores that extend past the allocated memory.
2056  if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
2057  return false;
2058  // So far, AllocaSliceRewriter does not support widening split slice tails
2059  // in rewriteIntegerStore.
2060  if (S.beginOffset() < AllocBeginOffset)
2061  return false;
2062  // Note that we don't count vector loads or stores as whole-alloca
2063  // operations which enable integer widening because we would prefer to use
2064  // vector widening instead.
2065  if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2066  WholeAllocaOp = true;
2067  if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2068  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2069  return false;
2070  } else if (RelBegin != 0 || RelEnd != Size ||
2071  !canConvertValue(DL, ValueTy, AllocaTy)) {
2072  // Non-integer stores need to be convertible to the alloca type so that
2073  // they are promotable.
2074  return false;
2075  }
2076  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2077  if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2078  return false;
2079  if (!S.isSplittable())
2080  return false; // Skip any unsplittable intrinsics.
2081  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2082  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2083  return false;
2084  } else {
2085  return false;
2086  }
2087 
2088  return true;
2089 }
2090 
2091 /// Test whether the given alloca partition's integer operations can be
2092 /// widened to promotable ones.
2093 ///
2094 /// This is a quick test to check whether we can rewrite the integer loads and
2095 /// stores to a particular alloca into wider loads and stores and be able to
2096 /// promote the resulting alloca.
2097 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2098  const DataLayout &DL) {
2099  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
2100  // Don't create integer types larger than the maximum bitwidth.
2101  if (SizeInBits > IntegerType::MAX_INT_BITS)
2102  return false;
2103 
2104  // Don't try to handle allocas with bit-padding.
2105  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
2106  return false;
2107 
2108  // We need to ensure that an integer type with the appropriate bitwidth can
2109  // be converted to the alloca type, whatever that is. We don't want to force
2110  // the alloca itself to have an integer type if there is a more suitable one.
2111  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2112  if (!canConvertValue(DL, AllocaTy, IntTy) ||
2113  !canConvertValue(DL, IntTy, AllocaTy))
2114  return false;
2115 
2116  // While examining uses, we ensure that the alloca has a covering load or
2117  // store. We don't want to widen the integer operations only to fail to
2118  // promote due to some other unsplittable entry (which we may make splittable
2119  // later). However, if there are only splittable uses, go ahead and assume
2120  // that we cover the alloca.
2121  // FIXME: We shouldn't consider split slices that happen to start in the
2122  // partition here...
2123  bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2124 
2125  for (const Slice &S : P)
2126  if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2127  WholeAllocaOp))
2128  return false;
2129 
2130  for (const Slice *S : P.splitSliceTails())
2131  if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2132  WholeAllocaOp))
2133  return false;
2134 
2135  return WholeAllocaOp;
2136 }
2137 
2138 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2139  IntegerType *Ty, uint64_t Offset,
2140  const Twine &Name) {
2141  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2142  IntegerType *IntTy = cast<IntegerType>(V->getType());
2143  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2144  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2145  "Element extends past full value");
2146  uint64_t ShAmt = 8 * Offset;
2147  if (DL.isBigEndian())
2148  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2149  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2150  if (ShAmt) {
2151  V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2152  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2153  }
2154  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2155  "Cannot extract to a larger integer!");
2156  if (Ty != IntTy) {
2157  V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2158  LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2159  }
2160  return V;
2161 }
2162 
2163 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2164  Value *V, uint64_t Offset, const Twine &Name) {
2165  IntegerType *IntTy = cast<IntegerType>(Old->getType());
2166  IntegerType *Ty = cast<IntegerType>(V->getType());
2167  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2168  "Cannot insert a larger integer!");
2169  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2170  if (Ty != IntTy) {
2171  V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2172  LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2173  }
2174  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2175  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2176  "Element store outside of alloca store");
2177  uint64_t ShAmt = 8 * Offset;
2178  if (DL.isBigEndian())
2179  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2180  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2181  if (ShAmt) {
2182  V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2183  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2184  }
2185 
2186  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2187  APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2188  Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2189  LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2190  V = IRB.CreateOr(Old, V, Name + ".insert");
2191  LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2192  }
2193  return V;
2194 }
2195 
2196 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2197  unsigned EndIndex, const Twine &Name) {
2198  auto *VecTy = cast<FixedVectorType>(V->getType());
2199  unsigned NumElements = EndIndex - BeginIndex;
2200  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2201 
2202  if (NumElements == VecTy->getNumElements())
2203  return V;
2204 
2205  if (NumElements == 1) {
2206  V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2207  Name + ".extract");
2208  LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2209  return V;
2210  }
2211 
2213  Mask.reserve(NumElements);
2214  for (unsigned i = BeginIndex; i != EndIndex; ++i)
2215  Mask.push_back(i);
2216  V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2217  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2218  return V;
2219 }
2220 
2221 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2222  unsigned BeginIndex, const Twine &Name) {
2223  VectorType *VecTy = cast<VectorType>(Old->getType());
2224  assert(VecTy && "Can only insert a vector into a vector");
2225 
2226  VectorType *Ty = dyn_cast<VectorType>(V->getType());
2227  if (!Ty) {
2228  // Single element to insert.
2229  V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2230  Name + ".insert");
2231  LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2232  return V;
2233  }
2234 
2235  assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2236  cast<FixedVectorType>(VecTy)->getNumElements() &&
2237  "Too many elements!");
2238  if (cast<FixedVectorType>(Ty)->getNumElements() ==
2239  cast<FixedVectorType>(VecTy)->getNumElements()) {
2240  assert(V->getType() == VecTy && "Vector type mismatch");
2241  return V;
2242  }
2243  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2244 
2245  // When inserting a smaller vector into the larger to store, we first
2246  // use a shuffle vector to widen it with undef elements, and then
2247  // a second shuffle vector to select between the loaded vector and the
2248  // incoming vector.
2250  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2251  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2252  if (i >= BeginIndex && i < EndIndex)
2253  Mask.push_back(i - BeginIndex);
2254  else
2255  Mask.push_back(-1);
2256  V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2257  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2258 
2260  Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2261  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2262  Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2263 
2264  V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2265 
2266  LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2267  return V;
2268 }
2269 
2270 /// Visitor to rewrite instructions using p particular slice of an alloca
2271 /// to use a new alloca.
2272 ///
2273 /// Also implements the rewriting to vector-based accesses when the partition
2274 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2275 /// lives here.
2277  : public InstVisitor<AllocaSliceRewriter, bool> {
2278  // Befriend the base class so it can delegate to private visit methods.
2279  friend class InstVisitor<AllocaSliceRewriter, bool>;
2280 
2282 
2283  const DataLayout &DL;
2284  AllocaSlices &AS;
2285  SROA &Pass;
2286  AllocaInst &OldAI, &NewAI;
2287  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2288  Type *NewAllocaTy;
2289 
2290  // This is a convenience and flag variable that will be null unless the new
2291  // alloca's integer operations should be widened to this integer type due to
2292  // passing isIntegerWideningViable above. If it is non-null, the desired
2293  // integer type will be stored here for easy access during rewriting.
2294  IntegerType *IntTy;
2295 
2296  // If we are rewriting an alloca partition which can be written as pure
2297  // vector operations, we stash extra information here. When VecTy is
2298  // non-null, we have some strict guarantees about the rewritten alloca:
2299  // - The new alloca is exactly the size of the vector type here.
2300  // - The accesses all either map to the entire vector or to a single
2301  // element.
2302  // - The set of accessing instructions is only one of those handled above
2303  // in isVectorPromotionViable. Generally these are the same access kinds
2304  // which are promotable via mem2reg.
2305  VectorType *VecTy;
2306  Type *ElementTy;
2307  uint64_t ElementSize;
2308 
2309  // The original offset of the slice currently being rewritten relative to
2310  // the original alloca.
2311  uint64_t BeginOffset = 0;
2312  uint64_t EndOffset = 0;
2313 
2314  // The new offsets of the slice currently being rewritten relative to the
2315  // original alloca.
2316  uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2317 
2318  uint64_t SliceSize = 0;
2319  bool IsSplittable = false;
2320  bool IsSplit = false;
2321  Use *OldUse = nullptr;
2322  Instruction *OldPtr = nullptr;
2323 
2324  // Track post-rewrite users which are PHI nodes and Selects.
2325  SmallSetVector<PHINode *, 8> &PHIUsers;
2326  SmallSetVector<SelectInst *, 8> &SelectUsers;
2327 
2328  // Utility IR builder, whose name prefix is setup for each visited use, and
2329  // the insertion point is set to point to the user.
2330  IRBuilderTy IRB;
2331 
2332 public:
2334  AllocaInst &OldAI, AllocaInst &NewAI,
2335  uint64_t NewAllocaBeginOffset,
2336  uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2337  VectorType *PromotableVecTy,
2338  SmallSetVector<PHINode *, 8> &PHIUsers,
2339  SmallSetVector<SelectInst *, 8> &SelectUsers)
2340  : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2341  NewAllocaBeginOffset(NewAllocaBeginOffset),
2342  NewAllocaEndOffset(NewAllocaEndOffset),
2343  NewAllocaTy(NewAI.getAllocatedType()),
2344  IntTy(
2345  IsIntegerPromotable
2346  ? Type::getIntNTy(NewAI.getContext(),
2347  DL.getTypeSizeInBits(NewAI.getAllocatedType())
2348  .getFixedSize())
2349  : nullptr),
2350  VecTy(PromotableVecTy),
2351  ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2352  ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
2353  : 0),
2354  PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2355  IRB(NewAI.getContext(), ConstantFolder()) {
2356  if (VecTy) {
2357  assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
2358  "Only multiple-of-8 sized vector elements are viable");
2359  ++NumVectorized;
2360  }
2361  assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2362  }
2363 
2365  bool CanSROA = true;
2366  BeginOffset = I->beginOffset();
2367  EndOffset = I->endOffset();
2368  IsSplittable = I->isSplittable();
2369  IsSplit =
2370  BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2371  LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2372  LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2373  LLVM_DEBUG(dbgs() << "\n");
2374 
2375  // Compute the intersecting offset range.
2376  assert(BeginOffset < NewAllocaEndOffset);
2377  assert(EndOffset > NewAllocaBeginOffset);
2378  NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2379  NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2380 
2381  SliceSize = NewEndOffset - NewBeginOffset;
2382 
2383  OldUse = I->getUse();
2384  OldPtr = cast<Instruction>(OldUse->get());
2385 
2386  Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2387  IRB.SetInsertPoint(OldUserI);
2388  IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2389  IRB.getInserter().SetNamePrefix(
2390  Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
2391 
2392  CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2393  if (VecTy || IntTy)
2394  assert(CanSROA);
2395  return CanSROA;
2396  }
2397 
2398 private:
2399  // Make sure the other visit overloads are visible.
2400  using Base::visit;
2401 
2402  // Every instruction which can end up as a user must have a rewrite rule.
2403  bool visitInstruction(Instruction &I) {
2404  LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2405  llvm_unreachable("No rewrite rule for this instruction!");
2406  }
2407 
2408  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2409  // Note that the offset computation can use BeginOffset or NewBeginOffset
2410  // interchangeably for unsplit slices.
2411  assert(IsSplit || BeginOffset == NewBeginOffset);
2412  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2413 
2414 #ifndef NDEBUG
2415  StringRef OldName = OldPtr->getName();
2416  // Skip through the last '.sroa.' component of the name.
2417  size_t LastSROAPrefix = OldName.rfind(".sroa.");
2418  if (LastSROAPrefix != StringRef::npos) {
2419  OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2420  // Look for an SROA slice index.
2421  size_t IndexEnd = OldName.find_first_not_of("0123456789");
2422  if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2423  // Strip the index and look for the offset.
2424  OldName = OldName.substr(IndexEnd + 1);
2425  size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2426  if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2427  // Strip the offset.
2428  OldName = OldName.substr(OffsetEnd + 1);
2429  }
2430  }
2431  // Strip any SROA suffixes as well.
2432  OldName = OldName.substr(0, OldName.find(".sroa_"));
2433 #endif
2434 
2435  return getAdjustedPtr(IRB, DL, &NewAI,
2436  APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
2437  PointerTy,
2438 #ifndef NDEBUG
2439  Twine(OldName) + "."
2440 #else
2441  Twine()
2442 #endif
2443  );
2444  }
2445 
2446  /// Compute suitable alignment to access this slice of the *new*
2447  /// alloca.
2448  ///
2449  /// You can optionally pass a type to this routine and if that type's ABI
2450  /// alignment is itself suitable, this will return zero.
2451  Align getSliceAlign() {
2452  return commonAlignment(NewAI.getAlign(),
2453  NewBeginOffset - NewAllocaBeginOffset);
2454  }
2455 
2456  unsigned getIndex(uint64_t Offset) {
2457  assert(VecTy && "Can only call getIndex when rewriting a vector");
2458  uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2459  assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2460  uint32_t Index = RelOffset / ElementSize;
2461  assert(Index * ElementSize == RelOffset);
2462  return Index;
2463  }
2464 
2465  void deleteIfTriviallyDead(Value *V) {
2466  Instruction *I = cast<Instruction>(V);
2468  Pass.DeadInsts.push_back(I);
2469  }
2470 
2471  Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2472  unsigned BeginIndex = getIndex(NewBeginOffset);
2473  unsigned EndIndex = getIndex(NewEndOffset);
2474  assert(EndIndex > BeginIndex && "Empty vector!");
2475 
2476  LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2477  NewAI.getAlign(), "load");
2478 
2479  Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2480  LLVMContext::MD_access_group});
2481  return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
2482  }
2483 
2484  Value *rewriteIntegerLoad(LoadInst &LI) {
2485  assert(IntTy && "We cannot insert an integer to the alloca");
2486  assert(!LI.isVolatile());
2487  Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2488  NewAI.getAlign(), "load");
2489  V = convertValue(DL, IRB, V, IntTy);
2490  assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2491  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2492  if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2493  IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
2494  V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
2495  }
2496  // It is possible that the extracted type is not the load type. This
2497  // happens if there is a load past the end of the alloca, and as
2498  // a consequence the slice is narrower but still a candidate for integer
2499  // lowering. To handle this case, we just zero extend the extracted
2500  // integer.
2501  assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2502  "Can only handle an extract for an overly wide load");
2503  if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
2504  V = IRB.CreateZExt(V, LI.getType());
2505  return V;
2506  }
2507 
2508  bool visitLoadInst(LoadInst &LI) {
2509  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2510  Value *OldOp = LI.getOperand(0);
2511  assert(OldOp == OldPtr);
2512 
2513  AAMDNodes AATags;
2514  LI.getAAMetadata(AATags);
2515 
2516  unsigned AS = LI.getPointerAddressSpace();
2517 
2518  Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2519  : LI.getType();
2520  const bool IsLoadPastEnd =
2521  DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
2522  bool IsPtrAdjusted = false;
2523  Value *V;
2524  if (VecTy) {
2525  V = rewriteVectorizedLoadInst(LI);
2526  } else if (IntTy && LI.getType()->isIntegerTy()) {
2527  V = rewriteIntegerLoad(LI);
2528  } else if (NewBeginOffset == NewAllocaBeginOffset &&
2529  NewEndOffset == NewAllocaEndOffset &&
2530  (canConvertValue(DL, NewAllocaTy, TargetTy) ||
2531  (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2532  TargetTy->isIntegerTy()))) {
2533  LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2534  NewAI.getAlign(), LI.isVolatile(),
2535  LI.getName());
2536  if (AATags)
2537  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2538  if (LI.isVolatile())
2539  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2540  if (NewLI->isAtomic())
2541  NewLI->setAlignment(LI.getAlign());
2542 
2543  // Any !nonnull metadata or !range metadata on the old load is also valid
2544  // on the new load. This is even true in some cases even when the loads
2545  // are different types, for example by mapping !nonnull metadata to
2546  // !range metadata by modeling the null pointer constant converted to the
2547  // integer type.
2548  // FIXME: Add support for range metadata here. Currently the utilities
2549  // for this don't propagate range metadata in trivial cases from one
2550  // integer load to another, don't handle non-addrspace-0 null pointers
2551  // correctly, and don't have any support for mapping ranges as the
2552  // integer type becomes winder or narrower.
2553  if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
2554  copyNonnullMetadata(LI, N, *NewLI);
2555 
2556  // Try to preserve nonnull metadata
2557  V = NewLI;
2558 
2559  // If this is an integer load past the end of the slice (which means the
2560  // bytes outside the slice are undef or this load is dead) just forcibly
2561  // fix the integer size with correct handling of endianness.
2562  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2563  if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
2564  if (AITy->getBitWidth() < TITy->getBitWidth()) {
2565  V = IRB.CreateZExt(V, TITy, "load.ext");
2566  if (DL.isBigEndian())
2567  V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
2568  "endian_shift");
2569  }
2570  } else {
2571  Type *LTy = TargetTy->getPointerTo(AS);
2572  LoadInst *NewLI =
2573  IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
2574  getSliceAlign(), LI.isVolatile(), LI.getName());
2575  if (AATags)
2576  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2577  if (LI.isVolatile())
2578  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2579  NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2580  LLVMContext::MD_access_group});
2581 
2582  V = NewLI;
2583  IsPtrAdjusted = true;
2584  }
2585  V = convertValue(DL, IRB, V, TargetTy);
2586 
2587  if (IsSplit) {
2588  assert(!LI.isVolatile());
2589  assert(LI.getType()->isIntegerTy() &&
2590  "Only integer type loads and stores are split");
2591  assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
2592  "Split load isn't smaller than original load");
2593  assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
2594  "Non-byte-multiple bit width");
2595  // Move the insertion point just past the load so that we can refer to it.
2596  IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
2597  // Create a placeholder value with the same type as LI to use as the
2598  // basis for the new value. This allows us to replace the uses of LI with
2599  // the computed value, and then replace the placeholder with LI, leaving
2600  // LI only used for this computation.
2601  Value *Placeholder = new LoadInst(
2602  LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
2603  false, Align(1));
2604  V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
2605  "insert");
2606  LI.replaceAllUsesWith(V);
2607  Placeholder->replaceAllUsesWith(&LI);
2608  Placeholder->deleteValue();
2609  } else {
2610  LI.replaceAllUsesWith(V);
2611  }
2612 
2613  Pass.DeadInsts.push_back(&LI);
2614  deleteIfTriviallyDead(OldOp);
2615  LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
2616  return !LI.isVolatile() && !IsPtrAdjusted;
2617  }
2618 
2619  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
2620  AAMDNodes AATags) {
2621  if (V->getType() != VecTy) {
2622  unsigned BeginIndex = getIndex(NewBeginOffset);
2623  unsigned EndIndex = getIndex(NewEndOffset);
2624  assert(EndIndex > BeginIndex && "Empty vector!");
2625  unsigned NumElements = EndIndex - BeginIndex;
2626  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2627  "Too many elements!");
2628  Type *SliceTy = (NumElements == 1)
2629  ? ElementTy
2630  : FixedVectorType::get(ElementTy, NumElements);
2631  if (V->getType() != SliceTy)
2632  V = convertValue(DL, IRB, V, SliceTy);
2633 
2634  // Mix in the existing elements.
2635  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2636  NewAI.getAlign(), "load");
2637  V = insertVector(IRB, Old, V, BeginIndex, "vec");
2638  }
2639  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2640  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2641  LLVMContext::MD_access_group});
2642  if (AATags)
2643  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2644  Pass.DeadInsts.push_back(&SI);
2645 
2646  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2647  return true;
2648  }
2649 
2650  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
2651  assert(IntTy && "We cannot extract an integer from the alloca");
2652  assert(!SI.isVolatile());
2653  if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
2654  IntTy->getBitWidth()) {
2655  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2656  NewAI.getAlign(), "oldload");
2657  Old = convertValue(DL, IRB, Old, IntTy);
2658  assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2659  uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2660  V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
2661  }
2662  V = convertValue(DL, IRB, V, NewAllocaTy);
2663  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2664  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2665  LLVMContext::MD_access_group});
2666  if (AATags)
2667  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2668  Pass.DeadInsts.push_back(&SI);
2669  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2670  return true;
2671  }
2672 
2673  bool visitStoreInst(StoreInst &SI) {
2674  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
2675  Value *OldOp = SI.getOperand(1);
2676  assert(OldOp == OldPtr);
2677 
2678  AAMDNodes AATags;
2679  SI.getAAMetadata(AATags);
2680 
2681  Value *V = SI.getValueOperand();
2682 
2683  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2684  // alloca that should be re-examined after promoting this alloca.
2685  if (V->getType()->isPointerTy())
2686  if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
2687  Pass.PostPromotionWorklist.insert(AI);
2688 
2689  if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
2690  assert(!SI.isVolatile());
2691  assert(V->getType()->isIntegerTy() &&
2692  "Only integer type loads and stores are split");
2693  assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
2694  "Non-byte-multiple bit width");
2695  IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
2696  V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
2697  "extract");
2698  }
2699 
2700  if (VecTy)
2701  return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
2702  if (IntTy && V->getType()->isIntegerTy())
2703  return rewriteIntegerStore(V, SI, AATags);
2704 
2705  const bool IsStorePastEnd =
2706  DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
2707  StoreInst *NewSI;
2708  if (NewBeginOffset == NewAllocaBeginOffset &&
2709  NewEndOffset == NewAllocaEndOffset &&
2710  (canConvertValue(DL, V->getType(), NewAllocaTy) ||
2711  (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
2712  V->getType()->isIntegerTy()))) {
2713  // If this is an integer store past the end of slice (and thus the bytes
2714  // past that point are irrelevant or this is unreachable), truncate the
2715  // value prior to storing.
2716  if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
2717  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2718  if (VITy->getBitWidth() > AITy->getBitWidth()) {
2719  if (DL.isBigEndian())
2720  V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
2721  "endian_shift");
2722  V = IRB.CreateTrunc(V, AITy, "load.trunc");
2723  }
2724 
2725  V = convertValue(DL, IRB, V, NewAllocaTy);
2726  NewSI =
2727  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
2728  } else {
2729  unsigned AS = SI.getPointerAddressSpace();
2730  Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
2731  NewSI =
2732  IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
2733  }
2734  NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2735  LLVMContext::MD_access_group});
2736  if (AATags)
2737  NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2738  if (SI.isVolatile())
2739  NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
2740  if (NewSI->isAtomic())
2741  NewSI->setAlignment(SI.getAlign());
2742  Pass.DeadInsts.push_back(&SI);
2743  deleteIfTriviallyDead(OldOp);
2744 
2745  LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
2746  return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
2747  }
2748 
2749  /// Compute an integer value from splatting an i8 across the given
2750  /// number of bytes.
2751  ///
2752  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2753  /// call this routine.
2754  /// FIXME: Heed the advice above.
2755  ///
2756  /// \param V The i8 value to splat.
2757  /// \param Size The number of bytes in the output (assuming i8 is one byte)
2758  Value *getIntegerSplat(Value *V, unsigned Size) {
2759  assert(Size > 0 && "Expected a positive number of bytes.");
2760  IntegerType *VTy = cast<IntegerType>(V->getType());
2761  assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
2762  if (Size == 1)
2763  return V;
2764 
2765  Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
2766  V = IRB.CreateMul(
2767  IRB.CreateZExt(V, SplatIntTy, "zext"),
2768  ConstantExpr::getUDiv(
2769  Constant::getAllOnesValue(SplatIntTy),
2770  ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
2771  SplatIntTy)),
2772  "isplat");
2773  return V;
2774  }
2775 
2776  /// Compute a vector splat for a given element value.
2777  Value *getVectorSplat(Value *V, unsigned NumElements) {
2778  V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
2779  LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
2780  return V;
2781  }
2782 
2783  bool visitMemSetInst(MemSetInst &II) {
2784  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2785  assert(II.getRawDest() == OldPtr);
2786 
2787  AAMDNodes AATags;
2788  II.getAAMetadata(AATags);
2789 
2790  // If the memset has a variable size, it cannot be split, just adjust the
2791  // pointer to the new alloca.
2792  if (!isa<ConstantInt>(II.getLength())) {
2793  assert(!IsSplit);
2794  assert(NewBeginOffset == BeginOffset);
2795  II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
2796  II.setDestAlignment(getSliceAlign());
2797 
2798  deleteIfTriviallyDead(OldPtr);
2799  return false;
2800  }
2801 
2802  // Record this instruction for deletion.
2803  Pass.DeadInsts.push_back(&II);
2804 
2805  Type *AllocaTy = NewAI.getAllocatedType();
2806  Type *ScalarTy = AllocaTy->getScalarType();
2807 
2808  const bool CanContinue = [&]() {
2809  if (VecTy || IntTy)
2810  return true;
2811  if (BeginOffset > NewAllocaBeginOffset ||
2812  EndOffset < NewAllocaEndOffset)
2813  return false;
2814  // Length must be in range for FixedVectorType.
2815  auto *C = cast<ConstantInt>(II.getLength());
2816  const uint64_t Len = C->getLimitedValue();
2818  return false;
2819  auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
2820  auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
2821  return canConvertValue(DL, SrcTy, AllocaTy) &&
2822  DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
2823  }();
2824 
2825  // If this doesn't map cleanly onto the alloca type, and that type isn't
2826  // a single value type, just emit a memset.
2827  if (!CanContinue) {
2828  Type *SizeTy = II.getLength()->getType();
2829  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2830  CallInst *New = IRB.CreateMemSet(
2831  getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
2832  MaybeAlign(getSliceAlign()), II.isVolatile());
2833  if (AATags)
2834  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2835  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2836  return false;
2837  }
2838 
2839  // If we can represent this as a simple value, we have to build the actual
2840  // value to store, which requires expanding the byte present in memset to
2841  // a sensible representation for the alloca type. This is essentially
2842  // splatting the byte to a sufficiently wide integer, splatting it across
2843  // any desired vector width, and bitcasting to the final type.
2844  Value *V;
2845 
2846  if (VecTy) {
2847  // If this is a memset of a vectorized alloca, insert it.
2848  assert(ElementTy == ScalarTy);
2849 
2850  unsigned BeginIndex = getIndex(NewBeginOffset);
2851  unsigned EndIndex = getIndex(NewEndOffset);
2852  assert(EndIndex > BeginIndex && "Empty vector!");
2853  unsigned NumElements = EndIndex - BeginIndex;
2854  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2855  "Too many elements!");
2856 
2857  Value *Splat = getIntegerSplat(
2858  II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
2859  Splat = convertValue(DL, IRB, Splat, ElementTy);
2860  if (NumElements > 1)
2861  Splat = getVectorSplat(Splat, NumElements);
2862 
2863  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2864  NewAI.getAlign(), "oldload");
2865  V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
2866  } else if (IntTy) {
2867  // If this is a memset on an alloca where we can widen stores, insert the
2868  // set integer.
2869  assert(!II.isVolatile());
2870 
2871  uint64_t Size = NewEndOffset - NewBeginOffset;
2872  V = getIntegerSplat(II.getValue(), Size);
2873 
2874  if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
2875  EndOffset != NewAllocaBeginOffset)) {
2876  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2877  NewAI.getAlign(), "oldload");
2878  Old = convertValue(DL, IRB, Old, IntTy);
2879  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2880  V = insertInteger(DL, IRB, Old, V, Offset, "insert");
2881  } else {
2882  assert(V->getType() == IntTy &&
2883  "Wrong type for an alloca wide integer!");
2884  }
2885  V = convertValue(DL, IRB, V, AllocaTy);
2886  } else {
2887  // Established these invariants above.
2888  assert(NewBeginOffset == NewAllocaBeginOffset);
2889  assert(NewEndOffset == NewAllocaEndOffset);
2890 
2891  V = getIntegerSplat(II.getValue(),
2892  DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
2893  if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
2894  V = getVectorSplat(
2895  V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
2896 
2897  V = convertValue(DL, IRB, V, AllocaTy);
2898  }
2899 
2900  StoreInst *New =
2901  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
2902  New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
2903  LLVMContext::MD_access_group});
2904  if (AATags)
2905  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2906  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2907  return !II.isVolatile();
2908  }
2909 
2910  bool visitMemTransferInst(MemTransferInst &II) {
2911  // Rewriting of memory transfer instructions can be a bit tricky. We break
2912  // them into two categories: split intrinsics and unsplit intrinsics.
2913 
2914  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2915 
2916  AAMDNodes AATags;
2917  II.getAAMetadata(AATags);
2918 
2919  bool IsDest = &II.getRawDestUse() == OldUse;
2920  assert((IsDest && II.getRawDest() == OldPtr) ||
2921  (!IsDest && II.getRawSource() == OldPtr));
2922 
2923  MaybeAlign SliceAlign = getSliceAlign();
2924 
2925  // For unsplit intrinsics, we simply modify the source and destination
2926  // pointers in place. This isn't just an optimization, it is a matter of
2927  // correctness. With unsplit intrinsics we may be dealing with transfers
2928  // within a single alloca before SROA ran, or with transfers that have
2929  // a variable length. We may also be dealing with memmove instead of
2930  // memcpy, and so simply updating the pointers is the necessary for us to
2931  // update both source and dest of a single call.
2932  if (!IsSplittable) {
2933  Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2934  if (IsDest) {
2935  II.setDest(AdjustedPtr);
2936  II.setDestAlignment(SliceAlign);
2937  }
2938  else {
2939  II.setSource(AdjustedPtr);
2940  II.setSourceAlignment(SliceAlign);
2941  }
2942 
2943  LLVM_DEBUG(dbgs() << " to: " << II << "\n");
2944  deleteIfTriviallyDead(OldPtr);
2945  return false;
2946  }
2947  // For split transfer intrinsics we have an incredibly useful assurance:
2948  // the source and destination do not reside within the same alloca, and at
2949  // least one of them does not escape. This means that we can replace
2950  // memmove with memcpy, and we don't need to worry about all manner of
2951  // downsides to splitting and transforming the operations.
2952 
2953  // If this doesn't map cleanly onto the alloca type, and that type isn't
2954  // a single value type, just emit a memcpy.
2955  bool EmitMemCpy =
2956  !VecTy && !IntTy &&
2957  (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
2958  SliceSize !=
2959  DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
2960  !NewAI.getAllocatedType()->isSingleValueType());
2961 
2962  // If we're just going to emit a memcpy, the alloca hasn't changed, and the
2963  // size hasn't been shrunk based on analysis of the viable range, this is
2964  // a no-op.
2965  if (EmitMemCpy && &OldAI == &NewAI) {
2966  // Ensure the start lines up.
2967  assert(NewBeginOffset == BeginOffset);
2968 
2969  // Rewrite the size as needed.
2970  if (NewEndOffset != EndOffset)
2972  NewEndOffset - NewBeginOffset));
2973  return false;
2974  }
2975  // Record this instruction for deletion.
2976  Pass.DeadInsts.push_back(&II);
2977 
2978  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2979  // alloca that should be re-examined after rewriting this instruction.
2980  Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
2981  if (AllocaInst *AI =
2982  dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
2983  assert(AI != &OldAI && AI != &NewAI &&
2984  "Splittable transfers cannot reach the same alloca on both ends.");
2985  Pass.Worklist.insert(AI);
2986  }
2987 
2988  Type *OtherPtrTy = OtherPtr->getType();
2989  unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
2990 
2991  // Compute the relative offset for the other pointer within the transfer.
2992  unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
2993  APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
2994  Align OtherAlign =
2995  (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
2996  OtherAlign =
2997  commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
2998 
2999  if (EmitMemCpy) {
3000  // Compute the other pointer, folding as much as possible to produce
3001  // a single, simple GEP in most cases.
3002  OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3003  OtherPtr->getName() + ".");
3004 
3005  Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3006  Type *SizeTy = II.getLength()->getType();
3007  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3008 
3009  Value *DestPtr, *SrcPtr;
3010  MaybeAlign DestAlign, SrcAlign;
3011  // Note: IsDest is true iff we're copying into the new alloca slice
3012  if (IsDest) {
3013  DestPtr = OurPtr;
3014  DestAlign = SliceAlign;
3015  SrcPtr = OtherPtr;
3016  SrcAlign = OtherAlign;
3017  } else {
3018  DestPtr = OtherPtr;
3019  DestAlign = OtherAlign;
3020  SrcPtr = OurPtr;
3021  SrcAlign = SliceAlign;
3022  }
3023  CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3024  Size, II.isVolatile());
3025  if (AATags)
3026  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3027  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3028  return false;
3029  }
3030 
3031  bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3032  NewEndOffset == NewAllocaEndOffset;
3033  uint64_t Size = NewEndOffset - NewBeginOffset;
3034  unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3035  unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3036  unsigned NumElements = EndIndex - BeginIndex;
3037  IntegerType *SubIntTy =
3038  IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3039 
3040  // Reset the other pointer type to match the register type we're going to
3041  // use, but using the address space of the original other pointer.
3042  Type *OtherTy;
3043  if (VecTy && !IsWholeAlloca) {
3044  if (NumElements == 1)
3045  OtherTy = VecTy->getElementType();
3046  else
3047  OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3048  } else if (IntTy && !IsWholeAlloca) {
3049  OtherTy = SubIntTy;
3050  } else {
3051  OtherTy = NewAllocaTy;
3052  }
3053  OtherPtrTy = OtherTy->getPointerTo(OtherAS);
3054 
3055  Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3056  OtherPtr->getName() + ".");
3057  MaybeAlign SrcAlign = OtherAlign;
3058  Value *DstPtr = &NewAI;
3059  MaybeAlign DstAlign = SliceAlign;
3060  if (!IsDest) {
3061  std::swap(SrcPtr, DstPtr);
3062  std::swap(SrcAlign, DstAlign);
3063  }
3064 
3065  Value *Src;
3066  if (VecTy && !IsWholeAlloca && !IsDest) {
3067  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3068  NewAI.getAlign(), "load");
3069  Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3070  } else if (IntTy && !IsWholeAlloca && !IsDest) {
3071  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3072  NewAI.getAlign(), "load");
3073  Src = convertValue(DL, IRB, Src, IntTy);
3074  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3075  Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3076  } else {
3077  LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3078  II.isVolatile(), "copyload");
3079  Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3080  LLVMContext::MD_access_group});
3081  if (AATags)
3082  Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3083  Src = Load;
3084  }
3085 
3086  if (VecTy && !IsWholeAlloca && IsDest) {
3087  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3088  NewAI.getAlign(), "oldload");
3089  Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3090  } else if (IntTy && !IsWholeAlloca && IsDest) {
3091  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3092  NewAI.getAlign(), "oldload");
3093  Old = convertValue(DL, IRB, Old, IntTy);
3094  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3095  Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3096  Src = convertValue(DL, IRB, Src, NewAllocaTy);
3097  }
3098 
3099  StoreInst *Store = cast<StoreInst>(
3100  IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3101  Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3102  LLVMContext::MD_access_group});
3103  if (AATags)
3104  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3105  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3106  return !II.isVolatile();
3107  }
3108 
3109  bool visitIntrinsicInst(IntrinsicInst &II) {
3110  assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3111  "Unexpected intrinsic!");
3112  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3113 
3114  // Record this instruction for deletion.
3115  Pass.DeadInsts.push_back(&II);
3116 
3117  if (II.isDroppable()) {
3118  assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3119  // TODO For now we forget assumed information, this can be improved.
3120  OldPtr->dropDroppableUsesIn(II);
3121  return true;
3122  }
3123 
3124  assert(II.getArgOperand(1) == OldPtr);
3125  // Lifetime intrinsics are only promotable if they cover the whole alloca.
3126  // Therefore, we drop lifetime intrinsics which don't cover the whole
3127  // alloca.
3128  // (In theory, intrinsics which partially cover an alloca could be
3129  // promoted, but PromoteMemToReg doesn't handle that case.)
3130  // FIXME: Check whether the alloca is promotable before dropping the
3131  // lifetime intrinsics?
3132  if (NewBeginOffset != NewAllocaBeginOffset ||
3133  NewEndOffset != NewAllocaEndOffset)
3134  return true;
3135 
3136  ConstantInt *Size =
3137  ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3138  NewEndOffset - NewBeginOffset);
3139  // Lifetime intrinsics always expect an i8* so directly get such a pointer
3140  // for the new alloca slice.
3142  Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3143  Value *New;
3144  if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3145  New = IRB.CreateLifetimeStart(Ptr, Size);
3146  else
3147  New = IRB.CreateLifetimeEnd(Ptr, Size);
3148 
3149  (void)New;
3150  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3151 
3152  return true;
3153  }
3154 
3155  void fixLoadStoreAlign(Instruction &Root) {
3156  // This algorithm implements the same visitor loop as
3157  // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3158  // or store found.
3161  Visited.insert(&Root);
3162  Uses.push_back(&Root);
3163  do {
3164  Instruction *I = Uses.pop_back_val();
3165 
3166  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3167  LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3168  continue;
3169  }
3170  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3171  SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3172  continue;
3173  }
3174 
3175  assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3176  isa<PHINode>(I) || isa<SelectInst>(I) ||
3177  isa<GetElementPtrInst>(I));
3178  for (User *U : I->users())
3179  if (Visited.insert(cast<Instruction>(U)).second)
3180  Uses.push_back(cast<Instruction>(U));
3181  } while (!Uses.empty());
3182  }
3183 
3184  bool visitPHINode(PHINode &PN) {
3185  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3186  assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3187  assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3188 
3189  // We would like to compute a new pointer in only one place, but have it be
3190  // as local as possible to the PHI. To do that, we re-use the location of
3191  // the old pointer, which necessarily must be in the right position to
3192  // dominate the PHI.
3194  if (isa<PHINode>(OldPtr))
3195  IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
3196  else
3197  IRB.SetInsertPoint(OldPtr);
3198  IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3199 
3200  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3201  // Replace the operands which were using the old pointer.
3202  std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3203 
3204  LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3205  deleteIfTriviallyDead(OldPtr);
3206 
3207  // Fix the alignment of any loads or stores using this PHI node.
3208  fixLoadStoreAlign(PN);
3209 
3210  // PHIs can't be promoted on their own, but often can be speculated. We
3211  // check the speculation outside of the rewriter so that we see the
3212  // fully-rewritten alloca.
3213  PHIUsers.insert(&PN);
3214  return true;
3215  }
3216 
3217  bool visitSelectInst(SelectInst &SI) {
3218  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3219  assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3220  "Pointer isn't an operand!");
3221  assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3222  assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3223 
3224  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3225  // Replace the operands which were using the old pointer.
3226  if (SI.getOperand(1) == OldPtr)
3227  SI.setOperand(1, NewPtr);
3228  if (SI.getOperand(2) == OldPtr)
3229  SI.setOperand(2, NewPtr);
3230 
3231  LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3232  deleteIfTriviallyDead(OldPtr);
3233 
3234  // Fix the alignment of any loads or stores using this select.
3235  fixLoadStoreAlign(SI);
3236 
3237  // Selects can't be promoted on their own, but often can be speculated. We
3238  // check the speculation outside of the rewriter so that we see the
3239  // fully-rewritten alloca.
3240  SelectUsers.insert(&SI);
3241  return true;
3242  }
3243 };
3244 
3245 namespace {
3246 
3247 /// Visitor to rewrite aggregate loads and stores as scalar.
3248 ///
3249 /// This pass aggressively rewrites all aggregate loads and stores on
3250 /// a particular pointer (or any pointer derived from it which we can identify)
3251 /// with scalar loads and stores.
3252 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3253  // Befriend the base class so it can delegate to private visit methods.
3254  friend class InstVisitor<AggLoadStoreRewriter, bool>;
3255 
3256  /// Queue of pointer uses to analyze and potentially rewrite.
3258 
3259  /// Set to prevent us from cycling with phi nodes and loops.
3260  SmallPtrSet<User *, 8> Visited;
3261 
3262  /// The current pointer use being rewritten. This is used to dig up the used
3263  /// value (as opposed to the user).
3264  Use *U = nullptr;
3265 
3266  /// Used to calculate offsets, and hence alignment, of subobjects.
3267  const DataLayout &DL;
3268 
3269 public:
3270  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
3271 
3272  /// Rewrite loads and stores through a pointer and all pointers derived from
3273  /// it.
3274  bool rewrite(Instruction &I) {
3275  LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3276  enqueueUsers(I);
3277  bool Changed = false;
3278  while (!Queue.empty()) {
3279  U = Queue.pop_back_val();
3280  Changed |= visit(cast<Instruction>(U->getUser()));
3281  }
3282  return Changed;
3283  }
3284 
3285 private:
3286  /// Enqueue all the users of the given instruction for further processing.
3287  /// This uses a set to de-duplicate users.
3288  void enqueueUsers(Instruction &I) {
3289  for (Use &U : I.uses())
3290  if (Visited.insert(U.getUser()).second)
3291  Queue.push_back(&U);
3292  }
3293 
3294  // Conservative default is to not rewrite anything.
3295  bool visitInstruction(Instruction &I) { return false; }
3296 
3297  /// Generic recursive split emission class.
3298  template <typename Derived> class OpSplitter {
3299  protected:
3300  /// The builder used to form new instructions.
3301  IRBuilderTy IRB;
3302 
3303  /// The indices which to be used with insert- or extractvalue to select the
3304  /// appropriate value within the aggregate.
3305  SmallVector<unsigned, 4> Indices;
3306 
3307  /// The indices to a GEP instruction which will move Ptr to the correct slot
3308  /// within the aggregate.
3309  SmallVector<Value *, 4> GEPIndices;
3310 
3311  /// The base pointer of the original op, used as a base for GEPing the
3312  /// split operations.
3313  Value *Ptr;
3314 
3315  /// The base pointee type being GEPed into.
3316  Type *BaseTy;
3317 
3318  /// Known alignment of the base pointer.
3319  Align BaseAlign;
3320 
3321  /// To calculate offset of each component so we can correctly deduce
3322  /// alignments.
3323  const DataLayout &DL;
3324 
3325  /// Initialize the splitter with an insertion point, Ptr and start with a
3326  /// single zero GEP index.
3327  OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3328  Align BaseAlign, const DataLayout &DL)
3329  : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
3330  BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
3331 
3332  public:
3333  /// Generic recursive split emission routine.
3334  ///
3335  /// This method recursively splits an aggregate op (load or store) into
3336  /// scalar or vector ops. It splits recursively until it hits a single value
3337  /// and emits that single value operation via the template argument.
3338  ///
3339  /// The logic of this routine relies on GEPs and insertvalue and
3340  /// extractvalue all operating with the same fundamental index list, merely
3341  /// formatted differently (GEPs need actual values).
3342  ///
3343  /// \param Ty The type being split recursively into smaller ops.
3344  /// \param Agg The aggregate value being built up or stored, depending on
3345  /// whether this is splitting a load or a store respectively.
3346  void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3347  if (Ty->isSingleValueType()) {
3348  unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
3349  return static_cast<Derived *>(this)->emitFunc(
3350  Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
3351  }
3352 
3353  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3354  unsigned OldSize = Indices.size();
3355  (void)OldSize;
3356  for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3357  ++Idx) {
3358  assert(Indices.size() == OldSize && "Did not return to the old size");
3359  Indices.push_back(Idx);
3360  GEPIndices.push_back(IRB.getInt32(Idx));
3361  emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3362  GEPIndices.pop_back();
3363  Indices.pop_back();
3364  }
3365  return;
3366  }
3367 
3368  if (StructType *STy = dyn_cast<StructType>(Ty)) {
3369  unsigned OldSize = Indices.size();
3370  (void)OldSize;
3371  for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3372  ++Idx) {
3373  assert(Indices.size() == OldSize && "Did not return to the old size");
3374  Indices.push_back(Idx);
3375  GEPIndices.push_back(IRB.getInt32(Idx));
3376  emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3377  GEPIndices.pop_back();
3378  Indices.pop_back();
3379  }
3380  return;
3381  }
3382 
3383  llvm_unreachable("Only arrays and structs are aggregate loadable types");
3384  }
3385  };
3386 
3387  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3388  AAMDNodes AATags;
3389 
3390  LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3391  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
3392  : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3393  DL),
3394  AATags(AATags) {}
3395 
3396  /// Emit a leaf load of a single value. This is called at the leaves of the
3397  /// recursive emission to actually load values.
3398  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3399  assert(Ty->isSingleValueType());
3400  // Load the single value and insert it using the indices.
3401  Value *GEP =
3402  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3403  LoadInst *Load =
3404  IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
3405 
3406  APInt Offset(
3407  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3408  if (AATags &&
3409  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3410  Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3411 
3412  Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3413  LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3414  }
3415  };
3416 
3417  bool visitLoadInst(LoadInst &LI) {
3418  assert(LI.getPointerOperand() == *U);
3419  if (!LI.isSimple() || LI.getType()->isSingleValueType())
3420  return false;
3421 
3422  // We have an aggregate being loaded, split it apart.
3423  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3424  AAMDNodes AATags;
3425  LI.getAAMetadata(AATags);
3426  LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
3427  getAdjustedAlignment(&LI, 0), DL);
3428  Value *V = UndefValue::get(LI.getType());
3429  Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3430  Visited.erase(&LI);
3431  LI.replaceAllUsesWith(V);
3432  LI.eraseFromParent();
3433  return true;
3434  }
3435 
3436  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3437  StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3438  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
3439  : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3440  DL),
3441  AATags(AATags) {}
3442  AAMDNodes AATags;
3443  /// Emit a leaf store of a single value. This is called at the leaves of the
3444  /// recursive emission to actually produce stores.
3445  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3446  assert(Ty->isSingleValueType());
3447  // Extract the single value and store it using the indices.
3448  //
3449  // The gep and extractvalue values are factored out of the CreateStore
3450  // call to make the output independent of the argument evaluation order.
3451  Value *ExtractValue =
3452  IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
3453  Value *InBoundsGEP =
3454  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3455  StoreInst *Store =
3456  IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
3457 
3458  APInt Offset(
3459  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3460  if (AATags &&
3461  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3462  Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3463 
3464  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3465  }
3466  };
3467 
3468  bool visitStoreInst(StoreInst &SI) {
3469  if (!SI.isSimple() || SI.getPointerOperand() != *U)
3470  return false;
3471  Value *V = SI.getValueOperand();
3472  if (V->getType()->isSingleValueType())
3473  return false;
3474 
3475  // We have an aggregate being stored, split it apart.
3476  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3477  AAMDNodes AATags;
3478  SI.getAAMetadata(AATags);
3479  StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
3480  getAdjustedAlignment(&SI, 0), DL);
3481  Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3482  Visited.erase(&SI);
3483  SI.eraseFromParent();
3484  return true;
3485  }
3486 
3487  bool visitBitCastInst(BitCastInst &BC) {
3488  enqueueUsers(BC);
3489  return false;
3490  }
3491 
3492  bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
3493  enqueueUsers(ASC);
3494  return false;
3495  }
3496 
3497  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
3498  bool foldGEPSelect(GetElementPtrInst &GEPI) {
3499  if (!GEPI.hasAllConstantIndices())
3500  return false;
3501 
3502  SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
3503 
3504  LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
3505  << "\n original: " << *Sel
3506  << "\n " << GEPI);
3507 
3508  IRBuilderTy Builder(&GEPI);
3510  bool IsInBounds = GEPI.isInBounds();
3511 
3512  Type *Ty = GEPI.getSourceElementType();
3513  Value *True = Sel->getTrueValue();
3514  Value *NTrue =
3515  IsInBounds
3516  ? Builder.CreateInBoundsGEP(Ty, True, Index,
3517  True->getName() + ".sroa.gep")
3518  : Builder.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
3519 
3520  Value *False = Sel->getFalseValue();
3521 
3522  Value *NFalse =
3523  IsInBounds
3524  ? Builder.CreateInBoundsGEP(Ty, False, Index,
3525  False->getName() + ".sroa.gep")
3526  : Builder.CreateGEP(Ty, False, Index,
3527  False->getName() + ".sroa.gep");
3528 
3529  Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
3530  Sel->getName() + ".sroa.sel");
3531  Visited.erase(&GEPI);
3532  GEPI.replaceAllUsesWith(NSel);
3533  GEPI.eraseFromParent();
3534  Instruction *NSelI = cast<Instruction>(NSel);
3535  Visited.insert(NSelI);
3536  enqueueUsers(*NSelI);
3537 
3538  LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
3539  << "\n " << *NFalse
3540  << "\n " << *NSel << '\n');
3541 
3542  return true;
3543  }
3544 
3545  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
3546  bool foldGEPPhi(GetElementPtrInst &GEPI) {
3547  if (!GEPI.hasAllConstantIndices())
3548  return false;
3549 
3550  PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
3551  if (GEPI.getParent() != PHI->getParent() ||
3552  llvm::any_of(PHI->incoming_values(), [](Value *In)
3553  { Instruction *I = dyn_cast<Instruction>(In);
3554  return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
3555  succ_empty(I->getParent()) ||
3556  !I->getParent()->isLegalToHoistInto();
3557  }))
3558  return false;
3559 
3560  LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
3561  << "\n original: " << *PHI
3562  << "\n " << GEPI
3563  << "\n to: ");
3564 
3566  bool IsInBounds = GEPI.isInBounds();
3567  IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
3568  PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
3569  PHI->getNumIncomingValues(),
3570  PHI->getName() + ".sroa.phi");
3571  for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
3572  BasicBlock *B = PHI->getIncomingBlock(I);
3573  Value *NewVal = nullptr;
3574  int Idx = NewPN->getBasicBlockIndex(B);
3575  if (Idx >= 0) {
3576  NewVal = NewPN->getIncomingValue(Idx);
3577  } else {
3578  Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
3579 
3580  IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
3581  Type *Ty = GEPI.getSourceElementType();
3582  NewVal = IsInBounds
3583  ? B.CreateInBoundsGEP(Ty, In, Index, In->getName() + ".sroa.gep")
3584  : B.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep");
3585  }
3586  NewPN->addIncoming(NewVal, B);
3587  }
3588 
3589  Visited.erase(&GEPI);
3590  GEPI.replaceAllUsesWith(NewPN);
3591  GEPI.eraseFromParent();
3592  Visited.insert(NewPN);
3593  enqueueUsers(*NewPN);
3594 
3595  LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
3596  dbgs() << "\n " << *In;
3597  dbgs() << "\n " << *NewPN << '\n');
3598 
3599  return true;
3600  }
3601 
3602  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
3603  if (isa<SelectInst>(GEPI.getPointerOperand()) &&
3604  foldGEPSelect(GEPI))
3605  return true;
3606 
3607  if (isa<PHINode>(GEPI.getPointerOperand()) &&
3608  foldGEPPhi(GEPI))
3609  return true;
3610 
3611  enqueueUsers(GEPI);
3612  return false;
3613  }
3614 
3615  bool visitPHINode(PHINode &PN) {
3616  enqueueUsers(PN);
3617  return false;
3618  }
3619 
3620  bool visitSelectInst(SelectInst &SI) {
3621  enqueueUsers(SI);
3622  return false;
3623  }
3624 };
3625 
3626 } // end anonymous namespace
3627 
3628 /// Strip aggregate type wrapping.
3629 ///
3630 /// This removes no-op aggregate types wrapping an underlying type. It will
3631 /// strip as many layers of types as it can without changing either the type
3632 /// size or the allocated size.
3634  if (Ty->isSingleValueType())
3635  return Ty;
3636 
3637  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
3638  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
3639 
3640  Type *InnerTy;
3641  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
3642  InnerTy = ArrTy->getElementType();
3643  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
3644  const StructLayout *SL = DL.getStructLayout(STy);
3645  unsigned Index = SL->getElementContainingOffset(0);
3646  InnerTy = STy->getElementType(Index);
3647  } else {
3648  return Ty;
3649  }
3650 
3651  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
3652  TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
3653  return Ty;
3654 
3655  return stripAggregateTypeWrapping(DL, InnerTy);
3656 }
3657 
3658 /// Try to find a partition of the aggregate type passed in for a given
3659 /// offset and size.
3660 ///
3661 /// This recurses through the aggregate type and tries to compute a subtype
3662 /// based on the offset and size. When the offset and size span a sub-section
3663 /// of an array, it will even compute a new array type for that sub-section,
3664 /// and the same for structs.
3665 ///
3666 /// Note that this routine is very strict and tries to find a partition of the
3667 /// type which produces the *exact* right offset and size. It is not forgiving
3668 /// when the size or offset cause either end of type-based partition to be off.
3669 /// Also, this is a best-effort routine. It is reasonable to give up and not
3670 /// return a type if necessary.
3671 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
3672  uint64_t Size) {
3673  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
3674  return stripAggregateTypeWrapping(DL, Ty);
3675  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
3676  (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
3677  return nullptr;
3678 
3679  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
3680  Type *ElementTy;
3681  uint64_t TyNumElements;
3682  if (auto *AT = dyn_cast<ArrayType>(Ty)) {
3683  ElementTy = AT->getElementType();
3684  TyNumElements = AT->getNumElements();
3685  } else {
3686  // FIXME: This isn't right for vectors with non-byte-sized or
3687  // non-power-of-two sized elements.
3688  auto *VT = cast<FixedVectorType>(Ty);
3689  ElementTy = VT->getElementType();
3690  TyNumElements = VT->getNumElements();
3691  }
3692  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3693  uint64_t NumSkippedElements = Offset / ElementSize;
3694  if (NumSkippedElements >= TyNumElements)
3695  return nullptr;
3696  Offset -= NumSkippedElements * ElementSize;
3697 
3698  // First check if we need to recurse.
3699  if (Offset > 0 || Size < ElementSize) {
3700  // Bail if the partition ends in a different array element.
3701  if ((Offset + Size) > ElementSize)
3702  return nullptr;
3703  // Recurse through the element type trying to peel off offset bytes.
3704  return getTypePartition(DL, ElementTy, Offset, Size);
3705  }
3706  assert(Offset == 0);
3707 
3708  if (Size == ElementSize)
3709  return stripAggregateTypeWrapping(DL, ElementTy);
3710  assert(Size > ElementSize);
3711  uint64_t NumElements = Size / ElementSize;
3712  if (NumElements * ElementSize != Size)
3713  return nullptr;
3714  return ArrayType::get(ElementTy, NumElements);
3715  }
3716 
3717  StructType *STy = dyn_cast<StructType>(Ty);
3718  if (!STy)
3719  return nullptr;
3720 
3721  const StructLayout *SL = DL.getStructLayout(STy);
3722  if (Offset >= SL->getSizeInBytes())
3723  return nullptr;
3724  uint64_t EndOffset = Offset + Size;
3725  if (EndOffset > SL->getSizeInBytes())
3726  return nullptr;
3727 
3728  unsigned Index = SL->getElementContainingOffset(Offset);
3729  Offset -= SL->getElementOffset(Index);
3730 
3731  Type *ElementTy = STy->getElementType(Index);
3732  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3733  if (Offset >= ElementSize)
3734  return nullptr; // The offset points into alignment padding.
3735 
3736  // See if any partition must be contained by the element.
3737  if (Offset > 0 || Size < ElementSize) {
3738  if ((Offset + Size) > ElementSize)
3739  return nullptr;
3740  return getTypePartition(DL, ElementTy, Offset, Size);
3741  }
3742  assert(Offset == 0);
3743 
3744  if (Size == ElementSize)
3745  return stripAggregateTypeWrapping(DL, ElementTy);
3746 
3748  EE = STy->element_end();
3749  if (EndOffset < SL->getSizeInBytes()) {
3750  unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
3751  if (Index == EndIndex)
3752  return nullptr; // Within a single element and its padding.
3753 
3754  // Don't try to form "natural" types if the elements don't line up with the
3755  // expected size.
3756  // FIXME: We could potentially recurse down through the last element in the
3757  // sub-struct to find a natural end point.
3758  if (SL->getElementOffset(EndIndex) != EndOffset)
3759  return nullptr;
3760 
3761  assert(Index < EndIndex);
3762  EE = STy->element_begin() + EndIndex;
3763  }
3764 
3765  // Try to build up a sub-structure.
3766  StructType *SubTy =
3767  StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
3768  const StructLayout *SubSL = DL.getStructLayout(SubTy);
3769  if (Size != SubSL->getSizeInBytes())
3770  return nullptr; // The sub-struct doesn't have quite the size needed.
3771 
3772  return SubTy;
3773 }
3774 
3775 /// Pre-split loads and stores to simplify rewriting.
3776 ///
3777 /// We want to break up the splittable load+store pairs as much as
3778 /// possible. This is important to do as a preprocessing step, as once we
3779 /// start rewriting the accesses to partitions of the alloca we lose the
3780 /// necessary information to correctly split apart paired loads and stores
3781 /// which both point into this alloca. The case to consider is something like
3782 /// the following:
3783 ///
3784 /// %a = alloca [12 x i8]
3785 /// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
3786 /// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
3787 /// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
3788 /// %iptr1 = bitcast i8* %gep1 to i64*
3789 /// %iptr2 = bitcast i8* %gep2 to i64*
3790 /// %fptr1 = bitcast i8* %gep1 to float*
3791 /// %fptr2 = bitcast i8* %gep2 to float*
3792 /// %fptr3 = bitcast i8* %gep3 to float*
3793 /// store float 0.0, float* %fptr1
3794 /// store float 1.0, float* %fptr2
3795 /// %v = load i64* %iptr1
3796 /// store i64 %v, i64* %iptr2
3797 /// %f1 = load float* %fptr2
3798 /// %f2 = load float* %fptr3
3799 ///
3800 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
3801 /// promote everything so we recover the 2 SSA values that should have been
3802 /// there all along.
3803 ///
3804 /// \returns true if any changes are made.
3805 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
3806  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
3807 
3808  // Track the loads and stores which are candidates for pre-splitting here, in
3809  // the order they first appear during the partition scan. These give stable
3810  // iteration order and a basis for tracking which loads and stores we
3811  // actually split.
3814 
3815  // We need to accumulate the splits required of each load or store where we
3816  // can find them via a direct lookup. This is important to cross-check loads
3817  // and stores against each other. We also track the slice so that we can kill
3818  // all the slices that end up split.
3819  struct SplitOffsets {
3820  Slice *S;
3821  std::vector<uint64_t> Splits;
3822  };
3824 
3825  // Track loads out of this alloca which cannot, for any reason, be pre-split.
3826  // This is important as we also cannot pre-split stores of those loads!
3827  // FIXME: This is all pretty gross. It means that we can be more aggressive
3828  // in pre-splitting when the load feeding the store happens to come from
3829  // a separate alloca. Put another way, the effectiveness of SROA would be
3830  // decreased by a frontend which just concatenated all of its local allocas
3831  // into one big flat alloca. But defeating such patterns is exactly the job
3832  // SROA is tasked with! Sadly, to not have this discrepancy we would have
3833  // change store pre-splitting to actually force pre-splitting of the load
3834  // that feeds it *and all stores*. That makes pre-splitting much harder, but
3835  // maybe it would make it more principled?
3836  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
3837 
3838  LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
3839  for (auto &P : AS.partitions()) {
3840  for (Slice &S : P) {
3841  Instruction *I = cast<Instruction>(S.getUse()->getUser());
3842  if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
3843  // If this is a load we have to track that it can't participate in any
3844  // pre-splitting. If this is a store of a load we have to track that
3845  // that load also can't participate in any pre-splitting.
3846  if (auto *LI = dyn_cast<LoadInst>(I))
3847  UnsplittableLoads.insert(LI);
3848  else if (auto *SI = dyn_cast<StoreInst>(I))
3849  if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
3850  UnsplittableLoads.insert(LI);
3851  continue;
3852  }
3853  assert(P.endOffset() > S.beginOffset() &&
3854  "Empty or backwards partition!");
3855 
3856  // Determine if this is a pre-splittable slice.
3857  if (auto *LI = dyn_cast<LoadInst>(I)) {
3858  assert(!LI->isVolatile() && "Cannot split volatile loads!");
3859 
3860  // The load must be used exclusively to store into other pointers for
3861  // us to be able to arbitrarily pre-split it. The stores must also be
3862  // simple to avoid changing semantics.
3863  auto IsLoadSimplyStored = [](LoadInst *LI) {
3864  for (User *LU : LI->users()) {
3865  auto *SI = dyn_cast<StoreInst>(LU);
3866  if (!SI || !SI->isSimple())
3867  return false;
3868  }
3869  return true;
3870  };
3871  if (!IsLoadSimplyStored(LI)) {
3872  UnsplittableLoads.insert(LI);
3873  continue;
3874  }
3875 
3876  Loads.push_back(LI);
3877  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
3878  if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
3879  // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
3880  continue;
3881  auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
3882  if (!StoredLoad || !StoredLoad->isSimple())
3883  continue;
3884  assert(!SI->isVolatile() && "Cannot split volatile stores!");
3885 
3886  Stores.push_back(SI);
3887  } else {
3888  // Other uses cannot be pre-split.
3889  continue;
3890  }
3891 
3892  // Record the initial split.
3893  LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
3894  auto &Offsets = SplitOffsetsMap[I];
3895  assert(Offsets.Splits.empty() &&
3896  "Should not have splits the first time we see an instruction!");
3897  Offsets.S = &S;
3898  Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
3899  }
3900 
3901  // Now scan the already split slices, and add a split for any of them which
3902  // we're going to pre-split.
3903  for (Slice *S : P.splitSliceTails()) {
3904  auto SplitOffsetsMapI =
3905  SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
3906  if (SplitOffsetsMapI == SplitOffsetsMap.end())
3907  continue;
3908  auto &Offsets = SplitOffsetsMapI->second;
3909 
3910  assert(Offsets.S == S && "Found a mismatched slice!");
3911  assert(!Offsets.Splits.empty() &&
3912  "Cannot have an empty set of splits on the second partition!");
3913  assert(Offsets.Splits.back() ==
3914  P.beginOffset() - Offsets.S->beginOffset() &&
3915  "Previous split does not end where this one begins!");
3916 
3917  // Record each split. The last partition's end isn't needed as the size
3918  // of the slice dictates that.
3919  if (S->endOffset() > P.endOffset())
3920  Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
3921  }
3922  }
3923 
3924  // We may have split loads where some of their stores are split stores. For
3925  // such loads and stores, we can only pre-split them if their splits exactly
3926  // match relative to their starting offset. We have to verify this prior to
3927  // any rewriting.
3928  llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
3929  // Lookup the load we are storing in our map of split
3930  // offsets.
3931  auto *LI = cast<LoadInst>(SI->getValueOperand());
3932  // If it was completely unsplittable, then we're done,
3933  // and this store can't be pre-split.
3934  if (UnsplittableLoads.count(LI))
3935  return true;
3936 
3937  auto LoadOffsetsI = SplitOffsetsMap.find(LI);
3938  if (LoadOffsetsI == SplitOffsetsMap.end())
3939  return false; // Unrelated loads are definitely safe.
3940  auto &LoadOffsets = LoadOffsetsI->second;
3941 
3942  // Now lookup the store's offsets.
3943  auto &StoreOffsets = SplitOffsetsMap[SI];
3944 
3945  // If the relative offsets of each split in the load and
3946  // store match exactly, then we can split them and we
3947  // don't need to remove them here.
3948  if (LoadOffsets.Splits == StoreOffsets.Splits)
3949  return false;
3950 
3951  LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
3952  << " " << *LI << "\n"
3953  << " " << *SI << "\n");
3954 
3955  // We've found a store and load that we need to split
3956  // with mismatched relative splits. Just give up on them
3957  // and remove both instructions from our list of
3958  // candidates.
3959  UnsplittableLoads.insert(LI);
3960  return true;
3961  });
3962  // Now we have to go *back* through all the stores, because a later store may
3963  // have caused an earlier store's load to become unsplittable and if it is
3964  // unsplittable for the later store, then we can't rely on it being split in
3965  // the earlier store either.
3966  llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
3967  auto *LI = cast<LoadInst>(SI->getValueOperand());
3968  return UnsplittableLoads.count(LI);
3969  });
3970  // Once we've established all the loads that can't be split for some reason,
3971  // filter any that made it into our list out.
3972  llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
3973  return UnsplittableLoads.count(LI);
3974  });
3975 
3976  // If no loads or stores are left, there is no pre-splitting to be done for
3977  // this alloca.
3978  if (Loads.empty() && Stores.empty())
3979  return false;
3980 
3981  // From here on, we can't fail and will be building new accesses, so rig up
3982  // an IR builder.
3983  IRBuilderTy IRB(&AI);
3984 
3985  // Collect the new slices which we will merge into the alloca slices.
3986  SmallVector<Slice, 4> NewSlices;
3987 
3988  // Track any allocas we end up splitting loads and stores for so we iterate
3989  // on them.
3990  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
3991 
3992  // At this point, we have collected all of the loads and stores we can
3993  // pre-split, and the specific splits needed for them. We actually do the
3994  // splitting in a specific order in order to handle when one of the loads in
3995  // the value operand to one of the stores.
3996  //
3997  // First, we rewrite all of the split loads, and just accumulate each split
3998  // load in a parallel structure. We also build the slices for them and append
3999  // them to the alloca slices.
4001  std::vector<LoadInst *> SplitLoads;
4002  const DataLayout &DL = AI.getModule()->getDataLayout();
4003  for (LoadInst *LI : Loads) {
4004  SplitLoads.clear();
4005 
4006  IntegerType *Ty = cast<IntegerType>(LI->getType());
4007  assert(Ty->getBitWidth() % 8 == 0);
4008  uint64_t LoadSize = Ty->getBitWidth() / 8;
4009  assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
4010 
4011  auto &Offsets = SplitOffsetsMap[LI];
4012  assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4013  "Slice size should always match load size exactly!");
4014  uint64_t BaseOffset = Offsets.S->beginOffset();
4015  assert(BaseOffset + LoadSize > BaseOffset &&
4016  "Cannot represent alloca access size using 64-bit integers!");
4017 
4018  Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
4019  IRB.SetInsertPoint(LI);
4020 
4021  LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4022 
4023  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4024  int Idx = 0, Size = Offsets.Splits.size();
4025  for (;;) {
4026  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4027  auto AS = LI->getPointerAddressSpace();
4028  auto *PartPtrTy = PartTy->getPointerTo(AS);
4029  LoadInst *PLoad = IRB.CreateAlignedLoad(
4030  PartTy,
4031  getAdjustedPtr(IRB, DL, BasePtr,
4032  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4033  PartPtrTy, BasePtr->getName() + "."),
4034  getAdjustedAlignment(LI, PartOffset),
4035  /*IsVolatile*/ false, LI->getName());
4036  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4037  LLVMContext::MD_access_group});
4038 
4039  // Append this load onto the list of split loads so we can find it later
4040  // to rewrite the stores.
4041  SplitLoads.push_back(PLoad);
4042 
4043  // Now build a new slice for the alloca.
4044  NewSlices.push_back(
4045  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4046  &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4047  /*IsSplittable*/ false));
4048  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4049  << ", " << NewSlices.back().endOffset()
4050  << "): " << *PLoad << "\n");
4051 
4052  // See if we've handled all the splits.
4053  if (Idx >= Size)
4054  break;
4055 
4056  // Setup the next partition.
4057  PartOffset = Offsets.Splits[Idx];
4058  ++Idx;
4059  PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
4060  }
4061 
4062  // Now that we have the split loads, do the slow walk over all uses of the
4063  // load and rewrite them as split stores, or save the split loads to use
4064  // below if the store is going to be split there anyways.
4065  bool DeferredStores = false;
4066  for (User *LU : LI->users()) {
4067  StoreInst *SI = cast<StoreInst>(LU);
4068  if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4069  DeferredStores = true;
4070  LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4071  << "\n");
4072  continue;
4073  }
4074 
4075  Value *StoreBasePtr = SI->getPointerOperand();
4076  IRB.SetInsertPoint(SI);
4077 
4078  LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4079 
4080  for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4081  LoadInst *PLoad = SplitLoads[Idx];
4082  uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4083  auto *PartPtrTy =
4084  PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
4085 
4086  auto AS = SI->getPointerAddressSpace();
4087  StoreInst *PStore = IRB.CreateAlignedStore(
4088  PLoad,
4089  getAdjustedPtr(IRB, DL, StoreBasePtr,
4090  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4091  PartPtrTy, StoreBasePtr->getName() + "."),
4092  getAdjustedAlignment(SI, PartOffset),
4093  /*IsVolatile*/ false);
4094  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4095  LLVMContext::MD_access_group});
4096  LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4097  }
4098 
4099  // We want to immediately iterate on any allocas impacted by splitting
4100  // this store, and we have to track any promotable alloca (indicated by
4101  // a direct store) as needing to be resplit because it is no longer
4102  // promotable.
4103  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4104  ResplitPromotableAllocas.insert(OtherAI);
4105  Worklist.insert(OtherAI);
4106  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4107  StoreBasePtr->stripInBoundsOffsets())) {
4108  Worklist.insert(OtherAI);
4109  }
4110 
4111  // Mark the original store as dead.
4112  DeadInsts.push_back(SI);
4113  }
4114 
4115  // Save the split loads if there are deferred stores among the users.
4116  if (DeferredStores)
4117  SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4118 
4119  // Mark the original load as dead and kill the original slice.
4120  DeadInsts.push_back(LI);
4121  Offsets.S->kill();
4122  }
4123 
4124  // Second, we rewrite all of the split stores. At this point, we know that
4125  // all loads from this alloca have been split already. For stores of such
4126  // loads, we can simply look up the pre-existing split loads. For stores of
4127  // other loads, we split those loads first and then write split stores of
4128  // them.
4129  for (StoreInst *SI : Stores) {
4130  auto *LI = cast<LoadInst>(SI->getValueOperand());
4131  IntegerType *Ty = cast<IntegerType>(LI->getType());
4132  assert(Ty->getBitWidth() % 8 == 0);
4133  uint64_t StoreSize = Ty->getBitWidth() / 8;
4134  assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4135 
4136  auto &Offsets = SplitOffsetsMap[SI];
4137  assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4138  "Slice size should always match load size exactly!");
4139  uint64_t BaseOffset = Offsets.S->beginOffset();
4140  assert(BaseOffset + StoreSize > BaseOffset &&
4141  "Cannot represent alloca access size using 64-bit integers!");
4142 
4143  Value *LoadBasePtr = LI->getPointerOperand();
4144  Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4145 
4146  LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4147 
4148  // Check whether we have an already split load.
4149  auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4150  std::vector<LoadInst *> *SplitLoads = nullptr;
4151  if (SplitLoadsMapI != SplitLoadsMap.end()) {
4152  SplitLoads = &SplitLoadsMapI->second;
4153  assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4154  "Too few split loads for the number of splits in the store!");
4155  } else {
4156  LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4157  }
4158 
4159  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4160  int Idx = 0, Size = Offsets.Splits.size();
4161  for (;;) {
4162  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4163  auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
4164  auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
4165 
4166  // Either lookup a split load or create one.
4167  LoadInst *PLoad;
4168  if (SplitLoads) {
4169  PLoad = (*SplitLoads)[Idx];
4170  } else {
4171  IRB.SetInsertPoint(LI);
4172  auto AS = LI->getPointerAddressSpace();
4173  PLoad = IRB.CreateAlignedLoad(
4174  PartTy,
4175  getAdjustedPtr(IRB, DL, LoadBasePtr,
4176  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4177  LoadPartPtrTy, LoadBasePtr->getName() + "."),
4178  getAdjustedAlignment(LI, PartOffset),
4179  /*IsVolatile*/ false, LI->getName());
4180  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4181  LLVMContext::MD_access_group});
4182  }
4183 
4184  // And store this partition.
4185  IRB.SetInsertPoint(SI);
4186  auto AS = SI->getPointerAddressSpace();
4187  StoreInst *PStore = IRB.CreateAlignedStore(
4188  PLoad,
4189  getAdjustedPtr(IRB, DL, StoreBasePtr,
4190  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4191  StorePartPtrTy, StoreBasePtr->getName() + "."),
4192  getAdjustedAlignment(SI, PartOffset),
4193  /*IsVolatile*/ false);
4194  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4195  LLVMContext::MD_access_group});
4196 
4197  // Now build a new slice for the alloca.
4198  NewSlices.push_back(
4199  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4200  &PStore->getOperandUse(PStore->getPointerOperandIndex()),
4201  /*IsSplittable*/ false));
4202  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4203  << ", " << NewSlices.back().endOffset()
4204  << "): " << *PStore << "\n");
4205  if (!SplitLoads) {
4206  LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4207  }
4208 
4209  // See if we've finished all the splits.
4210  if (Idx >= Size)
4211  break;
4212 
4213  // Setup the next partition.
4214  PartOffset = Offsets.Splits[Idx];
4215  ++Idx;
4216  PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4217  }
4218 
4219  // We want to immediately iterate on any allocas impacted by splitting
4220  // this load, which is only relevant if it isn't a load of this alloca and
4221  // thus we didn't already split the loads above. We also have to keep track
4222  // of any promotable allocas we split loads on as they can no longer be
4223  // promoted.
4224  if (!SplitLoads) {
4225  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
4226  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4227  ResplitPromotableAllocas.insert(OtherAI);
4228  Worklist.insert(OtherAI);
4229  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4230  LoadBasePtr->stripInBoundsOffsets())) {
4231  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4232  Worklist.insert(OtherAI);
4233  }
4234  }
4235 
4236  // Mark the original store as dead now that we've split it up and kill its
4237  // slice. Note that we leave the original load in place unless this store
4238  // was its only use. It may in turn be split up if it is an alloca load
4239  // for some other alloca, but it may be a normal load. This may introduce
4240  // redundant loads, but where those can be merged the rest of the optimizer
4241  // should handle the merging, and this uncovers SSA splits which is more
4242  // important. In practice, the original loads will almost always be fully
4243  // split and removed eventually, and the splits will be merged by any
4244  // trivial CSE, including instcombine.
4245  if (LI->hasOneUse()) {
4246  assert(*LI->user_begin() == SI && "Single use isn't this store!");
4247  DeadInsts.push_back(LI);
4248  }
4249  DeadInsts.push_back(SI);
4250  Offsets.S->kill();
4251  }
4252 
4253  // Remove the killed slices that have ben pre-split.
4254  llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
4255 
4256  // Insert our new slices. This will sort and merge them into the sorted
4257  // sequence.
4258  AS.insert(NewSlices);
4259 
4260  LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4261 #ifndef NDEBUG
4262  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4263  LLVM_DEBUG(AS.print(dbgs(), I, " "));
4264 #endif
4265 
4266  // Finally, don't try to promote any allocas that new require re-splitting.
4267  // They have already been added to the worklist above.
4268  llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
4269  return ResplitPromotableAllocas.count(AI);
4270  });
4271 
4272  return true;
4273 }
4274 
4275 /// Rewrite an alloca partition's users.
4276 ///
4277 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4278 /// to rewrite uses of an alloca partition to be conducive for SSA value
4279 /// promotion. If the partition needs a new, more refined alloca, this will
4280 /// build that new alloca, preserving as much type information as possible, and
4281 /// rewrite the uses of the old alloca to point at the new one and have the
4282 /// appropriate new offsets. It also evaluates how successful the rewrite was
4283 /// at enabling promotion and if it was successful queues the alloca to be
4284 /// promoted.
4285 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4286  Partition &P) {
4287  // Try to compute a friendly type for this partition of the alloca. This
4288  // won't always succeed, in which case we fall back to a legal integer type
4289  // or an i8 array of an appropriate size.
4290  Type *SliceTy = nullptr;
4291  const DataLayout &DL = AI.getModule()->getDataLayout();
4292  std::pair<Type *, IntegerType *> CommonUseTy =
4293  findCommonType(P.begin(), P.end(), P.endOffset());
4294  // Do all uses operate on the same type?
4295  if (CommonUseTy.first)
4296  if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
4297  SliceTy = CommonUseTy.first;
4298  // If not, can we find an appropriate subtype in the original allocated type?
4299  if (!SliceTy)
4300  if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4301  P.beginOffset(), P.size()))
4302  SliceTy = TypePartitionTy;
4303  // If still not, can we use the largest bitwidth integer type used?
4304  if (!SliceTy && CommonUseTy.second)
4305  if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
4306  SliceTy = CommonUseTy.second;
4307  if ((!SliceTy || (SliceTy->isArrayTy() &&
4308  SliceTy->getArrayElementType()->isIntegerTy())) &&
4309  DL.isLegalInteger(P.size() * 8))
4310  SliceTy = Type::getIntNTy(*C, P.size() * 8);
4311  if (!SliceTy)
4312  SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4313  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
4314 
4315  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4316 
4317  VectorType *VecTy =
4318  IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4319  if (VecTy)
4320  SliceTy = VecTy;
4321 
4322  // Check for the case where we're going to rewrite to a new alloca of the
4323  // exact same type as the original, and with the same access offsets. In that
4324  // case, re-use the existing alloca, but still run through the rewriter to
4325  // perform phi and select speculation.
4326  // P.beginOffset() can be non-zero even with the same type in a case with
4327  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4328  AllocaInst *NewAI;
4329  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4330  NewAI = &AI;
4331  // FIXME: We should be able to bail at this point with "nothing changed".
4332  // FIXME: We might want to defer PHI speculation until after here.
4333  // FIXME: return nullptr;
4334  } else {
4335  // Make sure the alignment is compatible with P.beginOffset().
4336  const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
4337  // If we will get at least this much alignment from the type alone, leave
4338  // the alloca's alignment unconstrained.
4339  const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
4340  NewAI = new AllocaInst(
4341  SliceTy, AI.getType()->getAddressSpace(), nullptr,
4342  IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
4343  AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
4344  // Copy the old AI debug location over to the new one.
4345  NewAI->setDebugLoc(AI.getDebugLoc());
4346  ++NumNewAllocas;
4347  }
4348 
4349  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
4350  << "[" << P.beginOffset() << "," << P.endOffset()
4351  << ") to: " << *NewAI << "\n");
4352 
4353  // Track the high watermark on the worklist as it is only relevant for
4354  // promoted allocas. We will reset it to this point if the alloca is not in
4355  // fact scheduled for promotion.
4356  unsigned PPWOldSize = PostPromotionWorklist.size();
4357  unsigned NumUses = 0;
4359  SmallSetVector<SelectInst *, 8> SelectUsers;
4360 
4361  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4362  P.endOffset(), IsIntegerPromotable, VecTy,
4363  PHIUsers, SelectUsers);
4364  bool Promotable = true;
4365  for (Slice *S : P.splitSliceTails()) {
4366  Promotable &= Rewriter.visit(S);
4367  ++NumUses;
4368  }
4369  for (Slice &S : P) {
4370  Promotable &= Rewriter.visit(&S);
4371  ++NumUses;
4372  }
4373 
4374  NumAllocaPartitionUses += NumUses;
4375  MaxUsesPerAllocaPartition.updateMax(NumUses);
4376 
4377  // Now that we've processed all the slices in the new partition, check if any
4378  // PHIs or Selects would block promotion.
4379  for (PHINode *PHI : PHIUsers)
4380  if (!isSafePHIToSpeculate(*PHI)) {
4381  Promotable = false;
4382  PHIUsers.clear();
4383  SelectUsers.clear();
4384  break;
4385  }
4386 
4387  for (SelectInst *Sel : SelectUsers)
4388  if (!isSafeSelectToSpeculate(*Sel)) {
4389  Promotable = false;
4390  PHIUsers.clear();
4391  SelectUsers.clear();
4392  break;
4393  }
4394 
4395  if (Promotable) {
4396  for (Use *U : AS.getDeadUsesIfPromotable()) {
4397  auto *OldInst = dyn_cast<Instruction>(U->get());
4398  Value::dropDroppableUse(*U);
4399  if (OldInst)
4400  if (isInstructionTriviallyDead(OldInst))
4401  DeadInsts.push_back(OldInst);
4402  }
4403  if (PHIUsers.empty() && SelectUsers.empty()) {
4404  // Promote the alloca.
4405  PromotableAllocas.push_back(NewAI);
4406  } else {
4407  // If we have either PHIs or Selects to speculate, add them to those
4408  // worklists and re-queue the new alloca so that we promote in on the
4409  // next iteration.
4410  for (PHINode *PHIUser : PHIUsers)
4411  SpeculatablePHIs.insert(PHIUser);
4412  for (SelectInst *SelectUser : SelectUsers)
4413  SpeculatableSelects.insert(SelectUser);
4414  Worklist.insert(NewAI);
4415  }
4416  } else {
4417  // Drop any post-promotion work items if promotion didn't happen.
4418  while (PostPromotionWorklist.size() > PPWOldSize)
4419  PostPromotionWorklist.pop_back();
4420 
4421  // We couldn't promote and we didn't create a new partition, nothing
4422  // happened.
4423  if (NewAI == &AI)
4424  return nullptr;
4425 
4426  // If we can't promote the alloca, iterate on it to check for new
4427  // refinements exposed by splitting the current alloca. Don't iterate on an
4428  // alloca which didn't actually change and didn't get promoted.
4429  Worklist.insert(NewAI);
4430  }
4431 
4432  return NewAI;
4433 }
4434 
4435 /// Walks the slices of an alloca and form partitions based on them,
4436 /// rewriting each of their uses.
4437 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
4438  if (AS.begin() == AS.end())
4439  return false;
4440 
4441  unsigned NumPartitions = 0;
4442  bool Changed = false;
4443  const DataLayout &DL = AI.getModule()->getDataLayout();
4444 
4445  // First try to pre-split loads and stores.
4446  Changed |= presplitLoadsAndStores(AI, AS);
4447 
4448  // Now that we have identified any pre-splitting opportunities,
4449  // mark loads and stores unsplittable except for the following case.
4450  // We leave a slice splittable if all other slices are disjoint or fully
4451  // included in the slice, such as whole-alloca loads and stores.
4452  // If we fail to split these during pre-splitting, we want to force them
4453  // to be rewritten into a partition.
4454  bool IsSorted = true;
4455 
4456  uint64_t AllocaSize =
4457  DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
4458  const uint64_t MaxBitVectorSize = 1024;
4459  if (AllocaSize <= MaxBitVectorSize) {
4460  // If a byte boundary is included in any load or store, a slice starting or
4461  // ending at the boundary is not splittable.
4462  SmallBitVector SplittableOffset(AllocaSize + 1, true);
4463  for (Slice &S : AS)
4464  for (unsigned O = S.beginOffset() + 1;
4465  O < S.endOffset() && O < AllocaSize; O++)
4466  SplittableOffset.reset(O);
4467 
4468  for (Slice &S : AS) {
4469  if (!S.isSplittable())
4470  continue;
4471 
4472  if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
4473  (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
4474  continue;
4475 
4476  if (isa<LoadInst>(S.getUse()->getUser()) ||
4477  isa<StoreInst>(S.getUse()->getUser())) {
4478  S.makeUnsplittable();
4479  IsSorted = false;
4480  }
4481  }
4482  }
4483  else {
4484  // We only allow whole-alloca splittable loads and stores
4485  // for a large alloca to avoid creating too large BitVector.
4486  for (Slice &S : AS) {
4487  if (!S.isSplittable())
4488  continue;
4489 
4490  if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
4491  continue;
4492 
4493  if (isa<LoadInst>(S.getUse()->getUser()) ||
4494  isa<StoreInst>(S.getUse()->getUser())) {
4495  S.makeUnsplittable();
4496  IsSorted = false;
4497  }
4498  }
4499  }
4500 
4501  if (!IsSorted)
4502  llvm::sort(AS);
4503 
4504  /// Describes the allocas introduced by rewritePartition in order to migrate
4505  /// the debug info.
4506  struct Fragment {
4507  AllocaInst *Alloca;
4508  uint64_t Offset;
4509  uint64_t Size;
4510  Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
4511  : Alloca(AI), Offset(O), Size(S) {}
4512  };
4513  SmallVector<Fragment, 4> Fragments;
4514 
4515  // Rewrite each partition.
4516  for (auto &P : AS.partitions()) {
4517  if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
4518  Changed = true;
4519  if (NewAI != &AI) {
4520  uint64_t SizeOfByte = 8;
4521  uint64_t AllocaSize =
4522  DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
4523  // Don't include any padding.
4524  uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
4525  Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
4526  }
4527  }
4528  ++NumPartitions;
4529  }
4530 
4531  NumAllocaPartitions += NumPartitions;
4532  MaxPartitionsPerAlloca.updateMax(NumPartitions);
4533 
4534  // Migrate debug information from the old alloca to the new alloca(s)
4535  // and the individual partitions.
4537  for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
4538  auto *Expr = DbgDeclare->getExpression();
4539  DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
4540  uint64_t AllocaSize =
4541  DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
4542  for (auto Fragment : Fragments) {
4543  // Create a fragment expression describing the new partition or reuse AI's
4544  // expression if there is only one partition.
4545  auto *FragmentExpr = Expr;
4546  if (Fragment.Size < AllocaSize || Expr->isFragment()) {
4547  // If this alloca is already a scalar replacement of a larger aggregate,
4548  // Fragment.Offset describes the offset inside the scalar.
4549  auto ExprFragment = Expr->getFragmentInfo();
4550  uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
4551  uint64_t Start = Offset + Fragment.Offset;
4552  uint64_t Size = Fragment.Size;
4553  if (ExprFragment) {
4554  uint64_t AbsEnd =
4555  ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
4556  if (Start >= AbsEnd)
4557  // No need to describe a SROAed padding.
4558  continue;
4559  Size = std::min(Size, AbsEnd - Start);
4560  }
4561  // The new, smaller fragment is stenciled out from the old fragment.
4562  if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
4563  assert(Start >= OrigFragment->OffsetInBits &&
4564  "new fragment is outside of original fragment");
4565  Start -= OrigFragment->OffsetInBits;
4566  }
4567 
4568  // The alloca may be larger than the variable.
4569  auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
4570  if (VarSize) {
4571  if (Size > *VarSize)
4572  Size = *VarSize;
4573  if (Size == 0 || Start + Size > *VarSize)
4574  continue;
4575  }
4576 
4577  // Avoid creating a fragment expression that covers the entire variable.
4578  if (!VarSize || *VarSize != Size) {
4579  if (auto E =
4580  DIExpression::createFragmentExpression(Expr, Start, Size))
4581  FragmentExpr = *E;
4582  else
4583  continue;
4584  }
4585  }
4586 
4587  // Remove any existing intrinsics on the new alloca describing
4588  // the variable fragment.
4589  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
4590  auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS,
4591  const DbgVariableIntrinsic *RHS) {
4592  return LHS->getVariable() == RHS->getVariable() &&
4593  LHS->getDebugLoc()->getInlinedAt() ==
4594  RHS->getDebugLoc()->getInlinedAt();
4595  };
4596  if (SameVariableFragment(OldDII, DbgDeclare))
4597  OldDII->eraseFromParent();
4598  }
4599 
4600  DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
4601  DbgDeclare->getDebugLoc(), &AI);
4602  }
4603  }
4604  return Changed;
4605 }
4606 
4607 /// Clobber a use with undef, deleting the used value if it becomes dead.
4608 void SROA::clobberUse(Use &U) {
4609  Value *OldV = U;
4610  // Replace the use with an undef value.
4611  U = UndefValue::get(OldV->getType());
4612 
4613  // Check for this making an instruction dead. We have to garbage collect
4614  // all the dead instructions to ensure the uses of any alloca end up being
4615  // minimal.
4616  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
4617  if (isInstructionTriviallyDead(OldI)) {
4618  DeadInsts.push_back(OldI);
4619  }
4620 }
4621 
4622 /// Analyze an alloca for SROA.
4623 ///
4624 /// This analyzes the alloca to ensure we can reason about it, builds
4625 /// the slices of the alloca, and then hands it off to be split and
4626 /// rewritten as needed.
4627 bool SROA::runOnAlloca(AllocaInst &AI) {
4628  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
4629  ++NumAllocasAnalyzed;
4630 
4631  // Special case dead allocas, as they're trivial.
4632  if (AI.use_empty()) {
4633  AI.eraseFromParent();
4634  return true;
4635  }
4636  const DataLayout &DL = AI.getModule()->getDataLayout();
4637 
4638  // Skip alloca forms that this analysis can't handle.
4639  auto *AT = AI.getAllocatedType();
4640  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
4641  DL.getTypeAllocSize(AT).getFixedSize() == 0)
4642  return false;
4643 
4644  bool Changed = false;
4645 
4646  // First, split any FCA loads and stores touching this alloca to promote
4647  // better splitting and promotion opportunities.
4648  AggLoadStoreRewriter AggRewriter(DL);
4649  Changed |= AggRewriter.rewrite(AI);
4650 
4651  // Build the slices using a recursive instruction-visiting builder.
4652  AllocaSlices AS(DL, AI);
4653  LLVM_DEBUG(AS.print(dbgs()));
4654  if (AS.isEscaped())
4655  return Changed;
4656 
4657  // Delete all the dead users of this alloca before splitting and rewriting it.
4658  for (Instruction *DeadUser : AS.getDeadUsers()) {
4659  // Free up everything used by this instruction.
4660  for (Use &DeadOp : DeadUser->operands())
4661  clobberUse(DeadOp);
4662 
4663  // Now replace the uses of this instruction.
4664  DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
4665 
4666  // And mark it for deletion.
4667  DeadInsts.push_back(DeadUser);
4668  Changed = true;
4669  }
4670  for (Use *DeadOp : AS.getDeadOperands()) {
4671  clobberUse(*DeadOp);
4672  Changed = true;
4673  }
4674 
4675  // No slices to split. Leave the dead alloca for a later pass to clean up.
4676  if (AS.begin() == AS.end())
4677  return Changed;
4678 
4679  Changed |= splitAlloca(AI, AS);
4680 
4681  LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
4682  while (!SpeculatablePHIs.empty())
4683  speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
4684 
4685  LLVM_DEBUG(dbgs() << " Speculating Selects\n");
4686  while (!SpeculatableSelects.empty())
4687  speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
4688 
4689  return Changed;
4690 }
4691 
4692 /// Delete the dead instructions accumulated in this run.
4693 ///
4694 /// Recursively deletes the dead instructions we've accumulated. This is done
4695 /// at the very end to maximize locality of the recursive delete and to
4696 /// minimize the problems of invalidated instruction pointers as such pointers
4697 /// are used heavily in the intermediate stages of the algorithm.
4698 ///
4699 /// We also record the alloca instructions deleted here so that they aren't
4700 /// subsequently handed to mem2reg to promote.
4701 bool SROA::deleteDeadInstructions(
4702  SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
4703  bool Changed = false;
4704  while (!DeadInsts.empty()) {
4705  Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
4706  if (!I) continue;
4707  LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
4708 
4709  // If the instruction is an alloca, find the possible dbg.declare connected
4710  // to it, and remove it too. We must do this before calling RAUW or we will
4711  // not be able to find it.
4712  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4713  DeletedAllocas.insert(AI);
4714  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
4715  OldDII->eraseFromParent();
4716  }
4717 
4718  I->replaceAllUsesWith(UndefValue::get(I->getType()));
4719 
4720  for (Use &Operand : I->operands())
4721  if (Instruction *U = dyn_cast<Instruction>(Operand)) {
4722  // Zero out the operand and see if it becomes trivially dead.
4723  Operand = nullptr;
4725  DeadInsts.push_back(U);
4726  }
4727 
4728  ++NumDeleted;
4729  I->eraseFromParent();
4730  Changed = true;
4731  }
4732  return Changed;
4733 }
4734 
4735 /// Promote the allocas, using the best available technique.
4736 ///
4737 /// This attempts to promote whatever allocas have been identified as viable in
4738 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
4739 /// This function returns whether any promotion occurred.
4740 bool SROA::promoteAllocas(Function &F) {
4741  if (PromotableAllocas.empty())
4742  return false;
4743 
4744  NumPromoted += PromotableAllocas.size();
4745 
4746  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
4747  PromoteMemToReg(PromotableAllocas, *DT, AC);
4748  PromotableAllocas.clear();
4749  return true;
4750 }
4751 
4753  AssumptionCache &RunAC) {
4754  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
4755  C = &F.getContext();
4756  DT = &RunDT;
4757  AC = &RunAC;
4758 
4759  BasicBlock &EntryBB = F.getEntryBlock();
4760  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
4761  I != E; ++I) {
4762  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4763  if (isa<ScalableVectorType>(AI->getAllocatedType())) {
4764  if (isAllocaPromotable(AI))
4765  PromotableAllocas.push_back(AI);
4766  } else {
4767  Worklist.insert(AI);
4768  }
4769  }
4770  }
4771 
4772  bool Changed = false;
4773  // A set of deleted alloca instruction pointers which should be removed from
4774  // the list of promotable allocas.
4775  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
4776 
4777  do {
4778  while (!Worklist.empty()) {
4779  Changed |= runOnAlloca(*Worklist.pop_back_val());
4780  Changed |= deleteDeadInstructions(DeletedAllocas);
4781 
4782  // Remove the deleted allocas from various lists so that we don't try to
4783  // continue processing them.
4784  if (!DeletedAllocas.empty()) {
4785  auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
4786  Worklist.remove_if(IsInSet);
4787  PostPromotionWorklist.remove_if(IsInSet);
4788  llvm::erase_if(PromotableAllocas, IsInSet);
4789  DeletedAllocas.clear();
4790  }
4791  }
4792 
4793  Changed |= promoteAllocas(F);
4794 
4795  Worklist = PostPromotionWorklist;
4796  PostPromotionWorklist.clear();
4797  } while (!Worklist.empty());
4798 
4799  if (!Changed)
4800  return PreservedAnalyses::all();
4801 
4802  PreservedAnalyses PA;
4803  PA.preserveSet<CFGAnalyses>();
4804  return PA;
4805 }
4806 
4808  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
4810 }
4811 
4812 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
4813 ///
4814 /// This is in the llvm namespace purely to allow it to be a friend of the \c
4815 /// SROA pass.
4817  /// The SROA implementation.
4818  SROA Impl;
4819 
4820 public:
4821  static char ID;
4822 
4824  initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
4825  }
4826 
4827  bool runOnFunction(Function &F) override {
4828  if (skipFunction(F))
4829  return false;
4830 
4831  auto PA = Impl.runImpl(
4832  F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
4833  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
4834  return !PA.areAllPreserved();
4835  }
4836 
4837  void getAnalysisUsage(AnalysisUsage &AU) const override {
4841  AU.setPreservesCFG();
4842  }
4843 
4844  StringRef getPassName() const override { return "SROA"; }
4845 };
4846 
4847 char SROALegacyPass::ID = 0;
4848 
4850 
4852  "Scalar Replacement Of Aggregates", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
PtrUseVisitor.h
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::sroa::AllocaSlices::begin
iterator begin()
Definition: SROA.cpp:237
llvm::detail::PtrUseVisitorBase::DL
const DataLayout & DL
Definition: PtrUseVisitor.h:117
AssumptionCache.h
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:274
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:102
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:491
MathExtras.h
llvm::sroa::AllocaSlices::iterator
SmallVectorImpl< Slice >::iterator iterator
Support for iterating over the slices.
Definition: SROA.cpp:234
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:705
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
Pass
print lazy value Lazy Value Info Printer Pass
Definition: LazyValueInfo.cpp:1966
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:378
stripAggregateTypeWrapping
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition: SROA.cpp:3633
llvm::PHINode::incoming_values
op_range incoming_values()
Definition: Instructions.h:2713
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
Metadata.h
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:255
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:120
IntrinsicInst.h
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:228
llvm::DIBuilder
Definition: DIBuilder.h:41
llvm::sroa::Partition::end
iterator end() const
Definition: SROA.cpp:407
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:643
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:769
DebugInfoMetadata.h
llvm::cl::Prefix
@ Prefix
Definition: CommandLine.h:164
Scalar.h
Loads.h
llvm::MemTransferInst
This class wraps the llvm.memcpy/memmove intrinsics.
Definition: IntrinsicInst.h:895
llvm::StringRef::rfind
LLVM_NODISCARD size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition: StringRef.h:375
llvm::PointerType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:672
llvm::Function
Definition: Function.h:61
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
sroa
sroa
Definition: SROA.cpp:4855
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::StructType::element_iterator
Type::subtype_iterator element_iterator
Definition: DerivedTypes.h:314
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5194
GetElementPtrTypeIterator.h
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:319
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::AllocaInst::getType
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:104
llvm::MemIntrinsicBase::getDestAddressSpace
unsigned getDestAddressSpace() const
Definition: IntrinsicInst.h:632
llvm::StringRef::find
LLVM_NODISCARD size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:315
getNaturalGEPRecursively
static Value * getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Type *Ty, APInt &Offset, Type *TargetTy, SmallVectorImpl< Value * > &Indices, const Twine &NamePrefix)
Recursively compute indices for a natural GEP.
Definition: SROA.cpp:1469
llvm::SmallVector< Slice, 8 >
Statistic.h
llvm::sroa::Partition::beginOffset
uint64_t beginOffset() const
The start offset of this partition.
Definition: SROA.cpp:378
llvm::sroa::SROALegacyPass::SROALegacyPass
SROALegacyPass()
Definition: SROA.cpp:4823
ErrorHandling.h
insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition: SROA.cpp:2221
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:733
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2643
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1728
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:686
llvm::SmallDenseMap< Instruction *, unsigned >
llvm::GlobalAlias
Definition: GlobalAlias.h:27
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
GlobalsModRef.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::ConstantInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=~0ULL) const
getLimitedValue - If the value is smaller than the specified limit, return it, otherwise return the l...
Definition: Constants.h:244
APInt.h
llvm::sroa::SROALegacyPass
A legacy pass for the legacy pass manager that wraps the SROA pass.
Definition: SROA.cpp:4816
llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:651
AllocaSlices::SliceBuilder::SliceBuilder
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition: SROA.cpp:657
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
llvm::LoadInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition: Instructions.h:269
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
llvm::StoreInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:357
llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2032
llvm::DbgVariableIntrinsic::getVariable
DILocalVariable * getVariable() const
Definition: IntrinsicInst.h:253
llvm::APInt::ugt
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition: APInt.h:1275
llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition: IntrinsicInst.h:852
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet< Instruction *, 4 >
llvm::PromoteMemToReg
void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
Definition: PromoteMemoryToRegister.cpp:1013
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1789
llvm::FindDbgAddrUses
TinyPtrVector< DbgVariableIntrinsic * > FindDbgAddrUses(Value *V)
Finds all intrinsics declaring local variables as living in the memory that 'V' points to.
Definition: DebugInfo.cpp:46
Operator.h
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:421
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:398
getNaturalGEPWithType
static Value * getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl< Value * > &Indices, const Twine &NamePrefix)
Get a natural GEP off of the BasePtr walking through Ty toward TargetTy without changing the offset o...
Definition: SROA.cpp:1425
getBitWidth
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Definition: ValueTracking.cpp:89
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:829
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
STLExtras.h
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:356
llvm::IRBuilderDefaultInserter
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition: IRBuilder.h:63
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:139
LLVM_ATTRIBUTE_UNUSED
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:188
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:116
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::sroa::AllocaSlices
Representation of the alloca slices.
Definition: SROA.cpp:221
Use.h
endif
__FakeVCSRevision h endif() endif() set(generated_files "$
Definition: CMakeLists.txt:16
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::StoreInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:384
llvm::AMDGPU::HSAMD::ValueKind::Queue
@ Queue
llvm::Instruction::mayHaveSideEffects
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.cpp:683
llvm::GetElementPtrInst::getSourceElementType
Type * getSourceElementType() const
Definition: Instructions.h:1021
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:744
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:579
isIntegerWideningViableForSlice
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition: SROA.cpp:2010
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::gep_type_end
gep_type_iterator gep_type_end(const User *GEP)
Definition: GetElementPtrTypeIterator.h:146
llvm::IntegerType::getMask
APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition: Type.cpp:299
PointerIntPair.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition: SmallBitVector.h:34
llvm::StringRef::substr
LLVM_NODISCARD StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:611
llvm::GetElementPtrInst::isInBounds
bool isInBounds() const
Determine whether the GEP has the inbounds flag.
Definition: Instructions.cpp:1804
llvm::User::getOperandUse
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Instruction.h
CommandLine.h
insertInteger
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition: SROA.cpp:2163
extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition: SROA.cpp:2196
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Type::isArrayTy
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:225
llvm::Instruction::getNumSuccessors
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
Definition: Instruction.cpp:765
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1292
llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:259
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1547
llvm::ConstantFolder
ConstantFolder - Create constants with minimum, target independent, folding.
Definition: ConstantFolder.h:28
foldPHINodeOrSelectInst
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition: SROA.cpp:629
llvm::MemTransferBase::getSourceAddressSpace
unsigned getSourceAddressSpace() const
Definition: IntrinsicInst.h:697
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1787
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_END(SROALegacyPass
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition: Instructions.h:5234
Rewriter
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Constants.h
llvm::PHINode::getIncomingValue
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
Definition: Instructions.h:2723
llvm::Instruction::getAAMetadata
void getAAMetadata(AAMDNodes &N, bool Merge=false) const
Fills the AAMDNodes structure with AA metadata from this instruction.
Definition: TypeBasedAliasAnalysis.cpp:524
llvm::sroa::AllocaSlices::AllocaSlices
AllocaSlices(const DataLayout &DL, AllocaInst &AI)
Construct the slices of a particular alloca.
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:113
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:648
llvm::User
Definition: User.h:44
llvm::StructType::isPacked
bool isPacked() const
Definition: DerivedTypes.h:272
llvm::sroa::AllocaSliceRewriter
Visitor to rewrite instructions using p particular slice of an alloca to use a new alloca.
Definition: SROA.cpp:2276
llvm::sroa::Partition::endOffset
uint64_t endOffset() const
The end offset of this partition.
Definition: SROA.cpp:383
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Twine.h
InstrTypes.h
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:296
speculatePHINodeLoads
static void speculatePHINodeLoads(PHINode &PN)
Definition: SROA.cpp:1267
ConstantFolder.h
llvm::sroa::AllocaSlices::dump
void dump() const
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
runImpl
static bool runImpl(const TargetLibraryInfo &TLI, Function &F)
Definition: ReplaceWithVeclib.cpp:177
llvm::MemTransferBase::getRawSource
Value * getRawSource() const
Return the arguments to the instruction.
Definition: IntrinsicInst.h:684
isSafePHIToSpeculate
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition: SROA.cpp:1199
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:199
speculateSelectInstLoads
static void speculateSelectInstLoads(SelectInst &SI)
Definition: SROA.cpp:1358
llvm::sroa::AllocaSlices::getDeadUsers
ArrayRef< Instruction * > getDeadUsers() const
Access the dead users for this alloca.
Definition: SROA.cpp:269
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:249
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of tw