LLVM  13.0.0git
SROA.cpp
Go to the documentation of this file.
1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
14 ///
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
18 ///
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
22 ///
23 //===----------------------------------------------------------------------===//
24 
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringRef.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/ADT/iterator.h"
42 #include "llvm/Analysis/Loads.h"
44 #include "llvm/Config/llvm-config.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/ConstantFolder.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DIBuilder.h"
50 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Dominators.h"
54 #include "llvm/IR/Function.h"
56 #include "llvm/IR/GlobalAlias.h"
57 #include "llvm/IR/IRBuilder.h"
58 #include "llvm/IR/InstVisitor.h"
59 #include "llvm/IR/InstrTypes.h"
60 #include "llvm/IR/Instruction.h"
61 #include "llvm/IR/Instructions.h"
62 #include "llvm/IR/IntrinsicInst.h"
63 #include "llvm/IR/Intrinsics.h"
64 #include "llvm/IR/LLVMContext.h"
65 #include "llvm/IR/Metadata.h"
66 #include "llvm/IR/Module.h"
67 #include "llvm/IR/Operator.h"
68 #include "llvm/IR/PassManager.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/IR/Use.h"
71 #include "llvm/IR/User.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/InitializePasses.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Compiler.h"
78 #include "llvm/Support/Debug.h"
82 #include "llvm/Transforms/Scalar.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <chrono>
88 #include <cstddef>
89 #include <cstdint>
90 #include <cstring>
91 #include <iterator>
92 #include <string>
93 #include <tuple>
94 #include <utility>
95 #include <vector>
96 
97 using namespace llvm;
98 using namespace llvm::sroa;
99 
100 #define DEBUG_TYPE "sroa"
101 
102 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
103 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
104 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
105 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
106 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
107 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
108 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
109 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
110 STATISTIC(NumDeleted, "Number of instructions deleted");
111 STATISTIC(NumVectorized, "Number of vectorized aggregates");
112 
113 /// Hidden option to experiment with completely strict handling of inbounds
114 /// GEPs.
115 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
116  cl::Hidden);
117 
118 namespace {
119 
120 /// A custom IRBuilder inserter which prefixes all names, but only in
121 /// Assert builds.
122 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
123  std::string Prefix;
124 
125  const Twine getNameWithPrefix(const Twine &Name) const {
126  return Name.isTriviallyEmpty() ? Name : Prefix + Name;
127  }
128 
129 public:
130  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
131 
132  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
133  BasicBlock::iterator InsertPt) const override {
134  IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
135  InsertPt);
136  }
137 };
138 
139 /// Provide a type for IRBuilder that drops names in release builds.
141 
142 /// A used slice of an alloca.
143 ///
144 /// This structure represents a slice of an alloca used by some instruction. It
145 /// stores both the begin and end offsets of this use, a pointer to the use
146 /// itself, and a flag indicating whether we can classify the use as splittable
147 /// or not when forming partitions of the alloca.
148 class Slice {
149  /// The beginning offset of the range.
150  uint64_t BeginOffset = 0;
151 
152  /// The ending offset, not included in the range.
153  uint64_t EndOffset = 0;
154 
155  /// Storage for both the use of this slice and whether it can be
156  /// split.
157  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
158 
159 public:
160  Slice() = default;
161 
162  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
163  : BeginOffset(BeginOffset), EndOffset(EndOffset),
164  UseAndIsSplittable(U, IsSplittable) {}
165 
166  uint64_t beginOffset() const { return BeginOffset; }
167  uint64_t endOffset() const { return EndOffset; }
168 
169  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
170  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
171 
172  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
173 
174  bool isDead() const { return getUse() == nullptr; }
175  void kill() { UseAndIsSplittable.setPointer(nullptr); }
176 
177  /// Support for ordering ranges.
178  ///
179  /// This provides an ordering over ranges such that start offsets are
180  /// always increasing, and within equal start offsets, the end offsets are
181  /// decreasing. Thus the spanning range comes first in a cluster with the
182  /// same start position.
183  bool operator<(const Slice &RHS) const {
184  if (beginOffset() < RHS.beginOffset())
185  return true;
186  if (beginOffset() > RHS.beginOffset())
187  return false;
188  if (isSplittable() != RHS.isSplittable())
189  return !isSplittable();
190  if (endOffset() > RHS.endOffset())
191  return true;
192  return false;
193  }
194 
195  /// Support comparison with a single offset to allow binary searches.
196  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
197  uint64_t RHSOffset) {
198  return LHS.beginOffset() < RHSOffset;
199  }
200  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
201  const Slice &RHS) {
202  return LHSOffset < RHS.beginOffset();
203  }
204 
205  bool operator==(const Slice &RHS) const {
206  return isSplittable() == RHS.isSplittable() &&
207  beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
208  }
209  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
210 };
211 
212 } // end anonymous namespace
213 
214 /// Representation of the alloca slices.
215 ///
216 /// This class represents the slices of an alloca which are formed by its
217 /// various uses. If a pointer escapes, we can't fully build a representation
218 /// for the slices used and we reflect that in this structure. The uses are
219 /// stored, sorted by increasing beginning offset and with unsplittable slices
220 /// starting at a particular offset before splittable slices.
222 public:
223  /// Construct the slices of a particular alloca.
224  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
225 
226  /// Test whether a pointer to the allocation escapes our analysis.
227  ///
228  /// If this is true, the slices are never fully built and should be
229  /// ignored.
230  bool isEscaped() const { return PointerEscapingInstr; }
231 
232  /// Support for iterating over the slices.
233  /// @{
236 
237  iterator begin() { return Slices.begin(); }
238  iterator end() { return Slices.end(); }
239 
242 
243  const_iterator begin() const { return Slices.begin(); }
244  const_iterator end() const { return Slices.end(); }
245  /// @}
246 
247  /// Erase a range of slices.
248  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
249 
250  /// Insert new slices for this alloca.
251  ///
252  /// This moves the slices into the alloca's slices collection, and re-sorts
253  /// everything so that the usual ordering properties of the alloca's slices
254  /// hold.
255  void insert(ArrayRef<Slice> NewSlices) {
256  int OldSize = Slices.size();
257  Slices.append(NewSlices.begin(), NewSlices.end());
258  auto SliceI = Slices.begin() + OldSize;
259  llvm::sort(SliceI, Slices.end());
260  std::inplace_merge(Slices.begin(), SliceI, Slices.end());
261  }
262 
263  // Forward declare the iterator and range accessor for walking the
264  // partitions.
265  class partition_iterator;
267 
268  /// Access the dead users for this alloca.
269  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
270 
271  /// Access Uses that should be dropped if the alloca is promotable.
273  return DeadUseIfPromotable;
274  }
275 
276  /// Access the dead operands referring to this alloca.
277  ///
278  /// These are operands which have cannot actually be used to refer to the
279  /// alloca as they are outside its range and the user doesn't correct for
280  /// that. These mostly consist of PHI node inputs and the like which we just
281  /// need to replace with undef.
282  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
283 
284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
285  void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
287  StringRef Indent = " ") const;
289  StringRef Indent = " ") const;
290  void print(raw_ostream &OS) const;
291  void dump(const_iterator I) const;
292  void dump() const;
293 #endif
294 
295 private:
296  template <typename DerivedT, typename RetT = void> class BuilderBase;
297  class SliceBuilder;
298 
299  friend class AllocaSlices::SliceBuilder;
300 
301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
302  /// Handle to alloca instruction to simplify method interfaces.
303  AllocaInst &AI;
304 #endif
305 
306  /// The instruction responsible for this alloca not having a known set
307  /// of slices.
308  ///
309  /// When an instruction (potentially) escapes the pointer to the alloca, we
310  /// store a pointer to that here and abort trying to form slices of the
311  /// alloca. This will be null if the alloca slices are analyzed successfully.
312  Instruction *PointerEscapingInstr;
313 
314  /// The slices of the alloca.
315  ///
316  /// We store a vector of the slices formed by uses of the alloca here. This
317  /// vector is sorted by increasing begin offset, and then the unsplittable
318  /// slices before the splittable ones. See the Slice inner class for more
319  /// details.
320  SmallVector<Slice, 8> Slices;
321 
322  /// Instructions which will become dead if we rewrite the alloca.
323  ///
324  /// Note that these are not separated by slice. This is because we expect an
325  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
326  /// all these instructions can simply be removed and replaced with undef as
327  /// they come from outside of the allocated space.
329 
330  /// Uses which will become dead if can promote the alloca.
331  SmallVector<Use *, 8> DeadUseIfPromotable;
332 
333  /// Operands which will become dead if we rewrite the alloca.
334  ///
335  /// These are operands that in their particular use can be replaced with
336  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
337  /// to PHI nodes and the like. They aren't entirely dead (there might be
338  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
339  /// want to swap this particular input for undef to simplify the use lists of
340  /// the alloca.
341  SmallVector<Use *, 8> DeadOperands;
342 };
343 
344 /// A partition of the slices.
345 ///
346 /// An ephemeral representation for a range of slices which can be viewed as
347 /// a partition of the alloca. This range represents a span of the alloca's
348 /// memory which cannot be split, and provides access to all of the slices
349 /// overlapping some part of the partition.
350 ///
351 /// Objects of this type are produced by traversing the alloca's slices, but
352 /// are only ephemeral and not persistent.
354 private:
355  friend class AllocaSlices;
357 
358  using iterator = AllocaSlices::iterator;
359 
360  /// The beginning and ending offsets of the alloca for this
361  /// partition.
362  uint64_t BeginOffset = 0, EndOffset = 0;
363 
364  /// The start and end iterators of this partition.
365  iterator SI, SJ;
366 
367  /// A collection of split slice tails overlapping the partition.
368  SmallVector<Slice *, 4> SplitTails;
369 
370  /// Raw constructor builds an empty partition starting and ending at
371  /// the given iterator.
372  Partition(iterator SI) : SI(SI), SJ(SI) {}
373 
374 public:
375  /// The start offset of this partition.
376  ///
377  /// All of the contained slices start at or after this offset.
378  uint64_t beginOffset() const { return BeginOffset; }
379 
380  /// The end offset of this partition.
381  ///
382  /// All of the contained slices end at or before this offset.
383  uint64_t endOffset() const { return EndOffset; }
384 
385  /// The size of the partition.
386  ///
387  /// Note that this can never be zero.
388  uint64_t size() const {
389  assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
390  return EndOffset - BeginOffset;
391  }
392 
393  /// Test whether this partition contains no slices, and merely spans
394  /// a region occupied by split slices.
395  bool empty() const { return SI == SJ; }
396 
397  /// \name Iterate slices that start within the partition.
398  /// These may be splittable or unsplittable. They have a begin offset >= the
399  /// partition begin offset.
400  /// @{
401  // FIXME: We should probably define a "concat_iterator" helper and use that
402  // to stitch together pointee_iterators over the split tails and the
403  // contiguous iterators of the partition. That would give a much nicer
404  // interface here. We could then additionally expose filtered iterators for
405  // split, unsplit, and unsplittable splices based on the usage patterns.
406  iterator begin() const { return SI; }
407  iterator end() const { return SJ; }
408  /// @}
409 
410  /// Get the sequence of split slice tails.
411  ///
412  /// These tails are of slices which start before this partition but are
413  /// split and overlap into the partition. We accumulate these while forming
414  /// partitions.
415  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
416 };
417 
418 /// An iterator over partitions of the alloca's slices.
419 ///
420 /// This iterator implements the core algorithm for partitioning the alloca's
421 /// slices. It is a forward iterator as we don't support backtracking for
422 /// efficiency reasons, and re-use a single storage area to maintain the
423 /// current set of split slices.
424 ///
425 /// It is templated on the slice iterator type to use so that it can operate
426 /// with either const or non-const slice iterators.
428  : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
429  Partition> {
430  friend class AllocaSlices;
431 
432  /// Most of the state for walking the partitions is held in a class
433  /// with a nice interface for examining them.
434  Partition P;
435 
436  /// We need to keep the end of the slices to know when to stop.
438 
439  /// We also need to keep track of the maximum split end offset seen.
440  /// FIXME: Do we really?
441  uint64_t MaxSplitSliceEndOffset = 0;
442 
443  /// Sets the partition to be empty at given iterator, and sets the
444  /// end iterator.
446  : P(SI), SE(SE) {
447  // If not already at the end, advance our state to form the initial
448  // partition.
449  if (SI != SE)
450  advance();
451  }
452 
453  /// Advance the iterator to the next partition.
454  ///
455  /// Requires that the iterator not be at the end of the slices.
456  void advance() {
457  assert((P.SI != SE || !P.SplitTails.empty()) &&
458  "Cannot advance past the end of the slices!");
459 
460  // Clear out any split uses which have ended.
461  if (!P.SplitTails.empty()) {
462  if (P.EndOffset >= MaxSplitSliceEndOffset) {
463  // If we've finished all splits, this is easy.
464  P.SplitTails.clear();
465  MaxSplitSliceEndOffset = 0;
466  } else {
467  // Remove the uses which have ended in the prior partition. This
468  // cannot change the max split slice end because we just checked that
469  // the prior partition ended prior to that max.
470  llvm::erase_if(P.SplitTails,
471  [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
472  assert(llvm::any_of(P.SplitTails,
473  [&](Slice *S) {
474  return S->endOffset() == MaxSplitSliceEndOffset;
475  }) &&
476  "Could not find the current max split slice offset!");
477  assert(llvm::all_of(P.SplitTails,
478  [&](Slice *S) {
479  return S->endOffset() <= MaxSplitSliceEndOffset;
480  }) &&
481  "Max split slice end offset is not actually the max!");
482  }
483  }
484 
485  // If P.SI is already at the end, then we've cleared the split tail and
486  // now have an end iterator.
487  if (P.SI == SE) {
488  assert(P.SplitTails.empty() && "Failed to clear the split slices!");
489  return;
490  }
491 
492  // If we had a non-empty partition previously, set up the state for
493  // subsequent partitions.
494  if (P.SI != P.SJ) {
495  // Accumulate all the splittable slices which started in the old
496  // partition into the split list.
497  for (Slice &S : P)
498  if (S.isSplittable() && S.endOffset() > P.EndOffset) {
499  P.SplitTails.push_back(&S);
500  MaxSplitSliceEndOffset =
501  std::max(S.endOffset(), MaxSplitSliceEndOffset);
502  }
503 
504  // Start from the end of the previous partition.
505  P.SI = P.SJ;
506 
507  // If P.SI is now at the end, we at most have a tail of split slices.
508  if (P.SI == SE) {
509  P.BeginOffset = P.EndOffset;
510  P.EndOffset = MaxSplitSliceEndOffset;
511  return;
512  }
513 
514  // If the we have split slices and the next slice is after a gap and is
515  // not splittable immediately form an empty partition for the split
516  // slices up until the next slice begins.
517  if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
518  !P.SI->isSplittable()) {
519  P.BeginOffset = P.EndOffset;
520  P.EndOffset = P.SI->beginOffset();
521  return;
522  }
523  }
524 
525  // OK, we need to consume new slices. Set the end offset based on the
526  // current slice, and step SJ past it. The beginning offset of the
527  // partition is the beginning offset of the next slice unless we have
528  // pre-existing split slices that are continuing, in which case we begin
529  // at the prior end offset.
530  P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
531  P.EndOffset = P.SI->endOffset();
532  ++P.SJ;
533 
534  // There are two strategies to form a partition based on whether the
535  // partition starts with an unsplittable slice or a splittable slice.
536  if (!P.SI->isSplittable()) {
537  // When we're forming an unsplittable region, it must always start at
538  // the first slice and will extend through its end.
539  assert(P.BeginOffset == P.SI->beginOffset());
540 
541  // Form a partition including all of the overlapping slices with this
542  // unsplittable slice.
543  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
544  if (!P.SJ->isSplittable())
545  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
546  ++P.SJ;
547  }
548 
549  // We have a partition across a set of overlapping unsplittable
550  // partitions.
551  return;
552  }
553 
554  // If we're starting with a splittable slice, then we need to form
555  // a synthetic partition spanning it and any other overlapping splittable
556  // splices.
557  assert(P.SI->isSplittable() && "Forming a splittable partition!");
558 
559  // Collect all of the overlapping splittable slices.
560  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
561  P.SJ->isSplittable()) {
562  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
563  ++P.SJ;
564  }
565 
566  // Back upiP.EndOffset if we ended the span early when encountering an
567  // unsplittable slice. This synthesizes the early end offset of
568  // a partition spanning only splittable slices.
569  if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
570  assert(!P.SJ->isSplittable());
571  P.EndOffset = P.SJ->beginOffset();
572  }
573  }
574 
575 public:
576  bool operator==(const partition_iterator &RHS) const {
577  assert(SE == RHS.SE &&
578  "End iterators don't match between compared partition iterators!");
579 
580  // The observed positions of partitions is marked by the P.SI iterator and
581  // the emptiness of the split slices. The latter is only relevant when
582  // P.SI == SE, as the end iterator will additionally have an empty split
583  // slices list, but the prior may have the same P.SI and a tail of split
584  // slices.
585  if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
586  assert(P.SJ == RHS.P.SJ &&
587  "Same set of slices formed two different sized partitions!");
588  assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
589  "Same slice position with differently sized non-empty split "
590  "slice tails!");
591  return true;
592  }
593  return false;
594  }
595 
597  advance();
598  return *this;
599  }
600 
601  Partition &operator*() { return P; }
602 };
603 
604 /// A forward range over the partitions of the alloca's slices.
605 ///
606 /// This accesses an iterator range over the partitions of the alloca's
607 /// slices. It computes these partitions on the fly based on the overlapping
608 /// offsets of the slices and the ability to split them. It will visit "empty"
609 /// partitions to cover regions of the alloca only accessed via split
610 /// slices.
612  return make_range(partition_iterator(begin(), end()),
613  partition_iterator(end(), end()));
614 }
615 
617  // If the condition being selected on is a constant or the same value is
618  // being selected between, fold the select. Yes this does (rarely) happen
619  // early on.
620  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
621  return SI.getOperand(1 + CI->isZero());
622  if (SI.getOperand(1) == SI.getOperand(2))
623  return SI.getOperand(1);
624 
625  return nullptr;
626 }
627 
628 /// A helper that folds a PHI node or a select.
630  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
631  // If PN merges together the same value, return that value.
632  return PN->hasConstantValue();
633  }
634  return foldSelectInst(cast<SelectInst>(I));
635 }
636 
637 /// Builder for the alloca slices.
638 ///
639 /// This class builds a set of alloca slices by recursively visiting the uses
640 /// of an alloca and making a slice for each load and store at each offset.
641 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
643  friend class InstVisitor<SliceBuilder>;
644 
646 
647  const uint64_t AllocSize;
648  AllocaSlices &AS;
649 
650  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
652 
653  /// Set to de-duplicate dead instructions found in the use walk.
654  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
655 
656 public:
659  AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
660  AS(AS) {}
661 
662 private:
663  void markAsDead(Instruction &I) {
664  if (VisitedDeadInsts.insert(&I).second)
665  AS.DeadUsers.push_back(&I);
666  }
667 
668  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
669  bool IsSplittable = false) {
670  // Completely skip uses which have a zero size or start either before or
671  // past the end of the allocation.
672  if (Size == 0 || Offset.uge(AllocSize)) {
673  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
674  << Offset
675  << " which has zero size or starts outside of the "
676  << AllocSize << " byte alloca:\n"
677  << " alloca: " << AS.AI << "\n"
678  << " use: " << I << "\n");
679  return markAsDead(I);
680  }
681 
682  uint64_t BeginOffset = Offset.getZExtValue();
683  uint64_t EndOffset = BeginOffset + Size;
684 
685  // Clamp the end offset to the end of the allocation. Note that this is
686  // formulated to handle even the case where "BeginOffset + Size" overflows.
687  // This may appear superficially to be something we could ignore entirely,
688  // but that is not so! There may be widened loads or PHI-node uses where
689  // some instructions are dead but not others. We can't completely ignore
690  // them, and so have to record at least the information here.
691  assert(AllocSize >= BeginOffset); // Established above.
692  if (Size > AllocSize - BeginOffset) {
693  LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
694  << Offset << " to remain within the " << AllocSize
695  << " byte alloca:\n"
696  << " alloca: " << AS.AI << "\n"
697  << " use: " << I << "\n");
698  EndOffset = AllocSize;
699  }
700 
701  AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
702  }
703 
704  void visitBitCastInst(BitCastInst &BC) {
705  if (BC.use_empty())
706  return markAsDead(BC);
707 
708  return Base::visitBitCastInst(BC);
709  }
710 
711  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
712  if (ASC.use_empty())
713  return markAsDead(ASC);
714 
715  return Base::visitAddrSpaceCastInst(ASC);
716  }
717 
718  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
719  if (GEPI.use_empty())
720  return markAsDead(GEPI);
721 
722  if (SROAStrictInbounds && GEPI.isInBounds()) {
723  // FIXME: This is a manually un-factored variant of the basic code inside
724  // of GEPs with checking of the inbounds invariant specified in the
725  // langref in a very strict sense. If we ever want to enable
726  // SROAStrictInbounds, this code should be factored cleanly into
727  // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
728  // by writing out the code here where we have the underlying allocation
729  // size readily available.
730  APInt GEPOffset = Offset;
731  const DataLayout &DL = GEPI.getModule()->getDataLayout();
732  for (gep_type_iterator GTI = gep_type_begin(GEPI),
733  GTE = gep_type_end(GEPI);
734  GTI != GTE; ++GTI) {
735  ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
736  if (!OpC)
737  break;
738 
739  // Handle a struct index, which adds its field offset to the pointer.
740  if (StructType *STy = GTI.getStructTypeOrNull()) {
741  unsigned ElementIdx = OpC->getZExtValue();
742  const StructLayout *SL = DL.getStructLayout(STy);
743  GEPOffset +=
744  APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
745  } else {
746  // For array or vector indices, scale the index by the size of the
747  // type.
748  APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
749  GEPOffset +=
750  Index *
751  APInt(Offset.getBitWidth(),
752  DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
753  }
754 
755  // If this index has computed an intermediate pointer which is not
756  // inbounds, then the result of the GEP is a poison value and we can
757  // delete it and all uses.
758  if (GEPOffset.ugt(AllocSize))
759  return markAsDead(GEPI);
760  }
761  }
762 
763  return Base::visitGetElementPtrInst(GEPI);
764  }
765 
766  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
767  uint64_t Size, bool IsVolatile) {
768  // We allow splitting of non-volatile loads and stores where the type is an
769  // integer type. These may be used to implement 'memcpy' or other "transfer
770  // of bits" patterns.
771  bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
772 
773  insertUse(I, Offset, Size, IsSplittable);
774  }
775 
776  void visitLoadInst(LoadInst &LI) {
777  assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
778  "All simple FCA loads should have been pre-split");
779 
780  if (!IsOffsetKnown)
781  return PI.setAborted(&LI);
782 
783  if (LI.isVolatile() &&
784  LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
785  return PI.setAborted(&LI);
786 
787  if (isa<ScalableVectorType>(LI.getType()))
788  return PI.setAborted(&LI);
789 
790  uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
791  return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
792  }
793 
794  void visitStoreInst(StoreInst &SI) {
795  Value *ValOp = SI.getValueOperand();
796  if (ValOp == *U)
797  return PI.setEscapedAndAborted(&SI);
798  if (!IsOffsetKnown)
799  return PI.setAborted(&SI);
800 
801  if (SI.isVolatile() &&
802  SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
803  return PI.setAborted(&SI);
804 
805  if (isa<ScalableVectorType>(ValOp->getType()))
806  return PI.setAborted(&SI);
807 
808  uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
809 
810  // If this memory access can be shown to *statically* extend outside the
811  // bounds of the allocation, it's behavior is undefined, so simply
812  // ignore it. Note that this is more strict than the generic clamping
813  // behavior of insertUse. We also try to handle cases which might run the
814  // risk of overflow.
815  // FIXME: We should instead consider the pointer to have escaped if this
816  // function is being instrumented for addressing bugs or race conditions.
817  if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
818  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
819  << Offset << " which extends past the end of the "
820  << AllocSize << " byte alloca:\n"
821  << " alloca: " << AS.AI << "\n"
822  << " use: " << SI << "\n");
823  return markAsDead(SI);
824  }
825 
826  assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
827  "All simple FCA stores should have been pre-split");
828  handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
829  }
830 
831  void visitMemSetInst(MemSetInst &II) {
832  assert(II.getRawDest() == *U && "Pointer use is not the destination?");
833  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
834  if ((Length && Length->getValue() == 0) ||
835  (IsOffsetKnown && Offset.uge(AllocSize)))
836  // Zero-length mem transfer intrinsics can be ignored entirely.
837  return markAsDead(II);
838 
839  if (!IsOffsetKnown)
840  return PI.setAborted(&II);
841 
842  // Don't replace this with a store with a different address space. TODO:
843  // Use a store with the casted new alloca?
844  if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
845  return PI.setAborted(&II);
846 
847  insertUse(II, Offset, Length ? Length->getLimitedValue()
848  : AllocSize - Offset.getLimitedValue(),
849  (bool)Length);
850  }
851 
852  void visitMemTransferInst(MemTransferInst &II) {
853  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
854  if (Length && Length->getValue() == 0)
855  // Zero-length mem transfer intrinsics can be ignored entirely.
856  return markAsDead(II);
857 
858  // Because we can visit these intrinsics twice, also check to see if the
859  // first time marked this instruction as dead. If so, skip it.
860  if (VisitedDeadInsts.count(&II))
861  return;
862 
863  if (!IsOffsetKnown)
864  return PI.setAborted(&II);
865 
866  // Don't replace this with a load/store with a different address space.
867  // TODO: Use a store with the casted new alloca?
868  if (II.isVolatile() &&
869  (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
870  II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
871  return PI.setAborted(&II);
872 
873  // This side of the transfer is completely out-of-bounds, and so we can
874  // nuke the entire transfer. However, we also need to nuke the other side
875  // if already added to our partitions.
876  // FIXME: Yet another place we really should bypass this when
877  // instrumenting for ASan.
878  if (Offset.uge(AllocSize)) {
880  MemTransferSliceMap.find(&II);
881  if (MTPI != MemTransferSliceMap.end())
882  AS.Slices[MTPI->second].kill();
883  return markAsDead(II);
884  }
885 
886  uint64_t RawOffset = Offset.getLimitedValue();
887  uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
888 
889  // Check for the special case where the same exact value is used for both
890  // source and dest.
891  if (*U == II.getRawDest() && *U == II.getRawSource()) {
892  // For non-volatile transfers this is a no-op.
893  if (!II.isVolatile())
894  return markAsDead(II);
895 
896  return insertUse(II, Offset, Size, /*IsSplittable=*/false);
897  }
898 
899  // If we have seen both source and destination for a mem transfer, then
900  // they both point to the same alloca.
901  bool Inserted;
903  std::tie(MTPI, Inserted) =
904  MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
905  unsigned PrevIdx = MTPI->second;
906  if (!Inserted) {
907  Slice &PrevP = AS.Slices[PrevIdx];
908 
909  // Check if the begin offsets match and this is a non-volatile transfer.
910  // In that case, we can completely elide the transfer.
911  if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
912  PrevP.kill();
913  return markAsDead(II);
914  }
915 
916  // Otherwise we have an offset transfer within the same alloca. We can't
917  // split those.
918  PrevP.makeUnsplittable();
919  }
920 
921  // Insert the use now that we've fixed up the splittable nature.
922  insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
923 
924  // Check that we ended up with a valid index in the map.
925  assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
926  "Map index doesn't point back to a slice with this user.");
927  }
928 
929  // Disable SRoA for any intrinsics except for lifetime invariants and
930  // invariant group.
931  // FIXME: What about debug intrinsics? This matches old behavior, but
932  // doesn't make sense.
933  void visitIntrinsicInst(IntrinsicInst &II) {
934  if (II.isDroppable()) {
935  AS.DeadUseIfPromotable.push_back(U);
936  return;
937  }
938 
939  if (!IsOffsetKnown)
940  return PI.setAborted(&II);
941 
942  if (II.isLifetimeStartOrEnd()) {
943  ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
944  uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
945  Length->getLimitedValue());
946  insertUse(II, Offset, Size, true);
947  return;
948  }
949 
951  enqueueUsers(II);
952  return;
953  }
954 
955  Base::visitIntrinsicInst(II);
956  }
957 
958  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
959  // We consider any PHI or select that results in a direct load or store of
960  // the same offset to be a viable use for slicing purposes. These uses
961  // are considered unsplittable and the size is the maximum loaded or stored
962  // size.
965  Visited.insert(Root);
966  Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
967  const DataLayout &DL = Root->getModule()->getDataLayout();
968  // If there are no loads or stores, the access is dead. We mark that as
969  // a size zero access.
970  Size = 0;
971  do {
972  Instruction *I, *UsedI;
973  std::tie(UsedI, I) = Uses.pop_back_val();
974 
975  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
976  Size = std::max(Size,
977  DL.getTypeStoreSize(LI->getType()).getFixedSize());
978  continue;
979  }
980  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
981  Value *Op = SI->getOperand(0);
982  if (Op == UsedI)
983  return SI;
984  Size = std::max(Size,
985  DL.getTypeStoreSize(Op->getType()).getFixedSize());
986  continue;
987  }
988 
989  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
990  if (!GEP->hasAllZeroIndices())
991  return GEP;
992  } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
993  !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
994  return I;
995  }
996 
997  for (User *U : I->users())
998  if (Visited.insert(cast<Instruction>(U)).second)
999  Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1000  } while (!Uses.empty());
1001 
1002  return nullptr;
1003  }
1004 
1005  void visitPHINodeOrSelectInst(Instruction &I) {
1006  assert(isa<PHINode>(I) || isa<SelectInst>(I));
1007  if (I.use_empty())
1008  return markAsDead(I);
1009 
1010  // TODO: We could use SimplifyInstruction here to fold PHINodes and
1011  // SelectInsts. However, doing so requires to change the current
1012  // dead-operand-tracking mechanism. For instance, suppose neither loading
1013  // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1014  // trap either. However, if we simply replace %U with undef using the
1015  // current dead-operand-tracking mechanism, "load (select undef, undef,
1016  // %other)" may trap because the select may return the first operand
1017  // "undef".
1018  if (Value *Result = foldPHINodeOrSelectInst(I)) {
1019  if (Result == *U)
1020  // If the result of the constant fold will be the pointer, recurse
1021  // through the PHI/select as if we had RAUW'ed it.
1022  enqueueUsers(I);
1023  else
1024  // Otherwise the operand to the PHI/select is dead, and we can replace
1025  // it with undef.
1026  AS.DeadOperands.push_back(U);
1027 
1028  return;
1029  }
1030 
1031  if (!IsOffsetKnown)
1032  return PI.setAborted(&I);
1033 
1034  // See if we already have computed info on this node.
1035  uint64_t &Size = PHIOrSelectSizes[&I];
1036  if (!Size) {
1037  // This is a new PHI/Select, check for an unsafe use of it.
1038  if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1039  return PI.setAborted(UnsafeI);
1040  }
1041 
1042  // For PHI and select operands outside the alloca, we can't nuke the entire
1043  // phi or select -- the other side might still be relevant, so we special
1044  // case them here and use a separate structure to track the operands
1045  // themselves which should be replaced with undef.
1046  // FIXME: This should instead be escaped in the event we're instrumenting
1047  // for address sanitization.
1048  if (Offset.uge(AllocSize)) {
1049  AS.DeadOperands.push_back(U);
1050  return;
1051  }
1052 
1053  insertUse(I, Offset, Size);
1054  }
1055 
1056  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1057 
1058  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1059 
1060  /// Disable SROA entirely if there are unhandled users of the alloca.
1061  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1062 };
1063 
1065  :
1066 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1067  AI(AI),
1068 #endif
1069  PointerEscapingInstr(nullptr) {
1070  SliceBuilder PB(DL, AI, *this);
1071  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1072  if (PtrI.isEscaped() || PtrI.isAborted()) {
1073  // FIXME: We should sink the escape vs. abort info into the caller nicely,
1074  // possibly by just storing the PtrInfo in the AllocaSlices.
1075  PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1076  : PtrI.getAbortingInst();
1077  assert(PointerEscapingInstr && "Did not track a bad instruction");
1078  return;
1079  }
1080 
1081  llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1082 
1083  // Sort the uses. This arranges for the offsets to be in ascending order,
1084  // and the sizes to be in descending order.
1085  llvm::stable_sort(Slices);
1086 }
1087 
1088 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1089 
1090 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1091  StringRef Indent) const {
1092  printSlice(OS, I, Indent);
1093  OS << "\n";
1094  printUse(OS, I, Indent);
1095 }
1096 
1097 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1098  StringRef Indent) const {
1099  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1100  << " slice #" << (I - begin())
1101  << (I->isSplittable() ? " (splittable)" : "");
1102 }
1103 
1104 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1105  StringRef Indent) const {
1106  OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1107 }
1108 
1109 void AllocaSlices::print(raw_ostream &OS) const {
1110  if (PointerEscapingInstr) {
1111  OS << "Can't analyze slices for alloca: " << AI << "\n"
1112  << " A pointer to this alloca escaped by:\n"
1113  << " " << *PointerEscapingInstr << "\n";
1114  return;
1115  }
1116 
1117  OS << "Slices of alloca: " << AI << "\n";
1118  for (const_iterator I = begin(), E = end(); I != E; ++I)
1119  print(OS, I);
1120 }
1121 
1122 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1123  print(dbgs(), I);
1124 }
1125 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1126 
1127 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1128 
1129 /// Walk the range of a partitioning looking for a common type to cover this
1130 /// sequence of slices.
1131 static std::pair<Type *, IntegerType *>
1132 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1133  uint64_t EndOffset) {
1134  Type *Ty = nullptr;
1135  bool TyIsCommon = true;
1136  IntegerType *ITy = nullptr;
1137 
1138  // Note that we need to look at *every* alloca slice's Use to ensure we
1139  // always get consistent results regardless of the order of slices.
1140  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1141  Use *U = I->getUse();
1142  if (isa<IntrinsicInst>(*U->getUser()))
1143  continue;
1144  if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1145  continue;
1146 
1147  Type *UserTy = nullptr;
1148  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1149  UserTy = LI->getType();
1150  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1151  UserTy = SI->getValueOperand()->getType();
1152  }
1153 
1154  if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1155  // If the type is larger than the partition, skip it. We only encounter
1156  // this for split integer operations where we want to use the type of the
1157  // entity causing the split. Also skip if the type is not a byte width
1158  // multiple.
1159  if (UserITy->getBitWidth() % 8 != 0 ||
1160  UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1161  continue;
1162 
1163  // Track the largest bitwidth integer type used in this way in case there
1164  // is no common type.
1165  if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1166  ITy = UserITy;
1167  }
1168 
1169  // To avoid depending on the order of slices, Ty and TyIsCommon must not
1170  // depend on types skipped above.
1171  if (!UserTy || (Ty && Ty != UserTy))
1172  TyIsCommon = false; // Give up on anything but an iN type.
1173  else
1174  Ty = UserTy;
1175  }
1176 
1177  return {TyIsCommon ? Ty : nullptr, ITy};
1178 }
1179 
1180 /// PHI instructions that use an alloca and are subsequently loaded can be
1181 /// rewritten to load both input pointers in the pred blocks and then PHI the
1182 /// results, allowing the load of the alloca to be promoted.
1183 /// From this:
1184 /// %P2 = phi [i32* %Alloca, i32* %Other]
1185 /// %V = load i32* %P2
1186 /// to:
1187 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1188 /// ...
1189 /// %V2 = load i32* %Other
1190 /// ...
1191 /// %V = phi [i32 %V1, i32 %V2]
1192 ///
1193 /// We can do this to a select if its only uses are loads and if the operands
1194 /// to the select can be loaded unconditionally.
1195 ///
1196 /// FIXME: This should be hoisted into a generic utility, likely in
1197 /// Transforms/Util/Local.h
1198 static bool isSafePHIToSpeculate(PHINode &PN) {
1199  const DataLayout &DL = PN.getModule()->getDataLayout();
1200 
1201  // For now, we can only do this promotion if the load is in the same block
1202  // as the PHI, and if there are no stores between the phi and load.
1203  // TODO: Allow recursive phi users.
1204  // TODO: Allow stores.
1205  BasicBlock *BB = PN.getParent();
1206  Align MaxAlign;
1207  uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1208  APInt MaxSize(APWidth, 0);
1209  bool HaveLoad = false;
1210  for (User *U : PN.users()) {
1211  LoadInst *LI = dyn_cast<LoadInst>(U);
1212  if (!LI || !LI->isSimple())
1213  return false;
1214 
1215  // For now we only allow loads in the same block as the PHI. This is
1216  // a common case that happens when instcombine merges two loads through
1217  // a PHI.
1218  if (LI->getParent() != BB)
1219  return false;
1220 
1221  // Ensure that there are no instructions between the PHI and the load that
1222  // could store.
1223  for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1224  if (BBI->mayWriteToMemory())
1225  return false;
1226 
1227  uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
1228  MaxAlign = std::max(MaxAlign, LI->getAlign());
1229  MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
1230  HaveLoad = true;
1231  }
1232 
1233  if (!HaveLoad)
1234  return false;
1235 
1236  // We can only transform this if it is safe to push the loads into the
1237  // predecessor blocks. The only thing to watch out for is that we can't put
1238  // a possibly trapping load in the predecessor if it is a critical edge.
1239  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1240  Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
1241  Value *InVal = PN.getIncomingValue(Idx);
1242 
1243  // If the value is produced by the terminator of the predecessor (an
1244  // invoke) or it has side-effects, there is no valid place to put a load
1245  // in the predecessor.
1246  if (TI == InVal || TI->mayHaveSideEffects())
1247  return false;
1248 
1249  // If the predecessor has a single successor, then the edge isn't
1250  // critical.
1251  if (TI->getNumSuccessors() == 1)
1252  continue;
1253 
1254  // If this pointer is always safe to load, or if we can prove that there
1255  // is already a load in the block, then we can move the load to the pred
1256  // block.
1257  if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI))
1258  continue;
1259 
1260  return false;
1261  }
1262 
1263  return true;
1264 }
1265 
1266 static void speculatePHINodeLoads(PHINode &PN) {
1267  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1268 
1269  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1270  Type *LoadTy = SomeLoad->getType();
1271  IRBuilderTy PHIBuilder(&PN);
1272  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1273  PN.getName() + ".sroa.speculated");
1274 
1275  // Get the AA tags and alignment to use from one of the loads. It does not
1276  // matter which one we get and if any differ.
1277  AAMDNodes AATags;
1278  SomeLoad->getAAMetadata(AATags);
1279  Align Alignment = SomeLoad->getAlign();
1280 
1281  // Rewrite all loads of the PN to use the new PHI.
1282  while (!PN.use_empty()) {
1283  LoadInst *LI = cast<LoadInst>(PN.user_back());
1284  LI->replaceAllUsesWith(NewPN);
1285  LI->eraseFromParent();
1286  }
1287 
1288  // Inject loads into all of the pred blocks.
1289  DenseMap<BasicBlock*, Value*> InjectedLoads;
1290  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1291  BasicBlock *Pred = PN.getIncomingBlock(Idx);
1292  Value *InVal = PN.getIncomingValue(Idx);
1293 
1294  // A PHI node is allowed to have multiple (duplicated) entries for the same
1295  // basic block, as long as the value is the same. So if we already injected
1296  // a load in the predecessor, then we should reuse the same load for all
1297  // duplicated entries.
1298  if (Value* V = InjectedLoads.lookup(Pred)) {
1299  NewPN->addIncoming(V, Pred);
1300  continue;
1301  }
1302 
1303  Instruction *TI = Pred->getTerminator();
1304  IRBuilderTy PredBuilder(TI);
1305 
1306  LoadInst *Load = PredBuilder.CreateAlignedLoad(
1307  LoadTy, InVal, Alignment,
1308  (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1309  ++NumLoadsSpeculated;
1310  if (AATags)
1311  Load->setAAMetadata(AATags);
1312  NewPN->addIncoming(Load, Pred);
1313  InjectedLoads[Pred] = Load;
1314  }
1315 
1316  LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1317  PN.eraseFromParent();
1318 }
1319 
1320 /// Select instructions that use an alloca and are subsequently loaded can be
1321 /// rewritten to load both input pointers and then select between the result,
1322 /// allowing the load of the alloca to be promoted.
1323 /// From this:
1324 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1325 /// %V = load i32* %P2
1326 /// to:
1327 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1328 /// %V2 = load i32* %Other
1329 /// %V = select i1 %cond, i32 %V1, i32 %V2
1330 ///
1331 /// We can do this to a select if its only uses are loads and if the operand
1332 /// to the select can be loaded unconditionally.
1334  Value *TValue = SI.getTrueValue();
1335  Value *FValue = SI.getFalseValue();
1336  const DataLayout &DL = SI.getModule()->getDataLayout();
1337 
1338  for (User *U : SI.users()) {
1339  LoadInst *LI = dyn_cast<LoadInst>(U);
1340  if (!LI || !LI->isSimple())
1341  return false;
1342 
1343  // Both operands to the select need to be dereferenceable, either
1344  // absolutely (e.g. allocas) or at this point because we can see other
1345  // accesses to it.
1346  if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
1347  LI->getAlign(), DL, LI))
1348  return false;
1349  if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
1350  LI->getAlign(), DL, LI))
1351  return false;
1352  }
1353 
1354  return true;
1355 }
1356 
1358  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
1359 
1360  IRBuilderTy IRB(&SI);
1361  Value *TV = SI.getTrueValue();
1362  Value *FV = SI.getFalseValue();
1363  // Replace the loads of the select with a select of two loads.
1364  while (!SI.use_empty()) {
1365  LoadInst *LI = cast<LoadInst>(SI.user_back());
1366  assert(LI->isSimple() && "We only speculate simple loads");
1367 
1368  IRB.SetInsertPoint(LI);
1369  LoadInst *TL = IRB.CreateLoad(LI->getType(), TV,
1370  LI->getName() + ".sroa.speculate.load.true");
1371  LoadInst *FL = IRB.CreateLoad(LI->getType(), FV,
1372  LI->getName() + ".sroa.speculate.load.false");
1373  NumLoadsSpeculated += 2;
1374 
1375  // Transfer alignment and AA info if present.
1376  TL->setAlignment(LI->getAlign());
1377  FL->setAlignment(LI->getAlign());
1378 
1379  AAMDNodes Tags;
1380  LI->getAAMetadata(Tags);
1381  if (Tags) {
1382  TL->setAAMetadata(Tags);
1383  FL->setAAMetadata(Tags);
1384  }
1385 
1386  Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1387  LI->getName() + ".sroa.speculated");
1388 
1389  LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1390  LI->replaceAllUsesWith(V);
1391  LI->eraseFromParent();
1392  }
1393  SI.eraseFromParent();
1394 }
1395 
1396 /// Build a GEP out of a base pointer and indices.
1397 ///
1398 /// This will return the BasePtr if that is valid, or build a new GEP
1399 /// instruction using the IRBuilder if GEP-ing is needed.
1400 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
1401  SmallVectorImpl<Value *> &Indices,
1402  const Twine &NamePrefix) {
1403  if (Indices.empty())
1404  return BasePtr;
1405 
1406  // A single zero index is a no-op, so check for this and avoid building a GEP
1407  // in that case.
1408  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1409  return BasePtr;
1410 
1411  return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
1412  BasePtr, Indices, NamePrefix + "sroa_idx");
1413 }
1414 
1415 /// Get a natural GEP off of the BasePtr walking through Ty toward
1416 /// TargetTy without changing the offset of the pointer.
1417 ///
1418 /// This routine assumes we've already established a properly offset GEP with
1419 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1420 /// zero-indices down through type layers until we find one the same as
1421 /// TargetTy. If we can't find one with the same type, we at least try to use
1422 /// one with the same size. If none of that works, we just produce the GEP as
1423 /// indicated by Indices to have the correct offset.
1424 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
1425  Value *BasePtr, Type *Ty, Type *TargetTy,
1426  SmallVectorImpl<Value *> &Indices,
1427  const Twine &NamePrefix) {
1428  if (Ty == TargetTy)
1429  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1430 
1431  // Offset size to use for the indices.
1432  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
1433 
1434  // See if we can descend into a struct and locate a field with the correct
1435  // type.
1436  unsigned NumLayers = 0;
1437  Type *ElementTy = Ty;
1438  do {
1439  if (ElementTy->isPointerTy())
1440  break;
1441 
1442  if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
1443  ElementTy = ArrayTy->getElementType();
1444  Indices.push_back(IRB.getIntN(OffsetSize, 0));
1445  } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
1446  ElementTy = VectorTy->getElementType();
1447  Indices.push_back(IRB.getInt32(0));
1448  } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1449  if (STy->element_begin() == STy->element_end())
1450  break; // Nothing left to descend into.
1451  ElementTy = *STy->element_begin();
1452  Indices.push_back(IRB.getInt32(0));
1453  } else {
1454  break;
1455  }
1456  ++NumLayers;
1457  } while (ElementTy != TargetTy);
1458  if (ElementTy != TargetTy)
1459  Indices.erase(Indices.end() - NumLayers, Indices.end());
1460 
1461  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1462 }
1463 
1464 /// Recursively compute indices for a natural GEP.
1465 ///
1466 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
1467 /// element types adding appropriate indices for the GEP.
1468 static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
1469  Value *Ptr, Type *Ty, APInt &Offset,
1470  Type *TargetTy,
1471  SmallVectorImpl<Value *> &Indices,
1472  const Twine &NamePrefix) {
1473  if (Offset == 0)
1474  return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
1475  NamePrefix);
1476 
1477  // We can't recurse through pointer types.
1478  if (Ty->isPointerTy())
1479  return nullptr;
1480 
1481  // We try to analyze GEPs over vectors here, but note that these GEPs are
1482  // extremely poorly defined currently. The long-term goal is to remove GEPing
1483  // over a vector from the IR completely.
1484  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
1485  unsigned ElementSizeInBits =
1486  DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
1487  if (ElementSizeInBits % 8 != 0) {
1488  // GEPs over non-multiple of 8 size vector elements are invalid.
1489  return nullptr;
1490  }
1491  APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
1492  APInt NumSkippedElements = Offset.sdiv(ElementSize);
1493  if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements()))
1494  return nullptr;
1495  Offset -= NumSkippedElements * ElementSize;
1496  Indices.push_back(IRB.getInt(NumSkippedElements));
1497  return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
1498  Offset, TargetTy, Indices, NamePrefix);
1499  }
1500 
1501  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
1502  Type *ElementTy = ArrTy->getElementType();
1503  APInt ElementSize(Offset.getBitWidth(),
1504  DL.getTypeAllocSize(ElementTy).getFixedSize());
1505  APInt NumSkippedElements = Offset.sdiv(ElementSize);
1506  if (NumSkippedElements.ugt(ArrTy->getNumElements()))
1507  return nullptr;
1508 
1509  Offset -= NumSkippedElements * ElementSize;
1510  Indices.push_back(IRB.getInt(NumSkippedElements));
1511  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1512  Indices, NamePrefix);
1513  }
1514 
1515  StructType *STy = dyn_cast<StructType>(Ty);
1516  if (!STy)
1517  return nullptr;
1518 
1519  const StructLayout *SL = DL.getStructLayout(STy);
1520  uint64_t StructOffset = Offset.getZExtValue();
1521  if (StructOffset >= SL->getSizeInBytes())
1522  return nullptr;
1523  unsigned Index = SL->getElementContainingOffset(StructOffset);
1524  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
1525  Type *ElementTy = STy->getElementType(Index);
1526  if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
1527  return nullptr; // The offset points into alignment padding.
1528 
1529  Indices.push_back(IRB.getInt32(Index));
1530  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1531  Indices, NamePrefix);
1532 }
1533 
1534 /// Get a natural GEP from a base pointer to a particular offset and
1535 /// resulting in a particular type.
1536 ///
1537 /// The goal is to produce a "natural" looking GEP that works with the existing
1538 /// composite types to arrive at the appropriate offset and element type for
1539 /// a pointer. TargetTy is the element type the returned GEP should point-to if
1540 /// possible. We recurse by decreasing Offset, adding the appropriate index to
1541 /// Indices, and setting Ty to the result subtype.
1542 ///
1543 /// If no natural GEP can be constructed, this function returns null.
1544 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
1545  Value *Ptr, APInt Offset, Type *TargetTy,
1546  SmallVectorImpl<Value *> &Indices,
1547  const Twine &NamePrefix) {
1548  PointerType *Ty = cast<PointerType>(Ptr->getType());
1549 
1550  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1551  // an i8.
1552  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
1553  return nullptr;
1554 
1555  Type *ElementTy = Ty->getElementType();
1556  if (!ElementTy->isSized())
1557  return nullptr; // We can't GEP through an unsized element.
1558  if (isa<ScalableVectorType>(ElementTy))
1559  return nullptr;
1560  APInt ElementSize(Offset.getBitWidth(),
1561  DL.getTypeAllocSize(ElementTy).getFixedSize());
1562  if (ElementSize == 0)
1563  return nullptr; // Zero-length arrays can't help us build a natural GEP.
1564  APInt NumSkippedElements = Offset.sdiv(ElementSize);
1565 
1566  Offset -= NumSkippedElements * ElementSize;
1567  Indices.push_back(IRB.getInt(NumSkippedElements));
1568  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1569  Indices, NamePrefix);
1570 }
1571 
1572 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1573 /// resulting pointer has PointerTy.
1574 ///
1575 /// This tries very hard to compute a "natural" GEP which arrives at the offset
1576 /// and produces the pointer type desired. Where it cannot, it will try to use
1577 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
1578 /// fails, it will try to use an existing i8* and GEP to the byte offset and
1579 /// bitcast to the type.
1580 ///
1581 /// The strategy for finding the more natural GEPs is to peel off layers of the
1582 /// pointer, walking back through bit casts and GEPs, searching for a base
1583 /// pointer from which we can compute a natural GEP with the desired
1584 /// properties. The algorithm tries to fold as many constant indices into
1585 /// a single GEP as possible, thus making each GEP more independent of the
1586 /// surrounding code.
1587 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1589  const Twine &NamePrefix) {
1590  // Even though we don't look through PHI nodes, we could be called on an
1591  // instruction in an unreachable block, which may be on a cycle.
1592  SmallPtrSet<Value *, 4> Visited;
1593  Visited.insert(Ptr);
1594  SmallVector<Value *, 4> Indices;
1595 
1596  // We may end up computing an offset pointer that has the wrong type. If we
1597  // never are able to compute one directly that has the correct type, we'll
1598  // fall back to it, so keep it and the base it was computed from around here.
1599  Value *OffsetPtr = nullptr;
1600  Value *OffsetBasePtr;
1601 
1602  // Remember any i8 pointer we come across to re-use if we need to do a raw
1603  // byte offset.
1604  Value *Int8Ptr = nullptr;
1605  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1606 
1607  PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
1608  Type *TargetTy = TargetPtrTy->getElementType();
1609 
1610  // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
1611  // address space from the expected `PointerTy` (the pointer to be used).
1612  // Adjust the pointer type based the original storage pointer.
1613  auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
1614  PointerTy = TargetTy->getPointerTo(AS);
1615 
1616  do {
1617  // First fold any existing GEPs into the offset.
1618  while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1619  APInt GEPOffset(Offset.getBitWidth(), 0);
1620  if (!GEP->accumulateConstantOffset(DL, GEPOffset))
1621  break;
1622  Offset += GEPOffset;
1623  Ptr = GEP->getPointerOperand();
1624  if (!Visited.insert(Ptr).second)
1625  break;
1626  }
1627 
1628  // See if we can perform a natural GEP here.
1629  Indices.clear();
1630  if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
1631  Indices, NamePrefix)) {
1632  // If we have a new natural pointer at the offset, clear out any old
1633  // offset pointer we computed. Unless it is the base pointer or
1634  // a non-instruction, we built a GEP we don't need. Zap it.
1635  if (OffsetPtr && OffsetPtr != OffsetBasePtr)
1636  if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
1637  assert(I->use_empty() && "Built a GEP with uses some how!");
1638  I->eraseFromParent();
1639  }
1640  OffsetPtr = P;
1641  OffsetBasePtr = Ptr;
1642  // If we also found a pointer of the right type, we're done.
1643  if (P->getType() == PointerTy)
1644  break;
1645  }
1646 
1647  // Stash this pointer if we've found an i8*.
1648  if (Ptr->getType()->isIntegerTy(8)) {
1649  Int8Ptr = Ptr;
1650  Int8PtrOffset = Offset;
1651  }
1652 
1653  // Peel off a layer of the pointer and update the offset appropriately.
1654  if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1655  Ptr = cast<Operator>(Ptr)->getOperand(0);
1656  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1657  if (GA->isInterposable())
1658  break;
1659  Ptr = GA->getAliasee();
1660  } else {
1661  break;
1662  }
1663  assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1664  } while (Visited.insert(Ptr).second);
1665 
1666  if (!OffsetPtr) {
1667  if (!Int8Ptr) {
1668  Int8Ptr = IRB.CreateBitCast(
1669  Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
1670  NamePrefix + "sroa_raw_cast");
1671  Int8PtrOffset = Offset;
1672  }
1673 
1674  OffsetPtr = Int8PtrOffset == 0
1675  ? Int8Ptr
1676  : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
1677  IRB.getInt(Int8PtrOffset),
1678  NamePrefix + "sroa_raw_idx");
1679  }
1680  Ptr = OffsetPtr;
1681 
1682  // On the off chance we were targeting i8*, guard the bitcast here.
1683  if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
1684  Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
1685  TargetPtrTy,
1686  NamePrefix + "sroa_cast");
1687  }
1688 
1689  return Ptr;
1690 }
1691 
1692 /// Compute the adjusted alignment for a load or store from an offset.
1695 }
1696 
1697 /// Test whether we can convert a value from the old to the new type.
1698 ///
1699 /// This predicate should be used to guard calls to convertValue in order to
1700 /// ensure that we only try to convert viable values. The strategy is that we
1701 /// will peel off single element struct and array wrappings to get to an
1702 /// underlying value, and convert that value.
1703 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1704  if (OldTy == NewTy)
1705  return true;
1706 
1707  // For integer types, we can't handle any bit-width differences. This would
1708  // break both vector conversions with extension and introduce endianness
1709  // issues when in conjunction with loads and stores.
1710  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1711  assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1712  cast<IntegerType>(NewTy)->getBitWidth() &&
1713  "We can't have the same bitwidth for different int types");
1714  return false;
1715  }
1716 
1717  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
1718  DL.getTypeSizeInBits(OldTy).getFixedSize())
1719  return false;
1720  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1721  return false;
1722 
1723  // We can convert pointers to integers and vice-versa. Same for vectors
1724  // of pointers and integers.
1725  OldTy = OldTy->getScalarType();
1726  NewTy = NewTy->getScalarType();
1727  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1728  if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1729  unsigned OldAS = OldTy->getPointerAddressSpace();
1730  unsigned NewAS = NewTy->getPointerAddressSpace();
1731  // Convert pointers if they are pointers from the same address space or
1732  // different integral (not non-integral) address spaces with the same
1733  // pointer size.
1734  return OldAS == NewAS ||
1735  (!DL.isNonIntegralAddressSpace(OldAS) &&
1736  !DL.isNonIntegralAddressSpace(NewAS) &&
1737  DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1738  }
1739 
1740  // We can convert integers to integral pointers, but not to non-integral
1741  // pointers.
1742  if (OldTy->isIntegerTy())
1743  return !DL.isNonIntegralPointerType(NewTy);
1744 
1745  // We can convert integral pointers to integers, but non-integral pointers
1746  // need to remain pointers.
1747  if (!DL.isNonIntegralPointerType(OldTy))
1748  return NewTy->isIntegerTy();
1749 
1750  return false;
1751  }
1752 
1753  return true;
1754 }
1755 
1756 /// Generic routine to convert an SSA value to a value of a different
1757 /// type.
1758 ///
1759 /// This will try various different casting techniques, such as bitcasts,
1760 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1761 /// two types for viability with this routine.
1762 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
1763  Type *NewTy) {
1764  Type *OldTy = V->getType();
1765  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
1766 
1767  if (OldTy == NewTy)
1768  return V;
1769 
1770  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
1771  "Integer types must be the exact same to convert.");
1772 
1773  // See if we need inttoptr for this type pair. May require additional bitcast.
1774  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1775  // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
1776  // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
1777  // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
1778  // Directly handle i64 to i8*
1779  return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
1780  NewTy);
1781  }
1782 
1783  // See if we need ptrtoint for this type pair. May require additional bitcast.
1784  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
1785  // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
1786  // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
1787  // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
1788  // Expand i8* to i64 --> i8* to i64 to i64
1789  return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1790  NewTy);
1791  }
1792 
1793  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1794  unsigned OldAS = OldTy->getPointerAddressSpace();
1795  unsigned NewAS = NewTy->getPointerAddressSpace();
1796  // To convert pointers with different address spaces (they are already
1797  // checked convertible, i.e. they have the same pointer size), so far we
1798  // cannot use `bitcast` (which has restrict on the same address space) or
1799  // `addrspacecast` (which is not always no-op casting). Instead, use a pair
1800  // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
1801  // size.
1802  if (OldAS != NewAS) {
1803  assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1804  return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1805  NewTy);
1806  }
1807  }
1808 
1809  return IRB.CreateBitCast(V, NewTy);
1810 }
1811 
1812 /// Test whether the given slice use can be promoted to a vector.
1813 ///
1814 /// This function is called to test each entry in a partition which is slated
1815 /// for a single slice.
1816 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
1817  VectorType *Ty,
1818  uint64_t ElementSize,
1819  const DataLayout &DL) {
1820  // First validate the slice offsets.
1821  uint64_t BeginOffset =
1822  std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
1823  uint64_t BeginIndex = BeginOffset / ElementSize;
1824  if (BeginIndex * ElementSize != BeginOffset ||
1825  BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
1826  return false;
1827  uint64_t EndOffset =
1828  std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
1829  uint64_t EndIndex = EndOffset / ElementSize;
1830  if (EndIndex * ElementSize != EndOffset ||
1831  EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
1832  return false;
1833 
1834  assert(EndIndex > BeginIndex && "Empty vector!");
1835  uint64_t NumElements = EndIndex - BeginIndex;
1836  Type *SliceTy = (NumElements == 1)
1837  ? Ty->getElementType()
1838  : FixedVectorType::get(Ty->getElementType(), NumElements);
1839 
1840  Type *SplitIntTy =
1841  Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
1842 
1843  Use *U = S.getUse();
1844 
1845  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
1846  if (MI->isVolatile())
1847  return false;
1848  if (!S.isSplittable())
1849  return false; // Skip any unsplittable intrinsics.
1850  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
1851  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
1852  return false;
1853  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
1854  // Disable vector promotion when there are loads or stores of an FCA.
1855  return false;
1856  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1857  if (LI->isVolatile())
1858  return false;
1859  Type *LTy = LI->getType();
1860  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1861  assert(LTy->isIntegerTy());
1862  LTy = SplitIntTy;
1863  }
1864  if (!canConvertValue(DL, SliceTy, LTy))
1865  return false;
1866  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1867  if (SI->isVolatile())
1868  return false;
1869  Type *STy = SI->getValueOperand()->getType();
1870  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1871  assert(STy->isIntegerTy());
1872  STy = SplitIntTy;
1873  }
1874  if (!canConvertValue(DL, STy, SliceTy))
1875  return false;
1876  } else {
1877  return false;
1878  }
1879 
1880  return true;
1881 }
1882 
1883 /// Test whether the given alloca partitioning and range of slices can be
1884 /// promoted to a vector.
1885 ///
1886 /// This is a quick test to check whether we can rewrite a particular alloca
1887 /// partition (and its newly formed alloca) into a vector alloca with only
1888 /// whole-vector loads and stores such that it could be promoted to a vector
1889 /// SSA value. We only can ensure this for a limited set of operations, and we
1890 /// don't want to do the rewrites unless we are confident that the result will
1891 /// be promotable, so we have an early test here.
1893  // Collect the candidate types for vector-based promotion. Also track whether
1894  // we have different element types.
1895  SmallVector<VectorType *, 4> CandidateTys;
1896  Type *CommonEltTy = nullptr;
1897  bool HaveCommonEltTy = true;
1898  auto CheckCandidateType = [&](Type *Ty) {
1899  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1900  // Return if bitcast to vectors is different for total size in bits.
1901  if (!CandidateTys.empty()) {
1902  VectorType *V = CandidateTys[0];
1903  if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
1904  DL.getTypeSizeInBits(V).getFixedSize()) {
1905  CandidateTys.clear();
1906  return;
1907  }
1908  }
1909  CandidateTys.push_back(VTy);
1910  if (!CommonEltTy)
1911  CommonEltTy = VTy->getElementType();
1912  else if (CommonEltTy != VTy->getElementType())
1913  HaveCommonEltTy = false;
1914  }
1915  };
1916  // Consider any loads or stores that are the exact size of the slice.
1917  for (const Slice &S : P)
1918  if (S.beginOffset() == P.beginOffset() &&
1919  S.endOffset() == P.endOffset()) {
1920  if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
1921  CheckCandidateType(LI->getType());
1922  else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
1923  CheckCandidateType(SI->getValueOperand()->getType());
1924  }
1925 
1926  // If we didn't find a vector type, nothing to do here.
1927  if (CandidateTys.empty())
1928  return nullptr;
1929 
1930  // Remove non-integer vector types if we had multiple common element types.
1931  // FIXME: It'd be nice to replace them with integer vector types, but we can't
1932  // do that until all the backends are known to produce good code for all
1933  // integer vector types.
1934  if (!HaveCommonEltTy) {
1935  llvm::erase_if(CandidateTys, [](VectorType *VTy) {
1936  return !VTy->getElementType()->isIntegerTy();
1937  });
1938 
1939  // If there were no integer vector types, give up.
1940  if (CandidateTys.empty())
1941  return nullptr;
1942 
1943  // Rank the remaining candidate vector types. This is easy because we know
1944  // they're all integer vectors. We sort by ascending number of elements.
1945  auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
1946  (void)DL;
1947  assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
1948  DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
1949  "Cannot have vector types of different sizes!");
1950  assert(RHSTy->getElementType()->isIntegerTy() &&
1951  "All non-integer types eliminated!");
1952  assert(LHSTy->getElementType()->isIntegerTy() &&
1953  "All non-integer types eliminated!");
1954  return cast<FixedVectorType>(RHSTy)->getNumElements() <
1955  cast<FixedVectorType>(LHSTy)->getNumElements();
1956  };
1957  llvm::sort(CandidateTys, RankVectorTypes);
1958  CandidateTys.erase(
1959  std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
1960  CandidateTys.end());
1961  } else {
1962 // The only way to have the same element type in every vector type is to
1963 // have the same vector type. Check that and remove all but one.
1964 #ifndef NDEBUG
1965  for (VectorType *VTy : CandidateTys) {
1966  assert(VTy->getElementType() == CommonEltTy &&
1967  "Unaccounted for element type!");
1968  assert(VTy == CandidateTys[0] &&
1969  "Different vector types with the same element type!");
1970  }
1971 #endif
1972  CandidateTys.resize(1);
1973  }
1974 
1975  // Try each vector type, and return the one which works.
1976  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
1977  uint64_t ElementSize =
1978  DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
1979 
1980  // While the definition of LLVM vectors is bitpacked, we don't support sizes
1981  // that aren't byte sized.
1982  if (ElementSize % 8)
1983  return false;
1984  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
1985  "vector size not a multiple of element size?");
1986  ElementSize /= 8;
1987 
1988  for (const Slice &S : P)
1989  if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
1990  return false;
1991 
1992  for (const Slice *S : P.splitSliceTails())
1993  if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
1994  return false;
1995 
1996  return true;
1997  };
1998  for (VectorType *VTy : CandidateTys)
1999  if (CheckVectorTypeForPromotion(VTy))
2000  return VTy;
2001 
2002  return nullptr;
2003 }
2004 
2005 /// Test whether a slice of an alloca is valid for integer widening.
2006 ///
2007 /// This implements the necessary checking for the \c isIntegerWideningViable
2008 /// test below on a single slice of the alloca.
2009 static bool isIntegerWideningViableForSlice(const Slice &S,
2010  uint64_t AllocBeginOffset,
2011  Type *AllocaTy,
2012  const DataLayout &DL,
2013  bool &WholeAllocaOp) {
2014  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
2015 
2016  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2017  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2018 
2019  // We can't reasonably handle cases where the load or store extends past
2020  // the end of the alloca's type and into its padding.
2021  if (RelEnd > Size)
2022  return false;
2023 
2024  Use *U = S.getUse();
2025 
2026  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2027  if (LI->isVolatile())
2028  return false;
2029  // We can't handle loads that extend past the allocated memory.
2030  if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
2031  return false;
2032  // So far, AllocaSliceRewriter does not support widening split slice tails
2033  // in rewriteIntegerLoad.
2034  if (S.beginOffset() < AllocBeginOffset)
2035  return false;
2036  // Note that we don't count vector loads or stores as whole-alloca
2037  // operations which enable integer widening because we would prefer to use
2038  // vector widening instead.
2039  if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2040  WholeAllocaOp = true;
2041  if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2042  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2043  return false;
2044  } else if (RelBegin != 0 || RelEnd != Size ||
2045  !canConvertValue(DL, AllocaTy, LI->getType())) {
2046  // Non-integer loads need to be convertible from the alloca type so that
2047  // they are promotable.
2048  return false;
2049  }
2050  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2051  Type *ValueTy = SI->getValueOperand()->getType();
2052  if (SI->isVolatile())
2053  return false;
2054  // We can't handle stores that extend past the allocated memory.
2055  if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
2056  return false;
2057  // So far, AllocaSliceRewriter does not support widening split slice tails
2058  // in rewriteIntegerStore.
2059  if (S.beginOffset() < AllocBeginOffset)
2060  return false;
2061  // Note that we don't count vector loads or stores as whole-alloca
2062  // operations which enable integer widening because we would prefer to use
2063  // vector widening instead.
2064  if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2065  WholeAllocaOp = true;
2066  if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2067  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2068  return false;
2069  } else if (RelBegin != 0 || RelEnd != Size ||
2070  !canConvertValue(DL, ValueTy, AllocaTy)) {
2071  // Non-integer stores need to be convertible to the alloca type so that
2072  // they are promotable.
2073  return false;
2074  }
2075  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2076  if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2077  return false;
2078  if (!S.isSplittable())
2079  return false; // Skip any unsplittable intrinsics.
2080  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2081  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2082  return false;
2083  } else {
2084  return false;
2085  }
2086 
2087  return true;
2088 }
2089 
2090 /// Test whether the given alloca partition's integer operations can be
2091 /// widened to promotable ones.
2092 ///
2093 /// This is a quick test to check whether we can rewrite the integer loads and
2094 /// stores to a particular alloca into wider loads and stores and be able to
2095 /// promote the resulting alloca.
2096 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2097  const DataLayout &DL) {
2098  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
2099  // Don't create integer types larger than the maximum bitwidth.
2100  if (SizeInBits > IntegerType::MAX_INT_BITS)
2101  return false;
2102 
2103  // Don't try to handle allocas with bit-padding.
2104  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
2105  return false;
2106 
2107  // We need to ensure that an integer type with the appropriate bitwidth can
2108  // be converted to the alloca type, whatever that is. We don't want to force
2109  // the alloca itself to have an integer type if there is a more suitable one.
2110  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2111  if (!canConvertValue(DL, AllocaTy, IntTy) ||
2112  !canConvertValue(DL, IntTy, AllocaTy))
2113  return false;
2114 
2115  // While examining uses, we ensure that the alloca has a covering load or
2116  // store. We don't want to widen the integer operations only to fail to
2117  // promote due to some other unsplittable entry (which we may make splittable
2118  // later). However, if there are only splittable uses, go ahead and assume
2119  // that we cover the alloca.
2120  // FIXME: We shouldn't consider split slices that happen to start in the
2121  // partition here...
2122  bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2123 
2124  for (const Slice &S : P)
2125  if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2126  WholeAllocaOp))
2127  return false;
2128 
2129  for (const Slice *S : P.splitSliceTails())
2130  if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2131  WholeAllocaOp))
2132  return false;
2133 
2134  return WholeAllocaOp;
2135 }
2136 
2137 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2138  IntegerType *Ty, uint64_t Offset,
2139  const Twine &Name) {
2140  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2141  IntegerType *IntTy = cast<IntegerType>(V->getType());
2142  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2143  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2144  "Element extends past full value");
2145  uint64_t ShAmt = 8 * Offset;
2146  if (DL.isBigEndian())
2147  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2148  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2149  if (ShAmt) {
2150  V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2151  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2152  }
2153  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2154  "Cannot extract to a larger integer!");
2155  if (Ty != IntTy) {
2156  V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2157  LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2158  }
2159  return V;
2160 }
2161 
2162 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2163  Value *V, uint64_t Offset, const Twine &Name) {
2164  IntegerType *IntTy = cast<IntegerType>(Old->getType());
2165  IntegerType *Ty = cast<IntegerType>(V->getType());
2166  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2167  "Cannot insert a larger integer!");
2168  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2169  if (Ty != IntTy) {
2170  V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2171  LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2172  }
2173  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2174  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2175  "Element store outside of alloca store");
2176  uint64_t ShAmt = 8 * Offset;
2177  if (DL.isBigEndian())
2178  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2179  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2180  if (ShAmt) {
2181  V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2182  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2183  }
2184 
2185  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2186  APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2187  Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2188  LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2189  V = IRB.CreateOr(Old, V, Name + ".insert");
2190  LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2191  }
2192  return V;
2193 }
2194 
2195 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2196  unsigned EndIndex, const Twine &Name) {
2197  auto *VecTy = cast<FixedVectorType>(V->getType());
2198  unsigned NumElements = EndIndex - BeginIndex;
2199  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2200 
2201  if (NumElements == VecTy->getNumElements())
2202  return V;
2203 
2204  if (NumElements == 1) {
2205  V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2206  Name + ".extract");
2207  LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2208  return V;
2209  }
2210 
2212  Mask.reserve(NumElements);
2213  for (unsigned i = BeginIndex; i != EndIndex; ++i)
2214  Mask.push_back(i);
2215  V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2216  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2217  return V;
2218 }
2219 
2220 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2221  unsigned BeginIndex, const Twine &Name) {
2222  VectorType *VecTy = cast<VectorType>(Old->getType());
2223  assert(VecTy && "Can only insert a vector into a vector");
2224 
2225  VectorType *Ty = dyn_cast<VectorType>(V->getType());
2226  if (!Ty) {
2227  // Single element to insert.
2228  V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2229  Name + ".insert");
2230  LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2231  return V;
2232  }
2233 
2234  assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2235  cast<FixedVectorType>(VecTy)->getNumElements() &&
2236  "Too many elements!");
2237  if (cast<FixedVectorType>(Ty)->getNumElements() ==
2238  cast<FixedVectorType>(VecTy)->getNumElements()) {
2239  assert(V->getType() == VecTy && "Vector type mismatch");
2240  return V;
2241  }
2242  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2243 
2244  // When inserting a smaller vector into the larger to store, we first
2245  // use a shuffle vector to widen it with undef elements, and then
2246  // a second shuffle vector to select between the loaded vector and the
2247  // incoming vector.
2249  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2250  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2251  if (i >= BeginIndex && i < EndIndex)
2252  Mask.push_back(i - BeginIndex);
2253  else
2254  Mask.push_back(-1);
2255  V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2256  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2257 
2259  Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2260  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2261  Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2262 
2263  V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2264 
2265  LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2266  return V;
2267 }
2268 
2269 /// Visitor to rewrite instructions using p particular slice of an alloca
2270 /// to use a new alloca.
2271 ///
2272 /// Also implements the rewriting to vector-based accesses when the partition
2273 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2274 /// lives here.
2276  : public InstVisitor<AllocaSliceRewriter, bool> {
2277  // Befriend the base class so it can delegate to private visit methods.
2278  friend class InstVisitor<AllocaSliceRewriter, bool>;
2279 
2281 
2282  const DataLayout &DL;
2283  AllocaSlices &AS;
2284  SROA &Pass;
2285  AllocaInst &OldAI, &NewAI;
2286  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2287  Type *NewAllocaTy;
2288 
2289  // This is a convenience and flag variable that will be null unless the new
2290  // alloca's integer operations should be widened to this integer type due to
2291  // passing isIntegerWideningViable above. If it is non-null, the desired
2292  // integer type will be stored here for easy access during rewriting.
2293  IntegerType *IntTy;
2294 
2295  // If we are rewriting an alloca partition which can be written as pure
2296  // vector operations, we stash extra information here. When VecTy is
2297  // non-null, we have some strict guarantees about the rewritten alloca:
2298  // - The new alloca is exactly the size of the vector type here.
2299  // - The accesses all either map to the entire vector or to a single
2300  // element.
2301  // - The set of accessing instructions is only one of those handled above
2302  // in isVectorPromotionViable. Generally these are the same access kinds
2303  // which are promotable via mem2reg.
2304  VectorType *VecTy;
2305  Type *ElementTy;
2306  uint64_t ElementSize;
2307 
2308  // The original offset of the slice currently being rewritten relative to
2309  // the original alloca.
2310  uint64_t BeginOffset = 0;
2311  uint64_t EndOffset = 0;
2312 
2313  // The new offsets of the slice currently being rewritten relative to the
2314  // original alloca.
2315  uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2316 
2317  uint64_t SliceSize = 0;
2318  bool IsSplittable = false;
2319  bool IsSplit = false;
2320  Use *OldUse = nullptr;
2321  Instruction *OldPtr = nullptr;
2322 
2323  // Track post-rewrite users which are PHI nodes and Selects.
2324  SmallSetVector<PHINode *, 8> &PHIUsers;
2325  SmallSetVector<SelectInst *, 8> &SelectUsers;
2326 
2327  // Utility IR builder, whose name prefix is setup for each visited use, and
2328  // the insertion point is set to point to the user.
2329  IRBuilderTy IRB;
2330 
2331 public:
2333  AllocaInst &OldAI, AllocaInst &NewAI,
2334  uint64_t NewAllocaBeginOffset,
2335  uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2336  VectorType *PromotableVecTy,
2337  SmallSetVector<PHINode *, 8> &PHIUsers,
2338  SmallSetVector<SelectInst *, 8> &SelectUsers)
2339  : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2340  NewAllocaBeginOffset(NewAllocaBeginOffset),
2341  NewAllocaEndOffset(NewAllocaEndOffset),
2342  NewAllocaTy(NewAI.getAllocatedType()),
2343  IntTy(
2344  IsIntegerPromotable
2345  ? Type::getIntNTy(NewAI.getContext(),
2346  DL.getTypeSizeInBits(NewAI.getAllocatedType())
2347  .getFixedSize())
2348  : nullptr),
2349  VecTy(PromotableVecTy),
2350  ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2351  ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
2352  : 0),
2353  PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2354  IRB(NewAI.getContext(), ConstantFolder()) {
2355  if (VecTy) {
2356  assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
2357  "Only multiple-of-8 sized vector elements are viable");
2358  ++NumVectorized;
2359  }
2360  assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2361  }
2362 
2364  bool CanSROA = true;
2365  BeginOffset = I->beginOffset();
2366  EndOffset = I->endOffset();
2367  IsSplittable = I->isSplittable();
2368  IsSplit =
2369  BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2370  LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2371  LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2372  LLVM_DEBUG(dbgs() << "\n");
2373 
2374  // Compute the intersecting offset range.
2375  assert(BeginOffset < NewAllocaEndOffset);
2376  assert(EndOffset > NewAllocaBeginOffset);
2377  NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2378  NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2379 
2380  SliceSize = NewEndOffset - NewBeginOffset;
2381 
2382  OldUse = I->getUse();
2383  OldPtr = cast<Instruction>(OldUse->get());
2384 
2385  Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2386  IRB.SetInsertPoint(OldUserI);
2387  IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2388  IRB.getInserter().SetNamePrefix(
2389  Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
2390 
2391  CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2392  if (VecTy || IntTy)
2393  assert(CanSROA);
2394  return CanSROA;
2395  }
2396 
2397 private:
2398  // Make sure the other visit overloads are visible.
2399  using Base::visit;
2400 
2401  // Every instruction which can end up as a user must have a rewrite rule.
2402  bool visitInstruction(Instruction &I) {
2403  LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2404  llvm_unreachable("No rewrite rule for this instruction!");
2405  }
2406 
2407  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2408  // Note that the offset computation can use BeginOffset or NewBeginOffset
2409  // interchangeably for unsplit slices.
2410  assert(IsSplit || BeginOffset == NewBeginOffset);
2411  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2412 
2413 #ifndef NDEBUG
2414  StringRef OldName = OldPtr->getName();
2415  // Skip through the last '.sroa.' component of the name.
2416  size_t LastSROAPrefix = OldName.rfind(".sroa.");
2417  if (LastSROAPrefix != StringRef::npos) {
2418  OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2419  // Look for an SROA slice index.
2420  size_t IndexEnd = OldName.find_first_not_of("0123456789");
2421  if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2422  // Strip the index and look for the offset.
2423  OldName = OldName.substr(IndexEnd + 1);
2424  size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2425  if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2426  // Strip the offset.
2427  OldName = OldName.substr(OffsetEnd + 1);
2428  }
2429  }
2430  // Strip any SROA suffixes as well.
2431  OldName = OldName.substr(0, OldName.find(".sroa_"));
2432 #endif
2433 
2434  return getAdjustedPtr(IRB, DL, &NewAI,
2435  APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
2436  PointerTy,
2437 #ifndef NDEBUG
2438  Twine(OldName) + "."
2439 #else
2440  Twine()
2441 #endif
2442  );
2443  }
2444 
2445  /// Compute suitable alignment to access this slice of the *new*
2446  /// alloca.
2447  ///
2448  /// You can optionally pass a type to this routine and if that type's ABI
2449  /// alignment is itself suitable, this will return zero.
2450  Align getSliceAlign() {
2451  return commonAlignment(NewAI.getAlign(),
2452  NewBeginOffset - NewAllocaBeginOffset);
2453  }
2454 
2455  unsigned getIndex(uint64_t Offset) {
2456  assert(VecTy && "Can only call getIndex when rewriting a vector");
2457  uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2458  assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2459  uint32_t Index = RelOffset / ElementSize;
2460  assert(Index * ElementSize == RelOffset);
2461  return Index;
2462  }
2463 
2464  void deleteIfTriviallyDead(Value *V) {
2465  Instruction *I = cast<Instruction>(V);
2467  Pass.DeadInsts.push_back(I);
2468  }
2469 
2470  Value *rewriteVectorizedLoadInst() {
2471  unsigned BeginIndex = getIndex(NewBeginOffset);
2472  unsigned EndIndex = getIndex(NewEndOffset);
2473  assert(EndIndex > BeginIndex && "Empty vector!");
2474 
2475  Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2476  NewAI.getAlign(), "load");
2477  return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
2478  }
2479 
2480  Value *rewriteIntegerLoad(LoadInst &LI) {
2481  assert(IntTy && "We cannot insert an integer to the alloca");
2482  assert(!LI.isVolatile());
2483  Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2484  NewAI.getAlign(), "load");
2485  V = convertValue(DL, IRB, V, IntTy);
2486  assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2487  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2488  if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2489  IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
2490  V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
2491  }
2492  // It is possible that the extracted type is not the load type. This
2493  // happens if there is a load past the end of the alloca, and as
2494  // a consequence the slice is narrower but still a candidate for integer
2495  // lowering. To handle this case, we just zero extend the extracted
2496  // integer.
2497  assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2498  "Can only handle an extract for an overly wide load");
2499  if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
2500  V = IRB.CreateZExt(V, LI.getType());
2501  return V;
2502  }
2503 
2504  bool visitLoadInst(LoadInst &LI) {
2505  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2506  Value *OldOp = LI.getOperand(0);
2507  assert(OldOp == OldPtr);
2508 
2509  AAMDNodes AATags;
2510  LI.getAAMetadata(AATags);
2511 
2512  unsigned AS = LI.getPointerAddressSpace();
2513 
2514  Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2515  : LI.getType();
2516  const bool IsLoadPastEnd =
2517  DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
2518  bool IsPtrAdjusted = false;
2519  Value *V;
2520  if (VecTy) {
2521  V = rewriteVectorizedLoadInst();
2522  } else if (IntTy && LI.getType()->isIntegerTy()) {
2523  V = rewriteIntegerLoad(LI);
2524  } else if (NewBeginOffset == NewAllocaBeginOffset &&
2525  NewEndOffset == NewAllocaEndOffset &&
2526  (canConvertValue(DL, NewAllocaTy, TargetTy) ||
2527  (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2528  TargetTy->isIntegerTy()))) {
2529  LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2530  NewAI.getAlign(), LI.isVolatile(),
2531  LI.getName());
2532  if (AATags)
2533  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2534  if (LI.isVolatile())
2535  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2536  if (NewLI->isAtomic())
2537  NewLI->setAlignment(LI.getAlign());
2538 
2539  // Any !nonnull metadata or !range metadata on the old load is also valid
2540  // on the new load. This is even true in some cases even when the loads
2541  // are different types, for example by mapping !nonnull metadata to
2542  // !range metadata by modeling the null pointer constant converted to the
2543  // integer type.
2544  // FIXME: Add support for range metadata here. Currently the utilities
2545  // for this don't propagate range metadata in trivial cases from one
2546  // integer load to another, don't handle non-addrspace-0 null pointers
2547  // correctly, and don't have any support for mapping ranges as the
2548  // integer type becomes winder or narrower.
2549  if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
2550  copyNonnullMetadata(LI, N, *NewLI);
2551 
2552  // Try to preserve nonnull metadata
2553  V = NewLI;
2554 
2555  // If this is an integer load past the end of the slice (which means the
2556  // bytes outside the slice are undef or this load is dead) just forcibly
2557  // fix the integer size with correct handling of endianness.
2558  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2559  if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
2560  if (AITy->getBitWidth() < TITy->getBitWidth()) {
2561  V = IRB.CreateZExt(V, TITy, "load.ext");
2562  if (DL.isBigEndian())
2563  V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
2564  "endian_shift");
2565  }
2566  } else {
2567  Type *LTy = TargetTy->getPointerTo(AS);
2568  LoadInst *NewLI =
2569  IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
2570  getSliceAlign(), LI.isVolatile(), LI.getName());
2571  if (AATags)
2572  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2573  if (LI.isVolatile())
2574  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2575 
2576  V = NewLI;
2577  IsPtrAdjusted = true;
2578  }
2579  V = convertValue(DL, IRB, V, TargetTy);
2580 
2581  if (IsSplit) {
2582  assert(!LI.isVolatile());
2583  assert(LI.getType()->isIntegerTy() &&
2584  "Only integer type loads and stores are split");
2585  assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
2586  "Split load isn't smaller than original load");
2587  assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
2588  "Non-byte-multiple bit width");
2589  // Move the insertion point just past the load so that we can refer to it.
2590  IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
2591  // Create a placeholder value with the same type as LI to use as the
2592  // basis for the new value. This allows us to replace the uses of LI with
2593  // the computed value, and then replace the placeholder with LI, leaving
2594  // LI only used for this computation.
2595  Value *Placeholder = new LoadInst(
2596  LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
2597  false, Align(1));
2598  V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
2599  "insert");
2600  LI.replaceAllUsesWith(V);
2601  Placeholder->replaceAllUsesWith(&LI);
2602  Placeholder->deleteValue();
2603  } else {
2604  LI.replaceAllUsesWith(V);
2605  }
2606 
2607  Pass.DeadInsts.push_back(&LI);
2608  deleteIfTriviallyDead(OldOp);
2609  LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
2610  return !LI.isVolatile() && !IsPtrAdjusted;
2611  }
2612 
2613  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
2614  AAMDNodes AATags) {
2615  if (V->getType() != VecTy) {
2616  unsigned BeginIndex = getIndex(NewBeginOffset);
2617  unsigned EndIndex = getIndex(NewEndOffset);
2618  assert(EndIndex > BeginIndex && "Empty vector!");
2619  unsigned NumElements = EndIndex - BeginIndex;
2620  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2621  "Too many elements!");
2622  Type *SliceTy = (NumElements == 1)
2623  ? ElementTy
2624  : FixedVectorType::get(ElementTy, NumElements);
2625  if (V->getType() != SliceTy)
2626  V = convertValue(DL, IRB, V, SliceTy);
2627 
2628  // Mix in the existing elements.
2629  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2630  NewAI.getAlign(), "load");
2631  V = insertVector(IRB, Old, V, BeginIndex, "vec");
2632  }
2633  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2634  if (AATags)
2635  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2636  Pass.DeadInsts.push_back(&SI);
2637 
2638  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2639  return true;
2640  }
2641 
2642  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
2643  assert(IntTy && "We cannot extract an integer from the alloca");
2644  assert(!SI.isVolatile());
2645  if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
2646  IntTy->getBitWidth()) {
2647  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2648  NewAI.getAlign(), "oldload");
2649  Old = convertValue(DL, IRB, Old, IntTy);
2650  assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2651  uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2652  V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
2653  }
2654  V = convertValue(DL, IRB, V, NewAllocaTy);
2655  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2656  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2657  LLVMContext::MD_access_group});
2658  if (AATags)
2659  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2660  Pass.DeadInsts.push_back(&SI);
2661  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2662  return true;
2663  }
2664 
2665  bool visitStoreInst(StoreInst &SI) {
2666  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
2667  Value *OldOp = SI.getOperand(1);
2668  assert(OldOp == OldPtr);
2669 
2670  AAMDNodes AATags;
2671  SI.getAAMetadata(AATags);
2672 
2673  Value *V = SI.getValueOperand();
2674 
2675  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2676  // alloca that should be re-examined after promoting this alloca.
2677  if (V->getType()->isPointerTy())
2678  if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
2679  Pass.PostPromotionWorklist.insert(AI);
2680 
2681  if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
2682  assert(!SI.isVolatile());
2683  assert(V->getType()->isIntegerTy() &&
2684  "Only integer type loads and stores are split");
2685  assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
2686  "Non-byte-multiple bit width");
2687  IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
2688  V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
2689  "extract");
2690  }
2691 
2692  if (VecTy)
2693  return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
2694  if (IntTy && V->getType()->isIntegerTy())
2695  return rewriteIntegerStore(V, SI, AATags);
2696 
2697  const bool IsStorePastEnd =
2698  DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
2699  StoreInst *NewSI;
2700  if (NewBeginOffset == NewAllocaBeginOffset &&
2701  NewEndOffset == NewAllocaEndOffset &&
2702  (canConvertValue(DL, V->getType(), NewAllocaTy) ||
2703  (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
2704  V->getType()->isIntegerTy()))) {
2705  // If this is an integer store past the end of slice (and thus the bytes
2706  // past that point are irrelevant or this is unreachable), truncate the
2707  // value prior to storing.
2708  if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
2709  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2710  if (VITy->getBitWidth() > AITy->getBitWidth()) {
2711  if (DL.isBigEndian())
2712  V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
2713  "endian_shift");
2714  V = IRB.CreateTrunc(V, AITy, "load.trunc");
2715  }
2716 
2717  V = convertValue(DL, IRB, V, NewAllocaTy);
2718  NewSI =
2719  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
2720  } else {
2721  unsigned AS = SI.getPointerAddressSpace();
2722  Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
2723  NewSI =
2724  IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
2725  }
2726  NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2727  LLVMContext::MD_access_group});
2728  if (AATags)
2729  NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2730  if (SI.isVolatile())
2731  NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
2732  if (NewSI->isAtomic())
2733  NewSI->setAlignment(SI.getAlign());
2734  Pass.DeadInsts.push_back(&SI);
2735  deleteIfTriviallyDead(OldOp);
2736 
2737  LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
2738  return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
2739  }
2740 
2741  /// Compute an integer value from splatting an i8 across the given
2742  /// number of bytes.
2743  ///
2744  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2745  /// call this routine.
2746  /// FIXME: Heed the advice above.
2747  ///
2748  /// \param V The i8 value to splat.
2749  /// \param Size The number of bytes in the output (assuming i8 is one byte)
2750  Value *getIntegerSplat(Value *V, unsigned Size) {
2751  assert(Size > 0 && "Expected a positive number of bytes.");
2752  IntegerType *VTy = cast<IntegerType>(V->getType());
2753  assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
2754  if (Size == 1)
2755  return V;
2756 
2757  Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
2758  V = IRB.CreateMul(
2759  IRB.CreateZExt(V, SplatIntTy, "zext"),
2760  ConstantExpr::getUDiv(
2761  Constant::getAllOnesValue(SplatIntTy),
2762  ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
2763  SplatIntTy)),
2764  "isplat");
2765  return V;
2766  }
2767 
2768  /// Compute a vector splat for a given element value.
2769  Value *getVectorSplat(Value *V, unsigned NumElements) {
2770  V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
2771  LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
2772  return V;
2773  }
2774 
2775  bool visitMemSetInst(MemSetInst &II) {
2776  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2777  assert(II.getRawDest() == OldPtr);
2778 
2779  AAMDNodes AATags;
2780  II.getAAMetadata(AATags);
2781 
2782  // If the memset has a variable size, it cannot be split, just adjust the
2783  // pointer to the new alloca.
2784  if (!isa<Constant>(II.getLength())) {
2785  assert(!IsSplit);
2786  assert(NewBeginOffset == BeginOffset);
2787  II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
2788  II.setDestAlignment(getSliceAlign());
2789 
2790  deleteIfTriviallyDead(OldPtr);
2791  return false;
2792  }
2793 
2794  // Record this instruction for deletion.
2795  Pass.DeadInsts.push_back(&II);
2796 
2797  Type *AllocaTy = NewAI.getAllocatedType();
2798  Type *ScalarTy = AllocaTy->getScalarType();
2799 
2800  const bool CanContinue = [&]() {
2801  if (VecTy || IntTy)
2802  return true;
2803  if (BeginOffset > NewAllocaBeginOffset ||
2804  EndOffset < NewAllocaEndOffset)
2805  return false;
2806  auto *C = cast<ConstantInt>(II.getLength());
2807  if (C->getBitWidth() > 64)
2808  return false;
2809  const auto Len = C->getZExtValue();
2810  auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
2811  auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
2812  return canConvertValue(DL, SrcTy, AllocaTy) &&
2813  DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
2814  }();
2815 
2816  // If this doesn't map cleanly onto the alloca type, and that type isn't
2817  // a single value type, just emit a memset.
2818  if (!CanContinue) {
2819  Type *SizeTy = II.getLength()->getType();
2820  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2821  CallInst *New = IRB.CreateMemSet(
2822  getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
2823  MaybeAlign(getSliceAlign()), II.isVolatile());
2824  if (AATags)
2825  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2826  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2827  return false;
2828  }
2829 
2830  // If we can represent this as a simple value, we have to build the actual
2831  // value to store, which requires expanding the byte present in memset to
2832  // a sensible representation for the alloca type. This is essentially
2833  // splatting the byte to a sufficiently wide integer, splatting it across
2834  // any desired vector width, and bitcasting to the final type.
2835  Value *V;
2836 
2837  if (VecTy) {
2838  // If this is a memset of a vectorized alloca, insert it.
2839  assert(ElementTy == ScalarTy);
2840 
2841  unsigned BeginIndex = getIndex(NewBeginOffset);
2842  unsigned EndIndex = getIndex(NewEndOffset);
2843  assert(EndIndex > BeginIndex && "Empty vector!");
2844  unsigned NumElements = EndIndex - BeginIndex;
2845  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2846  "Too many elements!");
2847 
2848  Value *Splat = getIntegerSplat(
2849  II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
2850  Splat = convertValue(DL, IRB, Splat, ElementTy);
2851  if (NumElements > 1)
2852  Splat = getVectorSplat(Splat, NumElements);
2853 
2854  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2855  NewAI.getAlign(), "oldload");
2856  V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
2857  } else if (IntTy) {
2858  // If this is a memset on an alloca where we can widen stores, insert the
2859  // set integer.
2860  assert(!II.isVolatile());
2861 
2862  uint64_t Size = NewEndOffset - NewBeginOffset;
2863  V = getIntegerSplat(II.getValue(), Size);
2864 
2865  if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
2866  EndOffset != NewAllocaBeginOffset)) {
2867  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2868  NewAI.getAlign(), "oldload");
2869  Old = convertValue(DL, IRB, Old, IntTy);
2870  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2871  V = insertInteger(DL, IRB, Old, V, Offset, "insert");
2872  } else {
2873  assert(V->getType() == IntTy &&
2874  "Wrong type for an alloca wide integer!");
2875  }
2876  V = convertValue(DL, IRB, V, AllocaTy);
2877  } else {
2878  // Established these invariants above.
2879  assert(NewBeginOffset == NewAllocaBeginOffset);
2880  assert(NewEndOffset == NewAllocaEndOffset);
2881 
2882  V = getIntegerSplat(II.getValue(),
2883  DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
2884  if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
2885  V = getVectorSplat(
2886  V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
2887 
2888  V = convertValue(DL, IRB, V, AllocaTy);
2889  }
2890 
2891  StoreInst *New =
2892  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
2893  if (AATags)
2894  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2895  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2896  return !II.isVolatile();
2897  }
2898 
2899  bool visitMemTransferInst(MemTransferInst &II) {
2900  // Rewriting of memory transfer instructions can be a bit tricky. We break
2901  // them into two categories: split intrinsics and unsplit intrinsics.
2902 
2903  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2904 
2905  AAMDNodes AATags;
2906  II.getAAMetadata(AATags);
2907 
2908  bool IsDest = &II.getRawDestUse() == OldUse;
2909  assert((IsDest && II.getRawDest() == OldPtr) ||
2910  (!IsDest && II.getRawSource() == OldPtr));
2911 
2912  MaybeAlign SliceAlign = getSliceAlign();
2913 
2914  // For unsplit intrinsics, we simply modify the source and destination
2915  // pointers in place. This isn't just an optimization, it is a matter of
2916  // correctness. With unsplit intrinsics we may be dealing with transfers
2917  // within a single alloca before SROA ran, or with transfers that have
2918  // a variable length. We may also be dealing with memmove instead of
2919  // memcpy, and so simply updating the pointers is the necessary for us to
2920  // update both source and dest of a single call.
2921  if (!IsSplittable) {
2922  Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2923  if (IsDest) {
2924  II.setDest(AdjustedPtr);
2925  II.setDestAlignment(SliceAlign);
2926  }
2927  else {
2928  II.setSource(AdjustedPtr);
2929  II.setSourceAlignment(SliceAlign);
2930  }
2931 
2932  LLVM_DEBUG(dbgs() << " to: " << II << "\n");
2933  deleteIfTriviallyDead(OldPtr);
2934  return false;
2935  }
2936  // For split transfer intrinsics we have an incredibly useful assurance:
2937  // the source and destination do not reside within the same alloca, and at
2938  // least one of them does not escape. This means that we can replace
2939  // memmove with memcpy, and we don't need to worry about all manner of
2940  // downsides to splitting and transforming the operations.
2941 
2942  // If this doesn't map cleanly onto the alloca type, and that type isn't
2943  // a single value type, just emit a memcpy.
2944  bool EmitMemCpy =
2945  !VecTy && !IntTy &&
2946  (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
2947  SliceSize !=
2948  DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
2949  !NewAI.getAllocatedType()->isSingleValueType());
2950 
2951  // If we're just going to emit a memcpy, the alloca hasn't changed, and the
2952  // size hasn't been shrunk based on analysis of the viable range, this is
2953  // a no-op.
2954  if (EmitMemCpy && &OldAI == &NewAI) {
2955  // Ensure the start lines up.
2956  assert(NewBeginOffset == BeginOffset);
2957 
2958  // Rewrite the size as needed.
2959  if (NewEndOffset != EndOffset)
2961  NewEndOffset - NewBeginOffset));
2962  return false;
2963  }
2964  // Record this instruction for deletion.
2965  Pass.DeadInsts.push_back(&II);
2966 
2967  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2968  // alloca that should be re-examined after rewriting this instruction.
2969  Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
2970  if (AllocaInst *AI =
2971  dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
2972  assert(AI != &OldAI && AI != &NewAI &&
2973  "Splittable transfers cannot reach the same alloca on both ends.");
2974  Pass.Worklist.insert(AI);
2975  }
2976 
2977  Type *OtherPtrTy = OtherPtr->getType();
2978  unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
2979 
2980  // Compute the relative offset for the other pointer within the transfer.
2981  unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
2982  APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
2983  Align OtherAlign =
2984  (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
2985  OtherAlign =
2986  commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
2987 
2988  if (EmitMemCpy) {
2989  // Compute the other pointer, folding as much as possible to produce
2990  // a single, simple GEP in most cases.
2991  OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
2992  OtherPtr->getName() + ".");
2993 
2994  Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2995  Type *SizeTy = II.getLength()->getType();
2996  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2997 
2998  Value *DestPtr, *SrcPtr;
2999  MaybeAlign DestAlign, SrcAlign;
3000  // Note: IsDest is true iff we're copying into the new alloca slice
3001  if (IsDest) {
3002  DestPtr = OurPtr;
3003  DestAlign = SliceAlign;
3004  SrcPtr = OtherPtr;
3005  SrcAlign = OtherAlign;
3006  } else {
3007  DestPtr = OtherPtr;
3008  DestAlign = OtherAlign;
3009  SrcPtr = OurPtr;
3010  SrcAlign = SliceAlign;
3011  }
3012  CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3013  Size, II.isVolatile());
3014  if (AATags)
3015  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3016  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3017  return false;
3018  }
3019 
3020  bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3021  NewEndOffset == NewAllocaEndOffset;
3022  uint64_t Size = NewEndOffset - NewBeginOffset;
3023  unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3024  unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3025  unsigned NumElements = EndIndex - BeginIndex;
3026  IntegerType *SubIntTy =
3027  IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3028 
3029  // Reset the other pointer type to match the register type we're going to
3030  // use, but using the address space of the original other pointer.
3031  Type *OtherTy;
3032  if (VecTy && !IsWholeAlloca) {
3033  if (NumElements == 1)
3034  OtherTy = VecTy->getElementType();
3035  else
3036  OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3037  } else if (IntTy && !IsWholeAlloca) {
3038  OtherTy = SubIntTy;
3039  } else {
3040  OtherTy = NewAllocaTy;
3041  }
3042  OtherPtrTy = OtherTy->getPointerTo(OtherAS);
3043 
3044  Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3045  OtherPtr->getName() + ".");
3046  MaybeAlign SrcAlign = OtherAlign;
3047  Value *DstPtr = &NewAI;
3048  MaybeAlign DstAlign = SliceAlign;
3049  if (!IsDest) {
3050  std::swap(SrcPtr, DstPtr);
3051  std::swap(SrcAlign, DstAlign);
3052  }
3053 
3054  Value *Src;
3055  if (VecTy && !IsWholeAlloca && !IsDest) {
3056  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3057  NewAI.getAlign(), "load");
3058  Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3059  } else if (IntTy && !IsWholeAlloca && !IsDest) {
3060  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3061  NewAI.getAlign(), "load");
3062  Src = convertValue(DL, IRB, Src, IntTy);
3063  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3064  Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3065  } else {
3066  LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3067  II.isVolatile(), "copyload");
3068  if (AATags)
3069  Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3070  Src = Load;
3071  }
3072 
3073  if (VecTy && !IsWholeAlloca && IsDest) {
3074  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3075  NewAI.getAlign(), "oldload");
3076  Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3077  } else if (IntTy && !IsWholeAlloca && IsDest) {
3078  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3079  NewAI.getAlign(), "oldload");
3080  Old = convertValue(DL, IRB, Old, IntTy);
3081  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3082  Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3083  Src = convertValue(DL, IRB, Src, NewAllocaTy);
3084  }
3085 
3086  StoreInst *Store = cast<StoreInst>(
3087  IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3088  if (AATags)
3089  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3090  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3091  return !II.isVolatile();
3092  }
3093 
3094  bool visitIntrinsicInst(IntrinsicInst &II) {
3095  assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3096  "Unexpected intrinsic!");
3097  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3098 
3099  // Record this instruction for deletion.
3100  Pass.DeadInsts.push_back(&II);
3101 
3102  if (II.isDroppable()) {
3103  assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3104  // TODO For now we forget assumed information, this can be improved.
3105  OldPtr->dropDroppableUsesIn(II);
3106  return true;
3107  }
3108 
3109  assert(II.getArgOperand(1) == OldPtr);
3110  // Lifetime intrinsics are only promotable if they cover the whole alloca.
3111  // Therefore, we drop lifetime intrinsics which don't cover the whole
3112  // alloca.
3113  // (In theory, intrinsics which partially cover an alloca could be
3114  // promoted, but PromoteMemToReg doesn't handle that case.)
3115  // FIXME: Check whether the alloca is promotable before dropping the
3116  // lifetime intrinsics?
3117  if (NewBeginOffset != NewAllocaBeginOffset ||
3118  NewEndOffset != NewAllocaEndOffset)
3119  return true;
3120 
3121  ConstantInt *Size =
3122  ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3123  NewEndOffset - NewBeginOffset);
3124  // Lifetime intrinsics always expect an i8* so directly get such a pointer
3125  // for the new alloca slice.
3127  Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3128  Value *New;
3129  if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3130  New = IRB.CreateLifetimeStart(Ptr, Size);
3131  else
3132  New = IRB.CreateLifetimeEnd(Ptr, Size);
3133 
3134  (void)New;
3135  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3136 
3137  return true;
3138  }
3139 
3140  void fixLoadStoreAlign(Instruction &Root) {
3141  // This algorithm implements the same visitor loop as
3142  // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3143  // or store found.
3146  Visited.insert(&Root);
3147  Uses.push_back(&Root);
3148  do {
3149  Instruction *I = Uses.pop_back_val();
3150 
3151  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3152  LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3153  continue;
3154  }
3155  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3156  SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3157  continue;
3158  }
3159 
3160  assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3161  isa<PHINode>(I) || isa<SelectInst>(I) ||
3162  isa<GetElementPtrInst>(I));
3163  for (User *U : I->users())
3164  if (Visited.insert(cast<Instruction>(U)).second)
3165  Uses.push_back(cast<Instruction>(U));
3166  } while (!Uses.empty());
3167  }
3168 
3169  bool visitPHINode(PHINode &PN) {
3170  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3171  assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3172  assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3173 
3174  // We would like to compute a new pointer in only one place, but have it be
3175  // as local as possible to the PHI. To do that, we re-use the location of
3176  // the old pointer, which necessarily must be in the right position to
3177  // dominate the PHI.
3179  if (isa<PHINode>(OldPtr))
3180  IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
3181  else
3182  IRB.SetInsertPoint(OldPtr);
3183  IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3184 
3185  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3186  // Replace the operands which were using the old pointer.
3187  std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3188 
3189  LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3190  deleteIfTriviallyDead(OldPtr);
3191 
3192  // Fix the alignment of any loads or stores using this PHI node.
3193  fixLoadStoreAlign(PN);
3194 
3195  // PHIs can't be promoted on their own, but often can be speculated. We
3196  // check the speculation outside of the rewriter so that we see the
3197  // fully-rewritten alloca.
3198  PHIUsers.insert(&PN);
3199  return true;
3200  }
3201 
3202  bool visitSelectInst(SelectInst &SI) {
3203  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3204  assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3205  "Pointer isn't an operand!");
3206  assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3207  assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3208 
3209  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3210  // Replace the operands which were using the old pointer.
3211  if (SI.getOperand(1) == OldPtr)
3212  SI.setOperand(1, NewPtr);
3213  if (SI.getOperand(2) == OldPtr)
3214  SI.setOperand(2, NewPtr);
3215 
3216  LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3217  deleteIfTriviallyDead(OldPtr);
3218 
3219  // Fix the alignment of any loads or stores using this select.
3220  fixLoadStoreAlign(SI);
3221 
3222  // Selects can't be promoted on their own, but often can be speculated. We
3223  // check the speculation outside of the rewriter so that we see the
3224  // fully-rewritten alloca.
3225  SelectUsers.insert(&SI);
3226  return true;
3227  }
3228 };
3229 
3230 namespace {
3231 
3232 /// Visitor to rewrite aggregate loads and stores as scalar.
3233 ///
3234 /// This pass aggressively rewrites all aggregate loads and stores on
3235 /// a particular pointer (or any pointer derived from it which we can identify)
3236 /// with scalar loads and stores.
3237 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3238  // Befriend the base class so it can delegate to private visit methods.
3239  friend class InstVisitor<AggLoadStoreRewriter, bool>;
3240 
3241  /// Queue of pointer uses to analyze and potentially rewrite.
3243 
3244  /// Set to prevent us from cycling with phi nodes and loops.
3245  SmallPtrSet<User *, 8> Visited;
3246 
3247  /// The current pointer use being rewritten. This is used to dig up the used
3248  /// value (as opposed to the user).
3249  Use *U = nullptr;
3250 
3251  /// Used to calculate offsets, and hence alignment, of subobjects.
3252  const DataLayout &DL;
3253 
3254 public:
3255  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
3256 
3257  /// Rewrite loads and stores through a pointer and all pointers derived from
3258  /// it.
3259  bool rewrite(Instruction &I) {
3260  LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3261  enqueueUsers(I);
3262  bool Changed = false;
3263  while (!Queue.empty()) {
3264  U = Queue.pop_back_val();
3265  Changed |= visit(cast<Instruction>(U->getUser()));
3266  }
3267  return Changed;
3268  }
3269 
3270 private:
3271  /// Enqueue all the users of the given instruction for further processing.
3272  /// This uses a set to de-duplicate users.
3273  void enqueueUsers(Instruction &I) {
3274  for (Use &U : I.uses())
3275  if (Visited.insert(U.getUser()).second)
3276  Queue.push_back(&U);
3277  }
3278 
3279  // Conservative default is to not rewrite anything.
3280  bool visitInstruction(Instruction &I) { return false; }
3281 
3282  /// Generic recursive split emission class.
3283  template <typename Derived> class OpSplitter {
3284  protected:
3285  /// The builder used to form new instructions.
3286  IRBuilderTy IRB;
3287 
3288  /// The indices which to be used with insert- or extractvalue to select the
3289  /// appropriate value within the aggregate.
3290  SmallVector<unsigned, 4> Indices;
3291 
3292  /// The indices to a GEP instruction which will move Ptr to the correct slot
3293  /// within the aggregate.
3294  SmallVector<Value *, 4> GEPIndices;
3295 
3296  /// The base pointer of the original op, used as a base for GEPing the
3297  /// split operations.
3298  Value *Ptr;
3299 
3300  /// The base pointee type being GEPed into.
3301  Type *BaseTy;
3302 
3303  /// Known alignment of the base pointer.
3304  Align BaseAlign;
3305 
3306  /// To calculate offset of each component so we can correctly deduce
3307  /// alignments.
3308  const DataLayout &DL;
3309 
3310  /// Initialize the splitter with an insertion point, Ptr and start with a
3311  /// single zero GEP index.
3312  OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3313  Align BaseAlign, const DataLayout &DL)
3314  : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
3315  BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
3316 
3317  public:
3318  /// Generic recursive split emission routine.
3319  ///
3320  /// This method recursively splits an aggregate op (load or store) into
3321  /// scalar or vector ops. It splits recursively until it hits a single value
3322  /// and emits that single value operation via the template argument.
3323  ///
3324  /// The logic of this routine relies on GEPs and insertvalue and
3325  /// extractvalue all operating with the same fundamental index list, merely
3326  /// formatted differently (GEPs need actual values).
3327  ///
3328  /// \param Ty The type being split recursively into smaller ops.
3329  /// \param Agg The aggregate value being built up or stored, depending on
3330  /// whether this is splitting a load or a store respectively.
3331  void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3332  if (Ty->isSingleValueType()) {
3333  unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
3334  return static_cast<Derived *>(this)->emitFunc(
3335  Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
3336  }
3337 
3338  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3339  unsigned OldSize = Indices.size();
3340  (void)OldSize;
3341  for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3342  ++Idx) {
3343  assert(Indices.size() == OldSize && "Did not return to the old size");
3344  Indices.push_back(Idx);
3345  GEPIndices.push_back(IRB.getInt32(Idx));
3346  emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3347  GEPIndices.pop_back();
3348  Indices.pop_back();
3349  }
3350  return;
3351  }
3352 
3353  if (StructType *STy = dyn_cast<StructType>(Ty)) {
3354  unsigned OldSize = Indices.size();
3355  (void)OldSize;
3356  for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3357  ++Idx) {
3358  assert(Indices.size() == OldSize && "Did not return to the old size");
3359  Indices.push_back(Idx);
3360  GEPIndices.push_back(IRB.getInt32(Idx));
3361  emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3362  GEPIndices.pop_back();
3363  Indices.pop_back();
3364  }
3365  return;
3366  }
3367 
3368  llvm_unreachable("Only arrays and structs are aggregate loadable types");
3369  }
3370  };
3371 
3372  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3373  AAMDNodes AATags;
3374 
3375  LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3376  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
3377  : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3378  DL),
3379  AATags(AATags) {}
3380 
3381  /// Emit a leaf load of a single value. This is called at the leaves of the
3382  /// recursive emission to actually load values.
3383  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3384  assert(Ty->isSingleValueType());
3385  // Load the single value and insert it using the indices.
3386  Value *GEP =
3387  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3388  LoadInst *Load =
3389  IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
3390 
3391  APInt Offset(
3392  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3393  if (AATags &&
3394  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3395  Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3396 
3397  Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3398  LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3399  }
3400  };
3401 
3402  bool visitLoadInst(LoadInst &LI) {
3403  assert(LI.getPointerOperand() == *U);
3404  if (!LI.isSimple() || LI.getType()->isSingleValueType())
3405  return false;
3406 
3407  // We have an aggregate being loaded, split it apart.
3408  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3409  AAMDNodes AATags;
3410  LI.getAAMetadata(AATags);
3411  LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
3412  getAdjustedAlignment(&LI, 0), DL);
3413  Value *V = UndefValue::get(LI.getType());
3414  Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3415  Visited.erase(&LI);
3416  LI.replaceAllUsesWith(V);
3417  LI.eraseFromParent();
3418  return true;
3419  }
3420 
3421  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3422  StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3423  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
3424  : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3425  DL),
3426  AATags(AATags) {}
3427  AAMDNodes AATags;
3428  /// Emit a leaf store of a single value. This is called at the leaves of the
3429  /// recursive emission to actually produce stores.
3430  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3431  assert(Ty->isSingleValueType());
3432  // Extract the single value and store it using the indices.
3433  //
3434  // The gep and extractvalue values are factored out of the CreateStore
3435  // call to make the output independent of the argument evaluation order.
3436  Value *ExtractValue =
3437  IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
3438  Value *InBoundsGEP =
3439  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3440  StoreInst *Store =
3441  IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
3442 
3443  APInt Offset(
3444  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3445  if (AATags &&
3446  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3447  Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3448 
3449  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3450  }
3451  };
3452 
3453  bool visitStoreInst(StoreInst &SI) {
3454  if (!SI.isSimple() || SI.getPointerOperand() != *U)
3455  return false;
3456  Value *V = SI.getValueOperand();
3457  if (V->getType()->isSingleValueType())
3458  return false;
3459 
3460  // We have an aggregate being stored, split it apart.
3461  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3462  AAMDNodes AATags;
3463  SI.getAAMetadata(AATags);
3464  StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
3465  getAdjustedAlignment(&SI, 0), DL);
3466  Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3467  Visited.erase(&SI);
3468  SI.eraseFromParent();
3469  return true;
3470  }
3471 
3472  bool visitBitCastInst(BitCastInst &BC) {
3473  enqueueUsers(BC);
3474  return false;
3475  }
3476 
3477  bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
3478  enqueueUsers(ASC);
3479  return false;
3480  }
3481 
3482  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
3483  bool foldGEPSelect(GetElementPtrInst &GEPI) {
3484  if (!GEPI.hasAllConstantIndices())
3485  return false;
3486 
3487  SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
3488 
3489  LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
3490  << "\n original: " << *Sel
3491  << "\n " << GEPI);
3492 
3493  IRBuilderTy Builder(&GEPI);
3495  bool IsInBounds = GEPI.isInBounds();
3496 
3497  Type *Ty = GEPI.getSourceElementType();
3498  Value *True = Sel->getTrueValue();
3499  Value *NTrue =
3500  IsInBounds
3501  ? Builder.CreateInBoundsGEP(Ty, True, Index,
3502  True->getName() + ".sroa.gep")
3503  : Builder.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
3504 
3505  Value *False = Sel->getFalseValue();
3506 
3507  Value *NFalse =
3508  IsInBounds
3509  ? Builder.CreateInBoundsGEP(Ty, False, Index,
3510  False->getName() + ".sroa.gep")
3511  : Builder.CreateGEP(Ty, False, Index,
3512  False->getName() + ".sroa.gep");
3513 
3514  Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
3515  Sel->getName() + ".sroa.sel");
3516  Visited.erase(&GEPI);
3517  GEPI.replaceAllUsesWith(NSel);
3518  GEPI.eraseFromParent();
3519  Instruction *NSelI = cast<Instruction>(NSel);
3520  Visited.insert(NSelI);
3521  enqueueUsers(*NSelI);
3522 
3523  LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
3524  << "\n " << *NFalse
3525  << "\n " << *NSel << '\n');
3526 
3527  return true;
3528  }
3529 
3530  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
3531  bool foldGEPPhi(GetElementPtrInst &GEPI) {
3532  if (!GEPI.hasAllConstantIndices())
3533  return false;
3534 
3535  PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
3536  if (GEPI.getParent() != PHI->getParent() ||
3537  llvm::any_of(PHI->incoming_values(), [](Value *In)
3538  { Instruction *I = dyn_cast<Instruction>(In);
3539  return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
3540  succ_empty(I->getParent()) ||
3541  !I->getParent()->isLegalToHoistInto();
3542  }))
3543  return false;
3544 
3545  LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
3546  << "\n original: " << *PHI
3547  << "\n " << GEPI
3548  << "\n to: ");
3549 
3551  bool IsInBounds = GEPI.isInBounds();
3552  IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
3553  PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
3554  PHI->getNumIncomingValues(),
3555  PHI->getName() + ".sroa.phi");
3556  for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
3557  BasicBlock *B = PHI->getIncomingBlock(I);
3558  Value *NewVal = nullptr;
3559  int Idx = NewPN->getBasicBlockIndex(B);
3560  if (Idx >= 0) {
3561  NewVal = NewPN->getIncomingValue(Idx);
3562  } else {
3563  Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
3564 
3565  IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
3566  Type *Ty = GEPI.getSourceElementType();
3567  NewVal = IsInBounds
3568  ? B.CreateInBoundsGEP(Ty, In, Index, In->getName() + ".sroa.gep")
3569  : B.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep");
3570  }
3571  NewPN->addIncoming(NewVal, B);
3572  }
3573 
3574  Visited.erase(&GEPI);
3575  GEPI.replaceAllUsesWith(NewPN);
3576  GEPI.eraseFromParent();
3577  Visited.insert(NewPN);
3578  enqueueUsers(*NewPN);
3579 
3580  LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
3581  dbgs() << "\n " << *In;
3582  dbgs() << "\n " << *NewPN << '\n');
3583 
3584  return true;
3585  }
3586 
3587  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
3588  if (isa<SelectInst>(GEPI.getPointerOperand()) &&
3589  foldGEPSelect(GEPI))
3590  return true;
3591 
3592  if (isa<PHINode>(GEPI.getPointerOperand()) &&
3593  foldGEPPhi(GEPI))
3594  return true;
3595 
3596  enqueueUsers(GEPI);
3597  return false;
3598  }
3599 
3600  bool visitPHINode(PHINode &PN) {
3601  enqueueUsers(PN);
3602  return false;
3603  }
3604 
3605  bool visitSelectInst(SelectInst &SI) {
3606  enqueueUsers(SI);
3607  return false;
3608  }
3609 };
3610 
3611 } // end anonymous namespace
3612 
3613 /// Strip aggregate type wrapping.
3614 ///
3615 /// This removes no-op aggregate types wrapping an underlying type. It will
3616 /// strip as many layers of types as it can without changing either the type
3617 /// size or the allocated size.
3619  if (Ty->isSingleValueType())
3620  return Ty;
3621 
3622  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
3623  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
3624 
3625  Type *InnerTy;
3626  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
3627  InnerTy = ArrTy->getElementType();
3628  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
3629  const StructLayout *SL = DL.getStructLayout(STy);
3630  unsigned Index = SL->getElementContainingOffset(0);
3631  InnerTy = STy->getElementType(Index);
3632  } else {
3633  return Ty;
3634  }
3635 
3636  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
3637  TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
3638  return Ty;
3639 
3640  return stripAggregateTypeWrapping(DL, InnerTy);
3641 }
3642 
3643 /// Try to find a partition of the aggregate type passed in for a given
3644 /// offset and size.
3645 ///
3646 /// This recurses through the aggregate type and tries to compute a subtype
3647 /// based on the offset and size. When the offset and size span a sub-section
3648 /// of an array, it will even compute a new array type for that sub-section,
3649 /// and the same for structs.
3650 ///
3651 /// Note that this routine is very strict and tries to find a partition of the
3652 /// type which produces the *exact* right offset and size. It is not forgiving
3653 /// when the size or offset cause either end of type-based partition to be off.
3654 /// Also, this is a best-effort routine. It is reasonable to give up and not
3655 /// return a type if necessary.
3656 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
3657  uint64_t Size) {
3658  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
3659  return stripAggregateTypeWrapping(DL, Ty);
3660  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
3661  (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
3662  return nullptr;
3663 
3664  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
3665  Type *ElementTy;
3666  uint64_t TyNumElements;
3667  if (auto *AT = dyn_cast<ArrayType>(Ty)) {
3668  ElementTy = AT->getElementType();
3669  TyNumElements = AT->getNumElements();
3670  } else {
3671  // FIXME: This isn't right for vectors with non-byte-sized or
3672  // non-power-of-two sized elements.
3673  auto *VT = cast<FixedVectorType>(Ty);
3674  ElementTy = VT->getElementType();
3675  TyNumElements = VT->getNumElements();
3676  }
3677  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3678  uint64_t NumSkippedElements = Offset / ElementSize;
3679  if (NumSkippedElements >= TyNumElements)
3680  return nullptr;
3681  Offset -= NumSkippedElements * ElementSize;
3682 
3683  // First check if we need to recurse.
3684  if (Offset > 0 || Size < ElementSize) {
3685  // Bail if the partition ends in a different array element.
3686  if ((Offset + Size) > ElementSize)
3687  return nullptr;
3688  // Recurse through the element type trying to peel off offset bytes.
3689  return getTypePartition(DL, ElementTy, Offset, Size);
3690  }
3691  assert(Offset == 0);
3692 
3693  if (Size == ElementSize)
3694  return stripAggregateTypeWrapping(DL, ElementTy);
3695  assert(Size > ElementSize);
3696  uint64_t NumElements = Size / ElementSize;
3697  if (NumElements * ElementSize != Size)
3698  return nullptr;
3699  return ArrayType::get(ElementTy, NumElements);
3700  }
3701 
3702  StructType *STy = dyn_cast<StructType>(Ty);
3703  if (!STy)
3704  return nullptr;
3705 
3706  const StructLayout *SL = DL.getStructLayout(STy);
3707  if (Offset >= SL->getSizeInBytes())
3708  return nullptr;
3709  uint64_t EndOffset = Offset + Size;
3710  if (EndOffset > SL->getSizeInBytes())
3711  return nullptr;
3712 
3713  unsigned Index = SL->getElementContainingOffset(Offset);
3714  Offset -= SL->getElementOffset(Index);
3715 
3716  Type *ElementTy = STy->getElementType(Index);
3717  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3718  if (Offset >= ElementSize)
3719  return nullptr; // The offset points into alignment padding.
3720 
3721  // See if any partition must be contained by the element.
3722  if (Offset > 0 || Size < ElementSize) {
3723  if ((Offset + Size) > ElementSize)
3724  return nullptr;
3725  return getTypePartition(DL, ElementTy, Offset, Size);
3726  }
3727  assert(Offset == 0);
3728 
3729  if (Size == ElementSize)
3730  return stripAggregateTypeWrapping(DL, ElementTy);
3731 
3733  EE = STy->element_end();
3734  if (EndOffset < SL->getSizeInBytes()) {
3735  unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
3736  if (Index == EndIndex)
3737  return nullptr; // Within a single element and its padding.
3738 
3739  // Don't try to form "natural" types if the elements don't line up with the
3740  // expected size.
3741  // FIXME: We could potentially recurse down through the last element in the
3742  // sub-struct to find a natural end point.
3743  if (SL->getElementOffset(EndIndex) != EndOffset)
3744  return nullptr;
3745 
3746  assert(Index < EndIndex);
3747  EE = STy->element_begin() + EndIndex;
3748  }
3749 
3750  // Try to build up a sub-structure.
3751  StructType *SubTy =
3752  StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
3753  const StructLayout *SubSL = DL.getStructLayout(SubTy);
3754  if (Size != SubSL->getSizeInBytes())
3755  return nullptr; // The sub-struct doesn't have quite the size needed.
3756 
3757  return SubTy;
3758 }
3759 
3760 /// Pre-split loads and stores to simplify rewriting.
3761 ///
3762 /// We want to break up the splittable load+store pairs as much as
3763 /// possible. This is important to do as a preprocessing step, as once we
3764 /// start rewriting the accesses to partitions of the alloca we lose the
3765 /// necessary information to correctly split apart paired loads and stores
3766 /// which both point into this alloca. The case to consider is something like
3767 /// the following:
3768 ///
3769 /// %a = alloca [12 x i8]
3770 /// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
3771 /// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
3772 /// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
3773 /// %iptr1 = bitcast i8* %gep1 to i64*
3774 /// %iptr2 = bitcast i8* %gep2 to i64*
3775 /// %fptr1 = bitcast i8* %gep1 to float*
3776 /// %fptr2 = bitcast i8* %gep2 to float*
3777 /// %fptr3 = bitcast i8* %gep3 to float*
3778 /// store float 0.0, float* %fptr1
3779 /// store float 1.0, float* %fptr2
3780 /// %v = load i64* %iptr1
3781 /// store i64 %v, i64* %iptr2
3782 /// %f1 = load float* %fptr2
3783 /// %f2 = load float* %fptr3
3784 ///
3785 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
3786 /// promote everything so we recover the 2 SSA values that should have been
3787 /// there all along.
3788 ///
3789 /// \returns true if any changes are made.
3790 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
3791  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
3792 
3793  // Track the loads and stores which are candidates for pre-splitting here, in
3794  // the order they first appear during the partition scan. These give stable
3795  // iteration order and a basis for tracking which loads and stores we
3796  // actually split.
3799 
3800  // We need to accumulate the splits required of each load or store where we
3801  // can find them via a direct lookup. This is important to cross-check loads
3802  // and stores against each other. We also track the slice so that we can kill
3803  // all the slices that end up split.
3804  struct SplitOffsets {
3805  Slice *S;
3806  std::vector<uint64_t> Splits;
3807  };
3809 
3810  // Track loads out of this alloca which cannot, for any reason, be pre-split.
3811  // This is important as we also cannot pre-split stores of those loads!
3812  // FIXME: This is all pretty gross. It means that we can be more aggressive
3813  // in pre-splitting when the load feeding the store happens to come from
3814  // a separate alloca. Put another way, the effectiveness of SROA would be
3815  // decreased by a frontend which just concatenated all of its local allocas
3816  // into one big flat alloca. But defeating such patterns is exactly the job
3817  // SROA is tasked with! Sadly, to not have this discrepancy we would have
3818  // change store pre-splitting to actually force pre-splitting of the load
3819  // that feeds it *and all stores*. That makes pre-splitting much harder, but
3820  // maybe it would make it more principled?
3821  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
3822 
3823  LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
3824  for (auto &P : AS.partitions()) {
3825  for (Slice &S : P) {
3826  Instruction *I = cast<Instruction>(S.getUse()->getUser());
3827  if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
3828  // If this is a load we have to track that it can't participate in any
3829  // pre-splitting. If this is a store of a load we have to track that
3830  // that load also can't participate in any pre-splitting.
3831  if (auto *LI = dyn_cast<LoadInst>(I))
3832  UnsplittableLoads.insert(LI);
3833  else if (auto *SI = dyn_cast<StoreInst>(I))
3834  if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
3835  UnsplittableLoads.insert(LI);
3836  continue;
3837  }
3838  assert(P.endOffset() > S.beginOffset() &&
3839  "Empty or backwards partition!");
3840 
3841  // Determine if this is a pre-splittable slice.
3842  if (auto *LI = dyn_cast<LoadInst>(I)) {
3843  assert(!LI->isVolatile() && "Cannot split volatile loads!");
3844 
3845  // The load must be used exclusively to store into other pointers for
3846  // us to be able to arbitrarily pre-split it. The stores must also be
3847  // simple to avoid changing semantics.
3848  auto IsLoadSimplyStored = [](LoadInst *LI) {
3849  for (User *LU : LI->users()) {
3850  auto *SI = dyn_cast<StoreInst>(LU);
3851  if (!SI || !SI->isSimple())
3852  return false;
3853  }
3854  return true;
3855  };
3856  if (!IsLoadSimplyStored(LI)) {
3857  UnsplittableLoads.insert(LI);
3858  continue;
3859  }
3860 
3861  Loads.push_back(LI);
3862  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
3863  if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
3864  // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
3865  continue;
3866  auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
3867  if (!StoredLoad || !StoredLoad->isSimple())
3868  continue;
3869  assert(!SI->isVolatile() && "Cannot split volatile stores!");
3870 
3871  Stores.push_back(SI);
3872  } else {
3873  // Other uses cannot be pre-split.
3874  continue;
3875  }
3876 
3877  // Record the initial split.
3878  LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
3879  auto &Offsets = SplitOffsetsMap[I];
3880  assert(Offsets.Splits.empty() &&
3881  "Should not have splits the first time we see an instruction!");
3882  Offsets.S = &S;
3883  Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
3884  }
3885 
3886  // Now scan the already split slices, and add a split for any of them which
3887  // we're going to pre-split.
3888  for (Slice *S : P.splitSliceTails()) {
3889  auto SplitOffsetsMapI =
3890  SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
3891  if (SplitOffsetsMapI == SplitOffsetsMap.end())
3892  continue;
3893  auto &Offsets = SplitOffsetsMapI->second;
3894 
3895  assert(Offsets.S == S && "Found a mismatched slice!");
3896  assert(!Offsets.Splits.empty() &&
3897  "Cannot have an empty set of splits on the second partition!");
3898  assert(Offsets.Splits.back() ==
3899  P.beginOffset() - Offsets.S->beginOffset() &&
3900  "Previous split does not end where this one begins!");
3901 
3902  // Record each split. The last partition's end isn't needed as the size
3903  // of the slice dictates that.
3904  if (S->endOffset() > P.endOffset())
3905  Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
3906  }
3907  }
3908 
3909  // We may have split loads where some of their stores are split stores. For
3910  // such loads and stores, we can only pre-split them if their splits exactly
3911  // match relative to their starting offset. We have to verify this prior to
3912  // any rewriting.
3913  llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
3914  // Lookup the load we are storing in our map of split
3915  // offsets.
3916  auto *LI = cast<LoadInst>(SI->getValueOperand());
3917  // If it was completely unsplittable, then we're done,
3918  // and this store can't be pre-split.
3919  if (UnsplittableLoads.count(LI))
3920  return true;
3921 
3922  auto LoadOffsetsI = SplitOffsetsMap.find(LI);
3923  if (LoadOffsetsI == SplitOffsetsMap.end())
3924  return false; // Unrelated loads are definitely safe.
3925  auto &LoadOffsets = LoadOffsetsI->second;
3926 
3927  // Now lookup the store's offsets.
3928  auto &StoreOffsets = SplitOffsetsMap[SI];
3929 
3930  // If the relative offsets of each split in the load and
3931  // store match exactly, then we can split them and we
3932  // don't need to remove them here.
3933  if (LoadOffsets.Splits == StoreOffsets.Splits)
3934  return false;
3935 
3936  LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
3937  << " " << *LI << "\n"
3938  << " " << *SI << "\n");
3939 
3940  // We've found a store and load that we need to split
3941  // with mismatched relative splits. Just give up on them
3942  // and remove both instructions from our list of
3943  // candidates.
3944  UnsplittableLoads.insert(LI);
3945  return true;
3946  });
3947  // Now we have to go *back* through all the stores, because a later store may
3948  // have caused an earlier store's load to become unsplittable and if it is
3949  // unsplittable for the later store, then we can't rely on it being split in
3950  // the earlier store either.
3951  llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
3952  auto *LI = cast<LoadInst>(SI->getValueOperand());
3953  return UnsplittableLoads.count(LI);
3954  });
3955  // Once we've established all the loads that can't be split for some reason,
3956  // filter any that made it into our list out.
3957  llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
3958  return UnsplittableLoads.count(LI);
3959  });
3960 
3961  // If no loads or stores are left, there is no pre-splitting to be done for
3962  // this alloca.
3963  if (Loads.empty() && Stores.empty())
3964  return false;
3965 
3966  // From here on, we can't fail and will be building new accesses, so rig up
3967  // an IR builder.
3968  IRBuilderTy IRB(&AI);
3969 
3970  // Collect the new slices which we will merge into the alloca slices.
3971  SmallVector<Slice, 4> NewSlices;
3972 
3973  // Track any allocas we end up splitting loads and stores for so we iterate
3974  // on them.
3975  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
3976 
3977  // At this point, we have collected all of the loads and stores we can
3978  // pre-split, and the specific splits needed for them. We actually do the
3979  // splitting in a specific order in order to handle when one of the loads in
3980  // the value operand to one of the stores.
3981  //
3982  // First, we rewrite all of the split loads, and just accumulate each split
3983  // load in a parallel structure. We also build the slices for them and append
3984  // them to the alloca slices.
3986  std::vector<LoadInst *> SplitLoads;
3987  const DataLayout &DL = AI.getModule()->getDataLayout();
3988  for (LoadInst *LI : Loads) {
3989  SplitLoads.clear();
3990 
3991  IntegerType *Ty = cast<IntegerType>(LI->getType());
3992  uint64_t LoadSize = Ty->getBitWidth() / 8;
3993  assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
3994 
3995  auto &Offsets = SplitOffsetsMap[LI];
3996  assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
3997  "Slice size should always match load size exactly!");
3998  uint64_t BaseOffset = Offsets.S->beginOffset();
3999  assert(BaseOffset + LoadSize > BaseOffset &&
4000  "Cannot represent alloca access size using 64-bit integers!");
4001 
4002  Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
4003  IRB.SetInsertPoint(LI);
4004 
4005  LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4006 
4007  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4008  int Idx = 0, Size = Offsets.Splits.size();
4009  for (;;) {
4010  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4011  auto AS = LI->getPointerAddressSpace();
4012  auto *PartPtrTy = PartTy->getPointerTo(AS);
4013  LoadInst *PLoad = IRB.CreateAlignedLoad(
4014  PartTy,
4015  getAdjustedPtr(IRB, DL, BasePtr,
4016  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4017  PartPtrTy, BasePtr->getName() + "."),
4018  getAdjustedAlignment(LI, PartOffset),
4019  /*IsVolatile*/ false, LI->getName());
4020  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4021  LLVMContext::MD_access_group});
4022 
4023  // Append this load onto the list of split loads so we can find it later
4024  // to rewrite the stores.
4025  SplitLoads.push_back(PLoad);
4026 
4027  // Now build a new slice for the alloca.
4028  NewSlices.push_back(
4029  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4030  &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4031  /*IsSplittable*/ false));
4032  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4033  << ", " << NewSlices.back().endOffset()
4034  << "): " << *PLoad << "\n");
4035 
4036  // See if we've handled all the splits.
4037  if (Idx >= Size)
4038  break;
4039 
4040  // Setup the next partition.
4041  PartOffset = Offsets.Splits[Idx];
4042  ++Idx;
4043  PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
4044  }
4045 
4046  // Now that we have the split loads, do the slow walk over all uses of the
4047  // load and rewrite them as split stores, or save the split loads to use
4048  // below if the store is going to be split there anyways.
4049  bool DeferredStores = false;
4050  for (User *LU : LI->users()) {
4051  StoreInst *SI = cast<StoreInst>(LU);
4052  if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4053  DeferredStores = true;
4054  LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4055  << "\n");
4056  continue;
4057  }
4058 
4059  Value *StoreBasePtr = SI->getPointerOperand();
4060  IRB.SetInsertPoint(SI);
4061 
4062  LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4063 
4064  for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4065  LoadInst *PLoad = SplitLoads[Idx];
4066  uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4067  auto *PartPtrTy =
4068  PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
4069 
4070  auto AS = SI->getPointerAddressSpace();
4071  StoreInst *PStore = IRB.CreateAlignedStore(
4072  PLoad,
4073  getAdjustedPtr(IRB, DL, StoreBasePtr,
4074  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4075  PartPtrTy, StoreBasePtr->getName() + "."),
4076  getAdjustedAlignment(SI, PartOffset),
4077  /*IsVolatile*/ false);
4078  PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4079  LLVMContext::MD_access_group});
4080  LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4081  }
4082 
4083  // We want to immediately iterate on any allocas impacted by splitting
4084  // this store, and we have to track any promotable alloca (indicated by
4085  // a direct store) as needing to be resplit because it is no longer
4086  // promotable.
4087  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4088  ResplitPromotableAllocas.insert(OtherAI);
4089  Worklist.insert(OtherAI);
4090  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4091  StoreBasePtr->stripInBoundsOffsets())) {
4092  Worklist.insert(OtherAI);
4093  }
4094 
4095  // Mark the original store as dead.
4096  DeadInsts.push_back(SI);
4097  }
4098 
4099  // Save the split loads if there are deferred stores among the users.
4100  if (DeferredStores)
4101  SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4102 
4103  // Mark the original load as dead and kill the original slice.
4104  DeadInsts.push_back(LI);
4105  Offsets.S->kill();
4106  }
4107 
4108  // Second, we rewrite all of the split stores. At this point, we know that
4109  // all loads from this alloca have been split already. For stores of such
4110  // loads, we can simply look up the pre-existing split loads. For stores of
4111  // other loads, we split those loads first and then write split stores of
4112  // them.
4113  for (StoreInst *SI : Stores) {
4114  auto *LI = cast<LoadInst>(SI->getValueOperand());
4115  IntegerType *Ty = cast<IntegerType>(LI->getType());
4116  uint64_t StoreSize = Ty->getBitWidth() / 8;
4117  assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4118 
4119  auto &Offsets = SplitOffsetsMap[SI];
4120  assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4121  "Slice size should always match load size exactly!");
4122  uint64_t BaseOffset = Offsets.S->beginOffset();
4123  assert(BaseOffset + StoreSize > BaseOffset &&
4124  "Cannot represent alloca access size using 64-bit integers!");
4125 
4126  Value *LoadBasePtr = LI->getPointerOperand();
4127  Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4128 
4129  LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4130 
4131  // Check whether we have an already split load.
4132  auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4133  std::vector<LoadInst *> *SplitLoads = nullptr;
4134  if (SplitLoadsMapI != SplitLoadsMap.end()) {
4135  SplitLoads = &SplitLoadsMapI->second;
4136  assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4137  "Too few split loads for the number of splits in the store!");
4138  } else {
4139  LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4140  }
4141 
4142  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4143  int Idx = 0, Size = Offsets.Splits.size();
4144  for (;;) {
4145  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4146  auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
4147  auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
4148 
4149  // Either lookup a split load or create one.
4150  LoadInst *PLoad;
4151  if (SplitLoads) {
4152  PLoad = (*SplitLoads)[Idx];
4153  } else {
4154  IRB.SetInsertPoint(LI);
4155  auto AS = LI->getPointerAddressSpace();
4156  PLoad = IRB.CreateAlignedLoad(
4157  PartTy,
4158  getAdjustedPtr(IRB, DL, LoadBasePtr,
4159  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4160  LoadPartPtrTy, LoadBasePtr->getName() + "."),
4161  getAdjustedAlignment(LI, PartOffset),
4162  /*IsVolatile*/ false, LI->getName());
4163  }
4164 
4165  // And store this partition.
4166  IRB.SetInsertPoint(SI);
4167  auto AS = SI->getPointerAddressSpace();
4168  StoreInst *PStore = IRB.CreateAlignedStore(
4169  PLoad,
4170  getAdjustedPtr(IRB, DL, StoreBasePtr,
4171  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4172  StorePartPtrTy, StoreBasePtr->getName() + "."),
4173  getAdjustedAlignment(SI, PartOffset),
4174  /*IsVolatile*/ false);
4175 
4176  // Now build a new slice for the alloca.
4177  NewSlices.push_back(
4178  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4179  &PStore->getOperandUse(PStore->getPointerOperandIndex()),
4180  /*IsSplittable*/ false));
4181  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4182  << ", " << NewSlices.back().endOffset()
4183  << "): " << *PStore << "\n");
4184  if (!SplitLoads) {
4185  LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4186  }
4187 
4188  // See if we've finished all the splits.
4189  if (Idx >= Size)
4190  break;
4191 
4192  // Setup the next partition.
4193  PartOffset = Offsets.Splits[Idx];
4194  ++Idx;
4195  PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4196  }
4197 
4198  // We want to immediately iterate on any allocas impacted by splitting
4199  // this load, which is only relevant if it isn't a load of this alloca and
4200  // thus we didn't already split the loads above. We also have to keep track
4201  // of any promotable allocas we split loads on as they can no longer be
4202  // promoted.
4203  if (!SplitLoads) {
4204  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
4205  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4206  ResplitPromotableAllocas.insert(OtherAI);
4207  Worklist.insert(OtherAI);
4208  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4209  LoadBasePtr->stripInBoundsOffsets())) {
4210  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4211  Worklist.insert(OtherAI);
4212  }
4213  }
4214 
4215  // Mark the original store as dead now that we've split it up and kill its
4216  // slice. Note that we leave the original load in place unless this store
4217  // was its only use. It may in turn be split up if it is an alloca load
4218  // for some other alloca, but it may be a normal load. This may introduce
4219  // redundant loads, but where those can be merged the rest of the optimizer
4220  // should handle the merging, and this uncovers SSA splits which is more
4221  // important. In practice, the original loads will almost always be fully
4222  // split and removed eventually, and the splits will be merged by any
4223  // trivial CSE, including instcombine.
4224  if (LI->hasOneUse()) {
4225  assert(*LI->user_begin() == SI && "Single use isn't this store!");
4226  DeadInsts.push_back(LI);
4227  }
4228  DeadInsts.push_back(SI);
4229  Offsets.S->kill();
4230  }
4231 
4232  // Remove the killed slices that have ben pre-split.
4233  llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
4234 
4235  // Insert our new slices. This will sort and merge them into the sorted
4236  // sequence.
4237  AS.insert(NewSlices);
4238 
4239  LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4240 #ifndef NDEBUG
4241  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4242  LLVM_DEBUG(AS.print(dbgs(), I, " "));
4243 #endif
4244 
4245  // Finally, don't try to promote any allocas that new require re-splitting.
4246  // They have already been added to the worklist above.
4247  llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
4248  return ResplitPromotableAllocas.count(AI);
4249  });
4250 
4251  return true;
4252 }
4253 
4254 /// Rewrite an alloca partition's users.
4255 ///
4256 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4257 /// to rewrite uses of an alloca partition to be conducive for SSA value
4258 /// promotion. If the partition needs a new, more refined alloca, this will
4259 /// build that new alloca, preserving as much type information as possible, and
4260 /// rewrite the uses of the old alloca to point at the new one and have the
4261 /// appropriate new offsets. It also evaluates how successful the rewrite was
4262 /// at enabling promotion and if it was successful queues the alloca to be
4263 /// promoted.
4264 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4265  Partition &P) {
4266  // Try to compute a friendly type for this partition of the alloca. This
4267  // won't always succeed, in which case we fall back to a legal integer type
4268  // or an i8 array of an appropriate size.
4269  Type *SliceTy = nullptr;
4270  const DataLayout &DL = AI.getModule()->getDataLayout();
4271  std::pair<Type *, IntegerType *> CommonUseTy =
4272  findCommonType(P.begin(), P.end(), P.endOffset());
4273  // Do all uses operate on the same type?
4274  if (CommonUseTy.first)
4275  if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
4276  SliceTy = CommonUseTy.first;
4277  // If not, can we find an appropriate subtype in the original allocated type?
4278  if (!SliceTy)
4279  if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4280  P.beginOffset(), P.size()))
4281  SliceTy = TypePartitionTy;
4282  // If still not, can we use the largest bitwidth integer type used?
4283  if (!SliceTy && CommonUseTy.second)
4284  if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
4285  SliceTy = CommonUseTy.second;
4286  if ((!SliceTy || (SliceTy->isArrayTy() &&
4287  SliceTy->getArrayElementType()->isIntegerTy())) &&
4288  DL.isLegalInteger(P.size() * 8))
4289  SliceTy = Type::getIntNTy(*C, P.size() * 8);
4290  if (!SliceTy)
4291  SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4292  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
4293 
4294  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4295 
4296  VectorType *VecTy =
4297  IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4298  if (VecTy)
4299  SliceTy = VecTy;
4300 
4301  // Check for the case where we're going to rewrite to a new alloca of the
4302  // exact same type as the original, and with the same access offsets. In that
4303  // case, re-use the existing alloca, but still run through the rewriter to
4304  // perform phi and select speculation.
4305  // P.beginOffset() can be non-zero even with the same type in a case with
4306  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4307  AllocaInst *NewAI;
4308  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4309  NewAI = &AI;
4310  // FIXME: We should be able to bail at this point with "nothing changed".
4311  // FIXME: We might want to defer PHI speculation until after here.
4312  // FIXME: return nullptr;
4313  } else {
4314  // Make sure the alignment is compatible with P.beginOffset().
4315  const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
4316  // If we will get at least this much alignment from the type alone, leave
4317  // the alloca's alignment unconstrained.
4318  const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
4319  NewAI = new AllocaInst(
4320  SliceTy, AI.getType()->getAddressSpace(), nullptr,
4321  IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
4322  AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
4323  // Copy the old AI debug location over to the new one.
4324  NewAI->setDebugLoc(AI.getDebugLoc());
4325  ++NumNewAllocas;
4326  }
4327 
4328  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
4329  << "[" << P.beginOffset() << "," << P.endOffset()
4330  << ") to: " << *NewAI << "\n");
4331 
4332  // Track the high watermark on the worklist as it is only relevant for
4333  // promoted allocas. We will reset it to this point if the alloca is not in
4334  // fact scheduled for promotion.
4335  unsigned PPWOldSize = PostPromotionWorklist.size();
4336  unsigned NumUses = 0;
4338  SmallSetVector<SelectInst *, 8> SelectUsers;
4339 
4340  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4341  P.endOffset(), IsIntegerPromotable, VecTy,
4342  PHIUsers, SelectUsers);
4343  bool Promotable = true;
4344  for (Slice *S : P.splitSliceTails()) {
4345  Promotable &= Rewriter.visit(S);
4346  ++NumUses;
4347  }
4348  for (Slice &S : P) {
4349  Promotable &= Rewriter.visit(&S);
4350  ++NumUses;
4351  }
4352 
4353  NumAllocaPartitionUses += NumUses;
4354  MaxUsesPerAllocaPartition.updateMax(NumUses);
4355 
4356  // Now that we've processed all the slices in the new partition, check if any
4357  // PHIs or Selects would block promotion.
4358  for (PHINode *PHI : PHIUsers)
4359  if (!isSafePHIToSpeculate(*PHI)) {
4360  Promotable = false;
4361  PHIUsers.clear();
4362  SelectUsers.clear();
4363  break;
4364  }
4365 
4366  for (SelectInst *Sel : SelectUsers)
4367  if (!isSafeSelectToSpeculate(*Sel)) {
4368  Promotable = false;
4369  PHIUsers.clear();
4370  SelectUsers.clear();
4371  break;
4372  }
4373 
4374  if (Promotable) {
4375  for (Use *U : AS.getDeadUsesIfPromotable()) {
4376  auto *OldInst = dyn_cast<Instruction>(U->get());
4377  Value::dropDroppableUse(*U);
4378  if (OldInst)
4379  if (isInstructionTriviallyDead(OldInst))
4380  DeadInsts.push_back(OldInst);
4381  }
4382  if (PHIUsers.empty() && SelectUsers.empty()) {
4383  // Promote the alloca.
4384  PromotableAllocas.push_back(NewAI);
4385  } else {
4386  // If we have either PHIs or Selects to speculate, add them to those
4387  // worklists and re-queue the new alloca so that we promote in on the
4388  // next iteration.
4389  for (PHINode *PHIUser : PHIUsers)
4390  SpeculatablePHIs.insert(PHIUser);
4391  for (SelectInst *SelectUser : SelectUsers)
4392  SpeculatableSelects.insert(SelectUser);
4393  Worklist.insert(NewAI);
4394  }
4395  } else {
4396  // Drop any post-promotion work items if promotion didn't happen.
4397  while (PostPromotionWorklist.size() > PPWOldSize)
4398  PostPromotionWorklist.pop_back();
4399 
4400  // We couldn't promote and we didn't create a new partition, nothing
4401  // happened.
4402  if (NewAI == &AI)
4403  return nullptr;
4404 
4405  // If we can't promote the alloca, iterate on it to check for new
4406  // refinements exposed by splitting the current alloca. Don't iterate on an
4407  // alloca which didn't actually change and didn't get promoted.
4408  Worklist.insert(NewAI);
4409  }
4410 
4411  return NewAI;
4412 }
4413 
4414 /// Walks the slices of an alloca and form partitions based on them,
4415 /// rewriting each of their uses.
4416 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
4417  if (AS.begin() == AS.end())
4418  return false;
4419 
4420  unsigned NumPartitions = 0;
4421  bool Changed = false;
4422  const DataLayout &DL = AI.getModule()->getDataLayout();
4423 
4424  // First try to pre-split loads and stores.
4425  Changed |= presplitLoadsAndStores(AI, AS);
4426 
4427  // Now that we have identified any pre-splitting opportunities,
4428  // mark loads and stores unsplittable except for the following case.
4429  // We leave a slice splittable if all other slices are disjoint or fully
4430  // included in the slice, such as whole-alloca loads and stores.
4431  // If we fail to split these during pre-splitting, we want to force them
4432  // to be rewritten into a partition.
4433  bool IsSorted = true;
4434 
4435  uint64_t AllocaSize =
4436  DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
4437  const uint64_t MaxBitVectorSize = 1024;
4438  if (AllocaSize <= MaxBitVectorSize) {
4439  // If a byte boundary is included in any load or store, a slice starting or
4440  // ending at the boundary is not splittable.
4441  SmallBitVector SplittableOffset(AllocaSize + 1, true);
4442  for (Slice &S : AS)
4443  for (unsigned O = S.beginOffset() + 1;
4444  O < S.endOffset() && O < AllocaSize; O++)
4445  SplittableOffset.reset(O);
4446 
4447  for (Slice &S : AS) {
4448  if (!S.isSplittable())
4449  continue;
4450 
4451  if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
4452  (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
4453  continue;
4454 
4455  if (isa<LoadInst>(S.getUse()->getUser()) ||
4456  isa<StoreInst>(S.getUse()->getUser())) {
4457  S.makeUnsplittable();
4458  IsSorted = false;
4459  }
4460  }
4461  }
4462  else {
4463  // We only allow whole-alloca splittable loads and stores
4464  // for a large alloca to avoid creating too large BitVector.
4465  for (Slice &S : AS) {
4466  if (!S.isSplittable())
4467  continue;
4468 
4469  if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
4470  continue;
4471 
4472  if (isa<LoadInst>(S.getUse()->getUser()) ||
4473  isa<StoreInst>(S.getUse()->getUser())) {
4474  S.makeUnsplittable();
4475  IsSorted = false;
4476  }
4477  }
4478  }
4479 
4480  if (!IsSorted)
4481  llvm::sort(AS);
4482 
4483  /// Describes the allocas introduced by rewritePartition in order to migrate
4484  /// the debug info.
4485  struct Fragment {
4486  AllocaInst *Alloca;
4487  uint64_t Offset;
4488  uint64_t Size;
4489  Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
4490  : Alloca(AI), Offset(O), Size(S) {}
4491  };
4492  SmallVector<Fragment, 4> Fragments;
4493 
4494  // Rewrite each partition.
4495  for (auto &P : AS.partitions()) {
4496  if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
4497  Changed = true;
4498  if (NewAI != &AI) {
4499  uint64_t SizeOfByte = 8;
4500  uint64_t AllocaSize =
4501  DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
4502  // Don't include any padding.
4503  uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
4504  Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
4505  }
4506  }
4507  ++NumPartitions;
4508  }
4509 
4510  NumAllocaPartitions += NumPartitions;
4511  MaxPartitionsPerAlloca.updateMax(NumPartitions);
4512 
4513  // Migrate debug information from the old alloca to the new alloca(s)
4514  // and the individual partitions.
4516  for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
4517  auto *Expr = DbgDeclare->getExpression();
4518  DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
4519  uint64_t AllocaSize =
4520  DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
4521  for (auto Fragment : Fragments) {
4522  // Create a fragment expression describing the new partition or reuse AI's
4523  // expression if there is only one partition.
4524  auto *FragmentExpr = Expr;
4525  if (Fragment.Size < AllocaSize || Expr->isFragment()) {
4526  // If this alloca is already a scalar replacement of a larger aggregate,
4527  // Fragment.Offset describes the offset inside the scalar.
4528  auto ExprFragment = Expr->getFragmentInfo();
4529  uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
4530  uint64_t Start = Offset + Fragment.Offset;
4531  uint64_t Size = Fragment.Size;
4532  if (ExprFragment) {
4533  uint64_t AbsEnd =
4534  ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
4535  if (Start >= AbsEnd)
4536  // No need to describe a SROAed padding.
4537  continue;
4538  Size = std::min(Size, AbsEnd - Start);
4539  }
4540  // The new, smaller fragment is stenciled out from the old fragment.
4541  if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
4542  assert(Start >= OrigFragment->OffsetInBits &&
4543  "new fragment is outside of original fragment");
4544  Start -= OrigFragment->OffsetInBits;
4545  }
4546 
4547  // The alloca may be larger than the variable.
4548  auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
4549  if (VarSize) {
4550  if (Size > *VarSize)
4551  Size = *VarSize;
4552  if (Size == 0 || Start + Size > *VarSize)
4553  continue;
4554  }
4555 
4556  // Avoid creating a fragment expression that covers the entire variable.
4557  if (!VarSize || *VarSize != Size) {
4558  if (auto E =
4559  DIExpression::createFragmentExpression(Expr, Start, Size))
4560  FragmentExpr = *E;
4561  else
4562  continue;
4563  }
4564  }
4565 
4566  // Remove any existing intrinsics on the new alloca describing
4567  // the variable fragment.
4568  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
4569  auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS,
4570  const DbgVariableIntrinsic *RHS) {
4571  return LHS->getVariable() == RHS->getVariable() &&
4572  LHS->getDebugLoc()->getInlinedAt() ==
4573  RHS->getDebugLoc()->getInlinedAt();
4574  };
4575  if (SameVariableFragment(OldDII, DbgDeclare))
4576  OldDII->eraseFromParent();
4577  }
4578 
4579  DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
4580  DbgDeclare->getDebugLoc(), &AI);
4581  }
4582  }
4583  return Changed;
4584 }
4585 
4586 /// Clobber a use with undef, deleting the used value if it becomes dead.
4587 void SROA::clobberUse(Use &U) {
4588  Value *OldV = U;
4589  // Replace the use with an undef value.
4590  U = UndefValue::get(OldV->getType());
4591 
4592  // Check for this making an instruction dead. We have to garbage collect
4593  // all the dead instructions to ensure the uses of any alloca end up being
4594  // minimal.
4595  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
4596  if (isInstructionTriviallyDead(OldI)) {
4597  DeadInsts.push_back(OldI);
4598  }
4599 }
4600 
4601 /// Analyze an alloca for SROA.
4602 ///
4603 /// This analyzes the alloca to ensure we can reason about it, builds
4604 /// the slices of the alloca, and then hands it off to be split and
4605 /// rewritten as needed.
4606 bool SROA::runOnAlloca(AllocaInst &AI) {
4607  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
4608  ++NumAllocasAnalyzed;
4609 
4610  // Special case dead allocas, as they're trivial.
4611  if (AI.use_empty()) {
4612  AI.eraseFromParent();
4613  return true;
4614  }
4615  const DataLayout &DL = AI.getModule()->getDataLayout();
4616 
4617  // Skip alloca forms that this analysis can't handle.
4618  auto *AT = AI.getAllocatedType();
4619  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
4620  DL.getTypeAllocSize(AT).getFixedSize() == 0)
4621  return false;
4622 
4623  bool Changed = false;
4624 
4625  // First, split any FCA loads and stores touching this alloca to promote
4626  // better splitting and promotion opportunities.
4627  AggLoadStoreRewriter AggRewriter(DL);
4628  Changed |= AggRewriter.rewrite(AI);
4629 
4630  // Build the slices using a recursive instruction-visiting builder.
4631  AllocaSlices AS(DL, AI);
4632  LLVM_DEBUG(AS.print(dbgs()));
4633  if (AS.isEscaped())
4634  return Changed;
4635 
4636  // Delete all the dead users of this alloca before splitting and rewriting it.
4637  for (Instruction *DeadUser : AS.getDeadUsers()) {
4638  // Free up everything used by this instruction.
4639  for (Use &DeadOp : DeadUser->operands())
4640  clobberUse(DeadOp);
4641 
4642  // Now replace the uses of this instruction.
4643  DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
4644 
4645  // And mark it for deletion.
4646  DeadInsts.push_back(DeadUser);
4647  Changed = true;
4648  }
4649  for (Use *DeadOp : AS.getDeadOperands()) {
4650  clobberUse(*DeadOp);
4651  Changed = true;
4652  }
4653 
4654  // No slices to split. Leave the dead alloca for a later pass to clean up.
4655  if (AS.begin() == AS.end())
4656  return Changed;
4657 
4658  Changed |= splitAlloca(AI, AS);
4659 
4660  LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
4661  while (!SpeculatablePHIs.empty())
4662  speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
4663 
4664  LLVM_DEBUG(dbgs() << " Speculating Selects\n");
4665  while (!SpeculatableSelects.empty())
4666  speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
4667 
4668  return Changed;
4669 }
4670 
4671 /// Delete the dead instructions accumulated in this run.
4672 ///
4673 /// Recursively deletes the dead instructions we've accumulated. This is done
4674 /// at the very end to maximize locality of the recursive delete and to
4675 /// minimize the problems of invalidated instruction pointers as such pointers
4676 /// are used heavily in the intermediate stages of the algorithm.
4677 ///
4678 /// We also record the alloca instructions deleted here so that they aren't
4679 /// subsequently handed to mem2reg to promote.
4680 bool SROA::deleteDeadInstructions(
4681  SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
4682  bool Changed = false;
4683  while (!DeadInsts.empty()) {
4684  Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
4685  if (!I) continue;
4686  LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
4687 
4688  // If the instruction is an alloca, find the possible dbg.declare connected
4689  // to it, and remove it too. We must do this before calling RAUW or we will
4690  // not be able to find it.
4691  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4692  DeletedAllocas.insert(AI);
4693  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
4694  OldDII->eraseFromParent();
4695  }
4696 
4697  I->replaceAllUsesWith(UndefValue::get(I->getType()));
4698 
4699  for (Use &Operand : I->operands())
4700  if (Instruction *U = dyn_cast<Instruction>(Operand)) {
4701  // Zero out the operand and see if it becomes trivially dead.
4702  Operand = nullptr;
4704  DeadInsts.push_back(U);
4705  }
4706 
4707  ++NumDeleted;
4708  I->eraseFromParent();
4709  Changed = true;
4710  }
4711  return Changed;
4712 }
4713 
4714 /// Promote the allocas, using the best available technique.
4715 ///
4716 /// This attempts to promote whatever allocas have been identified as viable in
4717 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
4718 /// This function returns whether any promotion occurred.
4719 bool SROA::promoteAllocas(Function &F) {
4720  if (PromotableAllocas.empty())
4721  return false;
4722 
4723  NumPromoted += PromotableAllocas.size();
4724 
4725  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
4726  PromoteMemToReg(PromotableAllocas, *DT, AC);
4727  PromotableAllocas.clear();
4728  return true;
4729 }
4730 
4732  AssumptionCache &RunAC) {
4733  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
4734  C = &F.getContext();
4735  DT = &RunDT;
4736  AC = &RunAC;
4737 
4738  BasicBlock &EntryBB = F.getEntryBlock();
4739  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
4740  I != E; ++I) {
4741  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4742  if (isa<ScalableVectorType>(AI->getAllocatedType())) {
4743  if (isAllocaPromotable(AI))
4744  PromotableAllocas.push_back(AI);
4745  } else {
4746  Worklist.insert(AI);
4747  }
4748  }
4749  }
4750 
4751  bool Changed = false;
4752  // A set of deleted alloca instruction pointers which should be removed from
4753  // the list of promotable allocas.
4754  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
4755 
4756  do {
4757  while (!Worklist.empty()) {
4758  Changed |= runOnAlloca(*Worklist.pop_back_val());
4759  Changed |= deleteDeadInstructions(DeletedAllocas);
4760 
4761  // Remove the deleted allocas from various lists so that we don't try to
4762  // continue processing them.
4763  if (!DeletedAllocas.empty()) {
4764  auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
4765  Worklist.remove_if(IsInSet);
4766  PostPromotionWorklist.remove_if(IsInSet);
4767  llvm::erase_if(PromotableAllocas, IsInSet);
4768  DeletedAllocas.clear();
4769  }
4770  }
4771 
4772  Changed |= promoteAllocas(F);
4773 
4774  Worklist = PostPromotionWorklist;
4775  PostPromotionWorklist.clear();
4776  } while (!Worklist.empty());
4777 
4778  if (!Changed)
4779  return PreservedAnalyses::all();
4780 
4781  PreservedAnalyses PA;
4782  PA.preserveSet<CFGAnalyses>();
4783  PA.preserve<GlobalsAA>();
4784  return PA;
4785 }
4786 
4788  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
4790 }
4791 
4792 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
4793 ///
4794 /// This is in the llvm namespace purely to allow it to be a friend of the \c
4795 /// SROA pass.
4797  /// The SROA implementation.
4798  SROA Impl;
4799 
4800 public:
4801  static char ID;
4802 
4804  initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
4805  }
4806 
4807  bool runOnFunction(Function &F) override {
4808  if (skipFunction(F))
4809  return false;
4810 
4811  auto PA = Impl.runImpl(
4812  F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
4813  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
4814  return !PA.areAllPreserved();
4815  }
4816 
4817  void getAnalysisUsage(AnalysisUsage &AU) const override {
4821  AU.setPreservesCFG();
4822  }
4823 
4824  StringRef getPassName() const override { return "SROA"; }
4825 };
4826 
4827 char SROALegacyPass::ID = 0;
4828 
4830 
4832  "Scalar Replacement Of Aggregates", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::GlobalsAA
Analysis pass providing a never-invalidated alias analysis result.
Definition: GlobalsModRef.h:132
PtrUseVisitor.h
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::sroa::AllocaSlices::begin
iterator begin()
Definition: SROA.cpp:237
llvm::detail::PtrUseVisitorBase::DL
const DataLayout & DL
Definition: PtrUseVisitor.h:117
AssumptionCache.h
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:272
getSizeInBytes
static Optional< uint64_t > getSizeInBytes(Optional< uint64_t > SizeInBits)
Definition: AutoInitRemark.cpp:41
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:499
MathExtras.h
llvm::sroa::AllocaSlices::iterator
SmallVectorImpl< Slice >::iterator iterator
Support for iterating over the slices.
Definition: SROA.cpp:234
llvm
Definition: AllocatorList.h:23
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:705
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
Pass
print lazy value Lazy Value Info Printer Pass
Definition: LazyValueInfo.cpp:1939
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:378
stripAggregateTypeWrapping
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition: SROA.cpp:3618
llvm::PHINode::incoming_values
op_range incoming_values()
Definition: Instructions.h:2656
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:447
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
Metadata.h
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:256
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:119
IntrinsicInst.h
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:229
llvm::DIBuilder
Definition: DIBuilder.h:41
llvm::sroa::Partition::end
iterator end() const
Definition: SROA.cpp:407
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:617
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:785
DebugInfoMetadata.h
llvm::cl::Prefix
@ Prefix
Definition: CommandLine.h:161
Scalar.h
Loads.h
llvm::MemTransferInst
This class wraps the llvm.memcpy/memmove intrinsics.
Definition: IntrinsicInst.h:869
llvm::StringRef::rfind
LLVM_NODISCARD size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition: StringRef.h:378
llvm::PointerType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:653
llvm::Function
Definition: Function.h:61
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
sroa
sroa
Definition: SROA.cpp:4835
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::StructType::element_iterator
Type::subtype_iterator element_iterator
Definition: DerivedTypes.h:317
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5136
GetElementPtrTypeIterator.h
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:317
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:131
llvm::AllocaInst::getType
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:103
llvm::MemIntrinsicBase::getDestAddressSpace
unsigned getDestAddressSpace() const
Definition: IntrinsicInst.h:606
llvm::StringRef::find
LLVM_NODISCARD size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:318
getNaturalGEPRecursively
static Value * getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Type *Ty, APInt &Offset, Type *TargetTy, SmallVectorImpl< Value * > &Indices, const Twine &NamePrefix)
Recursively compute indices for a natural GEP.
Definition: SROA.cpp:1468
llvm::SmallVector< Slice, 8 >
Statistic.h
llvm::sroa::Partition::beginOffset
uint64_t beginOffset() const
The start offset of this partition.
Definition: SROA.cpp:378
llvm::sroa::SROALegacyPass::SROALegacyPass
SROALegacyPass()
Definition: SROA.cpp:4803
ErrorHandling.h
insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition: SROA.cpp:2220
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:693
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2586
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1674
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:662
llvm::SmallDenseMap< Instruction *, unsigned >
llvm::GlobalAlias
Definition: GlobalAlias.h:27
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
GlobalsModRef.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::ConstantInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=~0ULL) const
getLimitedValue - If the value is smaller than the specified limit, return it, otherwise return the l...
Definition: Constants.h:242
APInt.h
llvm::sroa::SROALegacyPass
A legacy pass for the legacy pass manager that wraps the SROA pass.
Definition: SROA.cpp:4796
llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:651
AllocaSlices::SliceBuilder::SliceBuilder
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition: SROA.cpp:657
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
llvm::LoadInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition: Instructions.h:268
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
llvm::StoreInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:357
llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2037
llvm::DbgVariableIntrinsic::getVariable
DILocalVariable * getVariable() const
Definition: IntrinsicInst.h:248
llvm::APInt::ugt
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition: APInt.h:1275
llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition: IntrinsicInst.h:826
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet< Instruction *, 4 >
llvm::PromoteMemToReg
void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
Definition: PromoteMemoryToRegister.cpp:1013
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1764
llvm::FindDbgAddrUses
TinyPtrVector< DbgVariableIntrinsic * > FindDbgAddrUses(Value *V)
Finds all intrinsics declaring local variables as living in the memory that 'V' points to.
Definition: Local.cpp:1664
Operator.h
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:424
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:410
getNaturalGEPWithType
static Value * getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl< Value * > &Indices, const Twine &NamePrefix)
Get a natural GEP off of the BasePtr walking through Ty toward TargetTy without changing the offset o...
Definition: SROA.cpp:1424
getBitWidth
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Definition: ValueTracking.cpp:87
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:800
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
STLExtras.h
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:359
llvm::IRBuilderDefaultInserter
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition: IRBuilder.h:63
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:266
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:139
LLVM_ATTRIBUTE_UNUSED
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:188
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:114
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:222
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::sroa::AllocaSlices
Representation of the alloca slices.
Definition: SROA.cpp:221
INITIALIZE_PASS_END
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Definition: RegBankSelect.cpp:69
Use.h
endif
__FakeVCSRevision h endif() endif() set(generated_files "$
Definition: CMakeLists.txt:16
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::StoreInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:384
llvm::AMDGPU::HSAMD::ValueKind::Queue
@ Queue
llvm::GetElementPtrInst::getSourceElementType
Type * getSourceElementType() const
Definition: Instructions.h:1002
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:718
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:583
isIntegerWideningViableForSlice
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition: SROA.cpp:2009
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::gep_type_end
gep_type_iterator gep_type_end(const User *GEP)
Definition: GetElementPtrTypeIterator.h:146
llvm::IntegerType::getMask
APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition: Type.cpp:300
PointerIntPair.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition: SmallBitVector.h:34
llvm::StringRef::substr
LLVM_NODISCARD StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:612
llvm::GetElementPtrInst::isInBounds
bool isInBounds() const
Determine whether the GEP has the inbounds flag.
Definition: Instructions.cpp:1799
llvm::User::getOperandUse
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Instruction.h
CommandLine.h
insertInteger
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition: SROA.cpp:2162
extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition: SROA.cpp:2195
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::Type::isArrayTy
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:226
llvm::Instruction::getNumSuccessors
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
Definition: Instruction.cpp:736
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1247
llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:257
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1505
llvm::ConstantFolder
ConstantFolder - Create constants with minimum, target independent, folding.
Definition: ConstantFolder.h:28
foldPHINodeOrSelectInst
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition: SROA.cpp:629
llvm::MemTransferBase::getSourceAddressSpace
unsigned getSourceAddressSpace() const
Definition: IntrinsicInst.h:671
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1762
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_END(SROALegacyPass
llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition: Instructions.h:5176
Rewriter
Virtual Register Rewriter
Definition: VirtRegMap.cpp:222
Constants.h
llvm::PHINode::getIncomingValue
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
Definition: Instructions.h:2666
llvm::Instruction::getAAMetadata
void getAAMetadata(AAMDNodes &N, bool Merge=false) const
Fills the AAMDNodes structure with AA metadata from this instruction.
Definition: TypeBasedAliasAnalysis.cpp:524
llvm::sroa::AllocaSlices::AllocaSlices
AllocaSlices(const DataLayout &DL, AllocaInst &AI)
Construct the slices of a particular alloca.
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:112
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:648
llvm::User
Definition: User.h:44
llvm::StructType::isPacked
bool isPacked() const
Definition: DerivedTypes.h:274
llvm::sroa::AllocaSliceRewriter
Visitor to rewrite instructions using p particular slice of an alloca to use a new alloca.
Definition: SROA.cpp:2275
llvm::sroa::Partition::endOffset
uint64_t endOffset() const
The end offset of this partition.
Definition: SROA.cpp:383
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Twine.h
InstrTypes.h
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:296
speculatePHINodeLoads
static void speculatePHINodeLoads(PHINode &PN)
Definition: SROA.cpp:1266
ConstantFolder.h
llvm::sroa::AllocaSlices::dump
void dump() const
SI
@ SI
Definition: SIInstrInfo.cpp:7342
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
runImpl
static bool runImpl(const TargetLibraryInfo &TLI, Function &F)
Definition: ReplaceWithVeclib.cpp:177
llvm::MemTransferBase::getRawSource
Value * getRawSource() const
Return the arguments to the instruction.
Definition: IntrinsicInst.h:658
isSafePHIToSpeculate
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition: SROA.cpp:1198
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:199
speculateSelectInstLoads
static void speculateSelectInstLoads(SelectInst &SI)
Definition: SROA.cpp:1357
llvm::sroa::AllocaSlices::getDeadUsers
ArrayRef< Instruction * > getDeadUsers() const
Access the dead users for this alloca.
Definition: SROA.cpp:269
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:249
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:119
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")