LLVM  14.0.0git
SROA.cpp
Go to the documentation of this file.
1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
14 ///
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
18 ///
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
22 ///
23 //===----------------------------------------------------------------------===//
24 
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringRef.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/ADT/iterator.h"
42 #include "llvm/Analysis/Loads.h"
44 #include "llvm/Config/llvm-config.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/ConstantFolder.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DIBuilder.h"
50 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Dominators.h"
54 #include "llvm/IR/Function.h"
56 #include "llvm/IR/GlobalAlias.h"
57 #include "llvm/IR/IRBuilder.h"
58 #include "llvm/IR/InstVisitor.h"
59 #include "llvm/IR/InstrTypes.h"
60 #include "llvm/IR/Instruction.h"
61 #include "llvm/IR/Instructions.h"
62 #include "llvm/IR/IntrinsicInst.h"
63 #include "llvm/IR/Intrinsics.h"
64 #include "llvm/IR/LLVMContext.h"
65 #include "llvm/IR/Metadata.h"
66 #include "llvm/IR/Module.h"
67 #include "llvm/IR/Operator.h"
68 #include "llvm/IR/PassManager.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/IR/Use.h"
71 #include "llvm/IR/User.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/InitializePasses.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Compiler.h"
78 #include "llvm/Support/Debug.h"
82 #include "llvm/Transforms/Scalar.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <chrono>
88 #include <cstddef>
89 #include <cstdint>
90 #include <cstring>
91 #include <iterator>
92 #include <string>
93 #include <tuple>
94 #include <utility>
95 #include <vector>
96 
97 using namespace llvm;
98 using namespace llvm::sroa;
99 
100 #define DEBUG_TYPE "sroa"
101 
102 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
103 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
104 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
105 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
106 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
107 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
108 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
109 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
110 STATISTIC(NumDeleted, "Number of instructions deleted");
111 STATISTIC(NumVectorized, "Number of vectorized aggregates");
112 
113 /// Hidden option to experiment with completely strict handling of inbounds
114 /// GEPs.
115 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
116  cl::Hidden);
117 
118 namespace {
119 
120 /// A custom IRBuilder inserter which prefixes all names, but only in
121 /// Assert builds.
122 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
123  std::string Prefix;
124 
125  Twine getNameWithPrefix(const Twine &Name) const {
126  return Name.isTriviallyEmpty() ? Name : Prefix + Name;
127  }
128 
129 public:
130  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
131 
132  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
133  BasicBlock::iterator InsertPt) const override {
134  IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
135  InsertPt);
136  }
137 };
138 
139 /// Provide a type for IRBuilder that drops names in release builds.
141 
142 /// A used slice of an alloca.
143 ///
144 /// This structure represents a slice of an alloca used by some instruction. It
145 /// stores both the begin and end offsets of this use, a pointer to the use
146 /// itself, and a flag indicating whether we can classify the use as splittable
147 /// or not when forming partitions of the alloca.
148 class Slice {
149  /// The beginning offset of the range.
150  uint64_t BeginOffset = 0;
151 
152  /// The ending offset, not included in the range.
153  uint64_t EndOffset = 0;
154 
155  /// Storage for both the use of this slice and whether it can be
156  /// split.
157  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
158 
159 public:
160  Slice() = default;
161 
162  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
163  : BeginOffset(BeginOffset), EndOffset(EndOffset),
164  UseAndIsSplittable(U, IsSplittable) {}
165 
166  uint64_t beginOffset() const { return BeginOffset; }
167  uint64_t endOffset() const { return EndOffset; }
168 
169  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
170  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
171 
172  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
173 
174  bool isDead() const { return getUse() == nullptr; }
175  void kill() { UseAndIsSplittable.setPointer(nullptr); }
176 
177  /// Support for ordering ranges.
178  ///
179  /// This provides an ordering over ranges such that start offsets are
180  /// always increasing, and within equal start offsets, the end offsets are
181  /// decreasing. Thus the spanning range comes first in a cluster with the
182  /// same start position.
183  bool operator<(const Slice &RHS) const {
184  if (beginOffset() < RHS.beginOffset())
185  return true;
186  if (beginOffset() > RHS.beginOffset())
187  return false;
188  if (isSplittable() != RHS.isSplittable())
189  return !isSplittable();
190  if (endOffset() > RHS.endOffset())
191  return true;
192  return false;
193  }
194 
195  /// Support comparison with a single offset to allow binary searches.
196  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
197  uint64_t RHSOffset) {
198  return LHS.beginOffset() < RHSOffset;
199  }
200  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
201  const Slice &RHS) {
202  return LHSOffset < RHS.beginOffset();
203  }
204 
205  bool operator==(const Slice &RHS) const {
206  return isSplittable() == RHS.isSplittable() &&
207  beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
208  }
209  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
210 };
211 
212 } // end anonymous namespace
213 
214 /// Representation of the alloca slices.
215 ///
216 /// This class represents the slices of an alloca which are formed by its
217 /// various uses. If a pointer escapes, we can't fully build a representation
218 /// for the slices used and we reflect that in this structure. The uses are
219 /// stored, sorted by increasing beginning offset and with unsplittable slices
220 /// starting at a particular offset before splittable slices.
222 public:
223  /// Construct the slices of a particular alloca.
224  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
225 
226  /// Test whether a pointer to the allocation escapes our analysis.
227  ///
228  /// If this is true, the slices are never fully built and should be
229  /// ignored.
230  bool isEscaped() const { return PointerEscapingInstr; }
231 
232  /// Support for iterating over the slices.
233  /// @{
236 
237  iterator begin() { return Slices.begin(); }
238  iterator end() { return Slices.end(); }
239 
242 
243  const_iterator begin() const { return Slices.begin(); }
244  const_iterator end() const { return Slices.end(); }
245  /// @}
246 
247  /// Erase a range of slices.
248  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
249 
250  /// Insert new slices for this alloca.
251  ///
252  /// This moves the slices into the alloca's slices collection, and re-sorts
253  /// everything so that the usual ordering properties of the alloca's slices
254  /// hold.
255  void insert(ArrayRef<Slice> NewSlices) {
256  int OldSize = Slices.size();
257  Slices.append(NewSlices.begin(), NewSlices.end());
258  auto SliceI = Slices.begin() + OldSize;
259  llvm::sort(SliceI, Slices.end());
260  std::inplace_merge(Slices.begin(), SliceI, Slices.end());
261  }
262 
263  // Forward declare the iterator and range accessor for walking the
264  // partitions.
265  class partition_iterator;
267 
268  /// Access the dead users for this alloca.
269  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
270 
271  /// Access Uses that should be dropped if the alloca is promotable.
273  return DeadUseIfPromotable;
274  }
275 
276  /// Access the dead operands referring to this alloca.
277  ///
278  /// These are operands which have cannot actually be used to refer to the
279  /// alloca as they are outside its range and the user doesn't correct for
280  /// that. These mostly consist of PHI node inputs and the like which we just
281  /// need to replace with undef.
282  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
283 
284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
285  void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
287  StringRef Indent = " ") const;
289  StringRef Indent = " ") const;
290  void print(raw_ostream &OS) const;
291  void dump(const_iterator I) const;
292  void dump() const;
293 #endif
294 
295 private:
296  template <typename DerivedT, typename RetT = void> class BuilderBase;
297  class SliceBuilder;
298 
299  friend class AllocaSlices::SliceBuilder;
300 
301 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
302  /// Handle to alloca instruction to simplify method interfaces.
303  AllocaInst &AI;
304 #endif
305 
306  /// The instruction responsible for this alloca not having a known set
307  /// of slices.
308  ///
309  /// When an instruction (potentially) escapes the pointer to the alloca, we
310  /// store a pointer to that here and abort trying to form slices of the
311  /// alloca. This will be null if the alloca slices are analyzed successfully.
312  Instruction *PointerEscapingInstr;
313 
314  /// The slices of the alloca.
315  ///
316  /// We store a vector of the slices formed by uses of the alloca here. This
317  /// vector is sorted by increasing begin offset, and then the unsplittable
318  /// slices before the splittable ones. See the Slice inner class for more
319  /// details.
320  SmallVector<Slice, 8> Slices;
321 
322  /// Instructions which will become dead if we rewrite the alloca.
323  ///
324  /// Note that these are not separated by slice. This is because we expect an
325  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
326  /// all these instructions can simply be removed and replaced with undef as
327  /// they come from outside of the allocated space.
329 
330  /// Uses which will become dead if can promote the alloca.
331  SmallVector<Use *, 8> DeadUseIfPromotable;
332 
333  /// Operands which will become dead if we rewrite the alloca.
334  ///
335  /// These are operands that in their particular use can be replaced with
336  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
337  /// to PHI nodes and the like. They aren't entirely dead (there might be
338  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
339  /// want to swap this particular input for undef to simplify the use lists of
340  /// the alloca.
341  SmallVector<Use *, 8> DeadOperands;
342 };
343 
344 /// A partition of the slices.
345 ///
346 /// An ephemeral representation for a range of slices which can be viewed as
347 /// a partition of the alloca. This range represents a span of the alloca's
348 /// memory which cannot be split, and provides access to all of the slices
349 /// overlapping some part of the partition.
350 ///
351 /// Objects of this type are produced by traversing the alloca's slices, but
352 /// are only ephemeral and not persistent.
354 private:
355  friend class AllocaSlices;
357 
358  using iterator = AllocaSlices::iterator;
359 
360  /// The beginning and ending offsets of the alloca for this
361  /// partition.
362  uint64_t BeginOffset = 0, EndOffset = 0;
363 
364  /// The start and end iterators of this partition.
365  iterator SI, SJ;
366 
367  /// A collection of split slice tails overlapping the partition.
368  SmallVector<Slice *, 4> SplitTails;
369 
370  /// Raw constructor builds an empty partition starting and ending at
371  /// the given iterator.
372  Partition(iterator SI) : SI(SI), SJ(SI) {}
373 
374 public:
375  /// The start offset of this partition.
376  ///
377  /// All of the contained slices start at or after this offset.
378  uint64_t beginOffset() const { return BeginOffset; }
379 
380  /// The end offset of this partition.
381  ///
382  /// All of the contained slices end at or before this offset.
383  uint64_t endOffset() const { return EndOffset; }
384 
385  /// The size of the partition.
386  ///
387  /// Note that this can never be zero.
388  uint64_t size() const {
389  assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
390  return EndOffset - BeginOffset;
391  }
392 
393  /// Test whether this partition contains no slices, and merely spans
394  /// a region occupied by split slices.
395  bool empty() const { return SI == SJ; }
396 
397  /// \name Iterate slices that start within the partition.
398  /// These may be splittable or unsplittable. They have a begin offset >= the
399  /// partition begin offset.
400  /// @{
401  // FIXME: We should probably define a "concat_iterator" helper and use that
402  // to stitch together pointee_iterators over the split tails and the
403  // contiguous iterators of the partition. That would give a much nicer
404  // interface here. We could then additionally expose filtered iterators for
405  // split, unsplit, and unsplittable splices based on the usage patterns.
406  iterator begin() const { return SI; }
407  iterator end() const { return SJ; }
408  /// @}
409 
410  /// Get the sequence of split slice tails.
411  ///
412  /// These tails are of slices which start before this partition but are
413  /// split and overlap into the partition. We accumulate these while forming
414  /// partitions.
415  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
416 };
417 
418 /// An iterator over partitions of the alloca's slices.
419 ///
420 /// This iterator implements the core algorithm for partitioning the alloca's
421 /// slices. It is a forward iterator as we don't support backtracking for
422 /// efficiency reasons, and re-use a single storage area to maintain the
423 /// current set of split slices.
424 ///
425 /// It is templated on the slice iterator type to use so that it can operate
426 /// with either const or non-const slice iterators.
428  : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
429  Partition> {
430  friend class AllocaSlices;
431 
432  /// Most of the state for walking the partitions is held in a class
433  /// with a nice interface for examining them.
434  Partition P;
435 
436  /// We need to keep the end of the slices to know when to stop.
438 
439  /// We also need to keep track of the maximum split end offset seen.
440  /// FIXME: Do we really?
441  uint64_t MaxSplitSliceEndOffset = 0;
442 
443  /// Sets the partition to be empty at given iterator, and sets the
444  /// end iterator.
446  : P(SI), SE(SE) {
447  // If not already at the end, advance our state to form the initial
448  // partition.
449  if (SI != SE)
450  advance();
451  }
452 
453  /// Advance the iterator to the next partition.
454  ///
455  /// Requires that the iterator not be at the end of the slices.
456  void advance() {
457  assert((P.SI != SE || !P.SplitTails.empty()) &&
458  "Cannot advance past the end of the slices!");
459 
460  // Clear out any split uses which have ended.
461  if (!P.SplitTails.empty()) {
462  if (P.EndOffset >= MaxSplitSliceEndOffset) {
463  // If we've finished all splits, this is easy.
464  P.SplitTails.clear();
465  MaxSplitSliceEndOffset = 0;
466  } else {
467  // Remove the uses which have ended in the prior partition. This
468  // cannot change the max split slice end because we just checked that
469  // the prior partition ended prior to that max.
470  llvm::erase_if(P.SplitTails,
471  [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
472  assert(llvm::any_of(P.SplitTails,
473  [&](Slice *S) {
474  return S->endOffset() == MaxSplitSliceEndOffset;
475  }) &&
476  "Could not find the current max split slice offset!");
477  assert(llvm::all_of(P.SplitTails,
478  [&](Slice *S) {
479  return S->endOffset() <= MaxSplitSliceEndOffset;
480  }) &&
481  "Max split slice end offset is not actually the max!");
482  }
483  }
484 
485  // If P.SI is already at the end, then we've cleared the split tail and
486  // now have an end iterator.
487  if (P.SI == SE) {
488  assert(P.SplitTails.empty() && "Failed to clear the split slices!");
489  return;
490  }
491 
492  // If we had a non-empty partition previously, set up the state for
493  // subsequent partitions.
494  if (P.SI != P.SJ) {
495  // Accumulate all the splittable slices which started in the old
496  // partition into the split list.
497  for (Slice &S : P)
498  if (S.isSplittable() && S.endOffset() > P.EndOffset) {
499  P.SplitTails.push_back(&S);
500  MaxSplitSliceEndOffset =
501  std::max(S.endOffset(), MaxSplitSliceEndOffset);
502  }
503 
504  // Start from the end of the previous partition.
505  P.SI = P.SJ;
506 
507  // If P.SI is now at the end, we at most have a tail of split slices.
508  if (P.SI == SE) {
509  P.BeginOffset = P.EndOffset;
510  P.EndOffset = MaxSplitSliceEndOffset;
511  return;
512  }
513 
514  // If the we have split slices and the next slice is after a gap and is
515  // not splittable immediately form an empty partition for the split
516  // slices up until the next slice begins.
517  if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
518  !P.SI->isSplittable()) {
519  P.BeginOffset = P.EndOffset;
520  P.EndOffset = P.SI->beginOffset();
521  return;
522  }
523  }
524 
525  // OK, we need to consume new slices. Set the end offset based on the
526  // current slice, and step SJ past it. The beginning offset of the
527  // partition is the beginning offset of the next slice unless we have
528  // pre-existing split slices that are continuing, in which case we begin
529  // at the prior end offset.
530  P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
531  P.EndOffset = P.SI->endOffset();
532  ++P.SJ;
533 
534  // There are two strategies to form a partition based on whether the
535  // partition starts with an unsplittable slice or a splittable slice.
536  if (!P.SI->isSplittable()) {
537  // When we're forming an unsplittable region, it must always start at
538  // the first slice and will extend through its end.
539  assert(P.BeginOffset == P.SI->beginOffset());
540 
541  // Form a partition including all of the overlapping slices with this
542  // unsplittable slice.
543  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
544  if (!P.SJ->isSplittable())
545  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
546  ++P.SJ;
547  }
548 
549  // We have a partition across a set of overlapping unsplittable
550  // partitions.
551  return;
552  }
553 
554  // If we're starting with a splittable slice, then we need to form
555  // a synthetic partition spanning it and any other overlapping splittable
556  // splices.
557  assert(P.SI->isSplittable() && "Forming a splittable partition!");
558 
559  // Collect all of the overlapping splittable slices.
560  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
561  P.SJ->isSplittable()) {
562  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
563  ++P.SJ;
564  }
565 
566  // Back upiP.EndOffset if we ended the span early when encountering an
567  // unsplittable slice. This synthesizes the early end offset of
568  // a partition spanning only splittable slices.
569  if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
570  assert(!P.SJ->isSplittable());
571  P.EndOffset = P.SJ->beginOffset();
572  }
573  }
574 
575 public:
576  bool operator==(const partition_iterator &RHS) const {
577  assert(SE == RHS.SE &&
578  "End iterators don't match between compared partition iterators!");
579 
580  // The observed positions of partitions is marked by the P.SI iterator and
581  // the emptiness of the split slices. The latter is only relevant when
582  // P.SI == SE, as the end iterator will additionally have an empty split
583  // slices list, but the prior may have the same P.SI and a tail of split
584  // slices.
585  if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
586  assert(P.SJ == RHS.P.SJ &&
587  "Same set of slices formed two different sized partitions!");
588  assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
589  "Same slice position with differently sized non-empty split "
590  "slice tails!");
591  return true;
592  }
593  return false;
594  }
595 
597  advance();
598  return *this;
599  }
600 
601  Partition &operator*() { return P; }
602 };
603 
604 /// A forward range over the partitions of the alloca's slices.
605 ///
606 /// This accesses an iterator range over the partitions of the alloca's
607 /// slices. It computes these partitions on the fly based on the overlapping
608 /// offsets of the slices and the ability to split them. It will visit "empty"
609 /// partitions to cover regions of the alloca only accessed via split
610 /// slices.
612  return make_range(partition_iterator(begin(), end()),
613  partition_iterator(end(), end()));
614 }
615 
617  // If the condition being selected on is a constant or the same value is
618  // being selected between, fold the select. Yes this does (rarely) happen
619  // early on.
620  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
621  return SI.getOperand(1 + CI->isZero());
622  if (SI.getOperand(1) == SI.getOperand(2))
623  return SI.getOperand(1);
624 
625  return nullptr;
626 }
627 
628 /// A helper that folds a PHI node or a select.
630  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
631  // If PN merges together the same value, return that value.
632  return PN->hasConstantValue();
633  }
634  return foldSelectInst(cast<SelectInst>(I));
635 }
636 
637 /// Builder for the alloca slices.
638 ///
639 /// This class builds a set of alloca slices by recursively visiting the uses
640 /// of an alloca and making a slice for each load and store at each offset.
641 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
643  friend class InstVisitor<SliceBuilder>;
644 
646 
647  const uint64_t AllocSize;
648  AllocaSlices &AS;
649 
650  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
652 
653  /// Set to de-duplicate dead instructions found in the use walk.
654  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
655 
656 public:
659  AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
660  AS(AS) {}
661 
662 private:
663  void markAsDead(Instruction &I) {
664  if (VisitedDeadInsts.insert(&I).second)
665  AS.DeadUsers.push_back(&I);
666  }
667 
668  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
669  bool IsSplittable = false) {
670  // Completely skip uses which have a zero size or start either before or
671  // past the end of the allocation.
672  if (Size == 0 || Offset.uge(AllocSize)) {
673  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
674  << Offset
675  << " which has zero size or starts outside of the "
676  << AllocSize << " byte alloca:\n"
677  << " alloca: " << AS.AI << "\n"
678  << " use: " << I << "\n");
679  return markAsDead(I);
680  }
681 
682  uint64_t BeginOffset = Offset.getZExtValue();
683  uint64_t EndOffset = BeginOffset + Size;
684 
685  // Clamp the end offset to the end of the allocation. Note that this is
686  // formulated to handle even the case where "BeginOffset + Size" overflows.
687  // This may appear superficially to be something we could ignore entirely,
688  // but that is not so! There may be widened loads or PHI-node uses where
689  // some instructions are dead but not others. We can't completely ignore
690  // them, and so have to record at least the information here.
691  assert(AllocSize >= BeginOffset); // Established above.
692  if (Size > AllocSize - BeginOffset) {
693  LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
694  << Offset << " to remain within the " << AllocSize
695  << " byte alloca:\n"
696  << " alloca: " << AS.AI << "\n"
697  << " use: " << I << "\n");
698  EndOffset = AllocSize;
699  }
700 
701  AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
702  }
703 
704  void visitBitCastInst(BitCastInst &BC) {
705  if (BC.use_empty())
706  return markAsDead(BC);
707 
708  return Base::visitBitCastInst(BC);
709  }
710 
711  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
712  if (ASC.use_empty())
713  return markAsDead(ASC);
714 
715  return Base::visitAddrSpaceCastInst(ASC);
716  }
717 
718  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
719  if (GEPI.use_empty())
720  return markAsDead(GEPI);
721 
722  if (SROAStrictInbounds && GEPI.isInBounds()) {
723  // FIXME: This is a manually un-factored variant of the basic code inside
724  // of GEPs with checking of the inbounds invariant specified in the
725  // langref in a very strict sense. If we ever want to enable
726  // SROAStrictInbounds, this code should be factored cleanly into
727  // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
728  // by writing out the code here where we have the underlying allocation
729  // size readily available.
730  APInt GEPOffset = Offset;
731  const DataLayout &DL = GEPI.getModule()->getDataLayout();
732  for (gep_type_iterator GTI = gep_type_begin(GEPI),
733  GTE = gep_type_end(GEPI);
734  GTI != GTE; ++GTI) {
735  ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
736  if (!OpC)
737  break;
738 
739  // Handle a struct index, which adds its field offset to the pointer.
740  if (StructType *STy = GTI.getStructTypeOrNull()) {
741  unsigned ElementIdx = OpC->getZExtValue();
742  const StructLayout *SL = DL.getStructLayout(STy);
743  GEPOffset +=
744  APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
745  } else {
746  // For array or vector indices, scale the index by the size of the
747  // type.
748  APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
749  GEPOffset +=
750  Index *
751  APInt(Offset.getBitWidth(),
752  DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
753  }
754 
755  // If this index has computed an intermediate pointer which is not
756  // inbounds, then the result of the GEP is a poison value and we can
757  // delete it and all uses.
758  if (GEPOffset.ugt(AllocSize))
759  return markAsDead(GEPI);
760  }
761  }
762 
763  return Base::visitGetElementPtrInst(GEPI);
764  }
765 
766  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
767  uint64_t Size, bool IsVolatile) {
768  // We allow splitting of non-volatile loads and stores where the type is an
769  // integer type. These may be used to implement 'memcpy' or other "transfer
770  // of bits" patterns.
771  bool IsSplittable =
772  Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
773 
774  insertUse(I, Offset, Size, IsSplittable);
775  }
776 
777  void visitLoadInst(LoadInst &LI) {
778  assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
779  "All simple FCA loads should have been pre-split");
780 
781  if (!IsOffsetKnown)
782  return PI.setAborted(&LI);
783 
784  if (LI.isVolatile() &&
785  LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
786  return PI.setAborted(&LI);
787 
788  if (isa<ScalableVectorType>(LI.getType()))
789  return PI.setAborted(&LI);
790 
791  uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
792  return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
793  }
794 
795  void visitStoreInst(StoreInst &SI) {
796  Value *ValOp = SI.getValueOperand();
797  if (ValOp == *U)
798  return PI.setEscapedAndAborted(&SI);
799  if (!IsOffsetKnown)
800  return PI.setAborted(&SI);
801 
802  if (SI.isVolatile() &&
803  SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
804  return PI.setAborted(&SI);
805 
806  if (isa<ScalableVectorType>(ValOp->getType()))
807  return PI.setAborted(&SI);
808 
809  uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
810 
811  // If this memory access can be shown to *statically* extend outside the
812  // bounds of the allocation, it's behavior is undefined, so simply
813  // ignore it. Note that this is more strict than the generic clamping
814  // behavior of insertUse. We also try to handle cases which might run the
815  // risk of overflow.
816  // FIXME: We should instead consider the pointer to have escaped if this
817  // function is being instrumented for addressing bugs or race conditions.
818  if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
819  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
820  << Offset << " which extends past the end of the "
821  << AllocSize << " byte alloca:\n"
822  << " alloca: " << AS.AI << "\n"
823  << " use: " << SI << "\n");
824  return markAsDead(SI);
825  }
826 
827  assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
828  "All simple FCA stores should have been pre-split");
829  handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
830  }
831 
832  void visitMemSetInst(MemSetInst &II) {
833  assert(II.getRawDest() == *U && "Pointer use is not the destination?");
834  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
835  if ((Length && Length->getValue() == 0) ||
836  (IsOffsetKnown && Offset.uge(AllocSize)))
837  // Zero-length mem transfer intrinsics can be ignored entirely.
838  return markAsDead(II);
839 
840  if (!IsOffsetKnown)
841  return PI.setAborted(&II);
842 
843  // Don't replace this with a store with a different address space. TODO:
844  // Use a store with the casted new alloca?
845  if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
846  return PI.setAborted(&II);
847 
848  insertUse(II, Offset, Length ? Length->getLimitedValue()
849  : AllocSize - Offset.getLimitedValue(),
850  (bool)Length);
851  }
852 
853  void visitMemTransferInst(MemTransferInst &II) {
854  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
855  if (Length && Length->getValue() == 0)
856  // Zero-length mem transfer intrinsics can be ignored entirely.
857  return markAsDead(II);
858 
859  // Because we can visit these intrinsics twice, also check to see if the
860  // first time marked this instruction as dead. If so, skip it.
861  if (VisitedDeadInsts.count(&II))
862  return;
863 
864  if (!IsOffsetKnown)
865  return PI.setAborted(&II);
866 
867  // Don't replace this with a load/store with a different address space.
868  // TODO: Use a store with the casted new alloca?
869  if (II.isVolatile() &&
870  (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
871  II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
872  return PI.setAborted(&II);
873 
874  // This side of the transfer is completely out-of-bounds, and so we can
875  // nuke the entire transfer. However, we also need to nuke the other side
876  // if already added to our partitions.
877  // FIXME: Yet another place we really should bypass this when
878  // instrumenting for ASan.
879  if (Offset.uge(AllocSize)) {
881  MemTransferSliceMap.find(&II);
882  if (MTPI != MemTransferSliceMap.end())
883  AS.Slices[MTPI->second].kill();
884  return markAsDead(II);
885  }
886 
887  uint64_t RawOffset = Offset.getLimitedValue();
888  uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
889 
890  // Check for the special case where the same exact value is used for both
891  // source and dest.
892  if (*U == II.getRawDest() && *U == II.getRawSource()) {
893  // For non-volatile transfers this is a no-op.
894  if (!II.isVolatile())
895  return markAsDead(II);
896 
897  return insertUse(II, Offset, Size, /*IsSplittable=*/false);
898  }
899 
900  // If we have seen both source and destination for a mem transfer, then
901  // they both point to the same alloca.
902  bool Inserted;
904  std::tie(MTPI, Inserted) =
905  MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
906  unsigned PrevIdx = MTPI->second;
907  if (!Inserted) {
908  Slice &PrevP = AS.Slices[PrevIdx];
909 
910  // Check if the begin offsets match and this is a non-volatile transfer.
911  // In that case, we can completely elide the transfer.
912  if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
913  PrevP.kill();
914  return markAsDead(II);
915  }
916 
917  // Otherwise we have an offset transfer within the same alloca. We can't
918  // split those.
919  PrevP.makeUnsplittable();
920  }
921 
922  // Insert the use now that we've fixed up the splittable nature.
923  insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
924 
925  // Check that we ended up with a valid index in the map.
926  assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
927  "Map index doesn't point back to a slice with this user.");
928  }
929 
930  // Disable SRoA for any intrinsics except for lifetime invariants and
931  // invariant group.
932  // FIXME: What about debug intrinsics? This matches old behavior, but
933  // doesn't make sense.
934  void visitIntrinsicInst(IntrinsicInst &II) {
935  if (II.isDroppable()) {
936  AS.DeadUseIfPromotable.push_back(U);
937  return;
938  }
939 
940  if (!IsOffsetKnown)
941  return PI.setAborted(&II);
942 
943  if (II.isLifetimeStartOrEnd()) {
944  ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
945  uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
946  Length->getLimitedValue());
947  insertUse(II, Offset, Size, true);
948  return;
949  }
950 
952  enqueueUsers(II);
953  return;
954  }
955 
956  Base::visitIntrinsicInst(II);
957  }
958 
959  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
960  // We consider any PHI or select that results in a direct load or store of
961  // the same offset to be a viable use for slicing purposes. These uses
962  // are considered unsplittable and the size is the maximum loaded or stored
963  // size.
966  Visited.insert(Root);
967  Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
968  const DataLayout &DL = Root->getModule()->getDataLayout();
969  // If there are no loads or stores, the access is dead. We mark that as
970  // a size zero access.
971  Size = 0;
972  do {
973  Instruction *I, *UsedI;
974  std::tie(UsedI, I) = Uses.pop_back_val();
975 
976  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
977  Size = std::max(Size,
978  DL.getTypeStoreSize(LI->getType()).getFixedSize());
979  continue;
980  }
981  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
982  Value *Op = SI->getOperand(0);
983  if (Op == UsedI)
984  return SI;
985  Size = std::max(Size,
986  DL.getTypeStoreSize(Op->getType()).getFixedSize());
987  continue;
988  }
989 
990  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
991  if (!GEP->hasAllZeroIndices())
992  return GEP;
993  } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
994  !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
995  return I;
996  }
997 
998  for (User *U : I->users())
999  if (Visited.insert(cast<Instruction>(U)).second)
1000  Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1001  } while (!Uses.empty());
1002 
1003  return nullptr;
1004  }
1005 
1006  void visitPHINodeOrSelectInst(Instruction &I) {
1007  assert(isa<PHINode>(I) || isa<SelectInst>(I));
1008  if (I.use_empty())
1009  return markAsDead(I);
1010 
1011  // TODO: We could use SimplifyInstruction here to fold PHINodes and
1012  // SelectInsts. However, doing so requires to change the current
1013  // dead-operand-tracking mechanism. For instance, suppose neither loading
1014  // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1015  // trap either. However, if we simply replace %U with undef using the
1016  // current dead-operand-tracking mechanism, "load (select undef, undef,
1017  // %other)" may trap because the select may return the first operand
1018  // "undef".
1019  if (Value *Result = foldPHINodeOrSelectInst(I)) {
1020  if (Result == *U)
1021  // If the result of the constant fold will be the pointer, recurse
1022  // through the PHI/select as if we had RAUW'ed it.
1023  enqueueUsers(I);
1024  else
1025  // Otherwise the operand to the PHI/select is dead, and we can replace
1026  // it with undef.
1027  AS.DeadOperands.push_back(U);
1028 
1029  return;
1030  }
1031 
1032  if (!IsOffsetKnown)
1033  return PI.setAborted(&I);
1034 
1035  // See if we already have computed info on this node.
1036  uint64_t &Size = PHIOrSelectSizes[&I];
1037  if (!Size) {
1038  // This is a new PHI/Select, check for an unsafe use of it.
1039  if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1040  return PI.setAborted(UnsafeI);
1041  }
1042 
1043  // For PHI and select operands outside the alloca, we can't nuke the entire
1044  // phi or select -- the other side might still be relevant, so we special
1045  // case them here and use a separate structure to track the operands
1046  // themselves which should be replaced with undef.
1047  // FIXME: This should instead be escaped in the event we're instrumenting
1048  // for address sanitization.
1049  if (Offset.uge(AllocSize)) {
1050  AS.DeadOperands.push_back(U);
1051  return;
1052  }
1053 
1054  insertUse(I, Offset, Size);
1055  }
1056 
1057  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1058 
1059  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1060 
1061  /// Disable SROA entirely if there are unhandled users of the alloca.
1062  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1063 };
1064 
1066  :
1067 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1068  AI(AI),
1069 #endif
1070  PointerEscapingInstr(nullptr) {
1071  SliceBuilder PB(DL, AI, *this);
1072  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1073  if (PtrI.isEscaped() || PtrI.isAborted()) {
1074  // FIXME: We should sink the escape vs. abort info into the caller nicely,
1075  // possibly by just storing the PtrInfo in the AllocaSlices.
1076  PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1077  : PtrI.getAbortingInst();
1078  assert(PointerEscapingInstr && "Did not track a bad instruction");
1079  return;
1080  }
1081 
1082  llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1083 
1084  // Sort the uses. This arranges for the offsets to be in ascending order,
1085  // and the sizes to be in descending order.
1086  llvm::stable_sort(Slices);
1087 }
1088 
1089 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1090 
1091 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1092  StringRef Indent) const {
1093  printSlice(OS, I, Indent);
1094  OS << "\n";
1095  printUse(OS, I, Indent);
1096 }
1097 
1098 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1099  StringRef Indent) const {
1100  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1101  << " slice #" << (I - begin())
1102  << (I->isSplittable() ? " (splittable)" : "");
1103 }
1104 
1105 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1106  StringRef Indent) const {
1107  OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1108 }
1109 
1110 void AllocaSlices::print(raw_ostream &OS) const {
1111  if (PointerEscapingInstr) {
1112  OS << "Can't analyze slices for alloca: " << AI << "\n"
1113  << " A pointer to this alloca escaped by:\n"
1114  << " " << *PointerEscapingInstr << "\n";
1115  return;
1116  }
1117 
1118  OS << "Slices of alloca: " << AI << "\n";
1119  for (const_iterator I = begin(), E = end(); I != E; ++I)
1120  print(OS, I);
1121 }
1122 
1123 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1124  print(dbgs(), I);
1125 }
1126 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1127 
1128 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1129 
1130 /// Walk the range of a partitioning looking for a common type to cover this
1131 /// sequence of slices.
1132 static std::pair<Type *, IntegerType *>
1133 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1134  uint64_t EndOffset) {
1135  Type *Ty = nullptr;
1136  bool TyIsCommon = true;
1137  IntegerType *ITy = nullptr;
1138 
1139  // Note that we need to look at *every* alloca slice's Use to ensure we
1140  // always get consistent results regardless of the order of slices.
1141  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1142  Use *U = I->getUse();
1143  if (isa<IntrinsicInst>(*U->getUser()))
1144  continue;
1145  if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1146  continue;
1147 
1148  Type *UserTy = nullptr;
1149  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1150  UserTy = LI->getType();
1151  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1152  UserTy = SI->getValueOperand()->getType();
1153  }
1154 
1155  if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1156  // If the type is larger than the partition, skip it. We only encounter
1157  // this for split integer operations where we want to use the type of the
1158  // entity causing the split. Also skip if the type is not a byte width
1159  // multiple.
1160  if (UserITy->getBitWidth() % 8 != 0 ||
1161  UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1162  continue;
1163 
1164  // Track the largest bitwidth integer type used in this way in case there
1165  // is no common type.
1166  if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1167  ITy = UserITy;
1168  }
1169 
1170  // To avoid depending on the order of slices, Ty and TyIsCommon must not
1171  // depend on types skipped above.
1172  if (!UserTy || (Ty && Ty != UserTy))
1173  TyIsCommon = false; // Give up on anything but an iN type.
1174  else
1175  Ty = UserTy;
1176  }
1177 
1178  return {TyIsCommon ? Ty : nullptr, ITy};
1179 }
1180 
1181 /// PHI instructions that use an alloca and are subsequently loaded can be
1182 /// rewritten to load both input pointers in the pred blocks and then PHI the
1183 /// results, allowing the load of the alloca to be promoted.
1184 /// From this:
1185 /// %P2 = phi [i32* %Alloca, i32* %Other]
1186 /// %V = load i32* %P2
1187 /// to:
1188 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1189 /// ...
1190 /// %V2 = load i32* %Other
1191 /// ...
1192 /// %V = phi [i32 %V1, i32 %V2]
1193 ///
1194 /// We can do this to a select if its only uses are loads and if the operands
1195 /// to the select can be loaded unconditionally.
1196 ///
1197 /// FIXME: This should be hoisted into a generic utility, likely in
1198 /// Transforms/Util/Local.h
1199 static bool isSafePHIToSpeculate(PHINode &PN) {
1200  const DataLayout &DL = PN.getModule()->getDataLayout();
1201 
1202  // For now, we can only do this promotion if the load is in the same block
1203  // as the PHI, and if there are no stores between the phi and load.
1204  // TODO: Allow recursive phi users.
1205  // TODO: Allow stores.
1206  BasicBlock *BB = PN.getParent();
1207  Align MaxAlign;
1208  uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1209  APInt MaxSize(APWidth, 0);
1210  bool HaveLoad = false;
1211  for (User *U : PN.users()) {
1212  LoadInst *LI = dyn_cast<LoadInst>(U);
1213  if (!LI || !LI->isSimple())
1214  return false;
1215 
1216  // For now we only allow loads in the same block as the PHI. This is
1217  // a common case that happens when instcombine merges two loads through
1218  // a PHI.
1219  if (LI->getParent() != BB)
1220  return false;
1221 
1222  // Ensure that there are no instructions between the PHI and the load that
1223  // could store.
1224  for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1225  if (BBI->mayWriteToMemory())
1226  return false;
1227 
1228  uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
1229  MaxAlign = std::max(MaxAlign, LI->getAlign());
1230  MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
1231  HaveLoad = true;
1232  }
1233 
1234  if (!HaveLoad)
1235  return false;
1236 
1237  // We can only transform this if it is safe to push the loads into the
1238  // predecessor blocks. The only thing to watch out for is that we can't put
1239  // a possibly trapping load in the predecessor if it is a critical edge.
1240  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1241  Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
1242  Value *InVal = PN.getIncomingValue(Idx);
1243 
1244  // If the value is produced by the terminator of the predecessor (an
1245  // invoke) or it has side-effects, there is no valid place to put a load
1246  // in the predecessor.
1247  if (TI == InVal || TI->mayHaveSideEffects())
1248  return false;
1249 
1250  // If the predecessor has a single successor, then the edge isn't
1251  // critical.
1252  if (TI->getNumSuccessors() == 1)
1253  continue;
1254 
1255  // If this pointer is always safe to load, or if we can prove that there
1256  // is already a load in the block, then we can move the load to the pred
1257  // block.
1258  if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI))
1259  continue;
1260 
1261  return false;
1262  }
1263 
1264  return true;
1265 }
1266 
1267 static void speculatePHINodeLoads(PHINode &PN) {
1268  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1269 
1270  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1271  Type *LoadTy = SomeLoad->getType();
1272  IRBuilderTy PHIBuilder(&PN);
1273  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1274  PN.getName() + ".sroa.speculated");
1275 
1276  // Get the AA tags and alignment to use from one of the loads. It does not
1277  // matter which one we get and if any differ.
1278  AAMDNodes AATags = SomeLoad->getAAMetadata();
1279  Align Alignment = SomeLoad->getAlign();
1280 
1281  // Rewrite all loads of the PN to use the new PHI.
1282  while (!PN.use_empty()) {
1283  LoadInst *LI = cast<LoadInst>(PN.user_back());
1284  LI->replaceAllUsesWith(NewPN);
1285  LI->eraseFromParent();
1286  }
1287 
1288  // Inject loads into all of the pred blocks.
1289  DenseMap<BasicBlock*, Value*> InjectedLoads;
1290  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1291  BasicBlock *Pred = PN.getIncomingBlock(Idx);
1292  Value *InVal = PN.getIncomingValue(Idx);
1293 
1294  // A PHI node is allowed to have multiple (duplicated) entries for the same
1295  // basic block, as long as the value is the same. So if we already injected
1296  // a load in the predecessor, then we should reuse the same load for all
1297  // duplicated entries.
1298  if (Value* V = InjectedLoads.lookup(Pred)) {
1299  NewPN->addIncoming(V, Pred);
1300  continue;
1301  }
1302 
1303  Instruction *TI = Pred->getTerminator();
1304  IRBuilderTy PredBuilder(TI);
1305 
1306  LoadInst *Load = PredBuilder.CreateAlignedLoad(
1307  LoadTy, InVal, Alignment,
1308  (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1309  ++NumLoadsSpeculated;
1310  if (AATags)
1311  Load->setAAMetadata(AATags);
1312  NewPN->addIncoming(Load, Pred);
1313  InjectedLoads[Pred] = Load;
1314  }
1315 
1316  LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1317  PN.eraseFromParent();
1318 }
1319 
1320 /// Select instructions that use an alloca and are subsequently loaded can be
1321 /// rewritten to load both input pointers and then select between the result,
1322 /// allowing the load of the alloca to be promoted.
1323 /// From this:
1324 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1325 /// %V = load i32* %P2
1326 /// to:
1327 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1328 /// %V2 = load i32* %Other
1329 /// %V = select i1 %cond, i32 %V1, i32 %V2
1330 ///
1331 /// We can do this to a select if its only uses are loads and if the operand
1332 /// to the select can be loaded unconditionally. If found an intervening bitcast
1333 /// with a single use of the load, allow the promotion.
1335  Value *TValue = SI.getTrueValue();
1336  Value *FValue = SI.getFalseValue();
1337  const DataLayout &DL = SI.getModule()->getDataLayout();
1338 
1339  for (User *U : SI.users()) {
1340  LoadInst *LI;
1341  BitCastInst *BC = dyn_cast<BitCastInst>(U);
1342  if (BC && BC->hasOneUse())
1343  LI = dyn_cast<LoadInst>(*BC->user_begin());
1344  else
1345  LI = dyn_cast<LoadInst>(U);
1346 
1347  if (!LI || !LI->isSimple())
1348  return false;
1349 
1350  // Both operands to the select need to be dereferenceable, either
1351  // absolutely (e.g. allocas) or at this point because we can see other
1352  // accesses to it.
1353  if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
1354  LI->getAlign(), DL, LI))
1355  return false;
1356  if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
1357  LI->getAlign(), DL, LI))
1358  return false;
1359  }
1360 
1361  return true;
1362 }
1363 
1365  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
1366 
1367  IRBuilderTy IRB(&SI);
1368  Value *TV = SI.getTrueValue();
1369  Value *FV = SI.getFalseValue();
1370  // Replace the loads of the select with a select of two loads.
1371  while (!SI.use_empty()) {
1372  LoadInst *LI;
1373  BitCastInst *BC = dyn_cast<BitCastInst>(SI.user_back());
1374  if (BC) {
1375  assert(BC->hasOneUse() && "Bitcast should have a single use.");
1376  LI = cast<LoadInst>(BC->user_back());
1377  } else {
1378  LI = cast<LoadInst>(SI.user_back());
1379  }
1380 
1381  assert(LI->isSimple() && "We only speculate simple loads");
1382 
1383  IRB.SetInsertPoint(LI);
1384  Value *NewTV =
1385  BC ? IRB.CreateBitCast(TV, BC->getType(), TV->getName() + ".sroa.cast")
1386  : TV;
1387  Value *NewFV =
1388  BC ? IRB.CreateBitCast(FV, BC->getType(), FV->getName() + ".sroa.cast")
1389  : FV;
1390  LoadInst *TL = IRB.CreateLoad(LI->getType(), NewTV,
1391  LI->getName() + ".sroa.speculate.load.true");
1392  LoadInst *FL = IRB.CreateLoad(LI->getType(), NewFV,
1393  LI->getName() + ".sroa.speculate.load.false");
1394  NumLoadsSpeculated += 2;
1395 
1396  // Transfer alignment and AA info if present.
1397  TL->setAlignment(LI->getAlign());
1398  FL->setAlignment(LI->getAlign());
1399 
1400  AAMDNodes Tags = LI->getAAMetadata();
1401  if (Tags) {
1402  TL->setAAMetadata(Tags);
1403  FL->setAAMetadata(Tags);
1404  }
1405 
1406  Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1407  LI->getName() + ".sroa.speculated");
1408 
1409  LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1410  LI->replaceAllUsesWith(V);
1411  LI->eraseFromParent();
1412  if (BC)
1413  BC->eraseFromParent();
1414  }
1415  SI.eraseFromParent();
1416 }
1417 
1418 /// Build a GEP out of a base pointer and indices.
1419 ///
1420 /// This will return the BasePtr if that is valid, or build a new GEP
1421 /// instruction using the IRBuilder if GEP-ing is needed.
1422 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
1423  SmallVectorImpl<Value *> &Indices,
1424  const Twine &NamePrefix) {
1425  if (Indices.empty())
1426  return BasePtr;
1427 
1428  // A single zero index is a no-op, so check for this and avoid building a GEP
1429  // in that case.
1430  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1431  return BasePtr;
1432 
1433  return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
1434  BasePtr, Indices, NamePrefix + "sroa_idx");
1435 }
1436 
1437 /// Get a natural GEP off of the BasePtr walking through Ty toward
1438 /// TargetTy without changing the offset of the pointer.
1439 ///
1440 /// This routine assumes we've already established a properly offset GEP with
1441 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1442 /// zero-indices down through type layers until we find one the same as
1443 /// TargetTy. If we can't find one with the same type, we at least try to use
1444 /// one with the same size. If none of that works, we just produce the GEP as
1445 /// indicated by Indices to have the correct offset.
1446 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
1447  Value *BasePtr, Type *Ty, Type *TargetTy,
1448  SmallVectorImpl<Value *> &Indices,
1449  const Twine &NamePrefix) {
1450  if (Ty == TargetTy)
1451  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1452 
1453  // Offset size to use for the indices.
1454  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
1455 
1456  // See if we can descend into a struct and locate a field with the correct
1457  // type.
1458  unsigned NumLayers = 0;
1459  Type *ElementTy = Ty;
1460  do {
1461  if (ElementTy->isPointerTy())
1462  break;
1463 
1464  if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
1465  ElementTy = ArrayTy->getElementType();
1466  Indices.push_back(IRB.getIntN(OffsetSize, 0));
1467  } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
1468  ElementTy = VectorTy->getElementType();
1469  Indices.push_back(IRB.getInt32(0));
1470  } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1471  if (STy->element_begin() == STy->element_end())
1472  break; // Nothing left to descend into.
1473  ElementTy = *STy->element_begin();
1474  Indices.push_back(IRB.getInt32(0));
1475  } else {
1476  break;
1477  }
1478  ++NumLayers;
1479  } while (ElementTy != TargetTy);
1480  if (ElementTy != TargetTy)
1481  Indices.erase(Indices.end() - NumLayers, Indices.end());
1482 
1483  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1484 }
1485 
1486 /// Get a natural GEP from a base pointer to a particular offset and
1487 /// resulting in a particular type.
1488 ///
1489 /// The goal is to produce a "natural" looking GEP that works with the existing
1490 /// composite types to arrive at the appropriate offset and element type for
1491 /// a pointer. TargetTy is the element type the returned GEP should point-to if
1492 /// possible. We recurse by decreasing Offset, adding the appropriate index to
1493 /// Indices, and setting Ty to the result subtype.
1494 ///
1495 /// If no natural GEP can be constructed, this function returns null.
1496 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
1497  Value *Ptr, APInt Offset, Type *TargetTy,
1498  SmallVectorImpl<Value *> &Indices,
1499  const Twine &NamePrefix) {
1500  PointerType *Ty = cast<PointerType>(Ptr->getType());
1501 
1502  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1503  // an i8.
1504  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
1505  return nullptr;
1506 
1507  Type *ElementTy = Ty->getElementType();
1508  if (!ElementTy->isSized())
1509  return nullptr; // We can't GEP through an unsized element.
1510 
1511  SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(ElementTy, Offset);
1512  if (Offset != 0)
1513  return nullptr;
1514 
1515  for (const APInt &Index : IntIndices)
1516  Indices.push_back(IRB.getInt(Index));
1517  return getNaturalGEPWithType(IRB, DL, Ptr, ElementTy, TargetTy, Indices,
1518  NamePrefix);
1519 }
1520 
1521 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1522 /// resulting pointer has PointerTy.
1523 ///
1524 /// This tries very hard to compute a "natural" GEP which arrives at the offset
1525 /// and produces the pointer type desired. Where it cannot, it will try to use
1526 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
1527 /// fails, it will try to use an existing i8* and GEP to the byte offset and
1528 /// bitcast to the type.
1529 ///
1530 /// The strategy for finding the more natural GEPs is to peel off layers of the
1531 /// pointer, walking back through bit casts and GEPs, searching for a base
1532 /// pointer from which we can compute a natural GEP with the desired
1533 /// properties. The algorithm tries to fold as many constant indices into
1534 /// a single GEP as possible, thus making each GEP more independent of the
1535 /// surrounding code.
1536 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1538  const Twine &NamePrefix) {
1539  // Create i8 GEP for opaque pointers.
1540  if (Ptr->getType()->isOpaquePointerTy()) {
1541  if (Offset != 0)
1542  Ptr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(Offset),
1543  NamePrefix + "sroa_idx");
1544  return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1545  NamePrefix + "sroa_cast");
1546  }
1547 
1548  // Even though we don't look through PHI nodes, we could be called on an
1549  // instruction in an unreachable block, which may be on a cycle.
1550  SmallPtrSet<Value *, 4> Visited;
1551  Visited.insert(Ptr);
1552  SmallVector<Value *, 4> Indices;
1553 
1554  // We may end up computing an offset pointer that has the wrong type. If we
1555  // never are able to compute one directly that has the correct type, we'll
1556  // fall back to it, so keep it and the base it was computed from around here.
1557  Value *OffsetPtr = nullptr;
1558  Value *OffsetBasePtr;
1559 
1560  // Remember any i8 pointer we come across to re-use if we need to do a raw
1561  // byte offset.
1562  Value *Int8Ptr = nullptr;
1563  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1564 
1565  PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
1566  Type *TargetTy = TargetPtrTy->getElementType();
1567 
1568  // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
1569  // address space from the expected `PointerTy` (the pointer to be used).
1570  // Adjust the pointer type based the original storage pointer.
1571  auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
1572  PointerTy = TargetTy->getPointerTo(AS);
1573 
1574  do {
1575  // First fold any existing GEPs into the offset.
1576  while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1577  APInt GEPOffset(Offset.getBitWidth(), 0);
1578  if (!GEP->accumulateConstantOffset(DL, GEPOffset))
1579  break;
1580  Offset += GEPOffset;
1581  Ptr = GEP->getPointerOperand();
1582  if (!Visited.insert(Ptr).second)
1583  break;
1584  }
1585 
1586  // See if we can perform a natural GEP here.
1587  Indices.clear();
1588  if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
1589  Indices, NamePrefix)) {
1590  // If we have a new natural pointer at the offset, clear out any old
1591  // offset pointer we computed. Unless it is the base pointer or
1592  // a non-instruction, we built a GEP we don't need. Zap it.
1593  if (OffsetPtr && OffsetPtr != OffsetBasePtr)
1594  if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
1595  assert(I->use_empty() && "Built a GEP with uses some how!");
1596  I->eraseFromParent();
1597  }
1598  OffsetPtr = P;
1599  OffsetBasePtr = Ptr;
1600  // If we also found a pointer of the right type, we're done.
1601  if (P->getType() == PointerTy)
1602  break;
1603  }
1604 
1605  // Stash this pointer if we've found an i8*.
1606  if (Ptr->getType()->isIntegerTy(8)) {
1607  Int8Ptr = Ptr;
1608  Int8PtrOffset = Offset;
1609  }
1610 
1611  // Peel off a layer of the pointer and update the offset appropriately.
1612  if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1613  Ptr = cast<Operator>(Ptr)->getOperand(0);
1614  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1615  if (GA->isInterposable())
1616  break;
1617  Ptr = GA->getAliasee();
1618  } else {
1619  break;
1620  }
1621  assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1622  } while (Visited.insert(Ptr).second);
1623 
1624  if (!OffsetPtr) {
1625  if (!Int8Ptr) {
1626  Int8Ptr = IRB.CreateBitCast(
1627  Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
1628  NamePrefix + "sroa_raw_cast");
1629  Int8PtrOffset = Offset;
1630  }
1631 
1632  OffsetPtr = Int8PtrOffset == 0
1633  ? Int8Ptr
1634  : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
1635  IRB.getInt(Int8PtrOffset),
1636  NamePrefix + "sroa_raw_idx");
1637  }
1638  Ptr = OffsetPtr;
1639 
1640  // On the off chance we were targeting i8*, guard the bitcast here.
1641  if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
1642  Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
1643  TargetPtrTy,
1644  NamePrefix + "sroa_cast");
1645  }
1646 
1647  return Ptr;
1648 }
1649 
1650 /// Compute the adjusted alignment for a load or store from an offset.
1653 }
1654 
1655 /// Test whether we can convert a value from the old to the new type.
1656 ///
1657 /// This predicate should be used to guard calls to convertValue in order to
1658 /// ensure that we only try to convert viable values. The strategy is that we
1659 /// will peel off single element struct and array wrappings to get to an
1660 /// underlying value, and convert that value.
1661 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1662  if (OldTy == NewTy)
1663  return true;
1664 
1665  // For integer types, we can't handle any bit-width differences. This would
1666  // break both vector conversions with extension and introduce endianness
1667  // issues when in conjunction with loads and stores.
1668  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1669  assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1670  cast<IntegerType>(NewTy)->getBitWidth() &&
1671  "We can't have the same bitwidth for different int types");
1672  return false;
1673  }
1674 
1675  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
1676  DL.getTypeSizeInBits(OldTy).getFixedSize())
1677  return false;
1678  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1679  return false;
1680 
1681  // We can convert pointers to integers and vice-versa. Same for vectors
1682  // of pointers and integers.
1683  OldTy = OldTy->getScalarType();
1684  NewTy = NewTy->getScalarType();
1685  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1686  if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1687  unsigned OldAS = OldTy->getPointerAddressSpace();
1688  unsigned NewAS = NewTy->getPointerAddressSpace();
1689  // Convert pointers if they are pointers from the same address space or
1690  // different integral (not non-integral) address spaces with the same
1691  // pointer size.
1692  return OldAS == NewAS ||
1693  (!DL.isNonIntegralAddressSpace(OldAS) &&
1694  !DL.isNonIntegralAddressSpace(NewAS) &&
1695  DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1696  }
1697 
1698  // We can convert integers to integral pointers, but not to non-integral
1699  // pointers.
1700  if (OldTy->isIntegerTy())
1701  return !DL.isNonIntegralPointerType(NewTy);
1702 
1703  // We can convert integral pointers to integers, but non-integral pointers
1704  // need to remain pointers.
1705  if (!DL.isNonIntegralPointerType(OldTy))
1706  return NewTy->isIntegerTy();
1707 
1708  return false;
1709  }
1710 
1711  return true;
1712 }
1713 
1714 /// Generic routine to convert an SSA value to a value of a different
1715 /// type.
1716 ///
1717 /// This will try various different casting techniques, such as bitcasts,
1718 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1719 /// two types for viability with this routine.
1720 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
1721  Type *NewTy) {
1722  Type *OldTy = V->getType();
1723  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
1724 
1725  if (OldTy == NewTy)
1726  return V;
1727 
1728  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
1729  "Integer types must be the exact same to convert.");
1730 
1731  // See if we need inttoptr for this type pair. May require additional bitcast.
1732  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1733  // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
1734  // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
1735  // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
1736  // Directly handle i64 to i8*
1737  return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
1738  NewTy);
1739  }
1740 
1741  // See if we need ptrtoint for this type pair. May require additional bitcast.
1742  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
1743  // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
1744  // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
1745  // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
1746  // Expand i8* to i64 --> i8* to i64 to i64
1747  return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1748  NewTy);
1749  }
1750 
1751  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1752  unsigned OldAS = OldTy->getPointerAddressSpace();
1753  unsigned NewAS = NewTy->getPointerAddressSpace();
1754  // To convert pointers with different address spaces (they are already
1755  // checked convertible, i.e. they have the same pointer size), so far we
1756  // cannot use `bitcast` (which has restrict on the same address space) or
1757  // `addrspacecast` (which is not always no-op casting). Instead, use a pair
1758  // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
1759  // size.
1760  if (OldAS != NewAS) {
1761  assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1762  return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1763  NewTy);
1764  }
1765  }
1766 
1767  return IRB.CreateBitCast(V, NewTy);
1768 }
1769 
1770 /// Test whether the given slice use can be promoted to a vector.
1771 ///
1772 /// This function is called to test each entry in a partition which is slated
1773 /// for a single slice.
1774 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
1775  VectorType *Ty,
1776  uint64_t ElementSize,
1777  const DataLayout &DL) {
1778  // First validate the slice offsets.
1779  uint64_t BeginOffset =
1780  std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
1781  uint64_t BeginIndex = BeginOffset / ElementSize;
1782  if (BeginIndex * ElementSize != BeginOffset ||
1783  BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
1784  return false;
1785  uint64_t EndOffset =
1786  std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
1787  uint64_t EndIndex = EndOffset / ElementSize;
1788  if (EndIndex * ElementSize != EndOffset ||
1789  EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
1790  return false;
1791 
1792  assert(EndIndex > BeginIndex && "Empty vector!");
1793  uint64_t NumElements = EndIndex - BeginIndex;
1794  Type *SliceTy = (NumElements == 1)
1795  ? Ty->getElementType()
1796  : FixedVectorType::get(Ty->getElementType(), NumElements);
1797 
1798  Type *SplitIntTy =
1799  Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
1800 
1801  Use *U = S.getUse();
1802 
1803  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
1804  if (MI->isVolatile())
1805  return false;
1806  if (!S.isSplittable())
1807  return false; // Skip any unsplittable intrinsics.
1808  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
1809  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
1810  return false;
1811  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1812  if (LI->isVolatile())
1813  return false;
1814  Type *LTy = LI->getType();
1815  // Disable vector promotion when there are loads or stores of an FCA.
1816  if (LTy->isStructTy())
1817  return false;
1818  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1819  assert(LTy->isIntegerTy());
1820  LTy = SplitIntTy;
1821  }
1822  if (!canConvertValue(DL, SliceTy, LTy))
1823  return false;
1824  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1825  if (SI->isVolatile())
1826  return false;
1827  Type *STy = SI->getValueOperand()->getType();
1828  // Disable vector promotion when there are loads or stores of an FCA.
1829  if (STy->isStructTy())
1830  return false;
1831  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1832  assert(STy->isIntegerTy());
1833  STy = SplitIntTy;
1834  }
1835  if (!canConvertValue(DL, STy, SliceTy))
1836  return false;
1837  } else {
1838  return false;
1839  }
1840 
1841  return true;
1842 }
1843 
1844 /// Test whether the given alloca partitioning and range of slices can be
1845 /// promoted to a vector.
1846 ///
1847 /// This is a quick test to check whether we can rewrite a particular alloca
1848 /// partition (and its newly formed alloca) into a vector alloca with only
1849 /// whole-vector loads and stores such that it could be promoted to a vector
1850 /// SSA value. We only can ensure this for a limited set of operations, and we
1851 /// don't want to do the rewrites unless we are confident that the result will
1852 /// be promotable, so we have an early test here.
1854  // Collect the candidate types for vector-based promotion. Also track whether
1855  // we have different element types.
1856  SmallVector<VectorType *, 4> CandidateTys;
1857  Type *CommonEltTy = nullptr;
1858  bool HaveCommonEltTy = true;
1859  auto CheckCandidateType = [&](Type *Ty) {
1860  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1861  // Return if bitcast to vectors is different for total size in bits.
1862  if (!CandidateTys.empty()) {
1863  VectorType *V = CandidateTys[0];
1864  if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
1865  DL.getTypeSizeInBits(V).getFixedSize()) {
1866  CandidateTys.clear();
1867  return;
1868  }
1869  }
1870  CandidateTys.push_back(VTy);
1871  if (!CommonEltTy)
1872  CommonEltTy = VTy->getElementType();
1873  else if (CommonEltTy != VTy->getElementType())
1874  HaveCommonEltTy = false;
1875  }
1876  };
1877  // Consider any loads or stores that are the exact size of the slice.
1878  for (const Slice &S : P)
1879  if (S.beginOffset() == P.beginOffset() &&
1880  S.endOffset() == P.endOffset()) {
1881  if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
1882  CheckCandidateType(LI->getType());
1883  else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
1884  CheckCandidateType(SI->getValueOperand()->getType());
1885  }
1886 
1887  // If we didn't find a vector type, nothing to do here.
1888  if (CandidateTys.empty())
1889  return nullptr;
1890 
1891  // Remove non-integer vector types if we had multiple common element types.
1892  // FIXME: It'd be nice to replace them with integer vector types, but we can't
1893  // do that until all the backends are known to produce good code for all
1894  // integer vector types.
1895  if (!HaveCommonEltTy) {
1896  llvm::erase_if(CandidateTys, [](VectorType *VTy) {
1897  return !VTy->getElementType()->isIntegerTy();
1898  });
1899 
1900  // If there were no integer vector types, give up.
1901  if (CandidateTys.empty())
1902  return nullptr;
1903 
1904  // Rank the remaining candidate vector types. This is easy because we know
1905  // they're all integer vectors. We sort by ascending number of elements.
1906  auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
1907  (void)DL;
1908  assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
1909  DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
1910  "Cannot have vector types of different sizes!");
1911  assert(RHSTy->getElementType()->isIntegerTy() &&
1912  "All non-integer types eliminated!");
1913  assert(LHSTy->getElementType()->isIntegerTy() &&
1914  "All non-integer types eliminated!");
1915  return cast<FixedVectorType>(RHSTy)->getNumElements() <
1916  cast<FixedVectorType>(LHSTy)->getNumElements();
1917  };
1918  llvm::sort(CandidateTys, RankVectorTypes);
1919  CandidateTys.erase(
1920  std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
1921  CandidateTys.end());
1922  } else {
1923 // The only way to have the same element type in every vector type is to
1924 // have the same vector type. Check that and remove all but one.
1925 #ifndef NDEBUG
1926  for (VectorType *VTy : CandidateTys) {
1927  assert(VTy->getElementType() == CommonEltTy &&
1928  "Unaccounted for element type!");
1929  assert(VTy == CandidateTys[0] &&
1930  "Different vector types with the same element type!");
1931  }
1932 #endif
1933  CandidateTys.resize(1);
1934  }
1935 
1936  // Try each vector type, and return the one which works.
1937  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
1938  uint64_t ElementSize =
1939  DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
1940 
1941  // While the definition of LLVM vectors is bitpacked, we don't support sizes
1942  // that aren't byte sized.
1943  if (ElementSize % 8)
1944  return false;
1945  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
1946  "vector size not a multiple of element size?");
1947  ElementSize /= 8;
1948 
1949  for (const Slice &S : P)
1950  if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
1951  return false;
1952 
1953  for (const Slice *S : P.splitSliceTails())
1954  if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
1955  return false;
1956 
1957  return true;
1958  };
1959  for (VectorType *VTy : CandidateTys)
1960  if (CheckVectorTypeForPromotion(VTy))
1961  return VTy;
1962 
1963  return nullptr;
1964 }
1965 
1966 /// Test whether a slice of an alloca is valid for integer widening.
1967 ///
1968 /// This implements the necessary checking for the \c isIntegerWideningViable
1969 /// test below on a single slice of the alloca.
1970 static bool isIntegerWideningViableForSlice(const Slice &S,
1971  uint64_t AllocBeginOffset,
1972  Type *AllocaTy,
1973  const DataLayout &DL,
1974  bool &WholeAllocaOp) {
1975  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
1976 
1977  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
1978  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
1979 
1980  // We can't reasonably handle cases where the load or store extends past
1981  // the end of the alloca's type and into its padding.
1982  if (RelEnd > Size)
1983  return false;
1984 
1985  Use *U = S.getUse();
1986 
1987  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1988  if (LI->isVolatile())
1989  return false;
1990  // We can't handle loads that extend past the allocated memory.
1991  if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
1992  return false;
1993  // So far, AllocaSliceRewriter does not support widening split slice tails
1994  // in rewriteIntegerLoad.
1995  if (S.beginOffset() < AllocBeginOffset)
1996  return false;
1997  // Note that we don't count vector loads or stores as whole-alloca
1998  // operations which enable integer widening because we would prefer to use
1999  // vector widening instead.
2000  if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2001  WholeAllocaOp = true;
2002  if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2003  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2004  return false;
2005  } else if (RelBegin != 0 || RelEnd != Size ||
2006  !canConvertValue(DL, AllocaTy, LI->getType())) {
2007  // Non-integer loads need to be convertible from the alloca type so that
2008  // they are promotable.
2009  return false;
2010  }
2011  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2012  Type *ValueTy = SI->getValueOperand()->getType();
2013  if (SI->isVolatile())
2014  return false;
2015  // We can't handle stores that extend past the allocated memory.
2016  if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
2017  return false;
2018  // So far, AllocaSliceRewriter does not support widening split slice tails
2019  // in rewriteIntegerStore.
2020  if (S.beginOffset() < AllocBeginOffset)
2021  return false;
2022  // Note that we don't count vector loads or stores as whole-alloca
2023  // operations which enable integer widening because we would prefer to use
2024  // vector widening instead.
2025  if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2026  WholeAllocaOp = true;
2027  if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2028  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2029  return false;
2030  } else if (RelBegin != 0 || RelEnd != Size ||
2031  !canConvertValue(DL, ValueTy, AllocaTy)) {
2032  // Non-integer stores need to be convertible to the alloca type so that
2033  // they are promotable.
2034  return false;
2035  }
2036  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2037  if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2038  return false;
2039  if (!S.isSplittable())
2040  return false; // Skip any unsplittable intrinsics.
2041  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2042  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2043  return false;
2044  } else {
2045  return false;
2046  }
2047 
2048  return true;
2049 }
2050 
2051 /// Test whether the given alloca partition's integer operations can be
2052 /// widened to promotable ones.
2053 ///
2054 /// This is a quick test to check whether we can rewrite the integer loads and
2055 /// stores to a particular alloca into wider loads and stores and be able to
2056 /// promote the resulting alloca.
2057 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2058  const DataLayout &DL) {
2059  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
2060  // Don't create integer types larger than the maximum bitwidth.
2061  if (SizeInBits > IntegerType::MAX_INT_BITS)
2062  return false;
2063 
2064  // Don't try to handle allocas with bit-padding.
2065  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
2066  return false;
2067 
2068  // We need to ensure that an integer type with the appropriate bitwidth can
2069  // be converted to the alloca type, whatever that is. We don't want to force
2070  // the alloca itself to have an integer type if there is a more suitable one.
2071  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2072  if (!canConvertValue(DL, AllocaTy, IntTy) ||
2073  !canConvertValue(DL, IntTy, AllocaTy))
2074  return false;
2075 
2076  // While examining uses, we ensure that the alloca has a covering load or
2077  // store. We don't want to widen the integer operations only to fail to
2078  // promote due to some other unsplittable entry (which we may make splittable
2079  // later). However, if there are only splittable uses, go ahead and assume
2080  // that we cover the alloca.
2081  // FIXME: We shouldn't consider split slices that happen to start in the
2082  // partition here...
2083  bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2084 
2085  for (const Slice &S : P)
2086  if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2087  WholeAllocaOp))
2088  return false;
2089 
2090  for (const Slice *S : P.splitSliceTails())
2091  if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2092  WholeAllocaOp))
2093  return false;
2094 
2095  return WholeAllocaOp;
2096 }
2097 
2098 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2100  const Twine &Name) {
2101  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2102  IntegerType *IntTy = cast<IntegerType>(V->getType());
2103  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2104  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2105  "Element extends past full value");
2106  uint64_t ShAmt = 8 * Offset;
2107  if (DL.isBigEndian())
2108  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2109  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2110  if (ShAmt) {
2111  V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2112  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2113  }
2114  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2115  "Cannot extract to a larger integer!");
2116  if (Ty != IntTy) {
2117  V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2118  LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2119  }
2120  return V;
2121 }
2122 
2123 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2124  Value *V, uint64_t Offset, const Twine &Name) {
2125  IntegerType *IntTy = cast<IntegerType>(Old->getType());
2126  IntegerType *Ty = cast<IntegerType>(V->getType());
2127  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2128  "Cannot insert a larger integer!");
2129  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2130  if (Ty != IntTy) {
2131  V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2132  LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2133  }
2134  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2135  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2136  "Element store outside of alloca store");
2137  uint64_t ShAmt = 8 * Offset;
2138  if (DL.isBigEndian())
2139  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2140  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2141  if (ShAmt) {
2142  V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2143  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2144  }
2145 
2146  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2147  APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2148  Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2149  LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2150  V = IRB.CreateOr(Old, V, Name + ".insert");
2151  LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2152  }
2153  return V;
2154 }
2155 
2156 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2157  unsigned EndIndex, const Twine &Name) {
2158  auto *VecTy = cast<FixedVectorType>(V->getType());
2159  unsigned NumElements = EndIndex - BeginIndex;
2160  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2161 
2162  if (NumElements == VecTy->getNumElements())
2163  return V;
2164 
2165  if (NumElements == 1) {
2166  V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2167  Name + ".extract");
2168  LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2169  return V;
2170  }
2171 
2173  Mask.reserve(NumElements);
2174  for (unsigned i = BeginIndex; i != EndIndex; ++i)
2175  Mask.push_back(i);
2176  V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2177  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2178  return V;
2179 }
2180 
2181 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2182  unsigned BeginIndex, const Twine &Name) {
2183  VectorType *VecTy = cast<VectorType>(Old->getType());
2184  assert(VecTy && "Can only insert a vector into a vector");
2185 
2186  VectorType *Ty = dyn_cast<VectorType>(V->getType());
2187  if (!Ty) {
2188  // Single element to insert.
2189  V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2190  Name + ".insert");
2191  LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2192  return V;
2193  }
2194 
2195  assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2196  cast<FixedVectorType>(VecTy)->getNumElements() &&
2197  "Too many elements!");
2198  if (cast<FixedVectorType>(Ty)->getNumElements() ==
2199  cast<FixedVectorType>(VecTy)->getNumElements()) {
2200  assert(V->getType() == VecTy && "Vector type mismatch");
2201  return V;
2202  }
2203  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2204 
2205  // When inserting a smaller vector into the larger to store, we first
2206  // use a shuffle vector to widen it with undef elements, and then
2207  // a second shuffle vector to select between the loaded vector and the
2208  // incoming vector.
2210  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2211  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2212  if (i >= BeginIndex && i < EndIndex)
2213  Mask.push_back(i - BeginIndex);
2214  else
2215  Mask.push_back(-1);
2216  V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2217  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2218 
2220  Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2221  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2222  Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2223 
2224  V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2225 
2226  LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2227  return V;
2228 }
2229 
2230 /// Visitor to rewrite instructions using p particular slice of an alloca
2231 /// to use a new alloca.
2232 ///
2233 /// Also implements the rewriting to vector-based accesses when the partition
2234 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2235 /// lives here.
2237  : public InstVisitor<AllocaSliceRewriter, bool> {
2238  // Befriend the base class so it can delegate to private visit methods.
2239  friend class InstVisitor<AllocaSliceRewriter, bool>;
2240 
2242 
2243  const DataLayout &DL;
2244  AllocaSlices &AS;
2245  SROA &Pass;
2246  AllocaInst &OldAI, &NewAI;
2247  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2248  Type *NewAllocaTy;
2249 
2250  // This is a convenience and flag variable that will be null unless the new
2251  // alloca's integer operations should be widened to this integer type due to
2252  // passing isIntegerWideningViable above. If it is non-null, the desired
2253  // integer type will be stored here for easy access during rewriting.
2254  IntegerType *IntTy;
2255 
2256  // If we are rewriting an alloca partition which can be written as pure
2257  // vector operations, we stash extra information here. When VecTy is
2258  // non-null, we have some strict guarantees about the rewritten alloca:
2259  // - The new alloca is exactly the size of the vector type here.
2260  // - The accesses all either map to the entire vector or to a single
2261  // element.
2262  // - The set of accessing instructions is only one of those handled above
2263  // in isVectorPromotionViable. Generally these are the same access kinds
2264  // which are promotable via mem2reg.
2265  VectorType *VecTy;
2266  Type *ElementTy;
2267  uint64_t ElementSize;
2268 
2269  // The original offset of the slice currently being rewritten relative to
2270  // the original alloca.
2271  uint64_t BeginOffset = 0;
2272  uint64_t EndOffset = 0;
2273 
2274  // The new offsets of the slice currently being rewritten relative to the
2275  // original alloca.
2276  uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2277 
2278  uint64_t SliceSize = 0;
2279  bool IsSplittable = false;
2280  bool IsSplit = false;
2281  Use *OldUse = nullptr;
2282  Instruction *OldPtr = nullptr;
2283 
2284  // Track post-rewrite users which are PHI nodes and Selects.
2285  SmallSetVector<PHINode *, 8> &PHIUsers;
2286  SmallSetVector<SelectInst *, 8> &SelectUsers;
2287 
2288  // Utility IR builder, whose name prefix is setup for each visited use, and
2289  // the insertion point is set to point to the user.
2290  IRBuilderTy IRB;
2291 
2292 public:
2294  AllocaInst &OldAI, AllocaInst &NewAI,
2295  uint64_t NewAllocaBeginOffset,
2296  uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2297  VectorType *PromotableVecTy,
2298  SmallSetVector<PHINode *, 8> &PHIUsers,
2299  SmallSetVector<SelectInst *, 8> &SelectUsers)
2300  : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2301  NewAllocaBeginOffset(NewAllocaBeginOffset),
2302  NewAllocaEndOffset(NewAllocaEndOffset),
2303  NewAllocaTy(NewAI.getAllocatedType()),
2304  IntTy(
2305  IsIntegerPromotable
2306  ? Type::getIntNTy(NewAI.getContext(),
2307  DL.getTypeSizeInBits(NewAI.getAllocatedType())
2308  .getFixedSize())
2309  : nullptr),
2310  VecTy(PromotableVecTy),
2311  ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2312  ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
2313  : 0),
2314  PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2315  IRB(NewAI.getContext(), ConstantFolder()) {
2316  if (VecTy) {
2317  assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
2318  "Only multiple-of-8 sized vector elements are viable");
2319  ++NumVectorized;
2320  }
2321  assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2322  }
2323 
2325  bool CanSROA = true;
2326  BeginOffset = I->beginOffset();
2327  EndOffset = I->endOffset();
2328  IsSplittable = I->isSplittable();
2329  IsSplit =
2330  BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2331  LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2332  LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2333  LLVM_DEBUG(dbgs() << "\n");
2334 
2335  // Compute the intersecting offset range.
2336  assert(BeginOffset < NewAllocaEndOffset);
2337  assert(EndOffset > NewAllocaBeginOffset);
2338  NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2339  NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2340 
2341  SliceSize = NewEndOffset - NewBeginOffset;
2342 
2343  OldUse = I->getUse();
2344  OldPtr = cast<Instruction>(OldUse->get());
2345 
2346  Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2347  IRB.SetInsertPoint(OldUserI);
2348  IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2349  IRB.getInserter().SetNamePrefix(
2350  Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
2351 
2352  CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2353  if (VecTy || IntTy)
2354  assert(CanSROA);
2355  return CanSROA;
2356  }
2357 
2358 private:
2359  // Make sure the other visit overloads are visible.
2360  using Base::visit;
2361 
2362  // Every instruction which can end up as a user must have a rewrite rule.
2363  bool visitInstruction(Instruction &I) {
2364  LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2365  llvm_unreachable("No rewrite rule for this instruction!");
2366  }
2367 
2368  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2369  // Note that the offset computation can use BeginOffset or NewBeginOffset
2370  // interchangeably for unsplit slices.
2371  assert(IsSplit || BeginOffset == NewBeginOffset);
2372  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2373 
2374 #ifndef NDEBUG
2375  StringRef OldName = OldPtr->getName();
2376  // Skip through the last '.sroa.' component of the name.
2377  size_t LastSROAPrefix = OldName.rfind(".sroa.");
2378  if (LastSROAPrefix != StringRef::npos) {
2379  OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2380  // Look for an SROA slice index.
2381  size_t IndexEnd = OldName.find_first_not_of("0123456789");
2382  if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2383  // Strip the index and look for the offset.
2384  OldName = OldName.substr(IndexEnd + 1);
2385  size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2386  if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2387  // Strip the offset.
2388  OldName = OldName.substr(OffsetEnd + 1);
2389  }
2390  }
2391  // Strip any SROA suffixes as well.
2392  OldName = OldName.substr(0, OldName.find(".sroa_"));
2393 #endif
2394 
2395  return getAdjustedPtr(IRB, DL, &NewAI,
2396  APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
2397  PointerTy,
2398 #ifndef NDEBUG
2399  Twine(OldName) + "."
2400 #else
2401  Twine()
2402 #endif
2403  );
2404  }
2405 
2406  /// Compute suitable alignment to access this slice of the *new*
2407  /// alloca.
2408  ///
2409  /// You can optionally pass a type to this routine and if that type's ABI
2410  /// alignment is itself suitable, this will return zero.
2411  Align getSliceAlign() {
2412  return commonAlignment(NewAI.getAlign(),
2413  NewBeginOffset - NewAllocaBeginOffset);
2414  }
2415 
2416  unsigned getIndex(uint64_t Offset) {
2417  assert(VecTy && "Can only call getIndex when rewriting a vector");
2418  uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2419  assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2420  uint32_t Index = RelOffset / ElementSize;
2421  assert(Index * ElementSize == RelOffset);
2422  return Index;
2423  }
2424 
2425  void deleteIfTriviallyDead(Value *V) {
2426  Instruction *I = cast<Instruction>(V);
2428  Pass.DeadInsts.push_back(I);
2429  }
2430 
2431  Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2432  unsigned BeginIndex = getIndex(NewBeginOffset);
2433  unsigned EndIndex = getIndex(NewEndOffset);
2434  assert(EndIndex > BeginIndex && "Empty vector!");
2435 
2436  LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2437  NewAI.getAlign(), "load");
2438 
2439  Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2440  LLVMContext::MD_access_group});
2441  return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
2442  }
2443 
2444  Value *rewriteIntegerLoad(LoadInst &LI) {
2445  assert(IntTy && "We cannot insert an integer to the alloca");
2446  assert(!LI.isVolatile());
2447  Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2448  NewAI.getAlign(), "load");
2449  V = convertValue(DL, IRB, V, IntTy);
2450  assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2451  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2452  if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2453  IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
2454  V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
2455  }
2456  // It is possible that the extracted type is not the load type. This
2457  // happens if there is a load past the end of the alloca, and as
2458  // a consequence the slice is narrower but still a candidate for integer
2459  // lowering. To handle this case, we just zero extend the extracted
2460  // integer.
2461  assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2462  "Can only handle an extract for an overly wide load");
2463  if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
2464  V = IRB.CreateZExt(V, LI.getType());
2465  return V;
2466  }
2467 
2468  bool visitLoadInst(LoadInst &LI) {
2469  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2470  Value *OldOp = LI.getOperand(0);
2471  assert(OldOp == OldPtr);
2472 
2473  AAMDNodes AATags = LI.getAAMetadata();
2474 
2475  unsigned AS = LI.getPointerAddressSpace();
2476 
2477  Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2478  : LI.getType();
2479  const bool IsLoadPastEnd =
2480  DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
2481  bool IsPtrAdjusted = false;
2482  Value *V;
2483  if (VecTy) {
2484  V = rewriteVectorizedLoadInst(LI);
2485  } else if (IntTy && LI.getType()->isIntegerTy()) {
2486  V = rewriteIntegerLoad(LI);
2487  } else if (NewBeginOffset == NewAllocaBeginOffset &&
2488  NewEndOffset == NewAllocaEndOffset &&
2489  (canConvertValue(DL, NewAllocaTy, TargetTy) ||
2490  (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2491  TargetTy->isIntegerTy()))) {
2492  LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2493  NewAI.getAlign(), LI.isVolatile(),
2494  LI.getName());
2495  if (AATags)
2496  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2497  if (LI.isVolatile())
2498  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2499  if (NewLI->isAtomic())
2500  NewLI->setAlignment(LI.getAlign());
2501 
2502  // Any !nonnull metadata or !range metadata on the old load is also valid
2503  // on the new load. This is even true in some cases even when the loads
2504  // are different types, for example by mapping !nonnull metadata to
2505  // !range metadata by modeling the null pointer constant converted to the
2506  // integer type.
2507  // FIXME: Add support for range metadata here. Currently the utilities
2508  // for this don't propagate range metadata in trivial cases from one
2509  // integer load to another, don't handle non-addrspace-0 null pointers
2510  // correctly, and don't have any support for mapping ranges as the
2511  // integer type becomes winder or narrower.
2512  if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
2513  copyNonnullMetadata(LI, N, *NewLI);
2514 
2515  // Try to preserve nonnull metadata
2516  V = NewLI;
2517 
2518  // If this is an integer load past the end of the slice (which means the
2519  // bytes outside the slice are undef or this load is dead) just forcibly
2520  // fix the integer size with correct handling of endianness.
2521  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2522  if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
2523  if (AITy->getBitWidth() < TITy->getBitWidth()) {
2524  V = IRB.CreateZExt(V, TITy, "load.ext");
2525  if (DL.isBigEndian())
2526  V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
2527  "endian_shift");
2528  }
2529  } else {
2530  Type *LTy = TargetTy->getPointerTo(AS);
2531  LoadInst *NewLI =
2532  IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
2533  getSliceAlign(), LI.isVolatile(), LI.getName());
2534  if (AATags)
2535  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2536  if (LI.isVolatile())
2537  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2538  NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2539  LLVMContext::MD_access_group});
2540 
2541  V = NewLI;
2542  IsPtrAdjusted = true;
2543  }
2544  V = convertValue(DL, IRB, V, TargetTy);
2545 
2546  if (IsSplit) {
2547  assert(!LI.isVolatile());
2548  assert(LI.getType()->isIntegerTy() &&
2549  "Only integer type loads and stores are split");
2550  assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
2551  "Split load isn't smaller than original load");
2552  assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
2553  "Non-byte-multiple bit width");
2554  // Move the insertion point just past the load so that we can refer to it.
2555  IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
2556  // Create a placeholder value with the same type as LI to use as the
2557  // basis for the new value. This allows us to replace the uses of LI with
2558  // the computed value, and then replace the placeholder with LI, leaving
2559  // LI only used for this computation.
2560  Value *Placeholder = new LoadInst(
2561  LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
2562  false, Align(1));
2563  V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
2564  "insert");
2565  LI.replaceAllUsesWith(V);
2566  Placeholder->replaceAllUsesWith(&LI);
2567  Placeholder->deleteValue();
2568  } else {
2569  LI.replaceAllUsesWith(V);
2570  }
2571 
2572  Pass.DeadInsts.push_back(&LI);
2573  deleteIfTriviallyDead(OldOp);
2574  LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
2575  return !LI.isVolatile() && !IsPtrAdjusted;
2576  }
2577 
2578  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
2579  AAMDNodes AATags) {
2580  if (V->getType() != VecTy) {
2581  unsigned BeginIndex = getIndex(NewBeginOffset);
2582  unsigned EndIndex = getIndex(NewEndOffset);
2583  assert(EndIndex > BeginIndex && "Empty vector!");
2584  unsigned NumElements = EndIndex - BeginIndex;
2585  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2586  "Too many elements!");
2587  Type *SliceTy = (NumElements == 1)
2588  ? ElementTy
2589  : FixedVectorType::get(ElementTy, NumElements);
2590  if (V->getType() != SliceTy)
2591  V = convertValue(DL, IRB, V, SliceTy);
2592 
2593  // Mix in the existing elements.
2594  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2595  NewAI.getAlign(), "load");
2596  V = insertVector(IRB, Old, V, BeginIndex, "vec");
2597  }
2598  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2599  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2600  LLVMContext::MD_access_group});
2601  if (AATags)
2602  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2603  Pass.DeadInsts.push_back(&SI);
2604 
2605  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2606  return true;
2607  }
2608 
2609  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
2610  assert(IntTy && "We cannot extract an integer from the alloca");
2611  assert(!SI.isVolatile());
2612  if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
2613  IntTy->getBitWidth()) {
2614  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2615  NewAI.getAlign(), "oldload");
2616  Old = convertValue(DL, IRB, Old, IntTy);
2617  assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2618  uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2619  V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
2620  }
2621  V = convertValue(DL, IRB, V, NewAllocaTy);
2622  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2623  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2624  LLVMContext::MD_access_group});
2625  if (AATags)
2626  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2627  Pass.DeadInsts.push_back(&SI);
2628  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2629  return true;
2630  }
2631 
2632  bool visitStoreInst(StoreInst &SI) {
2633  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
2634  Value *OldOp = SI.getOperand(1);
2635  assert(OldOp == OldPtr);
2636 
2637  AAMDNodes AATags = SI.getAAMetadata();
2638  Value *V = SI.getValueOperand();
2639 
2640  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2641  // alloca that should be re-examined after promoting this alloca.
2642  if (V->getType()->isPointerTy())
2643  if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
2644  Pass.PostPromotionWorklist.insert(AI);
2645 
2646  if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
2647  assert(!SI.isVolatile());
2648  assert(V->getType()->isIntegerTy() &&
2649  "Only integer type loads and stores are split");
2650  assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
2651  "Non-byte-multiple bit width");
2652  IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
2653  V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
2654  "extract");
2655  }
2656 
2657  if (VecTy)
2658  return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
2659  if (IntTy && V->getType()->isIntegerTy())
2660  return rewriteIntegerStore(V, SI, AATags);
2661 
2662  const bool IsStorePastEnd =
2663  DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
2664  StoreInst *NewSI;
2665  if (NewBeginOffset == NewAllocaBeginOffset &&
2666  NewEndOffset == NewAllocaEndOffset &&
2667  (canConvertValue(DL, V->getType(), NewAllocaTy) ||
2668  (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
2669  V->getType()->isIntegerTy()))) {
2670  // If this is an integer store past the end of slice (and thus the bytes
2671  // past that point are irrelevant or this is unreachable), truncate the
2672  // value prior to storing.
2673  if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
2674  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2675  if (VITy->getBitWidth() > AITy->getBitWidth()) {
2676  if (DL.isBigEndian())
2677  V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
2678  "endian_shift");
2679  V = IRB.CreateTrunc(V, AITy, "load.trunc");
2680  }
2681 
2682  V = convertValue(DL, IRB, V, NewAllocaTy);
2683  NewSI =
2684  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
2685  } else {
2686  unsigned AS = SI.getPointerAddressSpace();
2687  Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
2688  NewSI =
2689  IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
2690  }
2691  NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2692  LLVMContext::MD_access_group});
2693  if (AATags)
2694  NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2695  if (SI.isVolatile())
2696  NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
2697  if (NewSI->isAtomic())
2698  NewSI->setAlignment(SI.getAlign());
2699  Pass.DeadInsts.push_back(&SI);
2700  deleteIfTriviallyDead(OldOp);
2701 
2702  LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
2703  return NewSI->getPointerOperand() == &NewAI &&
2704  NewSI->getValueOperand()->getType() == NewAllocaTy &&
2705  !SI.isVolatile();
2706  }
2707 
2708  /// Compute an integer value from splatting an i8 across the given
2709  /// number of bytes.
2710  ///
2711  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2712  /// call this routine.
2713  /// FIXME: Heed the advice above.
2714  ///
2715  /// \param V The i8 value to splat.
2716  /// \param Size The number of bytes in the output (assuming i8 is one byte)
2717  Value *getIntegerSplat(Value *V, unsigned Size) {
2718  assert(Size > 0 && "Expected a positive number of bytes.");
2719  IntegerType *VTy = cast<IntegerType>(V->getType());
2720  assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
2721  if (Size == 1)
2722  return V;
2723 
2724  Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
2725  V = IRB.CreateMul(
2726  IRB.CreateZExt(V, SplatIntTy, "zext"),
2727  ConstantExpr::getUDiv(
2728  Constant::getAllOnesValue(SplatIntTy),
2729  ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
2730  SplatIntTy)),
2731  "isplat");
2732  return V;
2733  }
2734 
2735  /// Compute a vector splat for a given element value.
2736  Value *getVectorSplat(Value *V, unsigned NumElements) {
2737  V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
2738  LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
2739  return V;
2740  }
2741 
2742  bool visitMemSetInst(MemSetInst &II) {
2743  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2744  assert(II.getRawDest() == OldPtr);
2745 
2746  AAMDNodes AATags = II.getAAMetadata();
2747 
2748  // If the memset has a variable size, it cannot be split, just adjust the
2749  // pointer to the new alloca.
2750  if (!isa<ConstantInt>(II.getLength())) {
2751  assert(!IsSplit);
2752  assert(NewBeginOffset == BeginOffset);
2753  II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
2754  II.setDestAlignment(getSliceAlign());
2755 
2756  deleteIfTriviallyDead(OldPtr);
2757  return false;
2758  }
2759 
2760  // Record this instruction for deletion.
2761  Pass.DeadInsts.push_back(&II);
2762 
2763  Type *AllocaTy = NewAI.getAllocatedType();
2764  Type *ScalarTy = AllocaTy->getScalarType();
2765 
2766  const bool CanContinue = [&]() {
2767  if (VecTy || IntTy)
2768  return true;
2769  if (BeginOffset > NewAllocaBeginOffset ||
2770  EndOffset < NewAllocaEndOffset)
2771  return false;
2772  // Length must be in range for FixedVectorType.
2773  auto *C = cast<ConstantInt>(II.getLength());
2774  const uint64_t Len = C->getLimitedValue();
2776  return false;
2777  auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
2778  auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
2779  return canConvertValue(DL, SrcTy, AllocaTy) &&
2780  DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
2781  }();
2782 
2783  // If this doesn't map cleanly onto the alloca type, and that type isn't
2784  // a single value type, just emit a memset.
2785  if (!CanContinue) {
2786  Type *SizeTy = II.getLength()->getType();
2787  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2788  CallInst *New = IRB.CreateMemSet(
2789  getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
2790  MaybeAlign(getSliceAlign()), II.isVolatile());
2791  if (AATags)
2792  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2793  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2794  return false;
2795  }
2796 
2797  // If we can represent this as a simple value, we have to build the actual
2798  // value to store, which requires expanding the byte present in memset to
2799  // a sensible representation for the alloca type. This is essentially
2800  // splatting the byte to a sufficiently wide integer, splatting it across
2801  // any desired vector width, and bitcasting to the final type.
2802  Value *V;
2803 
2804  if (VecTy) {
2805  // If this is a memset of a vectorized alloca, insert it.
2806  assert(ElementTy == ScalarTy);
2807 
2808  unsigned BeginIndex = getIndex(NewBeginOffset);
2809  unsigned EndIndex = getIndex(NewEndOffset);
2810  assert(EndIndex > BeginIndex && "Empty vector!");
2811  unsigned NumElements = EndIndex - BeginIndex;
2812  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2813  "Too many elements!");
2814 
2815  Value *Splat = getIntegerSplat(
2816  II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
2817  Splat = convertValue(DL, IRB, Splat, ElementTy);
2818  if (NumElements > 1)
2819  Splat = getVectorSplat(Splat, NumElements);
2820 
2821  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2822  NewAI.getAlign(), "oldload");
2823  V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
2824  } else if (IntTy) {
2825  // If this is a memset on an alloca where we can widen stores, insert the
2826  // set integer.
2827  assert(!II.isVolatile());
2828 
2829  uint64_t Size = NewEndOffset - NewBeginOffset;
2830  V = getIntegerSplat(II.getValue(), Size);
2831 
2832  if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
2833  EndOffset != NewAllocaBeginOffset)) {
2834  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2835  NewAI.getAlign(), "oldload");
2836  Old = convertValue(DL, IRB, Old, IntTy);
2837  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2838  V = insertInteger(DL, IRB, Old, V, Offset, "insert");
2839  } else {
2840  assert(V->getType() == IntTy &&
2841  "Wrong type for an alloca wide integer!");
2842  }
2843  V = convertValue(DL, IRB, V, AllocaTy);
2844  } else {
2845  // Established these invariants above.
2846  assert(NewBeginOffset == NewAllocaBeginOffset);
2847  assert(NewEndOffset == NewAllocaEndOffset);
2848 
2849  V = getIntegerSplat(II.getValue(),
2850  DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
2851  if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
2852  V = getVectorSplat(
2853  V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
2854 
2855  V = convertValue(DL, IRB, V, AllocaTy);
2856  }
2857 
2858  StoreInst *New =
2859  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
2860  New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
2861  LLVMContext::MD_access_group});
2862  if (AATags)
2863  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2864  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2865  return !II.isVolatile();
2866  }
2867 
2868  bool visitMemTransferInst(MemTransferInst &II) {
2869  // Rewriting of memory transfer instructions can be a bit tricky. We break
2870  // them into two categories: split intrinsics and unsplit intrinsics.
2871 
2872  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2873 
2874  AAMDNodes AATags = II.getAAMetadata();
2875 
2876  bool IsDest = &II.getRawDestUse() == OldUse;
2877  assert((IsDest && II.getRawDest() == OldPtr) ||
2878  (!IsDest && II.getRawSource() == OldPtr));
2879 
2880  MaybeAlign SliceAlign = getSliceAlign();
2881 
2882  // For unsplit intrinsics, we simply modify the source and destination
2883  // pointers in place. This isn't just an optimization, it is a matter of
2884  // correctness. With unsplit intrinsics we may be dealing with transfers
2885  // within a single alloca before SROA ran, or with transfers that have
2886  // a variable length. We may also be dealing with memmove instead of
2887  // memcpy, and so simply updating the pointers is the necessary for us to
2888  // update both source and dest of a single call.
2889  if (!IsSplittable) {
2890  Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2891  if (IsDest) {
2892  II.setDest(AdjustedPtr);
2893  II.setDestAlignment(SliceAlign);
2894  }
2895  else {
2896  II.setSource(AdjustedPtr);
2897  II.setSourceAlignment(SliceAlign);
2898  }
2899 
2900  LLVM_DEBUG(dbgs() << " to: " << II << "\n");
2901  deleteIfTriviallyDead(OldPtr);
2902  return false;
2903  }
2904  // For split transfer intrinsics we have an incredibly useful assurance:
2905  // the source and destination do not reside within the same alloca, and at
2906  // least one of them does not escape. This means that we can replace
2907  // memmove with memcpy, and we don't need to worry about all manner of
2908  // downsides to splitting and transforming the operations.
2909 
2910  // If this doesn't map cleanly onto the alloca type, and that type isn't
2911  // a single value type, just emit a memcpy.
2912  bool EmitMemCpy =
2913  !VecTy && !IntTy &&
2914  (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
2915  SliceSize !=
2916  DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
2917  !NewAI.getAllocatedType()->isSingleValueType());
2918 
2919  // If we're just going to emit a memcpy, the alloca hasn't changed, and the
2920  // size hasn't been shrunk based on analysis of the viable range, this is
2921  // a no-op.
2922  if (EmitMemCpy && &OldAI == &NewAI) {
2923  // Ensure the start lines up.
2924  assert(NewBeginOffset == BeginOffset);
2925 
2926  // Rewrite the size as needed.
2927  if (NewEndOffset != EndOffset)
2929  NewEndOffset - NewBeginOffset));
2930  return false;
2931  }
2932  // Record this instruction for deletion.
2933  Pass.DeadInsts.push_back(&II);
2934 
2935  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2936  // alloca that should be re-examined after rewriting this instruction.
2937  Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
2938  if (AllocaInst *AI =
2939  dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
2940  assert(AI != &OldAI && AI != &NewAI &&
2941  "Splittable transfers cannot reach the same alloca on both ends.");
2942  Pass.Worklist.insert(AI);
2943  }
2944 
2945  Type *OtherPtrTy = OtherPtr->getType();
2946  unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
2947 
2948  // Compute the relative offset for the other pointer within the transfer.
2949  unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
2950  APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
2951  Align OtherAlign =
2952  (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
2953  OtherAlign =
2954  commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
2955 
2956  if (EmitMemCpy) {
2957  // Compute the other pointer, folding as much as possible to produce
2958  // a single, simple GEP in most cases.
2959  OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
2960  OtherPtr->getName() + ".");
2961 
2962  Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2963  Type *SizeTy = II.getLength()->getType();
2964  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2965 
2966  Value *DestPtr, *SrcPtr;
2967  MaybeAlign DestAlign, SrcAlign;
2968  // Note: IsDest is true iff we're copying into the new alloca slice
2969  if (IsDest) {
2970  DestPtr = OurPtr;
2971  DestAlign = SliceAlign;
2972  SrcPtr = OtherPtr;
2973  SrcAlign = OtherAlign;
2974  } else {
2975  DestPtr = OtherPtr;
2976  DestAlign = OtherAlign;
2977  SrcPtr = OurPtr;
2978  SrcAlign = SliceAlign;
2979  }
2980  CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
2981  Size, II.isVolatile());
2982  if (AATags)
2983  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2984  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2985  return false;
2986  }
2987 
2988  bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
2989  NewEndOffset == NewAllocaEndOffset;
2990  uint64_t Size = NewEndOffset - NewBeginOffset;
2991  unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
2992  unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
2993  unsigned NumElements = EndIndex - BeginIndex;
2994  IntegerType *SubIntTy =
2995  IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
2996 
2997  // Reset the other pointer type to match the register type we're going to
2998  // use, but using the address space of the original other pointer.
2999  Type *OtherTy;
3000  if (VecTy && !IsWholeAlloca) {
3001  if (NumElements == 1)
3002  OtherTy = VecTy->getElementType();
3003  else
3004  OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3005  } else if (IntTy && !IsWholeAlloca) {
3006  OtherTy = SubIntTy;
3007  } else {
3008  OtherTy = NewAllocaTy;
3009  }
3010  OtherPtrTy = OtherTy->getPointerTo(OtherAS);
3011 
3012  Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3013  OtherPtr->getName() + ".");
3014  MaybeAlign SrcAlign = OtherAlign;
3015  Value *DstPtr = &NewAI;
3016  MaybeAlign DstAlign = SliceAlign;
3017  if (!IsDest) {
3018  std::swap(SrcPtr, DstPtr);
3019  std::swap(SrcAlign, DstAlign);
3020  }
3021 
3022  Value *Src;
3023  if (VecTy && !IsWholeAlloca && !IsDest) {
3024  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3025  NewAI.getAlign(), "load");
3026  Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3027  } else if (IntTy && !IsWholeAlloca && !IsDest) {
3028  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3029  NewAI.getAlign(), "load");
3030  Src = convertValue(DL, IRB, Src, IntTy);
3031  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3032  Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3033  } else {
3034  LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3035  II.isVolatile(), "copyload");
3036  Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3037  LLVMContext::MD_access_group});
3038  if (AATags)
3039  Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3040  Src = Load;
3041  }
3042 
3043  if (VecTy && !IsWholeAlloca && IsDest) {
3044  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3045  NewAI.getAlign(), "oldload");
3046  Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3047  } else if (IntTy && !IsWholeAlloca && IsDest) {
3048  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3049  NewAI.getAlign(), "oldload");
3050  Old = convertValue(DL, IRB, Old, IntTy);
3051  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3052  Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3053  Src = convertValue(DL, IRB, Src, NewAllocaTy);
3054  }
3055 
3056  StoreInst *Store = cast<StoreInst>(
3057  IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3058  Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3059  LLVMContext::MD_access_group});
3060  if (AATags)
3061  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3062  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3063  return !II.isVolatile();
3064  }
3065 
3066  bool visitIntrinsicInst(IntrinsicInst &II) {
3067  assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3068  "Unexpected intrinsic!");
3069  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3070 
3071  // Record this instruction for deletion.
3072  Pass.DeadInsts.push_back(&II);
3073 
3074  if (II.isDroppable()) {
3075  assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3076  // TODO For now we forget assumed information, this can be improved.
3077  OldPtr->dropDroppableUsesIn(II);
3078  return true;
3079  }
3080 
3081  assert(II.getArgOperand(1) == OldPtr);
3082  // Lifetime intrinsics are only promotable if they cover the whole alloca.
3083  // Therefore, we drop lifetime intrinsics which don't cover the whole
3084  // alloca.
3085  // (In theory, intrinsics which partially cover an alloca could be
3086  // promoted, but PromoteMemToReg doesn't handle that case.)
3087  // FIXME: Check whether the alloca is promotable before dropping the
3088  // lifetime intrinsics?
3089  if (NewBeginOffset != NewAllocaBeginOffset ||
3090  NewEndOffset != NewAllocaEndOffset)
3091  return true;
3092 
3093  ConstantInt *Size =
3094  ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3095  NewEndOffset - NewBeginOffset);
3096  // Lifetime intrinsics always expect an i8* so directly get such a pointer
3097  // for the new alloca slice.
3099  Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3100  Value *New;
3101  if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3102  New = IRB.CreateLifetimeStart(Ptr, Size);
3103  else
3104  New = IRB.CreateLifetimeEnd(Ptr, Size);
3105 
3106  (void)New;
3107  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3108 
3109  return true;
3110  }
3111 
3112  void fixLoadStoreAlign(Instruction &Root) {
3113  // This algorithm implements the same visitor loop as
3114  // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3115  // or store found.
3118  Visited.insert(&Root);
3119  Uses.push_back(&Root);
3120  do {
3121  Instruction *I = Uses.pop_back_val();
3122 
3123  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3124  LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3125  continue;
3126  }
3127  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3128  SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3129  continue;
3130  }
3131 
3132  assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3133  isa<PHINode>(I) || isa<SelectInst>(I) ||
3134  isa<GetElementPtrInst>(I));
3135  for (User *U : I->users())
3136  if (Visited.insert(cast<Instruction>(U)).second)
3137  Uses.push_back(cast<Instruction>(U));
3138  } while (!Uses.empty());
3139  }
3140 
3141  bool visitPHINode(PHINode &PN) {
3142  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3143  assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3144  assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3145 
3146  // We would like to compute a new pointer in only one place, but have it be
3147  // as local as possible to the PHI. To do that, we re-use the location of
3148  // the old pointer, which necessarily must be in the right position to
3149  // dominate the PHI.
3151  if (isa<PHINode>(OldPtr))
3152  IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
3153  else
3154  IRB.SetInsertPoint(OldPtr);
3155  IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3156 
3157  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3158  // Replace the operands which were using the old pointer.
3159  std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3160 
3161  LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3162  deleteIfTriviallyDead(OldPtr);
3163 
3164  // Fix the alignment of any loads or stores using this PHI node.
3165  fixLoadStoreAlign(PN);
3166 
3167  // PHIs can't be promoted on their own, but often can be speculated. We
3168  // check the speculation outside of the rewriter so that we see the
3169  // fully-rewritten alloca.
3170  PHIUsers.insert(&PN);
3171  return true;
3172  }
3173 
3174  bool visitSelectInst(SelectInst &SI) {
3175  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3176  assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3177  "Pointer isn't an operand!");
3178  assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3179  assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3180 
3181  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3182  // Replace the operands which were using the old pointer.
3183  if (SI.getOperand(1) == OldPtr)
3184  SI.setOperand(1, NewPtr);
3185  if (SI.getOperand(2) == OldPtr)
3186  SI.setOperand(2, NewPtr);
3187 
3188  LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3189  deleteIfTriviallyDead(OldPtr);
3190 
3191  // Fix the alignment of any loads or stores using this select.
3192  fixLoadStoreAlign(SI);
3193 
3194  // Selects can't be promoted on their own, but often can be speculated. We
3195  // check the speculation outside of the rewriter so that we see the
3196  // fully-rewritten alloca.
3197  SelectUsers.insert(&SI);
3198  return true;
3199  }
3200 };
3201 
3202 namespace {
3203 
3204 /// Visitor to rewrite aggregate loads and stores as scalar.
3205 ///
3206 /// This pass aggressively rewrites all aggregate loads and stores on
3207 /// a particular pointer (or any pointer derived from it which we can identify)
3208 /// with scalar loads and stores.
3209 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3210  // Befriend the base class so it can delegate to private visit methods.
3211  friend class InstVisitor<AggLoadStoreRewriter, bool>;
3212 
3213  /// Queue of pointer uses to analyze and potentially rewrite.
3215 
3216  /// Set to prevent us from cycling with phi nodes and loops.
3217  SmallPtrSet<User *, 8> Visited;
3218 
3219  /// The current pointer use being rewritten. This is used to dig up the used
3220  /// value (as opposed to the user).
3221  Use *U = nullptr;
3222 
3223  /// Used to calculate offsets, and hence alignment, of subobjects.
3224  const DataLayout &DL;
3225 
3226 public:
3227  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
3228 
3229  /// Rewrite loads and stores through a pointer and all pointers derived from
3230  /// it.
3231  bool rewrite(Instruction &I) {
3232  LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3233  enqueueUsers(I);
3234  bool Changed = false;
3235  while (!Queue.empty()) {
3236  U = Queue.pop_back_val();
3237  Changed |= visit(cast<Instruction>(U->getUser()));
3238  }
3239  return Changed;
3240  }
3241 
3242 private:
3243  /// Enqueue all the users of the given instruction for further processing.
3244  /// This uses a set to de-duplicate users.
3245  void enqueueUsers(Instruction &I) {
3246  for (Use &U : I.uses())
3247  if (Visited.insert(U.getUser()).second)
3248  Queue.push_back(&U);
3249  }
3250 
3251  // Conservative default is to not rewrite anything.
3252  bool visitInstruction(Instruction &I) { return false; }
3253 
3254  /// Generic recursive split emission class.
3255  template <typename Derived> class OpSplitter {
3256  protected:
3257  /// The builder used to form new instructions.
3258  IRBuilderTy IRB;
3259 
3260  /// The indices which to be used with insert- or extractvalue to select the
3261  /// appropriate value within the aggregate.
3262  SmallVector<unsigned, 4> Indices;
3263 
3264  /// The indices to a GEP instruction which will move Ptr to the correct slot
3265  /// within the aggregate.
3266  SmallVector<Value *, 4> GEPIndices;
3267 
3268  /// The base pointer of the original op, used as a base for GEPing the
3269  /// split operations.
3270  Value *Ptr;
3271 
3272  /// The base pointee type being GEPed into.
3273  Type *BaseTy;
3274 
3275  /// Known alignment of the base pointer.
3276  Align BaseAlign;
3277 
3278  /// To calculate offset of each component so we can correctly deduce
3279  /// alignments.
3280  const DataLayout &DL;
3281 
3282  /// Initialize the splitter with an insertion point, Ptr and start with a
3283  /// single zero GEP index.
3284  OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3285  Align BaseAlign, const DataLayout &DL)
3286  : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
3287  BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
3288 
3289  public:
3290  /// Generic recursive split emission routine.
3291  ///
3292  /// This method recursively splits an aggregate op (load or store) into
3293  /// scalar or vector ops. It splits recursively until it hits a single value
3294  /// and emits that single value operation via the template argument.
3295  ///
3296  /// The logic of this routine relies on GEPs and insertvalue and
3297  /// extractvalue all operating with the same fundamental index list, merely
3298  /// formatted differently (GEPs need actual values).
3299  ///
3300  /// \param Ty The type being split recursively into smaller ops.
3301  /// \param Agg The aggregate value being built up or stored, depending on
3302  /// whether this is splitting a load or a store respectively.
3303  void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3304  if (Ty->isSingleValueType()) {
3305  unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
3306  return static_cast<Derived *>(this)->emitFunc(
3307  Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
3308  }
3309 
3310  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3311  unsigned OldSize = Indices.size();
3312  (void)OldSize;
3313  for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3314  ++Idx) {
3315  assert(Indices.size() == OldSize && "Did not return to the old size");
3316  Indices.push_back(Idx);
3317  GEPIndices.push_back(IRB.getInt32(Idx));
3318  emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3319  GEPIndices.pop_back();
3320  Indices.pop_back();
3321  }
3322  return;
3323  }
3324 
3325  if (StructType *STy = dyn_cast<StructType>(Ty)) {
3326  unsigned OldSize = Indices.size();
3327  (void)OldSize;
3328  for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3329  ++Idx) {
3330  assert(Indices.size() == OldSize && "Did not return to the old size");
3331  Indices.push_back(Idx);
3332  GEPIndices.push_back(IRB.getInt32(Idx));
3333  emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3334  GEPIndices.pop_back();
3335  Indices.pop_back();
3336  }
3337  return;
3338  }
3339 
3340  llvm_unreachable("Only arrays and structs are aggregate loadable types");
3341  }
3342  };
3343 
3344  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3345  AAMDNodes AATags;
3346 
3347  LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3348  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
3349  : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3350  DL),
3351  AATags(AATags) {}
3352 
3353  /// Emit a leaf load of a single value. This is called at the leaves of the
3354  /// recursive emission to actually load values.
3355  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3356  assert(Ty->isSingleValueType());
3357  // Load the single value and insert it using the indices.
3358  Value *GEP =
3359  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3360  LoadInst *Load =
3361  IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
3362 
3363  APInt Offset(
3364  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3365  if (AATags &&
3366  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3367  Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3368 
3369  Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3370  LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3371  }
3372  };
3373 
3374  bool visitLoadInst(LoadInst &LI) {
3375  assert(LI.getPointerOperand() == *U);
3376  if (!LI.isSimple() || LI.getType()->isSingleValueType())
3377  return false;
3378 
3379  // We have an aggregate being loaded, split it apart.
3380  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3381  LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3382  getAdjustedAlignment(&LI, 0), DL);
3383  Value *V = UndefValue::get(LI.getType());
3384  Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3385  Visited.erase(&LI);
3386  LI.replaceAllUsesWith(V);
3387  LI.eraseFromParent();
3388  return true;
3389  }
3390 
3391  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3392  StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3393  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
3394  : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3395  DL),
3396  AATags(AATags) {}
3397  AAMDNodes AATags;
3398  /// Emit a leaf store of a single value. This is called at the leaves of the
3399  /// recursive emission to actually produce stores.
3400  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3401  assert(Ty->isSingleValueType());
3402  // Extract the single value and store it using the indices.
3403  //
3404  // The gep and extractvalue values are factored out of the CreateStore
3405  // call to make the output independent of the argument evaluation order.
3406  Value *ExtractValue =
3407  IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
3408  Value *InBoundsGEP =
3409  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3410  StoreInst *Store =
3411  IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
3412 
3413  APInt Offset(
3414  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3415  if (AATags &&
3416  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3417  Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3418 
3419  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3420  }
3421  };
3422 
3423  bool visitStoreInst(StoreInst &SI) {
3424  if (!SI.isSimple() || SI.getPointerOperand() != *U)
3425  return false;
3426  Value *V = SI.getValueOperand();
3427  if (V->getType()->isSingleValueType())
3428  return false;
3429 
3430  // We have an aggregate being stored, split it apart.
3431  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3432  StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(),
3433  getAdjustedAlignment(&SI, 0), DL);
3434  Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3435  Visited.erase(&SI);
3436  SI.eraseFromParent();
3437  return true;
3438  }
3439 
3440  bool visitBitCastInst(BitCastInst &BC) {
3441  enqueueUsers(BC);
3442  return false;
3443  }
3444 
3445  bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
3446  enqueueUsers(ASC);
3447  return false;
3448  }
3449 
3450  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
3451  bool foldGEPSelect(GetElementPtrInst &GEPI) {
3452  if (!GEPI.hasAllConstantIndices())
3453  return false;
3454 
3455  SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
3456 
3457  LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
3458  << "\n original: " << *Sel
3459  << "\n " << GEPI);
3460 
3461  IRBuilderTy Builder(&GEPI);
3463  bool IsInBounds = GEPI.isInBounds();
3464 
3465  Type *Ty = GEPI.getSourceElementType();
3466  Value *True = Sel->getTrueValue();
3467  Value *NTrue =
3468  IsInBounds
3469  ? Builder.CreateInBoundsGEP(Ty, True, Index,
3470  True->getName() + ".sroa.gep")
3471  : Builder.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
3472 
3473  Value *False = Sel->getFalseValue();
3474 
3475  Value *NFalse =
3476  IsInBounds
3477  ? Builder.CreateInBoundsGEP(Ty, False, Index,
3478  False->getName() + ".sroa.gep")
3479  : Builder.CreateGEP(Ty, False, Index,
3480  False->getName() + ".sroa.gep");
3481 
3482  Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
3483  Sel->getName() + ".sroa.sel");
3484  Visited.erase(&GEPI);
3485  GEPI.replaceAllUsesWith(NSel);
3486  GEPI.eraseFromParent();
3487  Instruction *NSelI = cast<Instruction>(NSel);
3488  Visited.insert(NSelI);
3489  enqueueUsers(*NSelI);
3490 
3491  LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
3492  << "\n " << *NFalse
3493  << "\n " << *NSel << '\n');
3494 
3495  return true;
3496  }
3497 
3498  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
3499  bool foldGEPPhi(GetElementPtrInst &GEPI) {
3500  if (!GEPI.hasAllConstantIndices())
3501  return false;
3502 
3503  PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
3504  if (GEPI.getParent() != PHI->getParent() ||
3505  llvm::any_of(PHI->incoming_values(), [](Value *In)
3506  { Instruction *I = dyn_cast<Instruction>(In);
3507  return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
3508  succ_empty(I->getParent()) ||
3509  !I->getParent()->isLegalToHoistInto();
3510  }))
3511  return false;
3512 
3513  LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
3514  << "\n original: " << *PHI
3515  << "\n " << GEPI
3516  << "\n to: ");
3517 
3519  bool IsInBounds = GEPI.isInBounds();
3520  IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
3521  PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
3522  PHI->getNumIncomingValues(),
3523  PHI->getName() + ".sroa.phi");
3524  for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
3525  BasicBlock *B = PHI->getIncomingBlock(I);
3526  Value *NewVal = nullptr;
3527  int Idx = NewPN->getBasicBlockIndex(B);
3528  if (Idx >= 0) {
3529  NewVal = NewPN->getIncomingValue(Idx);
3530  } else {
3531  Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
3532 
3533  IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
3534  Type *Ty = GEPI.getSourceElementType();
3535  NewVal = IsInBounds
3536  ? B.CreateInBoundsGEP(Ty, In, Index, In->getName() + ".sroa.gep")
3537  : B.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep");
3538  }
3539  NewPN->addIncoming(NewVal, B);
3540  }
3541 
3542  Visited.erase(&GEPI);
3543  GEPI.replaceAllUsesWith(NewPN);
3544  GEPI.eraseFromParent();
3545  Visited.insert(NewPN);
3546  enqueueUsers(*NewPN);
3547 
3548  LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
3549  dbgs() << "\n " << *In;
3550  dbgs() << "\n " << *NewPN << '\n');
3551 
3552  return true;
3553  }
3554 
3555  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
3556  if (isa<SelectInst>(GEPI.getPointerOperand()) &&
3557  foldGEPSelect(GEPI))
3558  return true;
3559 
3560  if (isa<PHINode>(GEPI.getPointerOperand()) &&
3561  foldGEPPhi(GEPI))
3562  return true;
3563 
3564  enqueueUsers(GEPI);
3565  return false;
3566  }
3567 
3568  bool visitPHINode(PHINode &PN) {
3569  enqueueUsers(PN);
3570  return false;
3571  }
3572 
3573  bool visitSelectInst(SelectInst &SI) {
3574  enqueueUsers(SI);
3575  return false;
3576  }
3577 };
3578 
3579 } // end anonymous namespace
3580 
3581 /// Strip aggregate type wrapping.
3582 ///
3583 /// This removes no-op aggregate types wrapping an underlying type. It will
3584 /// strip as many layers of types as it can without changing either the type
3585 /// size or the allocated size.
3587  if (Ty->isSingleValueType())
3588  return Ty;
3589 
3590  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
3591  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
3592 
3593  Type *InnerTy;
3594  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
3595  InnerTy = ArrTy->getElementType();
3596  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
3597  const StructLayout *SL = DL.getStructLayout(STy);
3598  unsigned Index = SL->getElementContainingOffset(0);
3599  InnerTy = STy->getElementType(Index);
3600  } else {
3601  return Ty;
3602  }
3603 
3604  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
3605  TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
3606  return Ty;
3607 
3608  return stripAggregateTypeWrapping(DL, InnerTy);
3609 }
3610 
3611 /// Try to find a partition of the aggregate type passed in for a given
3612 /// offset and size.
3613 ///
3614 /// This recurses through the aggregate type and tries to compute a subtype
3615 /// based on the offset and size. When the offset and size span a sub-section
3616 /// of an array, it will even compute a new array type for that sub-section,
3617 /// and the same for structs.
3618 ///
3619 /// Note that this routine is very strict and tries to find a partition of the
3620 /// type which produces the *exact* right offset and size. It is not forgiving
3621 /// when the size or offset cause either end of type-based partition to be off.
3622 /// Also, this is a best-effort routine. It is reasonable to give up and not
3623 /// return a type if necessary.
3625  uint64_t Size) {
3626  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
3627  return stripAggregateTypeWrapping(DL, Ty);
3628  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
3629  (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
3630  return nullptr;
3631 
3632  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
3633  Type *ElementTy;
3634  uint64_t TyNumElements;
3635  if (auto *AT = dyn_cast<ArrayType>(Ty)) {
3636  ElementTy = AT->getElementType();
3637  TyNumElements = AT->getNumElements();
3638  } else {
3639  // FIXME: This isn't right for vectors with non-byte-sized or
3640  // non-power-of-two sized elements.
3641  auto *VT = cast<FixedVectorType>(Ty);
3642  ElementTy = VT->getElementType();
3643  TyNumElements = VT->getNumElements();
3644  }
3645  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3646  uint64_t NumSkippedElements = Offset / ElementSize;
3647  if (NumSkippedElements >= TyNumElements)
3648  return nullptr;
3649  Offset -= NumSkippedElements * ElementSize;
3650 
3651  // First check if we need to recurse.
3652  if (Offset > 0 || Size < ElementSize) {
3653  // Bail if the partition ends in a different array element.
3654  if ((Offset + Size) > ElementSize)
3655  return nullptr;
3656  // Recurse through the element type trying to peel off offset bytes.
3657  return getTypePartition(DL, ElementTy, Offset, Size);
3658  }
3659  assert(Offset == 0);
3660 
3661  if (Size == ElementSize)
3662  return stripAggregateTypeWrapping(DL, ElementTy);
3663  assert(Size > ElementSize);
3664  uint64_t NumElements = Size / ElementSize;
3665  if (NumElements * ElementSize != Size)
3666  return nullptr;
3667  return ArrayType::get(ElementTy, NumElements);
3668  }
3669 
3670  StructType *STy = dyn_cast<StructType>(Ty);
3671  if (!STy)
3672  return nullptr;
3673 
3674  const StructLayout *SL = DL.getStructLayout(STy);
3675  if (Offset >= SL->getSizeInBytes())
3676  return nullptr;
3677  uint64_t EndOffset = Offset + Size;
3678  if (EndOffset > SL->getSizeInBytes())
3679  return nullptr;
3680 
3681  unsigned Index = SL->getElementContainingOffset(Offset);
3682  Offset -= SL->getElementOffset(Index);
3683 
3684  Type *ElementTy = STy->getElementType(Index);
3685  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3686  if (Offset >= ElementSize)
3687  return nullptr; // The offset points into alignment padding.
3688 
3689  // See if any partition must be contained by the element.
3690  if (Offset > 0 || Size < ElementSize) {
3691  if ((Offset + Size) > ElementSize)
3692  return nullptr;
3693  return getTypePartition(DL, ElementTy, Offset, Size);
3694  }
3695  assert(Offset == 0);
3696 
3697  if (Size == ElementSize)
3698  return stripAggregateTypeWrapping(DL, ElementTy);
3699 
3701  EE = STy->element_end();
3702  if (EndOffset < SL->getSizeInBytes()) {
3703  unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
3704  if (Index == EndIndex)
3705  return nullptr; // Within a single element and its padding.
3706 
3707  // Don't try to form "natural" types if the elements don't line up with the
3708  // expected size.
3709  // FIXME: We could potentially recurse down through the last element in the
3710  // sub-struct to find a natural end point.
3711  if (SL->getElementOffset(EndIndex) != EndOffset)
3712  return nullptr;
3713 
3714  assert(Index < EndIndex);
3715  EE = STy->element_begin() + EndIndex;
3716  }
3717 
3718  // Try to build up a sub-structure.
3719  StructType *SubTy =
3720  StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
3721  const StructLayout *SubSL = DL.getStructLayout(SubTy);
3722  if (Size != SubSL->getSizeInBytes())
3723  return nullptr; // The sub-struct doesn't have quite the size needed.
3724 
3725  return SubTy;
3726 }
3727 
3728 /// Pre-split loads and stores to simplify rewriting.
3729 ///
3730 /// We want to break up the splittable load+store pairs as much as
3731 /// possible. This is important to do as a preprocessing step, as once we
3732 /// start rewriting the accesses to partitions of the alloca we lose the
3733 /// necessary information to correctly split apart paired loads and stores
3734 /// which both point into this alloca. The case to consider is something like
3735 /// the following:
3736 ///
3737 /// %a = alloca [12 x i8]
3738 /// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
3739 /// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
3740 /// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
3741 /// %iptr1 = bitcast i8* %gep1 to i64*
3742 /// %iptr2 = bitcast i8* %gep2 to i64*
3743 /// %fptr1 = bitcast i8* %gep1 to float*
3744 /// %fptr2 = bitcast i8* %gep2 to float*
3745 /// %fptr3 = bitcast i8* %gep3 to float*
3746 /// store float 0.0, float* %fptr1
3747 /// store float 1.0, float* %fptr2
3748 /// %v = load i64* %iptr1
3749 /// store i64 %v, i64* %iptr2
3750 /// %f1 = load float* %fptr2
3751 /// %f2 = load float* %fptr3
3752 ///
3753 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
3754 /// promote everything so we recover the 2 SSA values that should have been
3755 /// there all along.
3756 ///
3757 /// \returns true if any changes are made.
3758 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
3759  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
3760 
3761  // Track the loads and stores which are candidates for pre-splitting here, in
3762  // the order they first appear during the partition scan. These give stable
3763  // iteration order and a basis for tracking which loads and stores we
3764  // actually split.
3767 
3768  // We need to accumulate the splits required of each load or store where we
3769  // can find them via a direct lookup. This is important to cross-check loads
3770  // and stores against each other. We also track the slice so that we can kill
3771  // all the slices that end up split.
3772  struct SplitOffsets {
3773  Slice *S;
3774  std::vector<uint64_t> Splits;
3775  };
3777 
3778  // Track loads out of this alloca which cannot, for any reason, be pre-split.
3779  // This is important as we also cannot pre-split stores of those loads!
3780  // FIXME: This is all pretty gross. It means that we can be more aggressive
3781  // in pre-splitting when the load feeding the store happens to come from
3782  // a separate alloca. Put another way, the effectiveness of SROA would be
3783  // decreased by a frontend which just concatenated all of its local allocas
3784  // into one big flat alloca. But defeating such patterns is exactly the job
3785  // SROA is tasked with! Sadly, to not have this discrepancy we would have
3786  // change store pre-splitting to actually force pre-splitting of the load
3787  // that feeds it *and all stores*. That makes pre-splitting much harder, but
3788  // maybe it would make it more principled?
3789  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
3790 
3791  LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
3792  for (auto &P : AS.partitions()) {
3793  for (Slice &S : P) {
3794  Instruction *I = cast<Instruction>(S.getUse()->getUser());
3795  if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
3796  // If this is a load we have to track that it can't participate in any
3797  // pre-splitting. If this is a store of a load we have to track that
3798  // that load also can't participate in any pre-splitting.
3799  if (auto *LI = dyn_cast<LoadInst>(I))
3800  UnsplittableLoads.insert(LI);
3801  else if (auto *SI = dyn_cast<StoreInst>(I))
3802  if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
3803  UnsplittableLoads.insert(LI);
3804  continue;
3805  }
3806  assert(P.endOffset() > S.beginOffset() &&
3807  "Empty or backwards partition!");
3808 
3809  // Determine if this is a pre-splittable slice.
3810  if (auto *LI = dyn_cast<LoadInst>(I)) {
3811  assert(!LI->isVolatile() && "Cannot split volatile loads!");
3812 
3813  // The load must be used exclusively to store into other pointers for
3814  // us to be able to arbitrarily pre-split it. The stores must also be
3815  // simple to avoid changing semantics.
3816  auto IsLoadSimplyStored = [](LoadInst *LI) {
3817  for (User *LU : LI->users()) {
3818  auto *SI = dyn_cast<StoreInst>(LU);
3819  if (!SI || !SI->isSimple())
3820  return false;
3821  }
3822  return true;
3823  };
3824  if (!IsLoadSimplyStored(LI)) {
3825  UnsplittableLoads.insert(LI);
3826  continue;
3827  }
3828 
3829  Loads.push_back(LI);
3830  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
3831  if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
3832  // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
3833  continue;
3834  auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
3835  if (!StoredLoad || !StoredLoad->isSimple())
3836  continue;
3837  assert(!SI->isVolatile() && "Cannot split volatile stores!");
3838 
3839  Stores.push_back(SI);
3840  } else {
3841  // Other uses cannot be pre-split.
3842  continue;
3843  }
3844 
3845  // Record the initial split.
3846  LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
3847  auto &Offsets = SplitOffsetsMap[I];
3848  assert(Offsets.Splits.empty() &&
3849  "Should not have splits the first time we see an instruction!");
3850  Offsets.S = &S;
3851  Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
3852  }
3853 
3854  // Now scan the already split slices, and add a split for any of them which
3855  // we're going to pre-split.
3856  for (Slice *S : P.splitSliceTails()) {
3857  auto SplitOffsetsMapI =
3858  SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
3859  if (SplitOffsetsMapI == SplitOffsetsMap.end())
3860  continue;
3861  auto &Offsets = SplitOffsetsMapI->second;
3862 
3863  assert(Offsets.S == S && "Found a mismatched slice!");
3864  assert(!Offsets.Splits.empty() &&
3865  "Cannot have an empty set of splits on the second partition!");
3866  assert(Offsets.Splits.back() ==
3867  P.beginOffset() - Offsets.S->beginOffset() &&
3868  "Previous split does not end where this one begins!");
3869 
3870  // Record each split. The last partition's end isn't needed as the size
3871  // of the slice dictates that.
3872  if (S->endOffset() > P.endOffset())
3873  Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
3874  }
3875  }
3876 
3877  // We may have split loads where some of their stores are split stores. For
3878  // such loads and stores, we can only pre-split them if their splits exactly
3879  // match relative to their starting offset. We have to verify this prior to
3880  // any rewriting.
3881  llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
3882  // Lookup the load we are storing in our map of split
3883  // offsets.
3884  auto *LI = cast<LoadInst>(SI->getValueOperand());
3885  // If it was completely unsplittable, then we're done,
3886  // and this store can't be pre-split.
3887  if (UnsplittableLoads.count(LI))
3888  return true;
3889 
3890  auto LoadOffsetsI = SplitOffsetsMap.find(LI);
3891  if (LoadOffsetsI == SplitOffsetsMap.end())
3892  return false; // Unrelated loads are definitely safe.
3893  auto &LoadOffsets = LoadOffsetsI->second;
3894 
3895  // Now lookup the store's offsets.
3896  auto &StoreOffsets = SplitOffsetsMap[SI];
3897 
3898  // If the relative offsets of each split in the load and
3899  // store match exactly, then we can split them and we
3900  // don't need to remove them here.
3901  if (LoadOffsets.Splits == StoreOffsets.Splits)
3902  return false;
3903 
3904  LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
3905  << " " << *LI << "\n"
3906  << " " << *SI << "\n");
3907 
3908  // We've found a store and load that we need to split
3909  // with mismatched relative splits. Just give up on them
3910  // and remove both instructions from our list of
3911  // candidates.
3912  UnsplittableLoads.insert(LI);
3913  return true;
3914  });
3915  // Now we have to go *back* through all the stores, because a later store may
3916  // have caused an earlier store's load to become unsplittable and if it is
3917  // unsplittable for the later store, then we can't rely on it being split in
3918  // the earlier store either.
3919  llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
3920  auto *LI = cast<LoadInst>(SI->getValueOperand());
3921  return UnsplittableLoads.count(LI);
3922  });
3923  // Once we've established all the loads that can't be split for some reason,
3924  // filter any that made it into our list out.
3925  llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
3926  return UnsplittableLoads.count(LI);
3927  });
3928 
3929  // If no loads or stores are left, there is no pre-splitting to be done for
3930  // this alloca.
3931  if (Loads.empty() && Stores.empty())
3932  return false;
3933 
3934  // From here on, we can't fail and will be building new accesses, so rig up
3935  // an IR builder.
3936  IRBuilderTy IRB(&AI);
3937 
3938  // Collect the new slices which we will merge into the alloca slices.
3939  SmallVector<Slice, 4> NewSlices;
3940 
3941  // Track any allocas we end up splitting loads and stores for so we iterate
3942  // on them.
3943  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
3944 
3945  // At this point, we have collected all of the loads and stores we can
3946  // pre-split, and the specific splits needed for them. We actually do the
3947  // splitting in a specific order in order to handle when one of the loads in
3948  // the value operand to one of the stores.
3949  //
3950  // First, we rewrite all of the split loads, and just accumulate each split
3951  // load in a parallel structure. We also build the slices for them and append
3952  // them to the alloca slices.
3954  std::vector<LoadInst *> SplitLoads;
3955  const DataLayout &DL = AI.getModule()->getDataLayout();
3956  for (LoadInst *LI : Loads) {
3957  SplitLoads.clear();
3958 
3959  IntegerType *Ty = cast<IntegerType>(LI->getType());
3960  assert(Ty->getBitWidth() % 8 == 0);
3961  uint64_t LoadSize = Ty->getBitWidth() / 8;
3962  assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
3963 
3964  auto &Offsets = SplitOffsetsMap[LI];
3965  assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
3966  "Slice size should always match load size exactly!");
3967  uint64_t BaseOffset = Offsets.S->beginOffset();
3968  assert(BaseOffset + LoadSize > BaseOffset &&
3969  "Cannot represent alloca access size using 64-bit integers!");
3970 
3971  Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
3972  IRB.SetInsertPoint(LI);
3973 
3974  LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
3975 
3976  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
3977  int Idx = 0, Size = Offsets.Splits.size();
3978  for (;;) {
3979  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
3980  auto AS = LI->getPointerAddressSpace();
3981  auto *PartPtrTy = PartTy->getPointerTo(AS);
3982  LoadInst *PLoad = IRB.CreateAlignedLoad(
3983  PartTy,
3984  getAdjustedPtr(IRB, DL, BasePtr,
3985  APInt(DL.getIndexSizeInBits(AS), PartOffset),
3986  PartPtrTy, BasePtr->getName() + "."),
3987  getAdjustedAlignment(LI, PartOffset),
3988  /*IsVolatile*/ false, LI->getName());
3989  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
3990  LLVMContext::MD_access_group});
3991 
3992  // Append this load onto the list of split loads so we can find it later
3993  // to rewrite the stores.
3994  SplitLoads.push_back(PLoad);
3995 
3996  // Now build a new slice for the alloca.
3997  NewSlices.push_back(
3998  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
3999  &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4000  /*IsSplittable*/ false));
4001  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4002  << ", " << NewSlices.back().endOffset()
4003  << "): " << *PLoad << "\n");
4004 
4005  // See if we've handled all the splits.
4006  if (Idx >= Size)
4007  break;
4008 
4009  // Setup the next partition.
4010  PartOffset = Offsets.Splits[Idx];
4011  ++Idx;
4012  PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
4013  }
4014 
4015  // Now that we have the split loads, do the slow walk over all uses of the
4016  // load and rewrite them as split stores, or save the split loads to use
4017  // below if the store is going to be split there anyways.
4018  bool DeferredStores = false;
4019  for (User *LU : LI->users()) {
4020  StoreInst *SI = cast<StoreInst>(LU);
4021  if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4022  DeferredStores = true;
4023  LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4024  << "\n");
4025  continue;
4026  }
4027 
4028  Value *StoreBasePtr = SI->getPointerOperand();
4029  IRB.SetInsertPoint(SI);
4030 
4031  LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4032 
4033  for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4034  LoadInst *PLoad = SplitLoads[Idx];
4035  uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4036  auto *PartPtrTy =
4037  PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
4038 
4039  auto AS = SI->getPointerAddressSpace();
4040  StoreInst *PStore = IRB.CreateAlignedStore(
4041  PLoad,
4042  getAdjustedPtr(IRB, DL, StoreBasePtr,
4043  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4044  PartPtrTy, StoreBasePtr->getName() + "."),
4045  getAdjustedAlignment(SI, PartOffset),
4046  /*IsVolatile*/ false);
4047  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4048  LLVMContext::MD_access_group});
4049  LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4050  }
4051 
4052  // We want to immediately iterate on any allocas impacted by splitting
4053  // this store, and we have to track any promotable alloca (indicated by
4054  // a direct store) as needing to be resplit because it is no longer
4055  // promotable.
4056  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4057  ResplitPromotableAllocas.insert(OtherAI);
4058  Worklist.insert(OtherAI);
4059  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4060  StoreBasePtr->stripInBoundsOffsets())) {
4061  Worklist.insert(OtherAI);
4062  }
4063 
4064  // Mark the original store as dead.
4065  DeadInsts.push_back(SI);
4066  }
4067 
4068  // Save the split loads if there are deferred stores among the users.
4069  if (DeferredStores)
4070  SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4071 
4072  // Mark the original load as dead and kill the original slice.
4073  DeadInsts.push_back(LI);
4074  Offsets.S->kill();
4075  }
4076 
4077  // Second, we rewrite all of the split stores. At this point, we know that
4078  // all loads from this alloca have been split already. For stores of such
4079  // loads, we can simply look up the pre-existing split loads. For stores of
4080  // other loads, we split those loads first and then write split stores of
4081  // them.
4082  for (StoreInst *SI : Stores) {
4083  auto *LI = cast<LoadInst>(SI->getValueOperand());
4084  IntegerType *Ty = cast<IntegerType>(LI->getType());
4085  assert(Ty->getBitWidth() % 8 == 0);
4086  uint64_t StoreSize = Ty->getBitWidth() / 8;
4087  assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4088 
4089  auto &Offsets = SplitOffsetsMap[SI];
4090  assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4091  "Slice size should always match load size exactly!");
4092  uint64_t BaseOffset = Offsets.S->beginOffset();
4093  assert(BaseOffset + StoreSize > BaseOffset &&
4094  "Cannot represent alloca access size using 64-bit integers!");
4095 
4096  Value *LoadBasePtr = LI->getPointerOperand();
4097  Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4098 
4099  LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4100 
4101  // Check whether we have an already split load.
4102  auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4103  std::vector<LoadInst *> *SplitLoads = nullptr;
4104  if (SplitLoadsMapI != SplitLoadsMap.end()) {
4105  SplitLoads = &SplitLoadsMapI->second;
4106  assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4107  "Too few split loads for the number of splits in the store!");
4108  } else {
4109  LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4110  }
4111 
4112  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4113  int Idx = 0, Size = Offsets.Splits.size();
4114  for (;;) {
4115  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4116  auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
4117  auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
4118 
4119  // Either lookup a split load or create one.
4120  LoadInst *PLoad;
4121  if (SplitLoads) {
4122  PLoad = (*SplitLoads)[Idx];
4123  } else {
4124  IRB.SetInsertPoint(LI);
4125  auto AS = LI->getPointerAddressSpace();
4126  PLoad = IRB.CreateAlignedLoad(
4127  PartTy,
4128  getAdjustedPtr(IRB, DL, LoadBasePtr,
4129  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4130  LoadPartPtrTy, LoadBasePtr->getName() + "."),
4131  getAdjustedAlignment(LI, PartOffset),
4132  /*IsVolatile*/ false, LI->getName());
4133  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4134  LLVMContext::MD_access_group});
4135  }
4136 
4137  // And store this partition.
4138  IRB.SetInsertPoint(SI);
4139  auto AS = SI->getPointerAddressSpace();
4140  StoreInst *PStore = IRB.CreateAlignedStore(
4141  PLoad,
4142  getAdjustedPtr(IRB, DL, StoreBasePtr,
4143  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4144  StorePartPtrTy, StoreBasePtr->getName() + "."),
4145  getAdjustedAlignment(SI, PartOffset),
4146  /*IsVolatile*/ false);
4147  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4148  LLVMContext::MD_access_group});
4149 
4150  // Now build a new slice for the alloca.
4151  NewSlices.push_back(
4152  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4153  &PStore->getOperandUse(PStore->getPointerOperandIndex()),
4154  /*IsSplittable*/ false));
4155  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4156  << ", " << NewSlices.back().endOffset()
4157  << "): " << *PStore << "\n");
4158  if (!SplitLoads) {
4159  LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4160  }
4161 
4162  // See if we've finished all the splits.
4163  if (Idx >= Size)
4164  break;
4165 
4166  // Setup the next partition.
4167  PartOffset = Offsets.Splits[Idx];
4168  ++Idx;
4169  PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4170  }
4171 
4172  // We want to immediately iterate on any allocas impacted by splitting
4173  // this load, which is only relevant if it isn't a load of this alloca and
4174  // thus we didn't already split the loads above. We also have to keep track
4175  // of any promotable allocas we split loads on as they can no longer be
4176  // promoted.
4177  if (!SplitLoads) {
4178  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
4179  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4180  ResplitPromotableAllocas.insert(OtherAI);
4181  Worklist.insert(OtherAI);
4182  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4183  LoadBasePtr->stripInBoundsOffsets())) {
4184  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4185  Worklist.insert(OtherAI);
4186  }
4187  }
4188 
4189  // Mark the original store as dead now that we've split it up and kill its
4190  // slice. Note that we leave the original load in place unless this store
4191  // was its only use. It may in turn be split up if it is an alloca load
4192  // for some other alloca, but it may be a normal load. This may introduce
4193  // redundant loads, but where those can be merged the rest of the optimizer
4194  // should handle the merging, and this uncovers SSA splits which is more
4195  // important. In practice, the original loads will almost always be fully
4196  // split and removed eventually, and the splits will be merged by any
4197  // trivial CSE, including instcombine.
4198  if (LI->hasOneUse()) {
4199  assert(*LI->user_begin() == SI && "Single use isn't this store!");
4200  DeadInsts.push_back(LI);
4201  }
4202  DeadInsts.push_back(SI);
4203  Offsets.S->kill();
4204  }
4205 
4206  // Remove the killed slices that have ben pre-split.
4207  llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
4208 
4209  // Insert our new slices. This will sort and merge them into the sorted
4210  // sequence.
4211  AS.insert(NewSlices);
4212 
4213  LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4214 #ifndef NDEBUG
4215  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4216  LLVM_DEBUG(AS.print(dbgs(), I, " "));
4217 #endif
4218 
4219  // Finally, don't try to promote any allocas that new require re-splitting.
4220  // They have already been added to the worklist above.
4221  llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
4222  return ResplitPromotableAllocas.count(AI);
4223  });
4224 
4225  return true;
4226 }
4227 
4228 /// Rewrite an alloca partition's users.
4229 ///
4230 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4231 /// to rewrite uses of an alloca partition to be conducive for SSA value
4232 /// promotion. If the partition needs a new, more refined alloca, this will
4233 /// build that new alloca, preserving as much type information as possible, and
4234 /// rewrite the uses of the old alloca to point at the new one and have the
4235 /// appropriate new offsets. It also evaluates how successful the rewrite was
4236 /// at enabling promotion and if it was successful queues the alloca to be
4237 /// promoted.
4238 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4239  Partition &P) {
4240  // Try to compute a friendly type for this partition of the alloca. This
4241  // won't always succeed, in which case we fall back to a legal integer type
4242  // or an i8 array of an appropriate size.
4243  Type *SliceTy = nullptr;
4244  const DataLayout &DL = AI.getModule()->getDataLayout();
4245  std::pair<Type *, IntegerType *> CommonUseTy =
4246  findCommonType(P.begin(), P.end(), P.endOffset());
4247  // Do all uses operate on the same type?
4248  if (CommonUseTy.first)
4249  if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
4250  SliceTy = CommonUseTy.first;
4251  // If not, can we find an appropriate subtype in the original allocated type?
4252  if (!SliceTy)
4253  if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4254  P.beginOffset(), P.size()))
4255  SliceTy = TypePartitionTy;
4256  // If still not, can we use the largest bitwidth integer type used?
4257  if (!SliceTy && CommonUseTy.second)
4258  if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
4259  SliceTy = CommonUseTy.second;
4260  if ((!SliceTy || (SliceTy->isArrayTy() &&
4261  SliceTy->getArrayElementType()->isIntegerTy())) &&
4262  DL.isLegalInteger(P.size() * 8))
4263  SliceTy = Type::getIntNTy(*C, P.size() * 8);
4264  if (!SliceTy)
4265  SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4266  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
4267 
4268  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4269 
4270  VectorType *VecTy =
4271  IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4272  if (VecTy)
4273  SliceTy = VecTy;
4274 
4275  // Check for the case where we're going to rewrite to a new alloca of the
4276  // exact same type as the original, and with the same access offsets. In that
4277  // case, re-use the existing alloca, but still run through the rewriter to
4278  // perform phi and select speculation.
4279  // P.beginOffset() can be non-zero even with the same type in a case with
4280  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4281  AllocaInst *NewAI;
4282  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4283  NewAI = &AI;
4284  // FIXME: We should be able to bail at this point with "nothing changed".
4285  // FIXME: We might want to defer PHI speculation until after here.
4286  // FIXME: return nullptr;
4287  } else {
4288  // Make sure the alignment is compatible with P.beginOffset().
4289  const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
4290  // If we will get at least this much alignment from the type alone, leave
4291  // the alloca's alignment unconstrained.
4292  const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
4293  NewAI = new AllocaInst(
4294  SliceTy, AI.getType()->getAddressSpace(), nullptr,
4295  IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
4296  AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
4297  // Copy the old AI debug location over to the new one.
4298  NewAI->setDebugLoc(AI.getDebugLoc());
4299  ++NumNewAllocas;
4300  }
4301 
4302  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
4303  << "[" << P.beginOffset() << "," << P.endOffset()
4304  << ") to: " << *NewAI << "\n");
4305 
4306  // Track the high watermark on the worklist as it is only relevant for
4307  // promoted allocas. We will reset it to this point if the alloca is not in
4308  // fact scheduled for promotion.
4309  unsigned PPWOldSize = PostPromotionWorklist.size();
4310  unsigned NumUses = 0;
4312  SmallSetVector<SelectInst *, 8> SelectUsers;
4313 
4314  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4315  P.endOffset(), IsIntegerPromotable, VecTy,
4316  PHIUsers, SelectUsers);
4317  bool Promotable = true;
4318  for (Slice *S : P.splitSliceTails()) {
4319  Promotable &= Rewriter.visit(S);
4320  ++NumUses;
4321  }
4322  for (Slice &S : P) {
4323  Promotable &= Rewriter.visit(&S);
4324  ++NumUses;
4325  }
4326 
4327  NumAllocaPartitionUses += NumUses;
4328  MaxUsesPerAllocaPartition.updateMax(NumUses);
4329 
4330  // Now that we've processed all the slices in the new partition, check if any
4331  // PHIs or Selects would block promotion.
4332  for (PHINode *PHI : PHIUsers)
4333  if (!isSafePHIToSpeculate(*PHI)) {
4334  Promotable = false;
4335  PHIUsers.clear();
4336  SelectUsers.clear();
4337  break;
4338  }
4339 
4340  for (SelectInst *Sel : SelectUsers)
4341  if (!isSafeSelectToSpeculate(*Sel)) {
4342  Promotable = false;
4343  PHIUsers.clear();
4344  SelectUsers.clear();
4345  break;
4346  }
4347 
4348  if (Promotable) {
4349  for (Use *U : AS.getDeadUsesIfPromotable()) {
4350  auto *OldInst = dyn_cast<Instruction>(U->get());
4351  Value::dropDroppableUse(*U);
4352  if (OldInst)
4353  if (isInstructionTriviallyDead(OldInst))
4354  DeadInsts.push_back(OldInst);
4355  }
4356  if (PHIUsers.empty() && SelectUsers.empty()) {
4357  // Promote the alloca.
4358  PromotableAllocas.push_back(NewAI);
4359  } else {
4360  // If we have either PHIs or Selects to speculate, add them to those
4361  // worklists and re-queue the new alloca so that we promote in on the
4362  // next iteration.
4363  for (PHINode *PHIUser : PHIUsers)
4364  SpeculatablePHIs.insert(PHIUser);
4365  for (SelectInst *SelectUser : SelectUsers)
4366  SpeculatableSelects.insert(SelectUser);
4367  Worklist.insert(NewAI);
4368  }
4369  } else {
4370  // Drop any post-promotion work items if promotion didn't happen.
4371  while (PostPromotionWorklist.size() > PPWOldSize)
4372  PostPromotionWorklist.pop_back();
4373 
4374  // We couldn't promote and we didn't create a new partition, nothing
4375  // happened.
4376  if (NewAI == &AI)
4377  return nullptr;
4378 
4379  // If we can't promote the alloca, iterate on it to check for new
4380  // refinements exposed by splitting the current alloca. Don't iterate on an
4381  // alloca which didn't actually change and didn't get promoted.
4382  Worklist.insert(NewAI);
4383  }
4384 
4385  return NewAI;
4386 }
4387 
4388 /// Walks the slices of an alloca and form partitions based on them,
4389 /// rewriting each of their uses.
4390 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
4391  if (AS.begin() == AS.end())
4392  return false;
4393 
4394  unsigned NumPartitions = 0;
4395  bool Changed = false;
4396  const DataLayout &DL = AI.getModule()->getDataLayout();
4397 
4398  // First try to pre-split loads and stores.
4399  Changed |= presplitLoadsAndStores(AI, AS);
4400 
4401  // Now that we have identified any pre-splitting opportunities,
4402  // mark loads and stores unsplittable except for the following case.
4403  // We leave a slice splittable if all other slices are disjoint or fully
4404  // included in the slice, such as whole-alloca loads and stores.
4405  // If we fail to split these during pre-splitting, we want to force them
4406  // to be rewritten into a partition.
4407  bool IsSorted = true;
4408 
4409  uint64_t AllocaSize =
4410  DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
4411  const uint64_t MaxBitVectorSize = 1024;
4412  if (AllocaSize <= MaxBitVectorSize) {
4413  // If a byte boundary is included in any load or store, a slice starting or
4414  // ending at the boundary is not splittable.
4415  SmallBitVector SplittableOffset(AllocaSize + 1, true);
4416  for (Slice &S : AS)
4417  for (unsigned O = S.beginOffset() + 1;
4418  O < S.endOffset() && O < AllocaSize; O++)
4419  SplittableOffset.reset(O);
4420 
4421  for (Slice &S : AS) {
4422  if (!S.isSplittable())
4423  continue;
4424 
4425  if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
4426  (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
4427  continue;
4428 
4429  if (isa<LoadInst>(S.getUse()->getUser()) ||
4430  isa<StoreInst>(S.getUse()->getUser())) {
4431  S.makeUnsplittable();
4432  IsSorted = false;
4433  }
4434  }
4435  }
4436  else {
4437  // We only allow whole-alloca splittable loads and stores
4438  // for a large alloca to avoid creating too large BitVector.
4439  for (Slice &S : AS) {
4440  if (!S.isSplittable())
4441  continue;
4442 
4443  if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
4444  continue;
4445 
4446  if (isa<LoadInst>(S.getUse()->getUser()) ||
4447  isa<StoreInst>(S.getUse()->getUser())) {
4448  S.makeUnsplittable();
4449  IsSorted = false;
4450  }
4451  }
4452  }
4453 
4454  if (!IsSorted)
4455  llvm::sort(AS);
4456 
4457  /// Describes the allocas introduced by rewritePartition in order to migrate
4458  /// the debug info.
4459  struct Fragment {
4460  AllocaInst *Alloca;
4461  uint64_t Offset;
4462  uint64_t Size;
4463  Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
4464  : Alloca(AI), Offset(O), Size(S) {}
4465  };
4466  SmallVector<Fragment, 4> Fragments;
4467 
4468  // Rewrite each partition.
4469  for (auto &P : AS.partitions()) {
4470  if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
4471  Changed = true;
4472  if (NewAI != &AI) {
4473  uint64_t SizeOfByte = 8;
4474  uint64_t AllocaSize =
4475  DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
4476  // Don't include any padding.
4477  uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
4478  Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
4479  }
4480  }
4481  ++NumPartitions;
4482  }
4483 
4484  NumAllocaPartitions += NumPartitions;
4485  MaxPartitionsPerAlloca.updateMax(NumPartitions);
4486 
4487  // Migrate debug information from the old alloca to the new alloca(s)
4488  // and the individual partitions.
4490  for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
4491  auto *Expr = DbgDeclare->getExpression();
4492  DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
4493  uint64_t AllocaSize =
4494  DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
4495  for (auto Fragment : Fragments) {
4496  // Create a fragment expression describing the new partition or reuse AI's
4497  // expression if there is only one partition.
4498  auto *FragmentExpr = Expr;
4499  if (Fragment.Size < AllocaSize || Expr->isFragment()) {
4500  // If this alloca is already a scalar replacement of a larger aggregate,
4501  // Fragment.Offset describes the offset inside the scalar.
4502  auto ExprFragment = Expr->getFragmentInfo();
4503  uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
4504  uint64_t Start = Offset + Fragment.Offset;
4505  uint64_t Size = Fragment.Size;
4506  if (ExprFragment) {
4507  uint64_t AbsEnd =
4508  ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
4509  if (Start >= AbsEnd)
4510  // No need to describe a SROAed padding.
4511  continue;
4512  Size = std::min(Size, AbsEnd - Start);
4513  }
4514  // The new, smaller fragment is stenciled out from the old fragment.
4515  if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
4516  assert(Start >= OrigFragment->OffsetInBits &&
4517  "new fragment is outside of original fragment");
4518  Start -= OrigFragment->OffsetInBits;
4519  }
4520 
4521  // The alloca may be larger than the variable.
4522  auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
4523  if (VarSize) {
4524  if (Size > *VarSize)
4525  Size = *VarSize;
4526  if (Size == 0 || Start + Size > *VarSize)
4527  continue;
4528  }
4529 
4530  // Avoid creating a fragment expression that covers the entire variable.
4531  if (!VarSize || *VarSize != Size) {
4532  if (auto E =
4533  DIExpression::createFragmentExpression(Expr, Start, Size))
4534  FragmentExpr = *E;
4535  else
4536  continue;
4537  }
4538  }
4539 
4540  // Remove any existing intrinsics on the new alloca describing
4541  // the variable fragment.
4542  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
4543  auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS,
4544  const DbgVariableIntrinsic *RHS) {
4545  return LHS->getVariable() == RHS->getVariable() &&
4546  LHS->getDebugLoc()->getInlinedAt() ==
4547  RHS->getDebugLoc()->getInlinedAt();
4548  };
4549  if (SameVariableFragment(OldDII, DbgDeclare))
4550  OldDII->eraseFromParent();
4551  }
4552 
4553  DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
4554  DbgDeclare->getDebugLoc(), &AI);
4555  }
4556  }
4557  return Changed;
4558 }
4559 
4560 /// Clobber a use with undef, deleting the used value if it becomes dead.
4561 void SROA::clobberUse(Use &U) {
4562  Value *OldV = U;
4563  // Replace the use with an undef value.
4564  U = UndefValue::get(OldV->getType());
4565 
4566  // Check for this making an instruction dead. We have to garbage collect
4567  // all the dead instructions to ensure the uses of any alloca end up being
4568  // minimal.
4569  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
4570  if (isInstructionTriviallyDead(OldI)) {
4571  DeadInsts.push_back(OldI);
4572  }
4573 }
4574 
4575 /// Analyze an alloca for SROA.
4576 ///
4577 /// This analyzes the alloca to ensure we can reason about it, builds
4578 /// the slices of the alloca, and then hands it off to be split and
4579 /// rewritten as needed.
4580 bool SROA::runOnAlloca(AllocaInst &AI) {
4581  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
4582  ++NumAllocasAnalyzed;
4583 
4584  // Special case dead allocas, as they're trivial.
4585  if (AI.use_empty()) {
4586  AI.eraseFromParent();
4587  return true;
4588  }
4589  const DataLayout &DL = AI.getModule()->getDataLayout();
4590 
4591  // Skip alloca forms that this analysis can't handle.
4592  auto *AT = AI.getAllocatedType();
4593  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
4594  DL.getTypeAllocSize(AT).getFixedSize() == 0)
4595  return false;
4596 
4597  bool Changed = false;
4598 
4599  // First, split any FCA loads and stores touching this alloca to promote
4600  // better splitting and promotion opportunities.
4601  AggLoadStoreRewriter AggRewriter(DL);
4602  Changed |= AggRewriter.rewrite(AI);
4603 
4604  // Build the slices using a recursive instruction-visiting builder.
4605  AllocaSlices AS(DL, AI);
4606  LLVM_DEBUG(AS.print(dbgs()));
4607  if (AS.isEscaped())
4608  return Changed;
4609 
4610  // Delete all the dead users of this alloca before splitting and rewriting it.
4611  for (Instruction *DeadUser : AS.getDeadUsers()) {
4612  // Free up everything used by this instruction.
4613  for (Use &DeadOp : DeadUser->operands())
4614  clobberUse(DeadOp);
4615 
4616  // Now replace the uses of this instruction.
4617  DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
4618 
4619  // And mark it for deletion.
4620  DeadInsts.push_back(DeadUser);
4621  Changed = true;
4622  }
4623  for (Use *DeadOp : AS.getDeadOperands()) {
4624  clobberUse(*DeadOp);
4625  Changed = true;
4626  }
4627 
4628  // No slices to split. Leave the dead alloca for a later pass to clean up.
4629  if (AS.begin() == AS.end())
4630  return Changed;
4631 
4632  Changed |= splitAlloca(AI, AS);
4633 
4634  LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
4635  while (!SpeculatablePHIs.empty())
4636  speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
4637 
4638  LLVM_DEBUG(dbgs() << " Speculating Selects\n");
4639  while (!SpeculatableSelects.empty())
4640  speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
4641 
4642  return Changed;
4643 }
4644 
4645 /// Delete the dead instructions accumulated in this run.
4646 ///
4647 /// Recursively deletes the dead instructions we've accumulated. This is done
4648 /// at the very end to maximize locality of the recursive delete and to
4649 /// minimize the problems of invalidated instruction pointers as such pointers
4650 /// are used heavily in the intermediate stages of the algorithm.
4651 ///
4652 /// We also record the alloca instructions deleted here so that they aren't
4653 /// subsequently handed to mem2reg to promote.
4654 bool SROA::deleteDeadInstructions(
4655  SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
4656  bool Changed = false;
4657  while (!DeadInsts.empty()) {
4658  Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
4659  if (!I) continue;
4660  LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
4661 
4662  // If the instruction is an alloca, find the possible dbg.declare connected
4663  // to it, and remove it too. We must do this before calling RAUW or we will
4664  // not be able to find it.
4665  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4666  DeletedAllocas.insert(AI);
4667  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
4668  OldDII->eraseFromParent();
4669  }
4670 
4671  I->replaceAllUsesWith(UndefValue::get(I->getType()));
4672 
4673  for (Use &Operand : I->operands())
4674  if (Instruction *U = dyn_cast<Instruction>(Operand)) {
4675  // Zero out the operand and see if it becomes trivially dead.
4676  Operand = nullptr;
4678  DeadInsts.push_back(U);
4679  }
4680 
4681  ++NumDeleted;
4682  I->eraseFromParent();
4683  Changed = true;
4684  }
4685  return Changed;
4686 }
4687 
4688 /// Promote the allocas, using the best available technique.
4689 ///
4690 /// This attempts to promote whatever allocas have been identified as viable in
4691 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
4692 /// This function returns whether any promotion occurred.
4693 bool SROA::promoteAllocas(Function &F) {
4694  if (PromotableAllocas.empty())
4695  return false;
4696 
4697  NumPromoted += PromotableAllocas.size();
4698 
4699  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
4700  PromoteMemToReg(PromotableAllocas, *DT, AC);
4701  PromotableAllocas.clear();
4702  return true;
4703 }
4704 
4706  AssumptionCache &RunAC) {
4707  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
4708  C = &F.getContext();
4709  DT = &RunDT;
4710  AC = &RunAC;
4711 
4712  BasicBlock &EntryBB = F.getEntryBlock();
4713  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
4714  I != E; ++I) {
4715  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4716  if (isa<ScalableVectorType>(AI->getAllocatedType())) {
4717  if (isAllocaPromotable(AI))
4718  PromotableAllocas.push_back(AI);
4719  } else {
4720  Worklist.insert(AI);
4721  }
4722  }
4723  }
4724 
4725  bool Changed = false;
4726  // A set of deleted alloca instruction pointers which should be removed from
4727  // the list of promotable allocas.
4728  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
4729 
4730  do {
4731  while (!Worklist.empty()) {
4732  Changed |= runOnAlloca(*Worklist.pop_back_val());
4733  Changed |= deleteDeadInstructions(DeletedAllocas);
4734 
4735  // Remove the deleted allocas from various lists so that we don't try to
4736  // continue processing them.
4737  if (!DeletedAllocas.empty()) {
4738  auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
4739  Worklist.remove_if(IsInSet);
4740  PostPromotionWorklist.remove_if(IsInSet);
4741  llvm::erase_if(PromotableAllocas, IsInSet);
4742  DeletedAllocas.clear();
4743  }
4744  }
4745 
4746  Changed |= promoteAllocas(F);
4747 
4748  Worklist = PostPromotionWorklist;
4749  PostPromotionWorklist.clear();
4750  } while (!Worklist.empty());
4751 
4752  if (!Changed)
4753  return PreservedAnalyses::all();
4754 
4755  PreservedAnalyses PA;
4756  PA.preserveSet<CFGAnalyses>();
4757  return PA;
4758 }
4759 
4761  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
4763 }
4764 
4765 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
4766 ///
4767 /// This is in the llvm namespace purely to allow it to be a friend of the \c
4768 /// SROA pass.
4770  /// The SROA implementation.
4771  SROA Impl;
4772 
4773 public:
4774  static char ID;
4775 
4777  initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
4778  }
4779 
4780  bool runOnFunction(Function &F) override {
4781  if (skipFunction(F))
4782  return false;
4783 
4784  auto PA = Impl.runImpl(
4785  F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
4786  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
4787  return !PA.areAllPreserved();
4788  }
4789 
4790  void getAnalysisUsage(AnalysisUsage &AU) const override {
4794  AU.setPreservesCFG();
4795  }
4796 
4797  StringRef getPassName() const override { return "SROA"; }
4798 };
4799 
4800 char SROALegacyPass::ID = 0;
4801 
4803 
4805  "Scalar Replacement Of Aggregates", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
PtrUseVisitor.h
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::sroa::AllocaSlices::begin
iterator begin()
Definition: SROA.cpp:237
llvm::detail::PtrUseVisitorBase::DL
const DataLayout & DL
Definition: PtrUseVisitor.h:117
AssumptionCache.h
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:263
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:506
MathExtras.h
llvm::sroa::AllocaSlices::iterator
SmallVectorImpl< Slice >::iterator iterator
Support for iterating over the slices.
Definition: SROA.cpp:234
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:705
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
Pass
print lazy value Lazy Value Info Printer Pass
Definition: LazyValueInfo.cpp:1965
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:378
stripAggregateTypeWrapping
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition: SROA.cpp:3586
llvm::PHINode::incoming_values
op_range incoming_values()
Definition: Instructions.h:2719
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
Metadata.h
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:293
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:120
IntrinsicInst.h
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:217
llvm::DIBuilder
Definition: DIBuilder.h:41
llvm::sroa::Partition::end
iterator end() const
Definition: SROA.cpp:407
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:665
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:783
DebugInfoMetadata.h
llvm::cl::Prefix
@ Prefix
Definition: CommandLine.h:164
Scalar.h
Loads.h
llvm::MemTransferInst
This class wraps the llvm.memcpy/memmove intrinsics.
Definition: IntrinsicInst.h:917
llvm::StringRef::rfind
LLVM_NODISCARD size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition: StringRef.h:375
llvm::PointerType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:673
llvm::Function
Definition: Function.h:62
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
sroa
sroa
Definition: SROA.cpp:4808
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::StructType::element_iterator
Type::subtype_iterator element_iterator
Definition: DerivedTypes.h:315
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5198
GetElementPtrTypeIterator.h
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:308
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::AllocaInst::getType
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:104
llvm::MemIntrinsicBase::getDestAddressSpace
unsigned getDestAddressSpace() const
Definition: IntrinsicInst.h:654
llvm::StringRef::find
LLVM_NODISCARD size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:315
llvm::SmallVector< Slice, 8 >
Statistic.h
llvm::sroa::Partition::beginOffset
uint64_t beginOffset() const
The start offset of this partition.
Definition: SROA.cpp:378
llvm::sroa::SROALegacyPass::SROALegacyPass
SROALegacyPass()
Definition: SROA.cpp:4776
ErrorHandling.h
insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition: SROA.cpp:2181
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2665
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1732
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:687
llvm::SmallDenseMap< Instruction *, unsigned >
llvm::GlobalAlias
Definition: GlobalAlias.h:28
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
GlobalsModRef.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::ConstantInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=~0ULL) const
getLimitedValue - If the value is smaller than the specified limit, return it, otherwise return the l...
Definition: Constants.h:244
APInt.h
llvm::sroa::SROALegacyPass
A legacy pass for the legacy pass manager that wraps the SROA pass.
Definition: SROA.cpp:4769
llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:651
AllocaSlices::SliceBuilder::SliceBuilder
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition: SROA.cpp:657
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
llvm::LoadInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition: Instructions.h:269
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
llvm::StoreInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:357
llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:1983
llvm::DbgVariableIntrinsic::getVariable
DILocalVariable * getVariable() const
Definition: IntrinsicInst.h:253
llvm::APInt::ugt
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition: APInt.h:1114
llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition: IntrinsicInst.h:874
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet< Instruction *, 4 >
llvm::PromoteMemToReg
void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
Definition: PromoteMemoryToRegister.cpp:1014
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1787
llvm::FindDbgAddrUses
TinyPtrVector< DbgVariableIntrinsic * > FindDbgAddrUses(Value *V)
Finds all intrinsics declaring local variables as living in the memory that 'V' points to.
Definition: DebugInfo.cpp:46
Operator.h
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
getNaturalGEPWithType
static Value * getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl< Value * > &Indices, const Twine &NamePrefix)
Get a natural GEP off of the BasePtr walking through Ty toward TargetTy without changing the offset o...
Definition: SROA.cpp:1446
getBitWidth
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Definition: ValueTracking.cpp:100
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:829
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
STLExtras.h
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::IRBuilderDefaultInserter
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition: IRBuilder.h:63
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:139
LLVM_ATTRIBUTE_UNUSED
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:188
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:116
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::sroa::AllocaSlices
Representation of the alloca slices.
Definition: SROA.cpp:221
Use.h
endif
__FakeVCSRevision h endif() endif() set(generated_files "$
Definition: CMakeLists.txt:16
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::StoreInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:384
llvm::AMDGPU::HSAMD::ValueKind::Queue
@ Queue
llvm::Instruction::mayHaveSideEffects
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.cpp:683
llvm::GetElementPtrInst::getSourceElementType
Type * getSourceElementType() const
Definition: Instructions.h:1021
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:766
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:589
isIntegerWideningViableForSlice
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition: SROA.cpp:1970
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::gep_type_end
gep_type_iterator gep_type_end(const User *GEP)
Definition: GetElementPtrTypeIterator.h:146
llvm::IntegerType::getMask
APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition: Type.cpp:337
PointerIntPair.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition: SmallBitVector.h:34
llvm::StringRef::substr
LLVM_NODISCARD StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:611
llvm::GetElementPtrInst::isInBounds
bool isInBounds() const
Determine whether the GEP has the inbounds flag.
Definition: Instructions.cpp:1814
llvm::User::getOperandUse
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Instruction.h
CommandLine.h
insertInteger
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition: SROA.cpp:2123
extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition: SROA.cpp:2156
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Type::isArrayTy
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:214
llvm::Instruction::getNumSuccessors
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
Definition: Instruction.cpp:765
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1335
llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:248
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1551
llvm::ConstantFolder
ConstantFolder - Create constants with minimum, target independent, folding.
Definition: ConstantFolder.h:28
foldPHINodeOrSelectInst
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition: SROA.cpp:629
llvm::MemTransferBase::getSourceAddressSpace
unsigned getSourceAddressSpace() const
Definition: IntrinsicInst.h:719
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:398
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1785
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_END(SROALegacyPass
llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition: Instructions.h:5238
Rewriter
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Constants.h
llvm::PHINode::getIncomingValue
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
Definition: Instructions.h:2729
llvm::sroa::AllocaSlices::AllocaSlices
AllocaSlices(const DataLayout &DL, AllocaInst &AI)
Construct the slices of a particular alloca.
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:113
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:648
llvm::User
Definition: User.h:44
llvm::StructType::isPacked
bool isPacked() const
Definition: DerivedTypes.h:273
llvm::sroa::AllocaSliceRewriter
Visitor to rewrite instructions using p particular slice of an alloca to use a new alloca.
Definition: SROA.cpp:2236
llvm::sroa::Partition::endOffset
uint64_t endOffset() const
The end offset of this partition.
Definition: SROA.cpp:383
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Twine.h
InstrTypes.h
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:296
speculatePHINodeLoads
static void speculatePHINodeLoads(PHINode &PN)
Definition: SROA.cpp:1267
ConstantFolder.h
llvm::sroa::AllocaSlices::dump
void dump() const
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
runImpl
static bool runImpl(const TargetLibraryInfo &TLI, Function &F)
Definition: ReplaceWithVeclib.cpp:177
llvm::MemTransferBase::getRawSource
Value * getRawSource() const
Return the arguments to the instruction.
Definition: IntrinsicInst.h:706
isSafePHIToSpeculate
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition: SROA.cpp:1199
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:199
speculateSelectInstLoads
static void speculateSelectInstLoads(SelectInst &SI)
Definition: SROA.cpp:1364
llvm::sroa::AllocaSlices::getDeadUsers
ArrayRef< Instruction * > getDeadUsers() const
Access the dead users for this alloca.
Definition: SROA.cpp:269
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:253
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::Instruction
Definition: Instruction.h:45
AllocaSlices::partition_iterator::operator++
partition_iterator & operator++()
Definition: SROA.cpp:596
getAdjustedAlignment
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition: SROA.cpp:1651
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:287
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::sroa::Partition::empty
bool empty() const
Test whether this partition contains no slices, and merely spans a region occupied by split slices.
Definition: SROA.cpp:395
llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:73
llvm::sroa::AllocaSlices::erase
void erase(iterator Start, iterator Stop)
Erase a range of slices.
Definition: SROA.cpp:248
llvm::sroa::SROALegacyPass::runOnFunction
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Definition: SROA.cpp:4780
SmallPtrSet.h
llvm::Instruction::getAAMetadata
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1370
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definitio