LLVM  15.0.0git
SROA.cpp
Go to the documentation of this file.
1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
14 ///
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
18 ///
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
22 ///
23 //===----------------------------------------------------------------------===//
24 
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringRef.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/ADT/iterator.h"
42 #include "llvm/Analysis/Loads.h"
44 #include "llvm/Config/llvm-config.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/ConstantFolder.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DIBuilder.h"
50 #include "llvm/IR/DataLayout.h"
51 #include "llvm/IR/DebugInfo.h"
53 #include "llvm/IR/DerivedTypes.h"
54 #include "llvm/IR/Dominators.h"
55 #include "llvm/IR/Function.h"
57 #include "llvm/IR/GlobalAlias.h"
58 #include "llvm/IR/IRBuilder.h"
59 #include "llvm/IR/InstVisitor.h"
60 #include "llvm/IR/Instruction.h"
61 #include "llvm/IR/Instructions.h"
62 #include "llvm/IR/IntrinsicInst.h"
63 #include "llvm/IR/LLVMContext.h"
64 #include "llvm/IR/Metadata.h"
65 #include "llvm/IR/Module.h"
66 #include "llvm/IR/Operator.h"
67 #include "llvm/IR/PassManager.h"
68 #include "llvm/IR/Type.h"
69 #include "llvm/IR/Use.h"
70 #include "llvm/IR/User.h"
71 #include "llvm/IR/Value.h"
72 #include "llvm/InitializePasses.h"
73 #include "llvm/Pass.h"
74 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Compiler.h"
77 #include "llvm/Support/Debug.h"
80 #include "llvm/Transforms/Scalar.h"
83 #include <algorithm>
84 #include <cassert>
85 #include <cstddef>
86 #include <cstdint>
87 #include <cstring>
88 #include <iterator>
89 #include <string>
90 #include <tuple>
91 #include <utility>
92 #include <vector>
93 
94 using namespace llvm;
95 using namespace llvm::sroa;
96 
97 #define DEBUG_TYPE "sroa"
98 
99 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
100 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
101 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
102 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
103 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
104 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
105 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
106 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
107 STATISTIC(NumDeleted, "Number of instructions deleted");
108 STATISTIC(NumVectorized, "Number of vectorized aggregates");
109 
110 /// Hidden option to experiment with completely strict handling of inbounds
111 /// GEPs.
112 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
113  cl::Hidden);
114 
115 namespace {
116 
117 /// A custom IRBuilder inserter which prefixes all names, but only in
118 /// Assert builds.
119 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
120  std::string Prefix;
121 
122  Twine getNameWithPrefix(const Twine &Name) const {
123  return Name.isTriviallyEmpty() ? Name : Prefix + Name;
124  }
125 
126 public:
127  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
128 
129  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
130  BasicBlock::iterator InsertPt) const override {
131  IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
132  InsertPt);
133  }
134 };
135 
136 /// Provide a type for IRBuilder that drops names in release builds.
138 
139 /// A used slice of an alloca.
140 ///
141 /// This structure represents a slice of an alloca used by some instruction. It
142 /// stores both the begin and end offsets of this use, a pointer to the use
143 /// itself, and a flag indicating whether we can classify the use as splittable
144 /// or not when forming partitions of the alloca.
145 class Slice {
146  /// The beginning offset of the range.
147  uint64_t BeginOffset = 0;
148 
149  /// The ending offset, not included in the range.
150  uint64_t EndOffset = 0;
151 
152  /// Storage for both the use of this slice and whether it can be
153  /// split.
154  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
155 
156 public:
157  Slice() = default;
158 
159  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
160  : BeginOffset(BeginOffset), EndOffset(EndOffset),
161  UseAndIsSplittable(U, IsSplittable) {}
162 
163  uint64_t beginOffset() const { return BeginOffset; }
164  uint64_t endOffset() const { return EndOffset; }
165 
166  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
167  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
168 
169  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
170 
171  bool isDead() const { return getUse() == nullptr; }
172  void kill() { UseAndIsSplittable.setPointer(nullptr); }
173 
174  /// Support for ordering ranges.
175  ///
176  /// This provides an ordering over ranges such that start offsets are
177  /// always increasing, and within equal start offsets, the end offsets are
178  /// decreasing. Thus the spanning range comes first in a cluster with the
179  /// same start position.
180  bool operator<(const Slice &RHS) const {
181  if (beginOffset() < RHS.beginOffset())
182  return true;
183  if (beginOffset() > RHS.beginOffset())
184  return false;
185  if (isSplittable() != RHS.isSplittable())
186  return !isSplittable();
187  if (endOffset() > RHS.endOffset())
188  return true;
189  return false;
190  }
191 
192  /// Support comparison with a single offset to allow binary searches.
193  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
194  uint64_t RHSOffset) {
195  return LHS.beginOffset() < RHSOffset;
196  }
197  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
198  const Slice &RHS) {
199  return LHSOffset < RHS.beginOffset();
200  }
201 
202  bool operator==(const Slice &RHS) const {
203  return isSplittable() == RHS.isSplittable() &&
204  beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
205  }
206  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
207 };
208 
209 } // end anonymous namespace
210 
211 /// Representation of the alloca slices.
212 ///
213 /// This class represents the slices of an alloca which are formed by its
214 /// various uses. If a pointer escapes, we can't fully build a representation
215 /// for the slices used and we reflect that in this structure. The uses are
216 /// stored, sorted by increasing beginning offset and with unsplittable slices
217 /// starting at a particular offset before splittable slices.
219 public:
220  /// Construct the slices of a particular alloca.
221  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
222 
223  /// Test whether a pointer to the allocation escapes our analysis.
224  ///
225  /// If this is true, the slices are never fully built and should be
226  /// ignored.
227  bool isEscaped() const { return PointerEscapingInstr; }
228 
229  /// Support for iterating over the slices.
230  /// @{
233 
234  iterator begin() { return Slices.begin(); }
235  iterator end() { return Slices.end(); }
236 
239 
240  const_iterator begin() const { return Slices.begin(); }
241  const_iterator end() const { return Slices.end(); }
242  /// @}
243 
244  /// Erase a range of slices.
245  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
246 
247  /// Insert new slices for this alloca.
248  ///
249  /// This moves the slices into the alloca's slices collection, and re-sorts
250  /// everything so that the usual ordering properties of the alloca's slices
251  /// hold.
252  void insert(ArrayRef<Slice> NewSlices) {
253  int OldSize = Slices.size();
254  Slices.append(NewSlices.begin(), NewSlices.end());
255  auto SliceI = Slices.begin() + OldSize;
256  llvm::sort(SliceI, Slices.end());
257  std::inplace_merge(Slices.begin(), SliceI, Slices.end());
258  }
259 
260  // Forward declare the iterator and range accessor for walking the
261  // partitions.
262  class partition_iterator;
264 
265  /// Access the dead users for this alloca.
266  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
267 
268  /// Access Uses that should be dropped if the alloca is promotable.
270  return DeadUseIfPromotable;
271  }
272 
273  /// Access the dead operands referring to this alloca.
274  ///
275  /// These are operands which have cannot actually be used to refer to the
276  /// alloca as they are outside its range and the user doesn't correct for
277  /// that. These mostly consist of PHI node inputs and the like which we just
278  /// need to replace with undef.
279  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
280 
281 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
282  void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
284  StringRef Indent = " ") const;
286  StringRef Indent = " ") const;
287  void print(raw_ostream &OS) const;
288  void dump(const_iterator I) const;
289  void dump() const;
290 #endif
291 
292 private:
293  template <typename DerivedT, typename RetT = void> class BuilderBase;
294  class SliceBuilder;
295 
296  friend class AllocaSlices::SliceBuilder;
297 
298 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
299  /// Handle to alloca instruction to simplify method interfaces.
300  AllocaInst &AI;
301 #endif
302 
303  /// The instruction responsible for this alloca not having a known set
304  /// of slices.
305  ///
306  /// When an instruction (potentially) escapes the pointer to the alloca, we
307  /// store a pointer to that here and abort trying to form slices of the
308  /// alloca. This will be null if the alloca slices are analyzed successfully.
309  Instruction *PointerEscapingInstr;
310 
311  /// The slices of the alloca.
312  ///
313  /// We store a vector of the slices formed by uses of the alloca here. This
314  /// vector is sorted by increasing begin offset, and then the unsplittable
315  /// slices before the splittable ones. See the Slice inner class for more
316  /// details.
317  SmallVector<Slice, 8> Slices;
318 
319  /// Instructions which will become dead if we rewrite the alloca.
320  ///
321  /// Note that these are not separated by slice. This is because we expect an
322  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
323  /// all these instructions can simply be removed and replaced with poison as
324  /// they come from outside of the allocated space.
326 
327  /// Uses which will become dead if can promote the alloca.
328  SmallVector<Use *, 8> DeadUseIfPromotable;
329 
330  /// Operands which will become dead if we rewrite the alloca.
331  ///
332  /// These are operands that in their particular use can be replaced with
333  /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
334  /// to PHI nodes and the like. They aren't entirely dead (there might be
335  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
336  /// want to swap this particular input for poison to simplify the use lists of
337  /// the alloca.
338  SmallVector<Use *, 8> DeadOperands;
339 };
340 
341 /// A partition of the slices.
342 ///
343 /// An ephemeral representation for a range of slices which can be viewed as
344 /// a partition of the alloca. This range represents a span of the alloca's
345 /// memory which cannot be split, and provides access to all of the slices
346 /// overlapping some part of the partition.
347 ///
348 /// Objects of this type are produced by traversing the alloca's slices, but
349 /// are only ephemeral and not persistent.
351 private:
352  friend class AllocaSlices;
354 
355  using iterator = AllocaSlices::iterator;
356 
357  /// The beginning and ending offsets of the alloca for this
358  /// partition.
359  uint64_t BeginOffset = 0, EndOffset = 0;
360 
361  /// The start and end iterators of this partition.
362  iterator SI, SJ;
363 
364  /// A collection of split slice tails overlapping the partition.
365  SmallVector<Slice *, 4> SplitTails;
366 
367  /// Raw constructor builds an empty partition starting and ending at
368  /// the given iterator.
369  Partition(iterator SI) : SI(SI), SJ(SI) {}
370 
371 public:
372  /// The start offset of this partition.
373  ///
374  /// All of the contained slices start at or after this offset.
375  uint64_t beginOffset() const { return BeginOffset; }
376 
377  /// The end offset of this partition.
378  ///
379  /// All of the contained slices end at or before this offset.
380  uint64_t endOffset() const { return EndOffset; }
381 
382  /// The size of the partition.
383  ///
384  /// Note that this can never be zero.
385  uint64_t size() const {
386  assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
387  return EndOffset - BeginOffset;
388  }
389 
390  /// Test whether this partition contains no slices, and merely spans
391  /// a region occupied by split slices.
392  bool empty() const { return SI == SJ; }
393 
394  /// \name Iterate slices that start within the partition.
395  /// These may be splittable or unsplittable. They have a begin offset >= the
396  /// partition begin offset.
397  /// @{
398  // FIXME: We should probably define a "concat_iterator" helper and use that
399  // to stitch together pointee_iterators over the split tails and the
400  // contiguous iterators of the partition. That would give a much nicer
401  // interface here. We could then additionally expose filtered iterators for
402  // split, unsplit, and unsplittable splices based on the usage patterns.
403  iterator begin() const { return SI; }
404  iterator end() const { return SJ; }
405  /// @}
406 
407  /// Get the sequence of split slice tails.
408  ///
409  /// These tails are of slices which start before this partition but are
410  /// split and overlap into the partition. We accumulate these while forming
411  /// partitions.
412  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
413 };
414 
415 /// An iterator over partitions of the alloca's slices.
416 ///
417 /// This iterator implements the core algorithm for partitioning the alloca's
418 /// slices. It is a forward iterator as we don't support backtracking for
419 /// efficiency reasons, and re-use a single storage area to maintain the
420 /// current set of split slices.
421 ///
422 /// It is templated on the slice iterator type to use so that it can operate
423 /// with either const or non-const slice iterators.
425  : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
426  Partition> {
427  friend class AllocaSlices;
428 
429  /// Most of the state for walking the partitions is held in a class
430  /// with a nice interface for examining them.
431  Partition P;
432 
433  /// We need to keep the end of the slices to know when to stop.
435 
436  /// We also need to keep track of the maximum split end offset seen.
437  /// FIXME: Do we really?
438  uint64_t MaxSplitSliceEndOffset = 0;
439 
440  /// Sets the partition to be empty at given iterator, and sets the
441  /// end iterator.
443  : P(SI), SE(SE) {
444  // If not already at the end, advance our state to form the initial
445  // partition.
446  if (SI != SE)
447  advance();
448  }
449 
450  /// Advance the iterator to the next partition.
451  ///
452  /// Requires that the iterator not be at the end of the slices.
453  void advance() {
454  assert((P.SI != SE || !P.SplitTails.empty()) &&
455  "Cannot advance past the end of the slices!");
456 
457  // Clear out any split uses which have ended.
458  if (!P.SplitTails.empty()) {
459  if (P.EndOffset >= MaxSplitSliceEndOffset) {
460  // If we've finished all splits, this is easy.
461  P.SplitTails.clear();
462  MaxSplitSliceEndOffset = 0;
463  } else {
464  // Remove the uses which have ended in the prior partition. This
465  // cannot change the max split slice end because we just checked that
466  // the prior partition ended prior to that max.
467  llvm::erase_if(P.SplitTails,
468  [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
469  assert(llvm::any_of(P.SplitTails,
470  [&](Slice *S) {
471  return S->endOffset() == MaxSplitSliceEndOffset;
472  }) &&
473  "Could not find the current max split slice offset!");
474  assert(llvm::all_of(P.SplitTails,
475  [&](Slice *S) {
476  return S->endOffset() <= MaxSplitSliceEndOffset;
477  }) &&
478  "Max split slice end offset is not actually the max!");
479  }
480  }
481 
482  // If P.SI is already at the end, then we've cleared the split tail and
483  // now have an end iterator.
484  if (P.SI == SE) {
485  assert(P.SplitTails.empty() && "Failed to clear the split slices!");
486  return;
487  }
488 
489  // If we had a non-empty partition previously, set up the state for
490  // subsequent partitions.
491  if (P.SI != P.SJ) {
492  // Accumulate all the splittable slices which started in the old
493  // partition into the split list.
494  for (Slice &S : P)
495  if (S.isSplittable() && S.endOffset() > P.EndOffset) {
496  P.SplitTails.push_back(&S);
497  MaxSplitSliceEndOffset =
498  std::max(S.endOffset(), MaxSplitSliceEndOffset);
499  }
500 
501  // Start from the end of the previous partition.
502  P.SI = P.SJ;
503 
504  // If P.SI is now at the end, we at most have a tail of split slices.
505  if (P.SI == SE) {
506  P.BeginOffset = P.EndOffset;
507  P.EndOffset = MaxSplitSliceEndOffset;
508  return;
509  }
510 
511  // If the we have split slices and the next slice is after a gap and is
512  // not splittable immediately form an empty partition for the split
513  // slices up until the next slice begins.
514  if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
515  !P.SI->isSplittable()) {
516  P.BeginOffset = P.EndOffset;
517  P.EndOffset = P.SI->beginOffset();
518  return;
519  }
520  }
521 
522  // OK, we need to consume new slices. Set the end offset based on the
523  // current slice, and step SJ past it. The beginning offset of the
524  // partition is the beginning offset of the next slice unless we have
525  // pre-existing split slices that are continuing, in which case we begin
526  // at the prior end offset.
527  P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
528  P.EndOffset = P.SI->endOffset();
529  ++P.SJ;
530 
531  // There are two strategies to form a partition based on whether the
532  // partition starts with an unsplittable slice or a splittable slice.
533  if (!P.SI->isSplittable()) {
534  // When we're forming an unsplittable region, it must always start at
535  // the first slice and will extend through its end.
536  assert(P.BeginOffset == P.SI->beginOffset());
537 
538  // Form a partition including all of the overlapping slices with this
539  // unsplittable slice.
540  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
541  if (!P.SJ->isSplittable())
542  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
543  ++P.SJ;
544  }
545 
546  // We have a partition across a set of overlapping unsplittable
547  // partitions.
548  return;
549  }
550 
551  // If we're starting with a splittable slice, then we need to form
552  // a synthetic partition spanning it and any other overlapping splittable
553  // splices.
554  assert(P.SI->isSplittable() && "Forming a splittable partition!");
555 
556  // Collect all of the overlapping splittable slices.
557  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
558  P.SJ->isSplittable()) {
559  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
560  ++P.SJ;
561  }
562 
563  // Back upiP.EndOffset if we ended the span early when encountering an
564  // unsplittable slice. This synthesizes the early end offset of
565  // a partition spanning only splittable slices.
566  if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
567  assert(!P.SJ->isSplittable());
568  P.EndOffset = P.SJ->beginOffset();
569  }
570  }
571 
572 public:
573  bool operator==(const partition_iterator &RHS) const {
574  assert(SE == RHS.SE &&
575  "End iterators don't match between compared partition iterators!");
576 
577  // The observed positions of partitions is marked by the P.SI iterator and
578  // the emptiness of the split slices. The latter is only relevant when
579  // P.SI == SE, as the end iterator will additionally have an empty split
580  // slices list, but the prior may have the same P.SI and a tail of split
581  // slices.
582  if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
583  assert(P.SJ == RHS.P.SJ &&
584  "Same set of slices formed two different sized partitions!");
585  assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
586  "Same slice position with differently sized non-empty split "
587  "slice tails!");
588  return true;
589  }
590  return false;
591  }
592 
594  advance();
595  return *this;
596  }
597 
598  Partition &operator*() { return P; }
599 };
600 
601 /// A forward range over the partitions of the alloca's slices.
602 ///
603 /// This accesses an iterator range over the partitions of the alloca's
604 /// slices. It computes these partitions on the fly based on the overlapping
605 /// offsets of the slices and the ability to split them. It will visit "empty"
606 /// partitions to cover regions of the alloca only accessed via split
607 /// slices.
609  return make_range(partition_iterator(begin(), end()),
610  partition_iterator(end(), end()));
611 }
612 
614  // If the condition being selected on is a constant or the same value is
615  // being selected between, fold the select. Yes this does (rarely) happen
616  // early on.
617  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
618  return SI.getOperand(1 + CI->isZero());
619  if (SI.getOperand(1) == SI.getOperand(2))
620  return SI.getOperand(1);
621 
622  return nullptr;
623 }
624 
625 /// A helper that folds a PHI node or a select.
627  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
628  // If PN merges together the same value, return that value.
629  return PN->hasConstantValue();
630  }
631  return foldSelectInst(cast<SelectInst>(I));
632 }
633 
634 /// Builder for the alloca slices.
635 ///
636 /// This class builds a set of alloca slices by recursively visiting the uses
637 /// of an alloca and making a slice for each load and store at each offset.
638 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
640  friend class InstVisitor<SliceBuilder>;
641 
643 
644  const uint64_t AllocSize;
645  AllocaSlices &AS;
646 
647  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
649 
650  /// Set to de-duplicate dead instructions found in the use walk.
651  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
652 
653 public:
656  AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
657  AS(AS) {}
658 
659 private:
660  void markAsDead(Instruction &I) {
661  if (VisitedDeadInsts.insert(&I).second)
662  AS.DeadUsers.push_back(&I);
663  }
664 
665  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
666  bool IsSplittable = false) {
667  // Completely skip uses which have a zero size or start either before or
668  // past the end of the allocation.
669  if (Size == 0 || Offset.uge(AllocSize)) {
670  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
671  << Offset
672  << " which has zero size or starts outside of the "
673  << AllocSize << " byte alloca:\n"
674  << " alloca: " << AS.AI << "\n"
675  << " use: " << I << "\n");
676  return markAsDead(I);
677  }
678 
679  uint64_t BeginOffset = Offset.getZExtValue();
680  uint64_t EndOffset = BeginOffset + Size;
681 
682  // Clamp the end offset to the end of the allocation. Note that this is
683  // formulated to handle even the case where "BeginOffset + Size" overflows.
684  // This may appear superficially to be something we could ignore entirely,
685  // but that is not so! There may be widened loads or PHI-node uses where
686  // some instructions are dead but not others. We can't completely ignore
687  // them, and so have to record at least the information here.
688  assert(AllocSize >= BeginOffset); // Established above.
689  if (Size > AllocSize - BeginOffset) {
690  LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
691  << Offset << " to remain within the " << AllocSize
692  << " byte alloca:\n"
693  << " alloca: " << AS.AI << "\n"
694  << " use: " << I << "\n");
695  EndOffset = AllocSize;
696  }
697 
698  AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
699  }
700 
701  void visitBitCastInst(BitCastInst &BC) {
702  if (BC.use_empty())
703  return markAsDead(BC);
704 
705  return Base::visitBitCastInst(BC);
706  }
707 
708  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
709  if (ASC.use_empty())
710  return markAsDead(ASC);
711 
712  return Base::visitAddrSpaceCastInst(ASC);
713  }
714 
715  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
716  if (GEPI.use_empty())
717  return markAsDead(GEPI);
718 
719  if (SROAStrictInbounds && GEPI.isInBounds()) {
720  // FIXME: This is a manually un-factored variant of the basic code inside
721  // of GEPs with checking of the inbounds invariant specified in the
722  // langref in a very strict sense. If we ever want to enable
723  // SROAStrictInbounds, this code should be factored cleanly into
724  // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
725  // by writing out the code here where we have the underlying allocation
726  // size readily available.
727  APInt GEPOffset = Offset;
728  const DataLayout &DL = GEPI.getModule()->getDataLayout();
729  for (gep_type_iterator GTI = gep_type_begin(GEPI),
730  GTE = gep_type_end(GEPI);
731  GTI != GTE; ++GTI) {
732  ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
733  if (!OpC)
734  break;
735 
736  // Handle a struct index, which adds its field offset to the pointer.
737  if (StructType *STy = GTI.getStructTypeOrNull()) {
738  unsigned ElementIdx = OpC->getZExtValue();
739  const StructLayout *SL = DL.getStructLayout(STy);
740  GEPOffset +=
741  APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
742  } else {
743  // For array or vector indices, scale the index by the size of the
744  // type.
745  APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
746  GEPOffset +=
747  Index *
748  APInt(Offset.getBitWidth(),
749  DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
750  }
751 
752  // If this index has computed an intermediate pointer which is not
753  // inbounds, then the result of the GEP is a poison value and we can
754  // delete it and all uses.
755  if (GEPOffset.ugt(AllocSize))
756  return markAsDead(GEPI);
757  }
758  }
759 
760  return Base::visitGetElementPtrInst(GEPI);
761  }
762 
763  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
764  uint64_t Size, bool IsVolatile) {
765  // We allow splitting of non-volatile loads and stores where the type is an
766  // integer type. These may be used to implement 'memcpy' or other "transfer
767  // of bits" patterns.
768  bool IsSplittable =
769  Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
770 
771  insertUse(I, Offset, Size, IsSplittable);
772  }
773 
774  void visitLoadInst(LoadInst &LI) {
775  assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
776  "All simple FCA loads should have been pre-split");
777 
778  if (!IsOffsetKnown)
779  return PI.setAborted(&LI);
780 
781  if (LI.isVolatile() &&
782  LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
783  return PI.setAborted(&LI);
784 
785  if (isa<ScalableVectorType>(LI.getType()))
786  return PI.setAborted(&LI);
787 
788  uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
789  return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
790  }
791 
792  void visitStoreInst(StoreInst &SI) {
793  Value *ValOp = SI.getValueOperand();
794  if (ValOp == *U)
795  return PI.setEscapedAndAborted(&SI);
796  if (!IsOffsetKnown)
797  return PI.setAborted(&SI);
798 
799  if (SI.isVolatile() &&
800  SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
801  return PI.setAborted(&SI);
802 
803  if (isa<ScalableVectorType>(ValOp->getType()))
804  return PI.setAborted(&SI);
805 
806  uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
807 
808  // If this memory access can be shown to *statically* extend outside the
809  // bounds of the allocation, it's behavior is undefined, so simply
810  // ignore it. Note that this is more strict than the generic clamping
811  // behavior of insertUse. We also try to handle cases which might run the
812  // risk of overflow.
813  // FIXME: We should instead consider the pointer to have escaped if this
814  // function is being instrumented for addressing bugs or race conditions.
815  if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
816  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
817  << Offset << " which extends past the end of the "
818  << AllocSize << " byte alloca:\n"
819  << " alloca: " << AS.AI << "\n"
820  << " use: " << SI << "\n");
821  return markAsDead(SI);
822  }
823 
824  assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
825  "All simple FCA stores should have been pre-split");
826  handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
827  }
828 
829  void visitMemSetInst(MemSetInst &II) {
830  assert(II.getRawDest() == *U && "Pointer use is not the destination?");
831  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
832  if ((Length && Length->getValue() == 0) ||
833  (IsOffsetKnown && Offset.uge(AllocSize)))
834  // Zero-length mem transfer intrinsics can be ignored entirely.
835  return markAsDead(II);
836 
837  if (!IsOffsetKnown)
838  return PI.setAborted(&II);
839 
840  // Don't replace this with a store with a different address space. TODO:
841  // Use a store with the casted new alloca?
842  if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
843  return PI.setAborted(&II);
844 
845  insertUse(II, Offset, Length ? Length->getLimitedValue()
846  : AllocSize - Offset.getLimitedValue(),
847  (bool)Length);
848  }
849 
850  void visitMemTransferInst(MemTransferInst &II) {
851  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
852  if (Length && Length->getValue() == 0)
853  // Zero-length mem transfer intrinsics can be ignored entirely.
854  return markAsDead(II);
855 
856  // Because we can visit these intrinsics twice, also check to see if the
857  // first time marked this instruction as dead. If so, skip it.
858  if (VisitedDeadInsts.count(&II))
859  return;
860 
861  if (!IsOffsetKnown)
862  return PI.setAborted(&II);
863 
864  // Don't replace this with a load/store with a different address space.
865  // TODO: Use a store with the casted new alloca?
866  if (II.isVolatile() &&
867  (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
868  II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
869  return PI.setAborted(&II);
870 
871  // This side of the transfer is completely out-of-bounds, and so we can
872  // nuke the entire transfer. However, we also need to nuke the other side
873  // if already added to our partitions.
874  // FIXME: Yet another place we really should bypass this when
875  // instrumenting for ASan.
876  if (Offset.uge(AllocSize)) {
878  MemTransferSliceMap.find(&II);
879  if (MTPI != MemTransferSliceMap.end())
880  AS.Slices[MTPI->second].kill();
881  return markAsDead(II);
882  }
883 
884  uint64_t RawOffset = Offset.getLimitedValue();
885  uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
886 
887  // Check for the special case where the same exact value is used for both
888  // source and dest.
889  if (*U == II.getRawDest() && *U == II.getRawSource()) {
890  // For non-volatile transfers this is a no-op.
891  if (!II.isVolatile())
892  return markAsDead(II);
893 
894  return insertUse(II, Offset, Size, /*IsSplittable=*/false);
895  }
896 
897  // If we have seen both source and destination for a mem transfer, then
898  // they both point to the same alloca.
899  bool Inserted;
901  std::tie(MTPI, Inserted) =
902  MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
903  unsigned PrevIdx = MTPI->second;
904  if (!Inserted) {
905  Slice &PrevP = AS.Slices[PrevIdx];
906 
907  // Check if the begin offsets match and this is a non-volatile transfer.
908  // In that case, we can completely elide the transfer.
909  if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
910  PrevP.kill();
911  return markAsDead(II);
912  }
913 
914  // Otherwise we have an offset transfer within the same alloca. We can't
915  // split those.
916  PrevP.makeUnsplittable();
917  }
918 
919  // Insert the use now that we've fixed up the splittable nature.
920  insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
921 
922  // Check that we ended up with a valid index in the map.
923  assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
924  "Map index doesn't point back to a slice with this user.");
925  }
926 
927  // Disable SRoA for any intrinsics except for lifetime invariants and
928  // invariant group.
929  // FIXME: What about debug intrinsics? This matches old behavior, but
930  // doesn't make sense.
931  void visitIntrinsicInst(IntrinsicInst &II) {
932  if (II.isDroppable()) {
933  AS.DeadUseIfPromotable.push_back(U);
934  return;
935  }
936 
937  if (!IsOffsetKnown)
938  return PI.setAborted(&II);
939 
940  if (II.isLifetimeStartOrEnd()) {
941  ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
942  uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
943  Length->getLimitedValue());
944  insertUse(II, Offset, Size, true);
945  return;
946  }
947 
949  enqueueUsers(II);
950  return;
951  }
952 
953  Base::visitIntrinsicInst(II);
954  }
955 
956  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
957  // We consider any PHI or select that results in a direct load or store of
958  // the same offset to be a viable use for slicing purposes. These uses
959  // are considered unsplittable and the size is the maximum loaded or stored
960  // size.
963  Visited.insert(Root);
964  Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
965  const DataLayout &DL = Root->getModule()->getDataLayout();
966  // If there are no loads or stores, the access is dead. We mark that as
967  // a size zero access.
968  Size = 0;
969  do {
970  Instruction *I, *UsedI;
971  std::tie(UsedI, I) = Uses.pop_back_val();
972 
973  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
974  Size = std::max(Size,
975  DL.getTypeStoreSize(LI->getType()).getFixedSize());
976  continue;
977  }
978  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
979  Value *Op = SI->getOperand(0);
980  if (Op == UsedI)
981  return SI;
982  Size = std::max(Size,
983  DL.getTypeStoreSize(Op->getType()).getFixedSize());
984  continue;
985  }
986 
987  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
988  if (!GEP->hasAllZeroIndices())
989  return GEP;
990  } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
991  !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
992  return I;
993  }
994 
995  for (User *U : I->users())
996  if (Visited.insert(cast<Instruction>(U)).second)
997  Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
998  } while (!Uses.empty());
999 
1000  return nullptr;
1001  }
1002 
1003  void visitPHINodeOrSelectInst(Instruction &I) {
1004  assert(isa<PHINode>(I) || isa<SelectInst>(I));
1005  if (I.use_empty())
1006  return markAsDead(I);
1007 
1008  // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1009  // instructions in this BB, which may be required during rewriting. Bail out
1010  // on these cases.
1011  if (isa<PHINode>(I) &&
1012  I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1013  return PI.setAborted(&I);
1014 
1015  // TODO: We could use SimplifyInstruction here to fold PHINodes and
1016  // SelectInsts. However, doing so requires to change the current
1017  // dead-operand-tracking mechanism. For instance, suppose neither loading
1018  // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1019  // trap either. However, if we simply replace %U with undef using the
1020  // current dead-operand-tracking mechanism, "load (select undef, undef,
1021  // %other)" may trap because the select may return the first operand
1022  // "undef".
1023  if (Value *Result = foldPHINodeOrSelectInst(I)) {
1024  if (Result == *U)
1025  // If the result of the constant fold will be the pointer, recurse
1026  // through the PHI/select as if we had RAUW'ed it.
1027  enqueueUsers(I);
1028  else
1029  // Otherwise the operand to the PHI/select is dead, and we can replace
1030  // it with poison.
1031  AS.DeadOperands.push_back(U);
1032 
1033  return;
1034  }
1035 
1036  if (!IsOffsetKnown)
1037  return PI.setAborted(&I);
1038 
1039  // See if we already have computed info on this node.
1040  uint64_t &Size = PHIOrSelectSizes[&I];
1041  if (!Size) {
1042  // This is a new PHI/Select, check for an unsafe use of it.
1043  if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1044  return PI.setAborted(UnsafeI);
1045  }
1046 
1047  // For PHI and select operands outside the alloca, we can't nuke the entire
1048  // phi or select -- the other side might still be relevant, so we special
1049  // case them here and use a separate structure to track the operands
1050  // themselves which should be replaced with poison.
1051  // FIXME: This should instead be escaped in the event we're instrumenting
1052  // for address sanitization.
1053  if (Offset.uge(AllocSize)) {
1054  AS.DeadOperands.push_back(U);
1055  return;
1056  }
1057 
1058  insertUse(I, Offset, Size);
1059  }
1060 
1061  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1062 
1063  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1064 
1065  /// Disable SROA entirely if there are unhandled users of the alloca.
1066  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1067 };
1068 
1070  :
1071 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1072  AI(AI),
1073 #endif
1074  PointerEscapingInstr(nullptr) {
1075  SliceBuilder PB(DL, AI, *this);
1076  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1077  if (PtrI.isEscaped() || PtrI.isAborted()) {
1078  // FIXME: We should sink the escape vs. abort info into the caller nicely,
1079  // possibly by just storing the PtrInfo in the AllocaSlices.
1080  PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1081  : PtrI.getAbortingInst();
1082  assert(PointerEscapingInstr && "Did not track a bad instruction");
1083  return;
1084  }
1085 
1086  llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1087 
1088  // Sort the uses. This arranges for the offsets to be in ascending order,
1089  // and the sizes to be in descending order.
1090  llvm::stable_sort(Slices);
1091 }
1092 
1093 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1094 
1095 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1096  StringRef Indent) const {
1097  printSlice(OS, I, Indent);
1098  OS << "\n";
1099  printUse(OS, I, Indent);
1100 }
1101 
1102 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1103  StringRef Indent) const {
1104  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1105  << " slice #" << (I - begin())
1106  << (I->isSplittable() ? " (splittable)" : "");
1107 }
1108 
1109 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1110  StringRef Indent) const {
1111  OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1112 }
1113 
1114 void AllocaSlices::print(raw_ostream &OS) const {
1115  if (PointerEscapingInstr) {
1116  OS << "Can't analyze slices for alloca: " << AI << "\n"
1117  << " A pointer to this alloca escaped by:\n"
1118  << " " << *PointerEscapingInstr << "\n";
1119  return;
1120  }
1121 
1122  OS << "Slices of alloca: " << AI << "\n";
1123  for (const_iterator I = begin(), E = end(); I != E; ++I)
1124  print(OS, I);
1125 }
1126 
1127 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1128  print(dbgs(), I);
1129 }
1130 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1131 
1132 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1133 
1134 /// Walk the range of a partitioning looking for a common type to cover this
1135 /// sequence of slices.
1136 static std::pair<Type *, IntegerType *>
1137 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1138  uint64_t EndOffset) {
1139  Type *Ty = nullptr;
1140  bool TyIsCommon = true;
1141  IntegerType *ITy = nullptr;
1142 
1143  // Note that we need to look at *every* alloca slice's Use to ensure we
1144  // always get consistent results regardless of the order of slices.
1145  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1146  Use *U = I->getUse();
1147  if (isa<IntrinsicInst>(*U->getUser()))
1148  continue;
1149  if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1150  continue;
1151 
1152  Type *UserTy = nullptr;
1153  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1154  UserTy = LI->getType();
1155  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1156  UserTy = SI->getValueOperand()->getType();
1157  }
1158 
1159  if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1160  // If the type is larger than the partition, skip it. We only encounter
1161  // this for split integer operations where we want to use the type of the
1162  // entity causing the split. Also skip if the type is not a byte width
1163  // multiple.
1164  if (UserITy->getBitWidth() % 8 != 0 ||
1165  UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1166  continue;
1167 
1168  // Track the largest bitwidth integer type used in this way in case there
1169  // is no common type.
1170  if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1171  ITy = UserITy;
1172  }
1173 
1174  // To avoid depending on the order of slices, Ty and TyIsCommon must not
1175  // depend on types skipped above.
1176  if (!UserTy || (Ty && Ty != UserTy))
1177  TyIsCommon = false; // Give up on anything but an iN type.
1178  else
1179  Ty = UserTy;
1180  }
1181 
1182  return {TyIsCommon ? Ty : nullptr, ITy};
1183 }
1184 
1185 /// PHI instructions that use an alloca and are subsequently loaded can be
1186 /// rewritten to load both input pointers in the pred blocks and then PHI the
1187 /// results, allowing the load of the alloca to be promoted.
1188 /// From this:
1189 /// %P2 = phi [i32* %Alloca, i32* %Other]
1190 /// %V = load i32* %P2
1191 /// to:
1192 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1193 /// ...
1194 /// %V2 = load i32* %Other
1195 /// ...
1196 /// %V = phi [i32 %V1, i32 %V2]
1197 ///
1198 /// We can do this to a select if its only uses are loads and if the operands
1199 /// to the select can be loaded unconditionally.
1200 ///
1201 /// FIXME: This should be hoisted into a generic utility, likely in
1202 /// Transforms/Util/Local.h
1203 static bool isSafePHIToSpeculate(PHINode &PN) {
1204  const DataLayout &DL = PN.getModule()->getDataLayout();
1205 
1206  // For now, we can only do this promotion if the load is in the same block
1207  // as the PHI, and if there are no stores between the phi and load.
1208  // TODO: Allow recursive phi users.
1209  // TODO: Allow stores.
1210  BasicBlock *BB = PN.getParent();
1211  Align MaxAlign;
1212  uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1213  APInt MaxSize(APWidth, 0);
1214  bool HaveLoad = false;
1215  for (User *U : PN.users()) {
1216  LoadInst *LI = dyn_cast<LoadInst>(U);
1217  if (!LI || !LI->isSimple())
1218  return false;
1219 
1220  // For now we only allow loads in the same block as the PHI. This is
1221  // a common case that happens when instcombine merges two loads through
1222  // a PHI.
1223  if (LI->getParent() != BB)
1224  return false;
1225 
1226  // Ensure that there are no instructions between the PHI and the load that
1227  // could store.
1228  for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1229  if (BBI->mayWriteToMemory())
1230  return false;
1231 
1232  uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
1233  MaxAlign = std::max(MaxAlign, LI->getAlign());
1234  MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
1235  HaveLoad = true;
1236  }
1237 
1238  if (!HaveLoad)
1239  return false;
1240 
1241  // We can only transform this if it is safe to push the loads into the
1242  // predecessor blocks. The only thing to watch out for is that we can't put
1243  // a possibly trapping load in the predecessor if it is a critical edge.
1244  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1245  Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
1246  Value *InVal = PN.getIncomingValue(Idx);
1247 
1248  // If the value is produced by the terminator of the predecessor (an
1249  // invoke) or it has side-effects, there is no valid place to put a load
1250  // in the predecessor.
1251  if (TI == InVal || TI->mayHaveSideEffects())
1252  return false;
1253 
1254  // If the predecessor has a single successor, then the edge isn't
1255  // critical.
1256  if (TI->getNumSuccessors() == 1)
1257  continue;
1258 
1259  // If this pointer is always safe to load, or if we can prove that there
1260  // is already a load in the block, then we can move the load to the pred
1261  // block.
1262  if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI))
1263  continue;
1264 
1265  return false;
1266  }
1267 
1268  return true;
1269 }
1270 
1271 static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1272  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1273 
1274  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1275  Type *LoadTy = SomeLoad->getType();
1276  IRB.SetInsertPoint(&PN);
1277  PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1278  PN.getName() + ".sroa.speculated");
1279 
1280  // Get the AA tags and alignment to use from one of the loads. It does not
1281  // matter which one we get and if any differ.
1282  AAMDNodes AATags = SomeLoad->getAAMetadata();
1283  Align Alignment = SomeLoad->getAlign();
1284 
1285  // Rewrite all loads of the PN to use the new PHI.
1286  while (!PN.use_empty()) {
1287  LoadInst *LI = cast<LoadInst>(PN.user_back());
1288  LI->replaceAllUsesWith(NewPN);
1289  LI->eraseFromParent();
1290  }
1291 
1292  // Inject loads into all of the pred blocks.
1293  DenseMap<BasicBlock*, Value*> InjectedLoads;
1294  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1295  BasicBlock *Pred = PN.getIncomingBlock(Idx);
1296  Value *InVal = PN.getIncomingValue(Idx);
1297 
1298  // A PHI node is allowed to have multiple (duplicated) entries for the same
1299  // basic block, as long as the value is the same. So if we already injected
1300  // a load in the predecessor, then we should reuse the same load for all
1301  // duplicated entries.
1302  if (Value* V = InjectedLoads.lookup(Pred)) {
1303  NewPN->addIncoming(V, Pred);
1304  continue;
1305  }
1306 
1307  Instruction *TI = Pred->getTerminator();
1308  IRB.SetInsertPoint(TI);
1309 
1310  LoadInst *Load = IRB.CreateAlignedLoad(
1311  LoadTy, InVal, Alignment,
1312  (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1313  ++NumLoadsSpeculated;
1314  if (AATags)
1315  Load->setAAMetadata(AATags);
1316  NewPN->addIncoming(Load, Pred);
1317  InjectedLoads[Pred] = Load;
1318  }
1319 
1320  LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1321  PN.eraseFromParent();
1322 }
1323 
1324 /// Select instructions that use an alloca and are subsequently loaded can be
1325 /// rewritten to load both input pointers and then select between the result,
1326 /// allowing the load of the alloca to be promoted.
1327 /// From this:
1328 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1329 /// %V = load i32* %P2
1330 /// to:
1331 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1332 /// %V2 = load i32* %Other
1333 /// %V = select i1 %cond, i32 %V1, i32 %V2
1334 ///
1335 /// We can do this to a select if its only uses are loads and if the operand
1336 /// to the select can be loaded unconditionally. If found an intervening bitcast
1337 /// with a single use of the load, allow the promotion.
1339  Value *TValue = SI.getTrueValue();
1340  Value *FValue = SI.getFalseValue();
1341  const DataLayout &DL = SI.getModule()->getDataLayout();
1342 
1343  for (User *U : SI.users()) {
1344  LoadInst *LI;
1345  BitCastInst *BC = dyn_cast<BitCastInst>(U);
1346  if (BC && BC->hasOneUse())
1347  LI = dyn_cast<LoadInst>(*BC->user_begin());
1348  else
1349  LI = dyn_cast<LoadInst>(U);
1350 
1351  if (!LI || !LI->isSimple())
1352  return false;
1353 
1354  // Both operands to the select need to be dereferenceable, either
1355  // absolutely (e.g. allocas) or at this point because we can see other
1356  // accesses to it.
1357  if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
1358  LI->getAlign(), DL, LI))
1359  return false;
1360  if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
1361  LI->getAlign(), DL, LI))
1362  return false;
1363  }
1364 
1365  return true;
1366 }
1367 
1368 static void speculateSelectInstLoads(IRBuilderTy &IRB, SelectInst &SI) {
1369  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
1370 
1371  IRB.SetInsertPoint(&SI);
1372  Value *TV = SI.getTrueValue();
1373  Value *FV = SI.getFalseValue();
1374  // Replace the loads of the select with a select of two loads.
1375  while (!SI.use_empty()) {
1376  LoadInst *LI;
1377  BitCastInst *BC = dyn_cast<BitCastInst>(SI.user_back());
1378  if (BC) {
1379  assert(BC->hasOneUse() && "Bitcast should have a single use.");
1380  LI = cast<LoadInst>(BC->user_back());
1381  } else {
1382  LI = cast<LoadInst>(SI.user_back());
1383  }
1384 
1385  assert(LI->isSimple() && "We only speculate simple loads");
1386 
1387  IRB.SetInsertPoint(LI);
1388  Value *NewTV =
1389  BC ? IRB.CreateBitCast(TV, BC->getType(), TV->getName() + ".sroa.cast")
1390  : TV;
1391  Value *NewFV =
1392  BC ? IRB.CreateBitCast(FV, BC->getType(), FV->getName() + ".sroa.cast")
1393  : FV;
1394  LoadInst *TL = IRB.CreateLoad(LI->getType(), NewTV,
1395  LI->getName() + ".sroa.speculate.load.true");
1396  LoadInst *FL = IRB.CreateLoad(LI->getType(), NewFV,
1397  LI->getName() + ".sroa.speculate.load.false");
1398  NumLoadsSpeculated += 2;
1399 
1400  // Transfer alignment and AA info if present.
1401  TL->setAlignment(LI->getAlign());
1402  FL->setAlignment(LI->getAlign());
1403 
1404  AAMDNodes Tags = LI->getAAMetadata();
1405  if (Tags) {
1406  TL->setAAMetadata(Tags);
1407  FL->setAAMetadata(Tags);
1408  }
1409 
1410  Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1411  LI->getName() + ".sroa.speculated");
1412 
1413  LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1414  LI->replaceAllUsesWith(V);
1415  LI->eraseFromParent();
1416  if (BC)
1417  BC->eraseFromParent();
1418  }
1419  SI.eraseFromParent();
1420 }
1421 
1422 /// Build a GEP out of a base pointer and indices.
1423 ///
1424 /// This will return the BasePtr if that is valid, or build a new GEP
1425 /// instruction using the IRBuilder if GEP-ing is needed.
1426 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
1427  SmallVectorImpl<Value *> &Indices,
1428  const Twine &NamePrefix) {
1429  if (Indices.empty())
1430  return BasePtr;
1431 
1432  // A single zero index is a no-op, so check for this and avoid building a GEP
1433  // in that case.
1434  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1435  return BasePtr;
1436 
1437  // buildGEP() is only called for non-opaque pointers.
1438  return IRB.CreateInBoundsGEP(
1439  BasePtr->getType()->getNonOpaquePointerElementType(), BasePtr, Indices,
1440  NamePrefix + "sroa_idx");
1441 }
1442 
1443 /// Get a natural GEP off of the BasePtr walking through Ty toward
1444 /// TargetTy without changing the offset of the pointer.
1445 ///
1446 /// This routine assumes we've already established a properly offset GEP with
1447 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1448 /// zero-indices down through type layers until we find one the same as
1449 /// TargetTy. If we can't find one with the same type, we at least try to use
1450 /// one with the same size. If none of that works, we just produce the GEP as
1451 /// indicated by Indices to have the correct offset.
1452 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
1453  Value *BasePtr, Type *Ty, Type *TargetTy,
1454  SmallVectorImpl<Value *> &Indices,
1455  const Twine &NamePrefix) {
1456  if (Ty == TargetTy)
1457  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1458 
1459  // Offset size to use for the indices.
1460  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
1461 
1462  // See if we can descend into a struct and locate a field with the correct
1463  // type.
1464  unsigned NumLayers = 0;
1465  Type *ElementTy = Ty;
1466  do {
1467  if (ElementTy->isPointerTy())
1468  break;
1469 
1470  if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
1471  ElementTy = ArrayTy->getElementType();
1472  Indices.push_back(IRB.getIntN(OffsetSize, 0));
1473  } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
1474  ElementTy = VectorTy->getElementType();
1475  Indices.push_back(IRB.getInt32(0));
1476  } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1477  if (STy->element_begin() == STy->element_end())
1478  break; // Nothing left to descend into.
1479  ElementTy = *STy->element_begin();
1480  Indices.push_back(IRB.getInt32(0));
1481  } else {
1482  break;
1483  }
1484  ++NumLayers;
1485  } while (ElementTy != TargetTy);
1486  if (ElementTy != TargetTy)
1487  Indices.erase(Indices.end() - NumLayers, Indices.end());
1488 
1489  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1490 }
1491 
1492 /// Get a natural GEP from a base pointer to a particular offset and
1493 /// resulting in a particular type.
1494 ///
1495 /// The goal is to produce a "natural" looking GEP that works with the existing
1496 /// composite types to arrive at the appropriate offset and element type for
1497 /// a pointer. TargetTy is the element type the returned GEP should point-to if
1498 /// possible. We recurse by decreasing Offset, adding the appropriate index to
1499 /// Indices, and setting Ty to the result subtype.
1500 ///
1501 /// If no natural GEP can be constructed, this function returns null.
1502 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
1503  Value *Ptr, APInt Offset, Type *TargetTy,
1504  SmallVectorImpl<Value *> &Indices,
1505  const Twine &NamePrefix) {
1506  PointerType *Ty = cast<PointerType>(Ptr->getType());
1507 
1508  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1509  // an i8.
1510  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
1511  return nullptr;
1512 
1513  Type *ElementTy = Ty->getNonOpaquePointerElementType();
1514  if (!ElementTy->isSized())
1515  return nullptr; // We can't GEP through an unsized element.
1516 
1517  SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(ElementTy, Offset);
1518  if (Offset != 0)
1519  return nullptr;
1520 
1521  for (const APInt &Index : IntIndices)
1522  Indices.push_back(IRB.getInt(Index));
1523  return getNaturalGEPWithType(IRB, DL, Ptr, ElementTy, TargetTy, Indices,
1524  NamePrefix);
1525 }
1526 
1527 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1528 /// resulting pointer has PointerTy.
1529 ///
1530 /// This tries very hard to compute a "natural" GEP which arrives at the offset
1531 /// and produces the pointer type desired. Where it cannot, it will try to use
1532 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
1533 /// fails, it will try to use an existing i8* and GEP to the byte offset and
1534 /// bitcast to the type.
1535 ///
1536 /// The strategy for finding the more natural GEPs is to peel off layers of the
1537 /// pointer, walking back through bit casts and GEPs, searching for a base
1538 /// pointer from which we can compute a natural GEP with the desired
1539 /// properties. The algorithm tries to fold as many constant indices into
1540 /// a single GEP as possible, thus making each GEP more independent of the
1541 /// surrounding code.
1542 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1544  const Twine &NamePrefix) {
1545  // Create i8 GEP for opaque pointers.
1546  if (Ptr->getType()->isOpaquePointerTy()) {
1547  if (Offset != 0)
1548  Ptr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(Offset),
1549  NamePrefix + "sroa_idx");
1550  return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1551  NamePrefix + "sroa_cast");
1552  }
1553 
1554  // Even though we don't look through PHI nodes, we could be called on an
1555  // instruction in an unreachable block, which may be on a cycle.
1556  SmallPtrSet<Value *, 4> Visited;
1557  Visited.insert(Ptr);
1558  SmallVector<Value *, 4> Indices;
1559 
1560  // We may end up computing an offset pointer that has the wrong type. If we
1561  // never are able to compute one directly that has the correct type, we'll
1562  // fall back to it, so keep it and the base it was computed from around here.
1563  Value *OffsetPtr = nullptr;
1564  Value *OffsetBasePtr;
1565 
1566  // Remember any i8 pointer we come across to re-use if we need to do a raw
1567  // byte offset.
1568  Value *Int8Ptr = nullptr;
1569  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1570 
1571  PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
1572  Type *TargetTy = TargetPtrTy->getNonOpaquePointerElementType();
1573 
1574  // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
1575  // address space from the expected `PointerTy` (the pointer to be used).
1576  // Adjust the pointer type based the original storage pointer.
1577  auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
1578  PointerTy = TargetTy->getPointerTo(AS);
1579 
1580  do {
1581  // First fold any existing GEPs into the offset.
1582  while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1583  APInt GEPOffset(Offset.getBitWidth(), 0);
1584  if (!GEP->accumulateConstantOffset(DL, GEPOffset))
1585  break;
1586  Offset += GEPOffset;
1587  Ptr = GEP->getPointerOperand();
1588  if (!Visited.insert(Ptr).second)
1589  break;
1590  }
1591 
1592  // See if we can perform a natural GEP here.
1593  Indices.clear();
1594  if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
1595  Indices, NamePrefix)) {
1596  // If we have a new natural pointer at the offset, clear out any old
1597  // offset pointer we computed. Unless it is the base pointer or
1598  // a non-instruction, we built a GEP we don't need. Zap it.
1599  if (OffsetPtr && OffsetPtr != OffsetBasePtr)
1600  if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
1601  assert(I->use_empty() && "Built a GEP with uses some how!");
1602  I->eraseFromParent();
1603  }
1604  OffsetPtr = P;
1605  OffsetBasePtr = Ptr;
1606  // If we also found a pointer of the right type, we're done.
1607  if (P->getType() == PointerTy)
1608  break;
1609  }
1610 
1611  // Stash this pointer if we've found an i8*.
1612  if (Ptr->getType()->isIntegerTy(8)) {
1613  Int8Ptr = Ptr;
1614  Int8PtrOffset = Offset;
1615  }
1616 
1617  // Peel off a layer of the pointer and update the offset appropriately.
1618  if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1619  Ptr = cast<Operator>(Ptr)->getOperand(0);
1620  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1621  if (GA->isInterposable())
1622  break;
1623  Ptr = GA->getAliasee();
1624  } else {
1625  break;
1626  }
1627  assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1628  } while (Visited.insert(Ptr).second);
1629 
1630  if (!OffsetPtr) {
1631  if (!Int8Ptr) {
1632  Int8Ptr = IRB.CreateBitCast(
1633  Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
1634  NamePrefix + "sroa_raw_cast");
1635  Int8PtrOffset = Offset;
1636  }
1637 
1638  OffsetPtr = Int8PtrOffset == 0
1639  ? Int8Ptr
1640  : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
1641  IRB.getInt(Int8PtrOffset),
1642  NamePrefix + "sroa_raw_idx");
1643  }
1644  Ptr = OffsetPtr;
1645 
1646  // On the off chance we were targeting i8*, guard the bitcast here.
1647  if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
1648  Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
1649  TargetPtrTy,
1650  NamePrefix + "sroa_cast");
1651  }
1652 
1653  return Ptr;
1654 }
1655 
1656 /// Compute the adjusted alignment for a load or store from an offset.
1659 }
1660 
1661 /// Test whether we can convert a value from the old to the new type.
1662 ///
1663 /// This predicate should be used to guard calls to convertValue in order to
1664 /// ensure that we only try to convert viable values. The strategy is that we
1665 /// will peel off single element struct and array wrappings to get to an
1666 /// underlying value, and convert that value.
1667 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1668  if (OldTy == NewTy)
1669  return true;
1670 
1671  // For integer types, we can't handle any bit-width differences. This would
1672  // break both vector conversions with extension and introduce endianness
1673  // issues when in conjunction with loads and stores.
1674  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1675  assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1676  cast<IntegerType>(NewTy)->getBitWidth() &&
1677  "We can't have the same bitwidth for different int types");
1678  return false;
1679  }
1680 
1681  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
1682  DL.getTypeSizeInBits(OldTy).getFixedSize())
1683  return false;
1684  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1685  return false;
1686 
1687  // We can convert pointers to integers and vice-versa. Same for vectors
1688  // of pointers and integers.
1689  OldTy = OldTy->getScalarType();
1690  NewTy = NewTy->getScalarType();
1691  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1692  if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1693  unsigned OldAS = OldTy->getPointerAddressSpace();
1694  unsigned NewAS = NewTy->getPointerAddressSpace();
1695  // Convert pointers if they are pointers from the same address space or
1696  // different integral (not non-integral) address spaces with the same
1697  // pointer size.
1698  return OldAS == NewAS ||
1699  (!DL.isNonIntegralAddressSpace(OldAS) &&
1700  !DL.isNonIntegralAddressSpace(NewAS) &&
1701  DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1702  }
1703 
1704  // We can convert integers to integral pointers, but not to non-integral
1705  // pointers.
1706  if (OldTy->isIntegerTy())
1707  return !DL.isNonIntegralPointerType(NewTy);
1708 
1709  // We can convert integral pointers to integers, but non-integral pointers
1710  // need to remain pointers.
1711  if (!DL.isNonIntegralPointerType(OldTy))
1712  return NewTy->isIntegerTy();
1713 
1714  return false;
1715  }
1716 
1717  return true;
1718 }
1719 
1720 /// Generic routine to convert an SSA value to a value of a different
1721 /// type.
1722 ///
1723 /// This will try various different casting techniques, such as bitcasts,
1724 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1725 /// two types for viability with this routine.
1726 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
1727  Type *NewTy) {
1728  Type *OldTy = V->getType();
1729  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
1730 
1731  if (OldTy == NewTy)
1732  return V;
1733 
1734  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
1735  "Integer types must be the exact same to convert.");
1736 
1737  // See if we need inttoptr for this type pair. May require additional bitcast.
1738  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1739  // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
1740  // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
1741  // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
1742  // Directly handle i64 to i8*
1743  return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
1744  NewTy);
1745  }
1746 
1747  // See if we need ptrtoint for this type pair. May require additional bitcast.
1748  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
1749  // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
1750  // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
1751  // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
1752  // Expand i8* to i64 --> i8* to i64 to i64
1753  return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1754  NewTy);
1755  }
1756 
1757  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1758  unsigned OldAS = OldTy->getPointerAddressSpace();
1759  unsigned NewAS = NewTy->getPointerAddressSpace();
1760  // To convert pointers with different address spaces (they are already
1761  // checked convertible, i.e. they have the same pointer size), so far we
1762  // cannot use `bitcast` (which has restrict on the same address space) or
1763  // `addrspacecast` (which is not always no-op casting). Instead, use a pair
1764  // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
1765  // size.
1766  if (OldAS != NewAS) {
1767  assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1768  return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1769  NewTy);
1770  }
1771  }
1772 
1773  return IRB.CreateBitCast(V, NewTy);
1774 }
1775 
1776 /// Test whether the given slice use can be promoted to a vector.
1777 ///
1778 /// This function is called to test each entry in a partition which is slated
1779 /// for a single slice.
1780 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
1781  VectorType *Ty,
1782  uint64_t ElementSize,
1783  const DataLayout &DL) {
1784  // First validate the slice offsets.
1785  uint64_t BeginOffset =
1786  std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
1787  uint64_t BeginIndex = BeginOffset / ElementSize;
1788  if (BeginIndex * ElementSize != BeginOffset ||
1789  BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
1790  return false;
1791  uint64_t EndOffset =
1792  std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
1793  uint64_t EndIndex = EndOffset / ElementSize;
1794  if (EndIndex * ElementSize != EndOffset ||
1795  EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
1796  return false;
1797 
1798  assert(EndIndex > BeginIndex && "Empty vector!");
1799  uint64_t NumElements = EndIndex - BeginIndex;
1800  Type *SliceTy = (NumElements == 1)
1801  ? Ty->getElementType()
1802  : FixedVectorType::get(Ty->getElementType(), NumElements);
1803 
1804  Type *SplitIntTy =
1805  Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
1806 
1807  Use *U = S.getUse();
1808 
1809  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
1810  if (MI->isVolatile())
1811  return false;
1812  if (!S.isSplittable())
1813  return false; // Skip any unsplittable intrinsics.
1814  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
1815  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
1816  return false;
1817  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1818  if (LI->isVolatile())
1819  return false;
1820  Type *LTy = LI->getType();
1821  // Disable vector promotion when there are loads or stores of an FCA.
1822  if (LTy->isStructTy())
1823  return false;
1824  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1825  assert(LTy->isIntegerTy());
1826  LTy = SplitIntTy;
1827  }
1828  if (!canConvertValue(DL, SliceTy, LTy))
1829  return false;
1830  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1831  if (SI->isVolatile())
1832  return false;
1833  Type *STy = SI->getValueOperand()->getType();
1834  // Disable vector promotion when there are loads or stores of an FCA.
1835  if (STy->isStructTy())
1836  return false;
1837  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1838  assert(STy->isIntegerTy());
1839  STy = SplitIntTy;
1840  }
1841  if (!canConvertValue(DL, STy, SliceTy))
1842  return false;
1843  } else {
1844  return false;
1845  }
1846 
1847  return true;
1848 }
1849 
1850 /// Test whether the given alloca partitioning and range of slices can be
1851 /// promoted to a vector.
1852 ///
1853 /// This is a quick test to check whether we can rewrite a particular alloca
1854 /// partition (and its newly formed alloca) into a vector alloca with only
1855 /// whole-vector loads and stores such that it could be promoted to a vector
1856 /// SSA value. We only can ensure this for a limited set of operations, and we
1857 /// don't want to do the rewrites unless we are confident that the result will
1858 /// be promotable, so we have an early test here.
1860  // Collect the candidate types for vector-based promotion. Also track whether
1861  // we have different element types.
1862  SmallVector<VectorType *, 4> CandidateTys;
1863  Type *CommonEltTy = nullptr;
1864  bool HaveCommonEltTy = true;
1865  auto CheckCandidateType = [&](Type *Ty) {
1866  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1867  // Return if bitcast to vectors is different for total size in bits.
1868  if (!CandidateTys.empty()) {
1869  VectorType *V = CandidateTys[0];
1870  if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
1871  DL.getTypeSizeInBits(V).getFixedSize()) {
1872  CandidateTys.clear();
1873  return;
1874  }
1875  }
1876  CandidateTys.push_back(VTy);
1877  if (!CommonEltTy)
1878  CommonEltTy = VTy->getElementType();
1879  else if (CommonEltTy != VTy->getElementType())
1880  HaveCommonEltTy = false;
1881  }
1882  };
1883  // Consider any loads or stores that are the exact size of the slice.
1884  for (const Slice &S : P)
1885  if (S.beginOffset() == P.beginOffset() &&
1886  S.endOffset() == P.endOffset()) {
1887  if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
1888  CheckCandidateType(LI->getType());
1889  else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
1890  CheckCandidateType(SI->getValueOperand()->getType());
1891  }
1892 
1893  // If we didn't find a vector type, nothing to do here.
1894  if (CandidateTys.empty())
1895  return nullptr;
1896 
1897  // Remove non-integer vector types if we had multiple common element types.
1898  // FIXME: It'd be nice to replace them with integer vector types, but we can't
1899  // do that until all the backends are known to produce good code for all
1900  // integer vector types.
1901  if (!HaveCommonEltTy) {
1902  llvm::erase_if(CandidateTys, [](VectorType *VTy) {
1903  return !VTy->getElementType()->isIntegerTy();
1904  });
1905 
1906  // If there were no integer vector types, give up.
1907  if (CandidateTys.empty())
1908  return nullptr;
1909 
1910  // Rank the remaining candidate vector types. This is easy because we know
1911  // they're all integer vectors. We sort by ascending number of elements.
1912  auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
1913  (void)DL;
1914  assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
1915  DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
1916  "Cannot have vector types of different sizes!");
1917  assert(RHSTy->getElementType()->isIntegerTy() &&
1918  "All non-integer types eliminated!");
1919  assert(LHSTy->getElementType()->isIntegerTy() &&
1920  "All non-integer types eliminated!");
1921  return cast<FixedVectorType>(RHSTy)->getNumElements() <
1922  cast<FixedVectorType>(LHSTy)->getNumElements();
1923  };
1924  llvm::sort(CandidateTys, RankVectorTypes);
1925  CandidateTys.erase(
1926  std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
1927  CandidateTys.end());
1928  } else {
1929 // The only way to have the same element type in every vector type is to
1930 // have the same vector type. Check that and remove all but one.
1931 #ifndef NDEBUG
1932  for (VectorType *VTy : CandidateTys) {
1933  assert(VTy->getElementType() == CommonEltTy &&
1934  "Unaccounted for element type!");
1935  assert(VTy == CandidateTys[0] &&
1936  "Different vector types with the same element type!");
1937  }
1938 #endif
1939  CandidateTys.resize(1);
1940  }
1941 
1942  // Try each vector type, and return the one which works.
1943  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
1944  uint64_t ElementSize =
1945  DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
1946 
1947  // While the definition of LLVM vectors is bitpacked, we don't support sizes
1948  // that aren't byte sized.
1949  if (ElementSize % 8)
1950  return false;
1951  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
1952  "vector size not a multiple of element size?");
1953  ElementSize /= 8;
1954 
1955  for (const Slice &S : P)
1956  if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
1957  return false;
1958 
1959  for (const Slice *S : P.splitSliceTails())
1960  if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
1961  return false;
1962 
1963  return true;
1964  };
1965  for (VectorType *VTy : CandidateTys)
1966  if (CheckVectorTypeForPromotion(VTy))
1967  return VTy;
1968 
1969  return nullptr;
1970 }
1971 
1972 /// Test whether a slice of an alloca is valid for integer widening.
1973 ///
1974 /// This implements the necessary checking for the \c isIntegerWideningViable
1975 /// test below on a single slice of the alloca.
1976 static bool isIntegerWideningViableForSlice(const Slice &S,
1977  uint64_t AllocBeginOffset,
1978  Type *AllocaTy,
1979  const DataLayout &DL,
1980  bool &WholeAllocaOp) {
1981  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
1982 
1983  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
1984  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
1985 
1986  Use *U = S.getUse();
1987 
1988  // Lifetime intrinsics operate over the whole alloca whose sizes are usually
1989  // larger than other load/store slices (RelEnd > Size). But lifetime are
1990  // always promotable and should not impact other slices' promotability of the
1991  // partition.
1992  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
1993  if (II->isLifetimeStartOrEnd() || II->isDroppable())
1994  return true;
1995  }
1996 
1997  // We can't reasonably handle cases where the load or store extends past
1998  // the end of the alloca's type and into its padding.
1999  if (RelEnd > Size)
2000  return false;
2001 
2002  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2003  if (LI->isVolatile())
2004  return false;
2005  // We can't handle loads that extend past the allocated memory.
2006  if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
2007  return false;
2008  // So far, AllocaSliceRewriter does not support widening split slice tails
2009  // in rewriteIntegerLoad.
2010  if (S.beginOffset() < AllocBeginOffset)
2011  return false;
2012  // Note that we don't count vector loads or stores as whole-alloca
2013  // operations which enable integer widening because we would prefer to use
2014  // vector widening instead.
2015  if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2016  WholeAllocaOp = true;
2017  if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2018  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2019  return false;
2020  } else if (RelBegin != 0 || RelEnd != Size ||
2021  !canConvertValue(DL, AllocaTy, LI->getType())) {
2022  // Non-integer loads need to be convertible from the alloca type so that
2023  // they are promotable.
2024  return false;
2025  }
2026  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2027  Type *ValueTy = SI->getValueOperand()->getType();
2028  if (SI->isVolatile())
2029  return false;
2030  // We can't handle stores that extend past the allocated memory.
2031  if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
2032  return false;
2033  // So far, AllocaSliceRewriter does not support widening split slice tails
2034  // in rewriteIntegerStore.
2035  if (S.beginOffset() < AllocBeginOffset)
2036  return false;
2037  // Note that we don't count vector loads or stores as whole-alloca
2038  // operations which enable integer widening because we would prefer to use
2039  // vector widening instead.
2040  if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2041  WholeAllocaOp = true;
2042  if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2043  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2044  return false;
2045  } else if (RelBegin != 0 || RelEnd != Size ||
2046  !canConvertValue(DL, ValueTy, AllocaTy)) {
2047  // Non-integer stores need to be convertible to the alloca type so that
2048  // they are promotable.
2049  return false;
2050  }
2051  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2052  if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2053  return false;
2054  if (!S.isSplittable())
2055  return false; // Skip any unsplittable intrinsics.
2056  } else {
2057  return false;
2058  }
2059 
2060  return true;
2061 }
2062 
2063 /// Test whether the given alloca partition's integer operations can be
2064 /// widened to promotable ones.
2065 ///
2066 /// This is a quick test to check whether we can rewrite the integer loads and
2067 /// stores to a particular alloca into wider loads and stores and be able to
2068 /// promote the resulting alloca.
2069 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2070  const DataLayout &DL) {
2071  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
2072  // Don't create integer types larger than the maximum bitwidth.
2073  if (SizeInBits > IntegerType::MAX_INT_BITS)
2074  return false;
2075 
2076  // Don't try to handle allocas with bit-padding.
2077  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
2078  return false;
2079 
2080  // We need to ensure that an integer type with the appropriate bitwidth can
2081  // be converted to the alloca type, whatever that is. We don't want to force
2082  // the alloca itself to have an integer type if there is a more suitable one.
2083  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2084  if (!canConvertValue(DL, AllocaTy, IntTy) ||
2085  !canConvertValue(DL, IntTy, AllocaTy))
2086  return false;
2087 
2088  // While examining uses, we ensure that the alloca has a covering load or
2089  // store. We don't want to widen the integer operations only to fail to
2090  // promote due to some other unsplittable entry (which we may make splittable
2091  // later). However, if there are only splittable uses, go ahead and assume
2092  // that we cover the alloca.
2093  // FIXME: We shouldn't consider split slices that happen to start in the
2094  // partition here...
2095  bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2096 
2097  for (const Slice &S : P)
2098  if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2099  WholeAllocaOp))
2100  return false;
2101 
2102  for (const Slice *S : P.splitSliceTails())
2103  if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2104  WholeAllocaOp))
2105  return false;
2106 
2107  return WholeAllocaOp;
2108 }
2109 
2110 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2112  const Twine &Name) {
2113  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2114  IntegerType *IntTy = cast<IntegerType>(V->getType());
2115  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2116  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2117  "Element extends past full value");
2118  uint64_t ShAmt = 8 * Offset;
2119  if (DL.isBigEndian())
2120  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2121  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2122  if (ShAmt) {
2123  V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2124  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2125  }
2126  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2127  "Cannot extract to a larger integer!");
2128  if (Ty != IntTy) {
2129  V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2130  LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2131  }
2132  return V;
2133 }
2134 
2135 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2136  Value *V, uint64_t Offset, const Twine &Name) {
2137  IntegerType *IntTy = cast<IntegerType>(Old->getType());
2138  IntegerType *Ty = cast<IntegerType>(V->getType());
2139  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2140  "Cannot insert a larger integer!");
2141  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2142  if (Ty != IntTy) {
2143  V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2144  LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2145  }
2146  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2147  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2148  "Element store outside of alloca store");
2149  uint64_t ShAmt = 8 * Offset;
2150  if (DL.isBigEndian())
2151  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2152  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2153  if (ShAmt) {
2154  V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2155  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2156  }
2157 
2158  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2159  APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2160  Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2161  LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2162  V = IRB.CreateOr(Old, V, Name + ".insert");
2163  LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2164  }
2165  return V;
2166 }
2167 
2168 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2169  unsigned EndIndex, const Twine &Name) {
2170  auto *VecTy = cast<FixedVectorType>(V->getType());
2171  unsigned NumElements = EndIndex - BeginIndex;
2172  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2173 
2174  if (NumElements == VecTy->getNumElements())
2175  return V;
2176 
2177  if (NumElements == 1) {
2178  V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2179  Name + ".extract");
2180  LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2181  return V;
2182  }
2183 
2184  auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2185  V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2186  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2187  return V;
2188 }
2189 
2190 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2191  unsigned BeginIndex, const Twine &Name) {
2192  VectorType *VecTy = cast<VectorType>(Old->getType());
2193  assert(VecTy && "Can only insert a vector into a vector");
2194 
2195  VectorType *Ty = dyn_cast<VectorType>(V->getType());
2196  if (!Ty) {
2197  // Single element to insert.
2198  V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2199  Name + ".insert");
2200  LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2201  return V;
2202  }
2203 
2204  assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2205  cast<FixedVectorType>(VecTy)->getNumElements() &&
2206  "Too many elements!");
2207  if (cast<FixedVectorType>(Ty)->getNumElements() ==
2208  cast<FixedVectorType>(VecTy)->getNumElements()) {
2209  assert(V->getType() == VecTy && "Vector type mismatch");
2210  return V;
2211  }
2212  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2213 
2214  // When inserting a smaller vector into the larger to store, we first
2215  // use a shuffle vector to widen it with undef elements, and then
2216  // a second shuffle vector to select between the loaded vector and the
2217  // incoming vector.
2219  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2220  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2221  if (i >= BeginIndex && i < EndIndex)
2222  Mask.push_back(i - BeginIndex);
2223  else
2224  Mask.push_back(-1);
2225  V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2226  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2227 
2229  Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2230  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2231  Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2232 
2233  V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2234 
2235  LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2236  return V;
2237 }
2238 
2239 /// Visitor to rewrite instructions using p particular slice of an alloca
2240 /// to use a new alloca.
2241 ///
2242 /// Also implements the rewriting to vector-based accesses when the partition
2243 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2244 /// lives here.
2246  : public InstVisitor<AllocaSliceRewriter, bool> {
2247  // Befriend the base class so it can delegate to private visit methods.
2248  friend class InstVisitor<AllocaSliceRewriter, bool>;
2249 
2251 
2252  const DataLayout &DL;
2253  AllocaSlices &AS;
2254  SROAPass &Pass;
2255  AllocaInst &OldAI, &NewAI;
2256  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2257  Type *NewAllocaTy;
2258 
2259  // This is a convenience and flag variable that will be null unless the new
2260  // alloca's integer operations should be widened to this integer type due to
2261  // passing isIntegerWideningViable above. If it is non-null, the desired
2262  // integer type will be stored here for easy access during rewriting.
2263  IntegerType *IntTy;
2264 
2265  // If we are rewriting an alloca partition which can be written as pure
2266  // vector operations, we stash extra information here. When VecTy is
2267  // non-null, we have some strict guarantees about the rewritten alloca:
2268  // - The new alloca is exactly the size of the vector type here.
2269  // - The accesses all either map to the entire vector or to a single
2270  // element.
2271  // - The set of accessing instructions is only one of those handled above
2272  // in isVectorPromotionViable. Generally these are the same access kinds
2273  // which are promotable via mem2reg.
2274  VectorType *VecTy;
2275  Type *ElementTy;
2276  uint64_t ElementSize;
2277 
2278  // The original offset of the slice currently being rewritten relative to
2279  // the original alloca.
2280  uint64_t BeginOffset = 0;
2281  uint64_t EndOffset = 0;
2282 
2283  // The new offsets of the slice currently being rewritten relative to the
2284  // original alloca.
2285  uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2286 
2287  uint64_t SliceSize = 0;
2288  bool IsSplittable = false;
2289  bool IsSplit = false;
2290  Use *OldUse = nullptr;
2291  Instruction *OldPtr = nullptr;
2292 
2293  // Track post-rewrite users which are PHI nodes and Selects.
2294  SmallSetVector<PHINode *, 8> &PHIUsers;
2295  SmallSetVector<SelectInst *, 8> &SelectUsers;
2296 
2297  // Utility IR builder, whose name prefix is setup for each visited use, and
2298  // the insertion point is set to point to the user.
2299  IRBuilderTy IRB;
2300 
2301 public:
2303  AllocaInst &OldAI, AllocaInst &NewAI,
2304  uint64_t NewAllocaBeginOffset,
2305  uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2306  VectorType *PromotableVecTy,
2307  SmallSetVector<PHINode *, 8> &PHIUsers,
2308  SmallSetVector<SelectInst *, 8> &SelectUsers)
2309  : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2310  NewAllocaBeginOffset(NewAllocaBeginOffset),
2311  NewAllocaEndOffset(NewAllocaEndOffset),
2312  NewAllocaTy(NewAI.getAllocatedType()),
2313  IntTy(
2314  IsIntegerPromotable
2315  ? Type::getIntNTy(NewAI.getContext(),
2316  DL.getTypeSizeInBits(NewAI.getAllocatedType())
2317  .getFixedSize())
2318  : nullptr),
2319  VecTy(PromotableVecTy),
2320  ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2321  ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
2322  : 0),
2323  PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2324  IRB(NewAI.getContext(), ConstantFolder()) {
2325  if (VecTy) {
2326  assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
2327  "Only multiple-of-8 sized vector elements are viable");
2328  ++NumVectorized;
2329  }
2330  assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2331  }
2332 
2334  bool CanSROA = true;
2335  BeginOffset = I->beginOffset();
2336  EndOffset = I->endOffset();
2337  IsSplittable = I->isSplittable();
2338  IsSplit =
2339  BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2340  LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2341  LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2342  LLVM_DEBUG(dbgs() << "\n");
2343 
2344  // Compute the intersecting offset range.
2345  assert(BeginOffset < NewAllocaEndOffset);
2346  assert(EndOffset > NewAllocaBeginOffset);
2347  NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2348  NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2349 
2350  SliceSize = NewEndOffset - NewBeginOffset;
2351 
2352  OldUse = I->getUse();
2353  OldPtr = cast<Instruction>(OldUse->get());
2354 
2355  Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2356  IRB.SetInsertPoint(OldUserI);
2357  IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2358  IRB.getInserter().SetNamePrefix(
2359  Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
2360 
2361  CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2362  if (VecTy || IntTy)
2363  assert(CanSROA);
2364  return CanSROA;
2365  }
2366 
2367 private:
2368  // Make sure the other visit overloads are visible.
2369  using Base::visit;
2370 
2371  // Every instruction which can end up as a user must have a rewrite rule.
2372  bool visitInstruction(Instruction &I) {
2373  LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2374  llvm_unreachable("No rewrite rule for this instruction!");
2375  }
2376 
2377  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2378  // Note that the offset computation can use BeginOffset or NewBeginOffset
2379  // interchangeably for unsplit slices.
2380  assert(IsSplit || BeginOffset == NewBeginOffset);
2381  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2382 
2383 #ifndef NDEBUG
2384  StringRef OldName = OldPtr->getName();
2385  // Skip through the last '.sroa.' component of the name.
2386  size_t LastSROAPrefix = OldName.rfind(".sroa.");
2387  if (LastSROAPrefix != StringRef::npos) {
2388  OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2389  // Look for an SROA slice index.
2390  size_t IndexEnd = OldName.find_first_not_of("0123456789");
2391  if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2392  // Strip the index and look for the offset.
2393  OldName = OldName.substr(IndexEnd + 1);
2394  size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2395  if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2396  // Strip the offset.
2397  OldName = OldName.substr(OffsetEnd + 1);
2398  }
2399  }
2400  // Strip any SROA suffixes as well.
2401  OldName = OldName.substr(0, OldName.find(".sroa_"));
2402 #endif
2403 
2404  return getAdjustedPtr(IRB, DL, &NewAI,
2405  APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
2406  PointerTy,
2407 #ifndef NDEBUG
2408  Twine(OldName) + "."
2409 #else
2410  Twine()
2411 #endif
2412  );
2413  }
2414 
2415  /// Compute suitable alignment to access this slice of the *new*
2416  /// alloca.
2417  ///
2418  /// You can optionally pass a type to this routine and if that type's ABI
2419  /// alignment is itself suitable, this will return zero.
2420  Align getSliceAlign() {
2421  return commonAlignment(NewAI.getAlign(),
2422  NewBeginOffset - NewAllocaBeginOffset);
2423  }
2424 
2425  unsigned getIndex(uint64_t Offset) {
2426  assert(VecTy && "Can only call getIndex when rewriting a vector");
2427  uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2428  assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2429  uint32_t Index = RelOffset / ElementSize;
2430  assert(Index * ElementSize == RelOffset);
2431  return Index;
2432  }
2433 
2434  void deleteIfTriviallyDead(Value *V) {
2435  Instruction *I = cast<Instruction>(V);
2437  Pass.DeadInsts.push_back(I);
2438  }
2439 
2440  Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2441  unsigned BeginIndex = getIndex(NewBeginOffset);
2442  unsigned EndIndex = getIndex(NewEndOffset);
2443  assert(EndIndex > BeginIndex && "Empty vector!");
2444 
2445  LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2446  NewAI.getAlign(), "load");
2447 
2448  Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2449  LLVMContext::MD_access_group});
2450  return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
2451  }
2452 
2453  Value *rewriteIntegerLoad(LoadInst &LI) {
2454  assert(IntTy && "We cannot insert an integer to the alloca");
2455  assert(!LI.isVolatile());
2456  Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2457  NewAI.getAlign(), "load");
2458  V = convertValue(DL, IRB, V, IntTy);
2459  assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2460  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2461  if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2462  IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
2463  V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
2464  }
2465  // It is possible that the extracted type is not the load type. This
2466  // happens if there is a load past the end of the alloca, and as
2467  // a consequence the slice is narrower but still a candidate for integer
2468  // lowering. To handle this case, we just zero extend the extracted
2469  // integer.
2470  assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2471  "Can only handle an extract for an overly wide load");
2472  if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
2473  V = IRB.CreateZExt(V, LI.getType());
2474  return V;
2475  }
2476 
2477  bool visitLoadInst(LoadInst &LI) {
2478  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2479  Value *OldOp = LI.getOperand(0);
2480  assert(OldOp == OldPtr);
2481 
2482  AAMDNodes AATags = LI.getAAMetadata();
2483 
2484  unsigned AS = LI.getPointerAddressSpace();
2485 
2486  Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2487  : LI.getType();
2488  const bool IsLoadPastEnd =
2489  DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
2490  bool IsPtrAdjusted = false;
2491  Value *V;
2492  if (VecTy) {
2493  V = rewriteVectorizedLoadInst(LI);
2494  } else if (IntTy && LI.getType()->isIntegerTy()) {
2495  V = rewriteIntegerLoad(LI);
2496  } else if (NewBeginOffset == NewAllocaBeginOffset &&
2497  NewEndOffset == NewAllocaEndOffset &&
2498  (canConvertValue(DL, NewAllocaTy, TargetTy) ||
2499  (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2500  TargetTy->isIntegerTy()))) {
2501  LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2502  NewAI.getAlign(), LI.isVolatile(),
2503  LI.getName());
2504  if (AATags)
2505  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2506  if (LI.isVolatile())
2507  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2508  if (NewLI->isAtomic())
2509  NewLI->setAlignment(LI.getAlign());
2510 
2511  // Any !nonnull metadata or !range metadata on the old load is also valid
2512  // on the new load. This is even true in some cases even when the loads
2513  // are different types, for example by mapping !nonnull metadata to
2514  // !range metadata by modeling the null pointer constant converted to the
2515  // integer type.
2516  // FIXME: Add support for range metadata here. Currently the utilities
2517  // for this don't propagate range metadata in trivial cases from one
2518  // integer load to another, don't handle non-addrspace-0 null pointers
2519  // correctly, and don't have any support for mapping ranges as the
2520  // integer type becomes winder or narrower.
2521  if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
2522  copyNonnullMetadata(LI, N, *NewLI);
2523 
2524  // Try to preserve nonnull metadata
2525  V = NewLI;
2526 
2527  // If this is an integer load past the end of the slice (which means the
2528  // bytes outside the slice are undef or this load is dead) just forcibly
2529  // fix the integer size with correct handling of endianness.
2530  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2531  if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
2532  if (AITy->getBitWidth() < TITy->getBitWidth()) {
2533  V = IRB.CreateZExt(V, TITy, "load.ext");
2534  if (DL.isBigEndian())
2535  V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
2536  "endian_shift");
2537  }
2538  } else {
2539  Type *LTy = TargetTy->getPointerTo(AS);
2540  LoadInst *NewLI =
2541  IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
2542  getSliceAlign(), LI.isVolatile(), LI.getName());
2543  if (AATags)
2544  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2545  if (LI.isVolatile())
2546  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2547  NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2548  LLVMContext::MD_access_group});
2549 
2550  V = NewLI;
2551  IsPtrAdjusted = true;
2552  }
2553  V = convertValue(DL, IRB, V, TargetTy);
2554 
2555  if (IsSplit) {
2556  assert(!LI.isVolatile());
2557  assert(LI.getType()->isIntegerTy() &&
2558  "Only integer type loads and stores are split");
2559  assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
2560  "Split load isn't smaller than original load");
2561  assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
2562  "Non-byte-multiple bit width");
2563  // Move the insertion point just past the load so that we can refer to it.
2564  IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
2565  // Create a placeholder value with the same type as LI to use as the
2566  // basis for the new value. This allows us to replace the uses of LI with
2567  // the computed value, and then replace the placeholder with LI, leaving
2568  // LI only used for this computation.
2569  Value *Placeholder = new LoadInst(
2570  LI.getType(), PoisonValue::get(LI.getType()->getPointerTo(AS)), "",
2571  false, Align(1));
2572  V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
2573  "insert");
2574  LI.replaceAllUsesWith(V);
2575  Placeholder->replaceAllUsesWith(&LI);
2576  Placeholder->deleteValue();
2577  } else {
2578  LI.replaceAllUsesWith(V);
2579  }
2580 
2581  Pass.DeadInsts.push_back(&LI);
2582  deleteIfTriviallyDead(OldOp);
2583  LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
2584  return !LI.isVolatile() && !IsPtrAdjusted;
2585  }
2586 
2587  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
2588  AAMDNodes AATags) {
2589  if (V->getType() != VecTy) {
2590  unsigned BeginIndex = getIndex(NewBeginOffset);
2591  unsigned EndIndex = getIndex(NewEndOffset);
2592  assert(EndIndex > BeginIndex && "Empty vector!");
2593  unsigned NumElements = EndIndex - BeginIndex;
2594  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2595  "Too many elements!");
2596  Type *SliceTy = (NumElements == 1)
2597  ? ElementTy
2598  : FixedVectorType::get(ElementTy, NumElements);
2599  if (V->getType() != SliceTy)
2600  V = convertValue(DL, IRB, V, SliceTy);
2601 
2602  // Mix in the existing elements.
2603  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2604  NewAI.getAlign(), "load");
2605  V = insertVector(IRB, Old, V, BeginIndex, "vec");
2606  }
2607  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2608  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2609  LLVMContext::MD_access_group});
2610  if (AATags)
2611  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2612  Pass.DeadInsts.push_back(&SI);
2613 
2614  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2615  return true;
2616  }
2617 
2618  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
2619  assert(IntTy && "We cannot extract an integer from the alloca");
2620  assert(!SI.isVolatile());
2621  if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
2622  IntTy->getBitWidth()) {
2623  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2624  NewAI.getAlign(), "oldload");
2625  Old = convertValue(DL, IRB, Old, IntTy);
2626  assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2627  uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2628  V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
2629  }
2630  V = convertValue(DL, IRB, V, NewAllocaTy);
2631  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2632  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2633  LLVMContext::MD_access_group});
2634  if (AATags)
2635  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2636  Pass.DeadInsts.push_back(&SI);
2637  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2638  return true;
2639  }
2640 
2641  bool visitStoreInst(StoreInst &SI) {
2642  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
2643  Value *OldOp = SI.getOperand(1);
2644  assert(OldOp == OldPtr);
2645 
2646  AAMDNodes AATags = SI.getAAMetadata();
2647  Value *V = SI.getValueOperand();
2648 
2649  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2650  // alloca that should be re-examined after promoting this alloca.
2651  if (V->getType()->isPointerTy())
2652  if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
2653  Pass.PostPromotionWorklist.insert(AI);
2654 
2655  if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
2656  assert(!SI.isVolatile());
2657  assert(V->getType()->isIntegerTy() &&
2658  "Only integer type loads and stores are split");
2659  assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
2660  "Non-byte-multiple bit width");
2661  IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
2662  V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
2663  "extract");
2664  }
2665 
2666  if (VecTy)
2667  return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
2668  if (IntTy && V->getType()->isIntegerTy())
2669  return rewriteIntegerStore(V, SI, AATags);
2670 
2671  const bool IsStorePastEnd =
2672  DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
2673  StoreInst *NewSI;
2674  if (NewBeginOffset == NewAllocaBeginOffset &&
2675  NewEndOffset == NewAllocaEndOffset &&
2676  (canConvertValue(DL, V->getType(), NewAllocaTy) ||
2677  (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
2678  V->getType()->isIntegerTy()))) {
2679  // If this is an integer store past the end of slice (and thus the bytes
2680  // past that point are irrelevant or this is unreachable), truncate the
2681  // value prior to storing.
2682  if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
2683  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2684  if (VITy->getBitWidth() > AITy->getBitWidth()) {
2685  if (DL.isBigEndian())
2686  V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
2687  "endian_shift");
2688  V = IRB.CreateTrunc(V, AITy, "load.trunc");
2689  }
2690 
2691  V = convertValue(DL, IRB, V, NewAllocaTy);
2692  NewSI =
2693  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
2694  } else {
2695  unsigned AS = SI.getPointerAddressSpace();
2696  Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
2697  NewSI =
2698  IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
2699  }
2700  NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2701  LLVMContext::MD_access_group});
2702  if (AATags)
2703  NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2704  if (SI.isVolatile())
2705  NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
2706  if (NewSI->isAtomic())
2707  NewSI->setAlignment(SI.getAlign());
2708  Pass.DeadInsts.push_back(&SI);
2709  deleteIfTriviallyDead(OldOp);
2710 
2711  LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
2712  return NewSI->getPointerOperand() == &NewAI &&
2713  NewSI->getValueOperand()->getType() == NewAllocaTy &&
2714  !SI.isVolatile();
2715  }
2716 
2717  /// Compute an integer value from splatting an i8 across the given
2718  /// number of bytes.
2719  ///
2720  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2721  /// call this routine.
2722  /// FIXME: Heed the advice above.
2723  ///
2724  /// \param V The i8 value to splat.
2725  /// \param Size The number of bytes in the output (assuming i8 is one byte)
2726  Value *getIntegerSplat(Value *V, unsigned Size) {
2727  assert(Size > 0 && "Expected a positive number of bytes.");
2728  IntegerType *VTy = cast<IntegerType>(V->getType());
2729  assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
2730  if (Size == 1)
2731  return V;
2732 
2733  Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
2734  V = IRB.CreateMul(
2735  IRB.CreateZExt(V, SplatIntTy, "zext"),
2736  ConstantExpr::getUDiv(
2737  Constant::getAllOnesValue(SplatIntTy),
2738  ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
2739  SplatIntTy)),
2740  "isplat");
2741  return V;
2742  }
2743 
2744  /// Compute a vector splat for a given element value.
2745  Value *getVectorSplat(Value *V, unsigned NumElements) {
2746  V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
2747  LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
2748  return V;
2749  }
2750 
2751  bool visitMemSetInst(MemSetInst &II) {
2752  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2753  assert(II.getRawDest() == OldPtr);
2754 
2755  AAMDNodes AATags = II.getAAMetadata();
2756 
2757  // If the memset has a variable size, it cannot be split, just adjust the
2758  // pointer to the new alloca.
2759  if (!isa<ConstantInt>(II.getLength())) {
2760  assert(!IsSplit);
2761  assert(NewBeginOffset == BeginOffset);
2762  II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
2763  II.setDestAlignment(getSliceAlign());
2764 
2765  deleteIfTriviallyDead(OldPtr);
2766  return false;
2767  }
2768 
2769  // Record this instruction for deletion.
2770  Pass.DeadInsts.push_back(&II);
2771 
2772  Type *AllocaTy = NewAI.getAllocatedType();
2773  Type *ScalarTy = AllocaTy->getScalarType();
2774 
2775  const bool CanContinue = [&]() {
2776  if (VecTy || IntTy)
2777  return true;
2778  if (BeginOffset > NewAllocaBeginOffset ||
2779  EndOffset < NewAllocaEndOffset)
2780  return false;
2781  // Length must be in range for FixedVectorType.
2782  auto *C = cast<ConstantInt>(II.getLength());
2783  const uint64_t Len = C->getLimitedValue();
2785  return false;
2786  auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
2787  auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
2788  return canConvertValue(DL, SrcTy, AllocaTy) &&
2789  DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
2790  }();
2791 
2792  // If this doesn't map cleanly onto the alloca type, and that type isn't
2793  // a single value type, just emit a memset.
2794  if (!CanContinue) {
2795  Type *SizeTy = II.getLength()->getType();
2796  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2797  CallInst *New = IRB.CreateMemSet(
2798  getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
2799  MaybeAlign(getSliceAlign()), II.isVolatile());
2800  if (AATags)
2801  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2802  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2803  return false;
2804  }
2805 
2806  // If we can represent this as a simple value, we have to build the actual
2807  // value to store, which requires expanding the byte present in memset to
2808  // a sensible representation for the alloca type. This is essentially
2809  // splatting the byte to a sufficiently wide integer, splatting it across
2810  // any desired vector width, and bitcasting to the final type.
2811  Value *V;
2812 
2813  if (VecTy) {
2814  // If this is a memset of a vectorized alloca, insert it.
2815  assert(ElementTy == ScalarTy);
2816 
2817  unsigned BeginIndex = getIndex(NewBeginOffset);
2818  unsigned EndIndex = getIndex(NewEndOffset);
2819  assert(EndIndex > BeginIndex && "Empty vector!");
2820  unsigned NumElements = EndIndex - BeginIndex;
2821  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2822  "Too many elements!");
2823 
2824  Value *Splat = getIntegerSplat(
2825  II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
2826  Splat = convertValue(DL, IRB, Splat, ElementTy);
2827  if (NumElements > 1)
2828  Splat = getVectorSplat(Splat, NumElements);
2829 
2830  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2831  NewAI.getAlign(), "oldload");
2832  V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
2833  } else if (IntTy) {
2834  // If this is a memset on an alloca where we can widen stores, insert the
2835  // set integer.
2836  assert(!II.isVolatile());
2837 
2838  uint64_t Size = NewEndOffset - NewBeginOffset;
2839  V = getIntegerSplat(II.getValue(), Size);
2840 
2841  if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
2842  EndOffset != NewAllocaBeginOffset)) {
2843  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2844  NewAI.getAlign(), "oldload");
2845  Old = convertValue(DL, IRB, Old, IntTy);
2846  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2847  V = insertInteger(DL, IRB, Old, V, Offset, "insert");
2848  } else {
2849  assert(V->getType() == IntTy &&
2850  "Wrong type for an alloca wide integer!");
2851  }
2852  V = convertValue(DL, IRB, V, AllocaTy);
2853  } else {
2854  // Established these invariants above.
2855  assert(NewBeginOffset == NewAllocaBeginOffset);
2856  assert(NewEndOffset == NewAllocaEndOffset);
2857 
2858  V = getIntegerSplat(II.getValue(),
2859  DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
2860  if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
2861  V = getVectorSplat(
2862  V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
2863 
2864  V = convertValue(DL, IRB, V, AllocaTy);
2865  }
2866 
2867  StoreInst *New =
2868  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
2869  New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
2870  LLVMContext::MD_access_group});
2871  if (AATags)
2872  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2873  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2874  return !II.isVolatile();
2875  }
2876 
2877  bool visitMemTransferInst(MemTransferInst &II) {
2878  // Rewriting of memory transfer instructions can be a bit tricky. We break
2879  // them into two categories: split intrinsics and unsplit intrinsics.
2880 
2881  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2882 
2883  AAMDNodes AATags = II.getAAMetadata();
2884 
2885  bool IsDest = &II.getRawDestUse() == OldUse;
2886  assert((IsDest && II.getRawDest() == OldPtr) ||
2887  (!IsDest && II.getRawSource() == OldPtr));
2888 
2889  MaybeAlign SliceAlign = getSliceAlign();
2890 
2891  // For unsplit intrinsics, we simply modify the source and destination
2892  // pointers in place. This isn't just an optimization, it is a matter of
2893  // correctness. With unsplit intrinsics we may be dealing with transfers
2894  // within a single alloca before SROA ran, or with transfers that have
2895  // a variable length. We may also be dealing with memmove instead of
2896  // memcpy, and so simply updating the pointers is the necessary for us to
2897  // update both source and dest of a single call.
2898  if (!IsSplittable) {
2899  Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2900  if (IsDest) {
2901  II.setDest(AdjustedPtr);
2902  II.setDestAlignment(SliceAlign);
2903  }
2904  else {
2905  II.setSource(AdjustedPtr);
2906  II.setSourceAlignment(SliceAlign);
2907  }
2908 
2909  LLVM_DEBUG(dbgs() << " to: " << II << "\n");
2910  deleteIfTriviallyDead(OldPtr);
2911  return false;
2912  }
2913  // For split transfer intrinsics we have an incredibly useful assurance:
2914  // the source and destination do not reside within the same alloca, and at
2915  // least one of them does not escape. This means that we can replace
2916  // memmove with memcpy, and we don't need to worry about all manner of
2917  // downsides to splitting and transforming the operations.
2918 
2919  // If this doesn't map cleanly onto the alloca type, and that type isn't
2920  // a single value type, just emit a memcpy.
2921  bool EmitMemCpy =
2922  !VecTy && !IntTy &&
2923  (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
2924  SliceSize !=
2925  DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
2926  !NewAI.getAllocatedType()->isSingleValueType());
2927 
2928  // If we're just going to emit a memcpy, the alloca hasn't changed, and the
2929  // size hasn't been shrunk based on analysis of the viable range, this is
2930  // a no-op.
2931  if (EmitMemCpy && &OldAI == &NewAI) {
2932  // Ensure the start lines up.
2933  assert(NewBeginOffset == BeginOffset);
2934 
2935  // Rewrite the size as needed.
2936  if (NewEndOffset != EndOffset)
2938  NewEndOffset - NewBeginOffset));
2939  return false;
2940  }
2941  // Record this instruction for deletion.
2942  Pass.DeadInsts.push_back(&II);
2943 
2944  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2945  // alloca that should be re-examined after rewriting this instruction.
2946  Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
2947  if (AllocaInst *AI =
2948  dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
2949  assert(AI != &OldAI && AI != &NewAI &&
2950  "Splittable transfers cannot reach the same alloca on both ends.");
2951  Pass.Worklist.insert(AI);
2952  }
2953 
2954  Type *OtherPtrTy = OtherPtr->getType();
2955  unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
2956 
2957  // Compute the relative offset for the other pointer within the transfer.
2958  unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
2959  APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
2960  Align OtherAlign =
2961  (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
2962  OtherAlign =
2963  commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
2964 
2965  if (EmitMemCpy) {
2966  // Compute the other pointer, folding as much as possible to produce
2967  // a single, simple GEP in most cases.
2968  OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
2969  OtherPtr->getName() + ".");
2970 
2971  Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2972  Type *SizeTy = II.getLength()->getType();
2973  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2974 
2975  Value *DestPtr, *SrcPtr;
2976  MaybeAlign DestAlign, SrcAlign;
2977  // Note: IsDest is true iff we're copying into the new alloca slice
2978  if (IsDest) {
2979  DestPtr = OurPtr;
2980  DestAlign = SliceAlign;
2981  SrcPtr = OtherPtr;
2982  SrcAlign = OtherAlign;
2983  } else {
2984  DestPtr = OtherPtr;
2985  DestAlign = OtherAlign;
2986  SrcPtr = OurPtr;
2987  SrcAlign = SliceAlign;
2988  }
2989  CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
2990  Size, II.isVolatile());
2991  if (AATags)
2992  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2993  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2994  return false;
2995  }
2996 
2997  bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
2998  NewEndOffset == NewAllocaEndOffset;
2999  uint64_t Size = NewEndOffset - NewBeginOffset;
3000  unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3001  unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3002  unsigned NumElements = EndIndex - BeginIndex;
3003  IntegerType *SubIntTy =
3004  IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3005 
3006  // Reset the other pointer type to match the register type we're going to
3007  // use, but using the address space of the original other pointer.
3008  Type *OtherTy;
3009  if (VecTy && !IsWholeAlloca) {
3010  if (NumElements == 1)
3011  OtherTy = VecTy->getElementType();
3012  else
3013  OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3014  } else if (IntTy && !IsWholeAlloca) {
3015  OtherTy = SubIntTy;
3016  } else {
3017  OtherTy = NewAllocaTy;
3018  }
3019  OtherPtrTy = OtherTy->getPointerTo(OtherAS);
3020 
3021  Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3022  OtherPtr->getName() + ".");
3023  MaybeAlign SrcAlign = OtherAlign;
3024  Value *DstPtr = &NewAI;
3025  MaybeAlign DstAlign = SliceAlign;
3026  if (!IsDest) {
3027  std::swap(SrcPtr, DstPtr);
3028  std::swap(SrcAlign, DstAlign);
3029  }
3030 
3031  Value *Src;
3032  if (VecTy && !IsWholeAlloca && !IsDest) {
3033  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3034  NewAI.getAlign(), "load");
3035  Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3036  } else if (IntTy && !IsWholeAlloca && !IsDest) {
3037  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3038  NewAI.getAlign(), "load");
3039  Src = convertValue(DL, IRB, Src, IntTy);
3040  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3041  Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3042  } else {
3043  LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3044  II.isVolatile(), "copyload");
3045  Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3046  LLVMContext::MD_access_group});
3047  if (AATags)
3048  Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3049  Src = Load;
3050  }
3051 
3052  if (VecTy && !IsWholeAlloca && IsDest) {
3053  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3054  NewAI.getAlign(), "oldload");
3055  Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3056  } else if (IntTy && !IsWholeAlloca && IsDest) {
3057  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3058  NewAI.getAlign(), "oldload");
3059  Old = convertValue(DL, IRB, Old, IntTy);
3060  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3061  Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3062  Src = convertValue(DL, IRB, Src, NewAllocaTy);
3063  }
3064 
3065  StoreInst *Store = cast<StoreInst>(
3066  IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3067  Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3068  LLVMContext::MD_access_group});
3069  if (AATags)
3070  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3071  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3072  return !II.isVolatile();
3073  }
3074 
3075  bool visitIntrinsicInst(IntrinsicInst &II) {
3076  assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3077  "Unexpected intrinsic!");
3078  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3079 
3080  // Record this instruction for deletion.
3081  Pass.DeadInsts.push_back(&II);
3082 
3083  if (II.isDroppable()) {
3084  assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3085  // TODO For now we forget assumed information, this can be improved.
3086  OldPtr->dropDroppableUsesIn(II);
3087  return true;
3088  }
3089 
3090  assert(II.getArgOperand(1) == OldPtr);
3091  // Lifetime intrinsics are only promotable if they cover the whole alloca.
3092  // Therefore, we drop lifetime intrinsics which don't cover the whole
3093  // alloca.
3094  // (In theory, intrinsics which partially cover an alloca could be
3095  // promoted, but PromoteMemToReg doesn't handle that case.)
3096  // FIXME: Check whether the alloca is promotable before dropping the
3097  // lifetime intrinsics?
3098  if (NewBeginOffset != NewAllocaBeginOffset ||
3099  NewEndOffset != NewAllocaEndOffset)
3100  return true;
3101 
3102  ConstantInt *Size =
3103  ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3104  NewEndOffset - NewBeginOffset);
3105  // Lifetime intrinsics always expect an i8* so directly get such a pointer
3106  // for the new alloca slice.
3108  Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3109  Value *New;
3110  if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3111  New = IRB.CreateLifetimeStart(Ptr, Size);
3112  else
3113  New = IRB.CreateLifetimeEnd(Ptr, Size);
3114 
3115  (void)New;
3116  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3117 
3118  return true;
3119  }
3120 
3121  void fixLoadStoreAlign(Instruction &Root) {
3122  // This algorithm implements the same visitor loop as
3123  // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3124  // or store found.
3127  Visited.insert(&Root);
3128  Uses.push_back(&Root);
3129  do {
3130  Instruction *I = Uses.pop_back_val();
3131 
3132  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3133  LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3134  continue;
3135  }
3136  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3137  SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3138  continue;
3139  }
3140 
3141  assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3142  isa<PHINode>(I) || isa<SelectInst>(I) ||
3143  isa<GetElementPtrInst>(I));
3144  for (User *U : I->users())
3145  if (Visited.insert(cast<Instruction>(U)).second)
3146  Uses.push_back(cast<Instruction>(U));
3147  } while (!Uses.empty());
3148  }
3149 
3150  bool visitPHINode(PHINode &PN) {
3151  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3152  assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3153  assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3154 
3155  // We would like to compute a new pointer in only one place, but have it be
3156  // as local as possible to the PHI. To do that, we re-use the location of
3157  // the old pointer, which necessarily must be in the right position to
3158  // dominate the PHI.
3160  if (isa<PHINode>(OldPtr))
3161  IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
3162  else
3163  IRB.SetInsertPoint(OldPtr);
3164  IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3165 
3166  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3167  // Replace the operands which were using the old pointer.
3168  std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3169 
3170  LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3171  deleteIfTriviallyDead(OldPtr);
3172 
3173  // Fix the alignment of any loads or stores using this PHI node.
3174  fixLoadStoreAlign(PN);
3175 
3176  // PHIs can't be promoted on their own, but often can be speculated. We
3177  // check the speculation outside of the rewriter so that we see the
3178  // fully-rewritten alloca.
3179  PHIUsers.insert(&PN);
3180  return true;
3181  }
3182 
3183  bool visitSelectInst(SelectInst &SI) {
3184  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3185  assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3186  "Pointer isn't an operand!");
3187  assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3188  assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3189 
3190  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3191  // Replace the operands which were using the old pointer.
3192  if (SI.getOperand(1) == OldPtr)
3193  SI.setOperand(1, NewPtr);
3194  if (SI.getOperand(2) == OldPtr)
3195  SI.setOperand(2, NewPtr);
3196 
3197  LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3198  deleteIfTriviallyDead(OldPtr);
3199 
3200  // Fix the alignment of any loads or stores using this select.
3201  fixLoadStoreAlign(SI);
3202 
3203  // Selects can't be promoted on their own, but often can be speculated. We
3204  // check the speculation outside of the rewriter so that we see the
3205  // fully-rewritten alloca.
3206  SelectUsers.insert(&SI);
3207  return true;
3208  }
3209 };
3210 
3211 namespace {
3212 
3213 /// Visitor to rewrite aggregate loads and stores as scalar.
3214 ///
3215 /// This pass aggressively rewrites all aggregate loads and stores on
3216 /// a particular pointer (or any pointer derived from it which we can identify)
3217 /// with scalar loads and stores.
3218 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3219  // Befriend the base class so it can delegate to private visit methods.
3220  friend class InstVisitor<AggLoadStoreRewriter, bool>;
3221 
3222  /// Queue of pointer uses to analyze and potentially rewrite.
3224 
3225  /// Set to prevent us from cycling with phi nodes and loops.
3226  SmallPtrSet<User *, 8> Visited;
3227 
3228  /// The current pointer use being rewritten. This is used to dig up the used
3229  /// value (as opposed to the user).
3230  Use *U = nullptr;
3231 
3232  /// Used to calculate offsets, and hence alignment, of subobjects.
3233  const DataLayout &DL;
3234 
3235  IRBuilderTy &IRB;
3236 
3237 public:
3238  AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3239  : DL(DL), IRB(IRB) {}
3240 
3241  /// Rewrite loads and stores through a pointer and all pointers derived from
3242  /// it.
3243  bool rewrite(Instruction &I) {
3244  LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3245  enqueueUsers(I);
3246  bool Changed = false;
3247  while (!Queue.empty()) {
3248  U = Queue.pop_back_val();
3249  Changed |= visit(cast<Instruction>(U->getUser()));
3250  }
3251  return Changed;
3252  }
3253 
3254 private:
3255  /// Enqueue all the users of the given instruction for further processing.
3256  /// This uses a set to de-duplicate users.
3257  void enqueueUsers(Instruction &I) {
3258  for (Use &U : I.uses())
3259  if (Visited.insert(U.getUser()).second)
3260  Queue.push_back(&U);
3261  }
3262 
3263  // Conservative default is to not rewrite anything.
3264  bool visitInstruction(Instruction &I) { return false; }
3265 
3266  /// Generic recursive split emission class.
3267  template <typename Derived> class OpSplitter {
3268  protected:
3269  /// The builder used to form new instructions.
3270  IRBuilderTy &IRB;
3271 
3272  /// The indices which to be used with insert- or extractvalue to select the
3273  /// appropriate value within the aggregate.
3274  SmallVector<unsigned, 4> Indices;
3275 
3276  /// The indices to a GEP instruction which will move Ptr to the correct slot
3277  /// within the aggregate.
3278  SmallVector<Value *, 4> GEPIndices;
3279 
3280  /// The base pointer of the original op, used as a base for GEPing the
3281  /// split operations.
3282  Value *Ptr;
3283 
3284  /// The base pointee type being GEPed into.
3285  Type *BaseTy;
3286 
3287  /// Known alignment of the base pointer.
3288  Align BaseAlign;
3289 
3290  /// To calculate offset of each component so we can correctly deduce
3291  /// alignments.
3292  const DataLayout &DL;
3293 
3294  /// Initialize the splitter with an insertion point, Ptr and start with a
3295  /// single zero GEP index.
3296  OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3297  Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
3298  : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
3299  BaseAlign(BaseAlign), DL(DL) {
3300  IRB.SetInsertPoint(InsertionPoint);
3301  }
3302 
3303  public:
3304  /// Generic recursive split emission routine.
3305  ///
3306  /// This method recursively splits an aggregate op (load or store) into
3307  /// scalar or vector ops. It splits recursively until it hits a single value
3308  /// and emits that single value operation via the template argument.
3309  ///
3310  /// The logic of this routine relies on GEPs and insertvalue and
3311  /// extractvalue all operating with the same fundamental index list, merely
3312  /// formatted differently (GEPs need actual values).
3313  ///
3314  /// \param Ty The type being split recursively into smaller ops.
3315  /// \param Agg The aggregate value being built up or stored, depending on
3316  /// whether this is splitting a load or a store respectively.
3317  void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3318  if (Ty->isSingleValueType()) {
3319  unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
3320  return static_cast<Derived *>(this)->emitFunc(
3321  Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
3322  }
3323 
3324  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3325  unsigned OldSize = Indices.size();
3326  (void)OldSize;
3327  for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3328  ++Idx) {
3329  assert(Indices.size() == OldSize && "Did not return to the old size");
3330  Indices.push_back(Idx);
3331  GEPIndices.push_back(IRB.getInt32(Idx));
3332  emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3333  GEPIndices.pop_back();
3334  Indices.pop_back();
3335  }
3336  return;
3337  }
3338 
3339  if (StructType *STy = dyn_cast<StructType>(Ty)) {
3340  unsigned OldSize = Indices.size();
3341  (void)OldSize;
3342  for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3343  ++Idx) {
3344  assert(Indices.size() == OldSize && "Did not return to the old size");
3345  Indices.push_back(Idx);
3346  GEPIndices.push_back(IRB.getInt32(Idx));
3347  emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3348  GEPIndices.pop_back();
3349  Indices.pop_back();
3350  }
3351  return;
3352  }
3353 
3354  llvm_unreachable("Only arrays and structs are aggregate loadable types");
3355  }
3356  };
3357 
3358  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3359  AAMDNodes AATags;
3360 
3361  LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3362  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3363  IRBuilderTy &IRB)
3364  : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
3365  IRB),
3366  AATags(AATags) {}
3367 
3368  /// Emit a leaf load of a single value. This is called at the leaves of the
3369  /// recursive emission to actually load values.
3370  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3371  assert(Ty->isSingleValueType());
3372  // Load the single value and insert it using the indices.
3373  Value *GEP =
3374  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3375  LoadInst *Load =
3376  IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
3377 
3378  APInt Offset(
3379  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3380  if (AATags &&
3381  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3382  Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3383 
3384  Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3385  LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3386  }
3387  };
3388 
3389  bool visitLoadInst(LoadInst &LI) {
3390  assert(LI.getPointerOperand() == *U);
3391  if (!LI.isSimple() || LI.getType()->isSingleValueType())
3392  return false;
3393 
3394  // We have an aggregate being loaded, split it apart.
3395  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3396  LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3397  getAdjustedAlignment(&LI, 0), DL, IRB);
3398  Value *V = PoisonValue::get(LI.getType());
3399  Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3400  Visited.erase(&LI);
3401  LI.replaceAllUsesWith(V);
3402  LI.eraseFromParent();
3403  return true;
3404  }
3405 
3406  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3407  StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3408  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3409  IRBuilderTy &IRB)
3410  : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3411  DL, IRB),
3412  AATags(AATags) {}
3413  AAMDNodes AATags;
3414  /// Emit a leaf store of a single value. This is called at the leaves of the
3415  /// recursive emission to actually produce stores.
3416  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3417  assert(Ty->isSingleValueType());
3418  // Extract the single value and store it using the indices.
3419  //
3420  // The gep and extractvalue values are factored out of the CreateStore
3421  // call to make the output independent of the argument evaluation order.
3422  Value *ExtractValue =
3423  IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
3424  Value *InBoundsGEP =
3425  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3426  StoreInst *Store =
3427  IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
3428 
3429  APInt Offset(
3430  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3431  if (AATags &&
3432  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3433  Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3434 
3435  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3436  }
3437  };
3438 
3439  bool visitStoreInst(StoreInst &SI) {
3440  if (!SI.isSimple() || SI.getPointerOperand() != *U)
3441  return false;
3442  Value *V = SI.getValueOperand();
3443  if (V->getType()->isSingleValueType())
3444  return false;
3445 
3446  // We have an aggregate being stored, split it apart.
3447  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3448  StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(),
3449  getAdjustedAlignment(&SI, 0), DL, IRB);
3450  Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3451  Visited.erase(&SI);
3452  SI.eraseFromParent();
3453  return true;
3454  }
3455 
3456  bool visitBitCastInst(BitCastInst &BC) {
3457  enqueueUsers(BC);
3458  return false;
3459  }
3460 
3461  bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
3462  enqueueUsers(ASC);
3463  return false;
3464  }
3465 
3466  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
3467  bool foldGEPSelect(GetElementPtrInst &GEPI) {
3468  if (!GEPI.hasAllConstantIndices())
3469  return false;
3470 
3471  SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
3472 
3473  LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
3474  << "\n original: " << *Sel
3475  << "\n " << GEPI);
3476 
3477  IRB.SetInsertPoint(&GEPI);
3479  bool IsInBounds = GEPI.isInBounds();
3480 
3481  Type *Ty = GEPI.getSourceElementType();
3482  Value *True = Sel->getTrueValue();
3483  Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep",
3484  IsInBounds);
3485 
3486  Value *False = Sel->getFalseValue();
3487 
3488  Value *NFalse = IRB.CreateGEP(Ty, False, Index,
3489  False->getName() + ".sroa.gep", IsInBounds);
3490 
3491  Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
3492  Sel->getName() + ".sroa.sel");
3493  Visited.erase(&GEPI);
3494  GEPI.replaceAllUsesWith(NSel);
3495  GEPI.eraseFromParent();
3496  Instruction *NSelI = cast<Instruction>(NSel);
3497  Visited.insert(NSelI);
3498  enqueueUsers(*NSelI);
3499 
3500  LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
3501  << "\n " << *NFalse
3502  << "\n " << *NSel << '\n');
3503 
3504  return true;
3505  }
3506 
3507  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
3508  bool foldGEPPhi(GetElementPtrInst &GEPI) {
3509  if (!GEPI.hasAllConstantIndices())
3510  return false;
3511 
3512  PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
3513  if (GEPI.getParent() != PHI->getParent() ||
3514  llvm::any_of(PHI->incoming_values(), [](Value *In)
3515  { Instruction *I = dyn_cast<Instruction>(In);
3516  return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
3517  succ_empty(I->getParent()) ||
3518  !I->getParent()->isLegalToHoistInto();
3519  }))
3520  return false;
3521 
3522  LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
3523  << "\n original: " << *PHI
3524  << "\n " << GEPI
3525  << "\n to: ");
3526 
3528  bool IsInBounds = GEPI.isInBounds();
3529  IRB.SetInsertPoint(GEPI.getParent()->getFirstNonPHI());
3530  PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(),
3531  PHI->getName() + ".sroa.phi");
3532  for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
3533  BasicBlock *B = PHI->getIncomingBlock(I);
3534  Value *NewVal = nullptr;
3535  int Idx = NewPN->getBasicBlockIndex(B);
3536  if (Idx >= 0) {
3537  NewVal = NewPN->getIncomingValue(Idx);
3538  } else {
3539  Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
3540 
3541  IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
3542  Type *Ty = GEPI.getSourceElementType();
3543  NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep",
3544  IsInBounds);
3545  }
3546  NewPN->addIncoming(NewVal, B);
3547  }
3548 
3549  Visited.erase(&GEPI);
3550  GEPI.replaceAllUsesWith(NewPN);
3551  GEPI.eraseFromParent();
3552  Visited.insert(NewPN);
3553  enqueueUsers(*NewPN);
3554 
3555  LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
3556  dbgs() << "\n " << *In;
3557  dbgs() << "\n " << *NewPN << '\n');
3558 
3559  return true;
3560  }
3561 
3562  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
3563  if (isa<SelectInst>(GEPI.getPointerOperand()) &&
3564  foldGEPSelect(GEPI))
3565  return true;
3566 
3567  if (isa<PHINode>(GEPI.getPointerOperand()) &&
3568  foldGEPPhi(GEPI))
3569  return true;
3570 
3571  enqueueUsers(GEPI);
3572  return false;
3573  }
3574 
3575  bool visitPHINode(PHINode &PN) {
3576  enqueueUsers(PN);
3577  return false;
3578  }
3579 
3580  bool visitSelectInst(SelectInst &SI) {
3581  enqueueUsers(SI);
3582  return false;
3583  }
3584 };
3585 
3586 } // end anonymous namespace
3587 
3588 /// Strip aggregate type wrapping.
3589 ///
3590 /// This removes no-op aggregate types wrapping an underlying type. It will
3591 /// strip as many layers of types as it can without changing either the type
3592 /// size or the allocated size.
3594  if (Ty->isSingleValueType())
3595  return Ty;
3596 
3597  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
3598  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
3599 
3600  Type *InnerTy;
3601  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
3602  InnerTy = ArrTy->getElementType();
3603  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
3604  const StructLayout *SL = DL.getStructLayout(STy);
3605  unsigned Index = SL->getElementContainingOffset(0);
3606  InnerTy = STy->getElementType(Index);
3607  } else {
3608  return Ty;
3609  }
3610 
3611  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
3612  TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
3613  return Ty;
3614 
3615  return stripAggregateTypeWrapping(DL, InnerTy);
3616 }
3617 
3618 /// Try to find a partition of the aggregate type passed in for a given
3619 /// offset and size.
3620 ///
3621 /// This recurses through the aggregate type and tries to compute a subtype
3622 /// based on the offset and size. When the offset and size span a sub-section
3623 /// of an array, it will even compute a new array type for that sub-section,
3624 /// and the same for structs.
3625 ///
3626 /// Note that this routine is very strict and tries to find a partition of the
3627 /// type which produces the *exact* right offset and size. It is not forgiving
3628 /// when the size or offset cause either end of type-based partition to be off.
3629 /// Also, this is a best-effort routine. It is reasonable to give up and not
3630 /// return a type if necessary.
3631 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
3632  uint64_t Size) {
3633  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
3634  return stripAggregateTypeWrapping(DL, Ty);
3635  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
3636  (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
3637  return nullptr;
3638 
3639  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
3640  Type *ElementTy;
3641  uint64_t TyNumElements;
3642  if (auto *AT = dyn_cast<ArrayType>(Ty)) {
3643  ElementTy = AT->getElementType();
3644  TyNumElements = AT->getNumElements();
3645  } else {
3646  // FIXME: This isn't right for vectors with non-byte-sized or
3647  // non-power-of-two sized elements.
3648  auto *VT = cast<FixedVectorType>(Ty);
3649  ElementTy = VT->getElementType();
3650  TyNumElements = VT->getNumElements();
3651  }
3652  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3653  uint64_t NumSkippedElements = Offset / ElementSize;
3654  if (NumSkippedElements >= TyNumElements)
3655  return nullptr;
3656  Offset -= NumSkippedElements * ElementSize;
3657 
3658  // First check if we need to recurse.
3659  if (Offset > 0 || Size < ElementSize) {
3660  // Bail if the partition ends in a different array element.
3661  if ((Offset + Size) > ElementSize)
3662  return nullptr;
3663  // Recurse through the element type trying to peel off offset bytes.
3664  return getTypePartition(DL, ElementTy, Offset, Size);
3665  }
3666  assert(Offset == 0);
3667 
3668  if (Size == ElementSize)
3669  return stripAggregateTypeWrapping(DL, ElementTy);
3670  assert(Size > ElementSize);
3671  uint64_t NumElements = Size / ElementSize;
3672  if (NumElements * ElementSize != Size)
3673  return nullptr;
3674  return ArrayType::get(ElementTy, NumElements);
3675  }
3676 
3677  StructType *STy = dyn_cast<StructType>(Ty);
3678  if (!STy)
3679  return nullptr;
3680 
3681  const StructLayout *SL = DL.getStructLayout(STy);
3682  if (Offset >= SL->getSizeInBytes())
3683  return nullptr;
3684  uint64_t EndOffset = Offset + Size;
3685  if (EndOffset > SL->getSizeInBytes())
3686  return nullptr;
3687 
3688  unsigned Index = SL->getElementContainingOffset(Offset);
3689  Offset -= SL->getElementOffset(Index);
3690 
3691  Type *ElementTy = STy->getElementType(Index);
3692  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3693  if (Offset >= ElementSize)
3694  return nullptr; // The offset points into alignment padding.
3695 
3696  // See if any partition must be contained by the element.
3697  if (Offset > 0 || Size < ElementSize) {
3698  if ((Offset + Size) > ElementSize)
3699  return nullptr;
3700  return getTypePartition(DL, ElementTy, Offset, Size);
3701  }
3702  assert(Offset == 0);
3703 
3704  if (Size == ElementSize)
3705  return stripAggregateTypeWrapping(DL, ElementTy);
3706 
3707  StructType::element_iterator EI = STy->element_begin() + Index,
3708  EE = STy->element_end();
3709  if (EndOffset < SL->getSizeInBytes()) {
3710  unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
3711  if (Index == EndIndex)
3712  return nullptr; // Within a single element and its padding.
3713 
3714  // Don't try to form "natural" types if the elements don't line up with the
3715  // expected size.
3716  // FIXME: We could potentially recurse down through the last element in the
3717  // sub-struct to find a natural end point.
3718  if (SL->getElementOffset(EndIndex) != EndOffset)
3719  return nullptr;
3720 
3721  assert(Index < EndIndex);
3722  EE = STy->element_begin() + EndIndex;
3723  }
3724 
3725  // Try to build up a sub-structure.
3726  StructType *SubTy =
3727  StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
3728  const StructLayout *SubSL = DL.getStructLayout(SubTy);
3729  if (Size != SubSL->getSizeInBytes())
3730  return nullptr; // The sub-struct doesn't have quite the size needed.
3731 
3732  return SubTy;
3733 }
3734 
3735 /// Pre-split loads and stores to simplify rewriting.
3736 ///
3737 /// We want to break up the splittable load+store pairs as much as
3738 /// possible. This is important to do as a preprocessing step, as once we
3739 /// start rewriting the accesses to partitions of the alloca we lose the
3740 /// necessary information to correctly split apart paired loads and stores
3741 /// which both point into this alloca. The case to consider is something like
3742 /// the following:
3743 ///
3744 /// %a = alloca [12 x i8]
3745 /// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
3746 /// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
3747 /// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
3748 /// %iptr1 = bitcast i8* %gep1 to i64*
3749 /// %iptr2 = bitcast i8* %gep2 to i64*
3750 /// %fptr1 = bitcast i8* %gep1 to float*
3751 /// %fptr2 = bitcast i8* %gep2 to float*
3752 /// %fptr3 = bitcast i8* %gep3 to float*
3753 /// store float 0.0, float* %fptr1
3754 /// store float 1.0, float* %fptr2
3755 /// %v = load i64* %iptr1
3756 /// store i64 %v, i64* %iptr2
3757 /// %f1 = load float* %fptr2
3758 /// %f2 = load float* %fptr3
3759 ///
3760 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
3761 /// promote everything so we recover the 2 SSA values that should have been
3762 /// there all along.
3763 ///
3764 /// \returns true if any changes are made.
3765 bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
3766  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
3767 
3768  // Track the loads and stores which are candidates for pre-splitting here, in
3769  // the order they first appear during the partition scan. These give stable
3770  // iteration order and a basis for tracking which loads and stores we
3771  // actually split.
3774 
3775  // We need to accumulate the splits required of each load or store where we
3776  // can find them via a direct lookup. This is important to cross-check loads
3777  // and stores against each other. We also track the slice so that we can kill
3778  // all the slices that end up split.
3779  struct SplitOffsets {
3780  Slice *S;
3781  std::vector<uint64_t> Splits;
3782  };
3784 
3785  // Track loads out of this alloca which cannot, for any reason, be pre-split.
3786  // This is important as we also cannot pre-split stores of those loads!
3787  // FIXME: This is all pretty gross. It means that we can be more aggressive
3788  // in pre-splitting when the load feeding the store happens to come from
3789  // a separate alloca. Put another way, the effectiveness of SROA would be
3790  // decreased by a frontend which just concatenated all of its local allocas
3791  // into one big flat alloca. But defeating such patterns is exactly the job
3792  // SROA is tasked with! Sadly, to not have this discrepancy we would have
3793  // change store pre-splitting to actually force pre-splitting of the load
3794  // that feeds it *and all stores*. That makes pre-splitting much harder, but
3795  // maybe it would make it more principled?
3796  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
3797 
3798  LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
3799  for (auto &P : AS.partitions()) {
3800  for (Slice &S : P) {
3801  Instruction *I = cast<Instruction>(S.getUse()->getUser());
3802  if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
3803  // If this is a load we have to track that it can't participate in any
3804  // pre-splitting. If this is a store of a load we have to track that
3805  // that load also can't participate in any pre-splitting.
3806  if (auto *LI = dyn_cast<LoadInst>(I))
3807  UnsplittableLoads.insert(LI);
3808  else if (auto *SI = dyn_cast<StoreInst>(I))
3809  if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
3810  UnsplittableLoads.insert(LI);
3811  continue;
3812  }
3813  assert(P.endOffset() > S.beginOffset() &&
3814  "Empty or backwards partition!");
3815 
3816  // Determine if this is a pre-splittable slice.
3817  if (auto *LI = dyn_cast<LoadInst>(I)) {
3818  assert(!LI->isVolatile() && "Cannot split volatile loads!");
3819 
3820  // The load must be used exclusively to store into other pointers for
3821  // us to be able to arbitrarily pre-split it. The stores must also be
3822  // simple to avoid changing semantics.
3823  auto IsLoadSimplyStored = [](LoadInst *LI) {
3824  for (User *LU : LI->users()) {
3825  auto *SI = dyn_cast<StoreInst>(LU);
3826  if (!SI || !SI->isSimple())
3827  return false;
3828  }
3829  return true;
3830  };
3831  if (!IsLoadSimplyStored(LI)) {
3832  UnsplittableLoads.insert(LI);
3833  continue;
3834  }
3835 
3836  Loads.push_back(LI);
3837  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
3838  if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
3839  // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
3840  continue;
3841  auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
3842  if (!StoredLoad || !StoredLoad->isSimple())
3843  continue;
3844  assert(!SI->isVolatile() && "Cannot split volatile stores!");
3845 
3846  Stores.push_back(SI);
3847  } else {
3848  // Other uses cannot be pre-split.
3849  continue;
3850  }
3851 
3852  // Record the initial split.
3853  LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
3854  auto &Offsets = SplitOffsetsMap[I];
3855  assert(Offsets.Splits.empty() &&
3856  "Should not have splits the first time we see an instruction!");
3857  Offsets.S = &S;
3858  Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
3859  }
3860 
3861  // Now scan the already split slices, and add a split for any of them which
3862  // we're going to pre-split.
3863  for (Slice *S : P.splitSliceTails()) {
3864  auto SplitOffsetsMapI =
3865  SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
3866  if (SplitOffsetsMapI == SplitOffsetsMap.end())
3867  continue;
3868  auto &Offsets = SplitOffsetsMapI->second;
3869 
3870  assert(Offsets.S == S && "Found a mismatched slice!");
3871  assert(!Offsets.Splits.empty() &&
3872  "Cannot have an empty set of splits on the second partition!");
3873  assert(Offsets.Splits.back() ==
3874  P.beginOffset() - Offsets.S->beginOffset() &&
3875  "Previous split does not end where this one begins!");
3876 
3877  // Record each split. The last partition's end isn't needed as the size
3878  // of the slice dictates that.
3879  if (S->endOffset() > P.endOffset())
3880  Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
3881  }
3882  }
3883 
3884  // We may have split loads where some of their stores are split stores. For
3885  // such loads and stores, we can only pre-split them if their splits exactly
3886  // match relative to their starting offset. We have to verify this prior to
3887  // any rewriting.
3888  llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
3889  // Lookup the load we are storing in our map of split
3890  // offsets.
3891  auto *LI = cast<LoadInst>(SI->getValueOperand());
3892  // If it was completely unsplittable, then we're done,
3893  // and this store can't be pre-split.
3894  if (UnsplittableLoads.count(LI))
3895  return true;
3896 
3897  auto LoadOffsetsI = SplitOffsetsMap.find(LI);
3898  if (LoadOffsetsI == SplitOffsetsMap.end())
3899  return false; // Unrelated loads are definitely safe.
3900  auto &LoadOffsets = LoadOffsetsI->second;
3901 
3902  // Now lookup the store's offsets.
3903  auto &StoreOffsets = SplitOffsetsMap[SI];
3904 
3905  // If the relative offsets of each split in the load and
3906  // store match exactly, then we can split them and we
3907  // don't need to remove them here.
3908  if (LoadOffsets.Splits == StoreOffsets.Splits)
3909  return false;
3910 
3911  LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
3912  << " " << *LI << "\n"
3913  << " " << *SI << "\n");
3914 
3915  // We've found a store and load that we need to split
3916  // with mismatched relative splits. Just give up on them
3917  // and remove both instructions from our list of
3918  // candidates.
3919  UnsplittableLoads.insert(LI);
3920  return true;
3921  });
3922  // Now we have to go *back* through all the stores, because a later store may
3923  // have caused an earlier store's load to become unsplittable and if it is
3924  // unsplittable for the later store, then we can't rely on it being split in
3925  // the earlier store either.
3926  llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
3927  auto *LI = cast<LoadInst>(SI->getValueOperand());
3928  return UnsplittableLoads.count(LI);
3929  });
3930  // Once we've established all the loads that can't be split for some reason,
3931  // filter any that made it into our list out.
3932  llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
3933  return UnsplittableLoads.count(LI);
3934  });
3935 
3936  // If no loads or stores are left, there is no pre-splitting to be done for
3937  // this alloca.
3938  if (Loads.empty() && Stores.empty())
3939  return false;
3940 
3941  // From here on, we can't fail and will be building new accesses, so rig up
3942  // an IR builder.
3943  IRBuilderTy IRB(&AI);
3944 
3945  // Collect the new slices which we will merge into the alloca slices.
3946  SmallVector<Slice, 4> NewSlices;
3947 
3948  // Track any allocas we end up splitting loads and stores for so we iterate
3949  // on them.
3950  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
3951 
3952  // At this point, we have collected all of the loads and stores we can
3953  // pre-split, and the specific splits needed for them. We actually do the
3954  // splitting in a specific order in order to handle when one of the loads in
3955  // the value operand to one of the stores.
3956  //
3957  // First, we rewrite all of the split loads, and just accumulate each split
3958  // load in a parallel structure. We also build the slices for them and append
3959  // them to the alloca slices.
3961  std::vector<LoadInst *> SplitLoads;
3962  const DataLayout &DL = AI.getModule()->getDataLayout();
3963  for (LoadInst *LI : Loads) {
3964  SplitLoads.clear();
3965 
3966  auto &Offsets = SplitOffsetsMap[LI];
3967  unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
3968  assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
3969  "Load must have type size equal to store size");
3970  assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
3971  "Load must be >= slice size");
3972 
3973  uint64_t BaseOffset = Offsets.S->beginOffset();
3974  assert(BaseOffset + SliceSize > BaseOffset &&
3975  "Cannot represent alloca access size using 64-bit integers!");
3976 
3977  Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
3978  IRB.SetInsertPoint(LI);
3979 
3980  LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
3981 
3982  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
3983  int Idx = 0, Size = Offsets.Splits.size();
3984  for (;;) {
3985  auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
3986  auto AS = LI->getPointerAddressSpace();
3987  auto *PartPtrTy = PartTy->getPointerTo(AS);
3988  LoadInst *PLoad = IRB.CreateAlignedLoad(
3989  PartTy,
3990  getAdjustedPtr(IRB, DL, BasePtr,
3991  APInt(DL.getIndexSizeInBits(AS), PartOffset),
3992  PartPtrTy, BasePtr->getName() + "."),
3993  getAdjustedAlignment(LI, PartOffset),
3994  /*IsVolatile*/ false, LI->getName());
3995  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
3996  LLVMContext::MD_access_group});
3997 
3998  // Append this load onto the list of split loads so we can find it later
3999  // to rewrite the stores.
4000  SplitLoads.push_back(PLoad);
4001 
4002  // Now build a new slice for the alloca.
4003  NewSlices.push_back(
4004  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4005  &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4006  /*IsSplittable*/ false));
4007  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4008  << ", " << NewSlices.back().endOffset()
4009  << "): " << *PLoad << "\n");
4010 
4011  // See if we've handled all the splits.
4012  if (Idx >= Size)
4013  break;
4014 
4015  // Setup the next partition.
4016  PartOffset = Offsets.Splits[Idx];
4017  ++Idx;
4018  PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4019  }
4020 
4021  // Now that we have the split loads, do the slow walk over all uses of the
4022  // load and rewrite them as split stores, or save the split loads to use
4023  // below if the store is going to be split there anyways.
4024  bool DeferredStores = false;
4025  for (User *LU : LI->users()) {
4026  StoreInst *SI = cast<StoreInst>(LU);
4027  if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4028  DeferredStores = true;
4029  LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4030  << "\n");
4031  continue;
4032  }
4033 
4034  Value *StoreBasePtr = SI->getPointerOperand();
4035  IRB.SetInsertPoint(SI);
4036 
4037  LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4038 
4039  for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4040  LoadInst *PLoad = SplitLoads[Idx];
4041  uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4042  auto *PartPtrTy =
4043  PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
4044 
4045  auto AS = SI->getPointerAddressSpace();
4046  StoreInst *PStore = IRB.CreateAlignedStore(
4047  PLoad,
4048  getAdjustedPtr(IRB, DL, StoreBasePtr,
4049  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4050  PartPtrTy, StoreBasePtr->getName() + "."),
4051  getAdjustedAlignment(SI, PartOffset),
4052  /*IsVolatile*/ false);
4053  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4054  LLVMContext::MD_access_group});
4055  LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4056  }
4057 
4058  // We want to immediately iterate on any allocas impacted by splitting
4059  // this store, and we have to track any promotable alloca (indicated by
4060  // a direct store) as needing to be resplit because it is no longer
4061  // promotable.
4062  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4063  ResplitPromotableAllocas.insert(OtherAI);
4064  Worklist.insert(OtherAI);
4065  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4066  StoreBasePtr->stripInBoundsOffsets())) {
4067  Worklist.insert(OtherAI);
4068  }
4069 
4070  // Mark the original store as dead.
4071  DeadInsts.push_back(SI);
4072  }
4073 
4074  // Save the split loads if there are deferred stores among the users.
4075  if (DeferredStores)
4076  SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4077 
4078  // Mark the original load as dead and kill the original slice.
4079  DeadInsts.push_back(LI);
4080  Offsets.S->kill();
4081  }
4082 
4083  // Second, we rewrite all of the split stores. At this point, we know that
4084  // all loads from this alloca have been split already. For stores of such
4085  // loads, we can simply look up the pre-existing split loads. For stores of
4086  // other loads, we split those loads first and then write split stores of
4087  // them.
4088  for (StoreInst *SI : Stores) {
4089  auto *LI = cast<LoadInst>(SI->getValueOperand());
4090  IntegerType *Ty = cast<IntegerType>(LI->getType());
4091  assert(Ty->getBitWidth() % 8 == 0);
4092  uint64_t StoreSize = Ty->getBitWidth() / 8;
4093  assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4094 
4095  auto &Offsets = SplitOffsetsMap[SI];
4096  assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4097  "Slice size should always match load size exactly!");
4098  uint64_t BaseOffset = Offsets.S->beginOffset();
4099  assert(BaseOffset + StoreSize > BaseOffset &&
4100  "Cannot represent alloca access size using 64-bit integers!");
4101 
4102  Value *LoadBasePtr = LI->getPointerOperand();
4103  Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4104 
4105  LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4106 
4107  // Check whether we have an already split load.
4108  auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4109  std::vector<LoadInst *> *SplitLoads = nullptr;
4110  if (SplitLoadsMapI != SplitLoadsMap.end()) {
4111  SplitLoads = &SplitLoadsMapI->second;
4112  assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4113  "Too few split loads for the number of splits in the store!");
4114  } else {
4115  LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4116  }
4117 
4118  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4119  int Idx = 0, Size = Offsets.Splits.size();
4120  for (;;) {
4121  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4122  auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
4123  auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
4124 
4125  // Either lookup a split load or create one.
4126  LoadInst *PLoad;
4127  if (SplitLoads) {
4128  PLoad = (*SplitLoads)[Idx];
4129  } else {
4130  IRB.SetInsertPoint(LI);
4131  auto AS = LI->getPointerAddressSpace();
4132  PLoad = IRB.CreateAlignedLoad(
4133  PartTy,
4134  getAdjustedPtr(IRB, DL, LoadBasePtr,
4135  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4136  LoadPartPtrTy, LoadBasePtr->getName() + "."),
4137  getAdjustedAlignment(LI, PartOffset),
4138  /*IsVolatile*/ false, LI->getName());
4139  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4140  LLVMContext::MD_access_group});
4141  }
4142 
4143  // And store this partition.
4144  IRB.SetInsertPoint(SI);
4145  auto AS = SI->getPointerAddressSpace();
4146  StoreInst *PStore = IRB.CreateAlignedStore(
4147  PLoad,
4148  getAdjustedPtr(IRB, DL, StoreBasePtr,
4149  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4150  StorePartPtrTy, StoreBasePtr->getName() + "."),
4151  getAdjustedAlignment(SI, PartOffset),
4152  /*IsVolatile*/ false);
4153  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4154  LLVMContext::MD_access_group});
4155 
4156  // Now build a new slice for the alloca.
4157  NewSlices.push_back(
4158  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4159  &PStore->getOperandUse(PStore->getPointerOperandIndex()),
4160  /*IsSplittable*/ false));
4161  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4162  << ", " << NewSlices.back().endOffset()
4163  << "): " << *PStore << "\n");
4164  if (!SplitLoads) {
4165  LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4166  }
4167 
4168  // See if we've finished all the splits.
4169  if (Idx >= Size)
4170  break;
4171 
4172  // Setup the next partition.
4173  PartOffset = Offsets.Splits[Idx];
4174  ++Idx;
4175  PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4176  }
4177 
4178  // We want to immediately iterate on any allocas impacted by splitting
4179  // this load, which is only relevant if it isn't a load of this alloca and
4180  // thus we didn't already split the loads above. We also have to keep track
4181  // of any promotable allocas we split loads on as they can no longer be
4182  // promoted.
4183  if (!SplitLoads) {
4184  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
4185  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4186  ResplitPromotableAllocas.insert(OtherAI);
4187  Worklist.insert(OtherAI);
4188  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4189  LoadBasePtr->stripInBoundsOffsets())) {
4190  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4191  Worklist.insert(OtherAI);
4192  }
4193  }
4194 
4195  // Mark the original store as dead now that we've split it up and kill its
4196  // slice. Note that we leave the original load in place unless this store
4197  // was its only use. It may in turn be split up if it is an alloca load
4198  // for some other alloca, but it may be a normal load. This may introduce
4199  // redundant loads, but where those can be merged the rest of the optimizer
4200  // should handle the merging, and this uncovers SSA splits which is more
4201  // important. In practice, the original loads will almost always be fully
4202  // split and removed eventually, and the splits will be merged by any
4203  // trivial CSE, including instcombine.
4204  if (LI->hasOneUse()) {
4205  assert(*LI->user_begin() == SI && "Single use isn't this store!");
4206  DeadInsts.push_back(LI);
4207  }
4208  DeadInsts.push_back(SI);
4209  Offsets.S->kill();
4210  }
4211 
4212  // Remove the killed slices that have ben pre-split.
4213  llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
4214 
4215  // Insert our new slices. This will sort and merge them into the sorted
4216  // sequence.
4217  AS.insert(NewSlices);
4218 
4219  LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4220 #ifndef NDEBUG
4221  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4222  LLVM_DEBUG(AS.print(dbgs(), I, " "));
4223 #endif
4224 
4225  // Finally, don't try to promote any allocas that new require re-splitting.
4226  // They have already been added to the worklist above.
4227  llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
4228  return ResplitPromotableAllocas.count(AI);
4229  });
4230 
4231  return true;
4232 }
4233 
4234 /// Rewrite an alloca partition's users.
4235 ///
4236 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4237 /// to rewrite uses of an alloca partition to be conducive for SSA value
4238 /// promotion. If the partition needs a new, more refined alloca, this will
4239 /// build that new alloca, preserving as much type information as possible, and
4240 /// rewrite the uses of the old alloca to point at the new one and have the
4241 /// appropriate new offsets. It also evaluates how successful the rewrite was
4242 /// at enabling promotion and if it was successful queues the alloca to be
4243 /// promoted.
4244 AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4245  Partition &P) {
4246  // Try to compute a friendly type for this partition of the alloca. This
4247  // won't always succeed, in which case we fall back to a legal integer type
4248  // or an i8 array of an appropriate size.
4249  Type *SliceTy = nullptr;
4250  const DataLayout &DL = AI.getModule()->getDataLayout();
4251  std::pair<Type *, IntegerType *> CommonUseTy =
4252  findCommonType(P.begin(), P.end(), P.endOffset());
4253  // Do all uses operate on the same type?
4254  if (CommonUseTy.first)
4255  if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
4256  SliceTy = CommonUseTy.first;
4257  // If not, can we find an appropriate subtype in the original allocated type?
4258  if (!SliceTy)
4259  if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4260  P.beginOffset(), P.size()))
4261  SliceTy = TypePartitionTy;
4262  // If still not, can we use the largest bitwidth integer type used?
4263  if (!SliceTy && CommonUseTy.second)
4264  if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
4265  SliceTy = CommonUseTy.second;
4266  if ((!SliceTy || (SliceTy->isArrayTy() &&
4267  SliceTy->getArrayElementType()->isIntegerTy())) &&
4268  DL.isLegalInteger(P.size() * 8))
4269  SliceTy = Type::getIntNTy(*C, P.size() * 8);
4270  if (!SliceTy)
4271  SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4272  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
4273 
4274  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4275 
4276  VectorType *VecTy =
4277  IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4278  if (VecTy)
4279  SliceTy = VecTy;
4280 
4281  // Check for the case where we're going to rewrite to a new alloca of the
4282  // exact same type as the original, and with the same access offsets. In that
4283  // case, re-use the existing alloca, but still run through the rewriter to
4284  // perform phi and select speculation.
4285  // P.beginOffset() can be non-zero even with the same type in a case with
4286  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4287  AllocaInst *NewAI;
4288  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4289  NewAI = &AI;
4290  // FIXME: We should be able to bail at this point with "nothing changed".
4291  // FIXME: We might want to defer PHI speculation until after here.
4292  // FIXME: return nullptr;
4293  } else {
4294  // Make sure the alignment is compatible with P.beginOffset().
4295  const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
4296  // If we will get at least this much alignment from the type alone, leave
4297  // the alloca's alignment unconstrained.
4298  const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
4299  NewAI = new AllocaInst(
4300  SliceTy, AI.getType()->getAddressSpace(), nullptr,
4301  IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
4302  AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
4303  // Copy the old AI debug location over to the new one.
4304  NewAI->setDebugLoc(AI.getDebugLoc());
4305  ++NumNewAllocas;
4306  }
4307 
4308  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
4309  << "[" << P.beginOffset() << "," << P.endOffset()
4310  << ") to: " << *NewAI << "\n");
4311 
4312  // Track the high watermark on the worklist as it is only relevant for
4313  // promoted allocas. We will reset it to this point if the alloca is not in
4314  // fact scheduled for promotion.
4315  unsigned PPWOldSize = PostPromotionWorklist.size();
4316  unsigned NumUses = 0;
4318  SmallSetVector<SelectInst *, 8> SelectUsers;
4319 
4320  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4321  P.endOffset(), IsIntegerPromotable, VecTy,
4322  PHIUsers, SelectUsers);
4323  bool Promotable = true;
4324  for (Slice *S : P.splitSliceTails()) {
4325  Promotable &= Rewriter.visit(S);
4326  ++NumUses;
4327  }
4328  for (Slice &S : P) {
4329  Promotable &= Rewriter.visit(&S);
4330  ++NumUses;
4331  }
4332 
4333  NumAllocaPartitionUses += NumUses;
4334  MaxUsesPerAllocaPartition.updateMax(NumUses);
4335 
4336  // Now that we've processed all the slices in the new partition, check if any
4337  // PHIs or Selects would block promotion.
4338  for (PHINode *PHI : PHIUsers)
4339  if (!isSafePHIToSpeculate(*PHI)) {
4340  Promotable = false;
4341  PHIUsers.clear();
4342  SelectUsers.clear();
4343  break;
4344  }
4345 
4346  for (SelectInst *Sel : SelectUsers)
4347  if (!isSafeSelectToSpeculate(*Sel)) {
4348  Promotable = false;
4349  PHIUsers.clear();
4350  SelectUsers.clear();
4351  break;
4352  }
4353 
4354  if (Promotable) {
4355  for (Use *U : AS.getDeadUsesIfPromotable()) {
4356  auto *OldInst = dyn_cast<Instruction>(U->get());
4357  Value::dropDroppableUse(*U);
4358  if (OldInst)
4359  if (isInstructionTriviallyDead(OldInst))
4360  DeadInsts.push_back(OldInst);
4361  }
4362  if (PHIUsers.empty() && SelectUsers.empty()) {
4363  // Promote the alloca.
4364  PromotableAllocas.push_back(NewAI);
4365  } else {
4366  // If we have either PHIs or Selects to speculate, add them to those
4367  // worklists and re-queue the new alloca so that we promote in on the
4368  // next iteration.
4369  for (PHINode *PHIUser : PHIUsers)
4370  SpeculatablePHIs.insert(PHIUser);
4371  for (SelectInst *SelectUser : SelectUsers)
4372  SpeculatableSelects.insert(SelectUser);
4373  Worklist.insert(NewAI);
4374  }
4375  } else {
4376  // Drop any post-promotion work items if promotion didn't happen.
4377  while (PostPromotionWorklist.size() > PPWOldSize)
4378  PostPromotionWorklist.pop_back();
4379 
4380  // We couldn't promote and we didn't create a new partition, nothing
4381  // happened.
4382  if (NewAI == &AI)
4383  return nullptr;
4384 
4385  // If we can't promote the alloca, iterate on it to check for new
4386  // refinements exposed by splitting the current alloca. Don't iterate on an
4387  // alloca which didn't actually change and didn't get promoted.
4388  Worklist.insert(NewAI);
4389  }
4390 
4391  return NewAI;
4392 }
4393 
4394 /// Walks the slices of an alloca and form partitions based on them,
4395 /// rewriting each of their uses.
4396 bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
4397  if (AS.begin() == AS.end())
4398  return false;
4399 
4400  unsigned NumPartitions = 0;
4401  bool Changed = false;
4402  const DataLayout &DL = AI.getModule()->getDataLayout();
4403 
4404  // First try to pre-split loads and stores.
4405  Changed |= presplitLoadsAndStores(AI, AS);
4406 
4407  // Now that we have identified any pre-splitting opportunities,
4408  // mark loads and stores unsplittable except for the following case.
4409  // We leave a slice splittable if all other slices are disjoint or fully
4410  // included in the slice, such as whole-alloca loads and stores.
4411  // If we fail to split these during pre-splitting, we want to force them
4412  // to be rewritten into a partition.
4413  bool IsSorted = true;
4414 
4415  uint64_t AllocaSize =
4416  DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
4417  const uint64_t MaxBitVectorSize = 1024;
4418  if (AllocaSize <= MaxBitVectorSize) {
4419  // If a byte boundary is included in any load or store, a slice starting or
4420  // ending at the boundary is not splittable.
4421  SmallBitVector SplittableOffset(AllocaSize + 1, true);
4422  for (Slice &S : AS)
4423  for (unsigned O = S.beginOffset() + 1;
4424  O < S.endOffset() && O < AllocaSize; O++)
4425  SplittableOffset.reset(O);
4426 
4427  for (Slice &S : AS) {
4428  if (!S.isSplittable())
4429  continue;
4430 
4431  if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
4432  (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
4433  continue;
4434 
4435  if (isa<LoadInst>(S.getUse()->getUser()) ||
4436  isa<StoreInst>(S.getUse()->getUser())) {
4437  S.makeUnsplittable();
4438  IsSorted = false;
4439  }
4440  }
4441  }
4442  else {
4443  // We only allow whole-alloca splittable loads and stores
4444  // for a large alloca to avoid creating too large BitVector.
4445  for (Slice &S : AS) {
4446  if (!S.isSplittable())
4447  continue;
4448 
4449  if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
4450  continue;
4451 
4452  if (isa<LoadInst>(S.getUse()->getUser()) ||
4453  isa<StoreInst>(S.getUse()->getUser())) {
4454  S.makeUnsplittable();
4455  IsSorted = false;
4456  }
4457  }
4458  }
4459 
4460  if (!IsSorted)
4461  llvm::sort(AS);
4462 
4463  /// Describes the allocas introduced by rewritePartition in order to migrate
4464  /// the debug info.
4465  struct Fragment {
4466  AllocaInst *Alloca;
4467  uint64_t Offset;
4468  uint64_t Size;
4470  : Alloca(AI), Offset(O), Size(S) {}
4471  };
4472  SmallVector<Fragment, 4> Fragments;
4473 
4474  // Rewrite each partition.
4475  for (auto &P : AS.partitions()) {
4476  if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
4477  Changed = true;
4478  if (NewAI != &AI) {
4479  uint64_t SizeOfByte = 8;
4480  uint64_t AllocaSize =
4481  DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
4482  // Don't include any padding.
4483  uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
4484  Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
4485  }
4486  }
4487  ++NumPartitions;
4488  }
4489 
4490  NumAllocaPartitions += NumPartitions;
4491  MaxPartitionsPerAlloca.updateMax(NumPartitions);
4492 
4493  // Migrate debug information from the old alloca to the new alloca(s)
4494  // and the individual partitions.
4496  for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
4497  auto *Expr = DbgDeclare->getExpression();
4498  DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
4499  uint64_t AllocaSize =
4500  DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
4501  for (auto Fragment : Fragments) {
4502  // Create a fragment expression describing the new partition or reuse AI's
4503  // expression if there is only one partition.
4504  auto *FragmentExpr = Expr;
4505  if (Fragment.Size < AllocaSize || Expr->isFragment()) {
4506  // If this alloca is already a scalar replacement of a larger aggregate,
4507  // Fragment.Offset describes the offset inside the scalar.
4508  auto ExprFragment = Expr->getFragmentInfo();
4509  uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
4510  uint64_t Start = Offset + Fragment.Offset;
4511  uint64_t Size = Fragment.Size;
4512  if (ExprFragment) {
4513  uint64_t AbsEnd =
4514  ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
4515  if (Start >= AbsEnd)
4516  // No need to describe a SROAed padding.
4517  continue;
4518  Size = std::min(Size, AbsEnd - Start);
4519  }
4520  // The new, smaller fragment is stenciled out from the old fragment.
4521  if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
4522  assert(Start >= OrigFragment->OffsetInBits &&
4523  "new fragment is outside of original fragment");
4524  Start -= OrigFragment->OffsetInBits;
4525  }
4526 
4527  // The alloca may be larger than the variable.
4528  auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
4529  if (VarSize) {
4530  if (Size > *VarSize)
4531  Size = *VarSize;
4532  if (Size == 0 || Start + Size > *VarSize)
4533  continue;
4534  }
4535 
4536  // Avoid creating a fragment expression that covers the entire variable.
4537  if (!VarSize || *VarSize != Size) {
4538  if (auto E =
4539  DIExpression::createFragmentExpression(Expr, Start, Size))
4540  FragmentExpr = *E;
4541  else
4542  continue;
4543  }
4544  }
4545 
4546  // Remove any existing intrinsics on the new alloca describing
4547  // the variable fragment.
4548  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
4549  auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS,
4550  const DbgVariableIntrinsic *RHS) {
4551  return LHS->getVariable() == RHS->getVariable() &&
4552  LHS->getDebugLoc()->getInlinedAt() ==
4553  RHS->getDebugLoc()->getInlinedAt();
4554  };
4555  if (SameVariableFragment(OldDII, DbgDeclare))
4556  OldDII->eraseFromParent();
4557  }
4558 
4559  DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
4560  DbgDeclare->getDebugLoc(), &AI);
4561  }
4562  }
4563  return Changed;
4564 }
4565 
4566 /// Clobber a use with poison, deleting the used value if it becomes dead.
4567 void SROAPass::clobberUse(Use &U) {
4568  Value *OldV = U;
4569  // Replace the use with an poison value.
4570  U = PoisonValue::get(OldV->getType());
4571 
4572  // Check for this making an instruction dead. We have to garbage collect
4573  // all the dead instructions to ensure the uses of any alloca end up being
4574  // minimal.
4575  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
4576  if (isInstructionTriviallyDead(OldI)) {
4577  DeadInsts.push_back(OldI);
4578  }
4579 }
4580 
4581 /// Analyze an alloca for SROA.
4582 ///
4583 /// This analyzes the alloca to ensure we can reason about it, builds
4584 /// the slices of the alloca, and then hands it off to be split and
4585 /// rewritten as needed.
4586 bool SROAPass::runOnAlloca(AllocaInst &AI) {
4587  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
4588  ++NumAllocasAnalyzed;
4589 
4590  // Special case dead allocas, as they're trivial.
4591  if (AI.use_empty()) {
4592  AI.eraseFromParent();
4593  return true;
4594  }
4595  const DataLayout &DL = AI.getModule()->getDataLayout();
4596 
4597  // Skip alloca forms that this analysis can't handle.
4598  auto *AT = AI.getAllocatedType();
4599  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
4600  DL.getTypeAllocSize(AT).getFixedSize() == 0)
4601  return false;
4602 
4603  bool Changed = false;
4604 
4605  // First, split any FCA loads and stores touching this alloca to promote
4606  // better splitting and promotion opportunities.
4607  IRBuilderTy IRB(&AI);
4608  AggLoadStoreRewriter AggRewriter(DL, IRB);
4609  Changed |= AggRewriter.rewrite(AI);
4610 
4611  // Build the slices using a recursive instruction-visiting builder.
4612  AllocaSlices AS(DL, AI);
4613  LLVM_DEBUG(AS.print(dbgs()));
4614  if (AS.isEscaped())
4615  return Changed;
4616 
4617  // Delete all the dead users of this alloca before splitting and rewriting it.
4618  for (Instruction *DeadUser : AS.getDeadUsers()) {
4619  // Free up everything used by this instruction.
4620  for (Use &DeadOp : DeadUser->operands())
4621  clobberUse(DeadOp);
4622 
4623  // Now replace the uses of this instruction.
4624  DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
4625 
4626  // And mark it for deletion.
4627  DeadInsts.push_back(DeadUser);
4628  Changed = true;
4629  }
4630  for (Use *DeadOp : AS.getDeadOperands()) {
4631  clobberUse(*DeadOp);
4632  Changed = true;
4633  }
4634 
4635  // No slices to split. Leave the dead alloca for a later pass to clean up.
4636  if (AS.begin() == AS.end())
4637  return Changed;
4638 
4639  Changed |= splitAlloca(AI, AS);
4640 
4641  LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
4642  while (!SpeculatablePHIs.empty())
4643  speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
4644 
4645  LLVM_DEBUG(dbgs() << " Speculating Selects\n");
4646  while (!SpeculatableSelects.empty())
4647  speculateSelectInstLoads(IRB, *SpeculatableSelects.pop_back_val());
4648 
4649  return Changed;
4650 }
4651 
4652 /// Delete the dead instructions accumulated in this run.
4653 ///
4654 /// Recursively deletes the dead instructions we've accumulated. This is done
4655 /// at the very end to maximize locality of the recursive delete and to
4656 /// minimize the problems of invalidated instruction pointers as such pointers
4657 /// are used heavily in the intermediate stages of the algorithm.
4658 ///
4659 /// We also record the alloca instructions deleted here so that they aren't
4660 /// subsequently handed to mem2reg to promote.
4661 bool SROAPass::deleteDeadInstructions(
4662  SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
4663  bool Changed = false;
4664  while (!DeadInsts.empty()) {
4665  Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
4666  if (!I) continue;
4667  LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
4668 
4669  // If the instruction is an alloca, find the possible dbg.declare connected
4670  // to it, and remove it too. We must do this before calling RAUW or we will
4671  // not be able to find it.
4672  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4673  DeletedAllocas.insert(AI);
4674  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
4675  OldDII->eraseFromParent();
4676  }
4677 
4678  I->replaceAllUsesWith(UndefValue::get(I->getType()));
4679 
4680  for (Use &Operand : I->operands())
4681  if (Instruction *U = dyn_cast<Instruction>(Operand)) {
4682  // Zero out the operand and see if it becomes trivially dead.
4683  Operand = nullptr;
4685  DeadInsts.push_back(U);
4686  }
4687 
4688  ++NumDeleted;
4689  I->eraseFromParent();
4690  Changed = true;
4691  }
4692  return Changed;
4693 }
4694 
4695 /// Promote the allocas, using the best available technique.
4696 ///
4697 /// This attempts to promote whatever allocas have been identified as viable in
4698 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
4699 /// This function returns whether any promotion occurred.
4700 bool SROAPass::promoteAllocas(Function &F) {
4701  if (PromotableAllocas.empty())
4702  return false;
4703 
4704  NumPromoted += PromotableAllocas.size();
4705 
4706  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
4707  PromoteMemToReg(PromotableAllocas, *DT, AC);
4708  PromotableAllocas.clear();
4709  return true;
4710 }
4711 
4713  AssumptionCache &RunAC) {
4714  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
4715  C = &F.getContext();
4716  DT = &RunDT;
4717  AC = &RunAC;
4718 
4719  BasicBlock &EntryBB = F.getEntryBlock();
4720  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
4721  I != E; ++I) {
4722  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4723  if (isa<ScalableVectorType>(AI->getAllocatedType())) {
4724  if (isAllocaPromotable(AI))
4725  PromotableAllocas.push_back(AI);
4726  } else {
4727  Worklist.insert(AI);
4728  }
4729  }
4730  }
4731 
4732  bool Changed = false;
4733  // A set of deleted alloca instruction pointers which should be removed from
4734  // the list of promotable allocas.
4735  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
4736 
4737  do {
4738  while (!Worklist.empty()) {
4739  Changed |= runOnAlloca(*Worklist.pop_back_val());
4740  Changed |= deleteDeadInstructions(DeletedAllocas);
4741 
4742  // Remove the deleted allocas from various lists so that we don't try to
4743  // continue processing them.
4744  if (!DeletedAllocas.empty()) {
4745  auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
4746  Worklist.remove_if(IsInSet);
4747  PostPromotionWorklist.remove_if(IsInSet);
4748  llvm::erase_if(PromotableAllocas, IsInSet);
4749  DeletedAllocas.clear();
4750  }
4751  }
4752 
4753  Changed |= promoteAllocas(F);
4754 
4755  Worklist = PostPromotionWorklist;
4756  PostPromotionWorklist.clear();
4757  } while (!Worklist.empty());
4758 
4759  if (!Changed)
4760  return PreservedAnalyses::all();
4761 
4762  PreservedAnalyses PA;
4763  PA.preserveSet<CFGAnalyses>();
4764  return PA;
4765 }
4766 
4768  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
4770 }
4771 
4772 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
4773 ///
4774 /// This is in the llvm namespace purely to allow it to be a friend of the \c
4775 /// SROA pass.
4777  /// The SROA implementation.
4778  SROAPass Impl;
4779 
4780 public:
4781  static char ID;
4782 
4784  initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
4785  }
4786 
4787  bool runOnFunction(Function &F) override {
4788  if (skipFunction(F))
4789  return false;
4790 
4791  auto PA = Impl.runImpl(
4792  F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
4793  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
4794  return !PA.areAllPreserved();
4795  }
4796 
4797  void getAnalysisUsage(AnalysisUsage &AU) const override {
4801  AU.setPreservesCFG();
4802  }
4803 
4804  StringRef getPassName() const override { return "SROA"; }
4805 };
4806 
4807 char SROALegacyPass::ID = 0;
4808 
4810 
4812  "Scalar Replacement Of Aggregates", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:76
i
i
Definition: README.txt:29
PtrUseVisitor.h
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::sroa::AllocaSlices::begin
iterator begin()
Definition: SROA.cpp:234
llvm::detail::PtrUseVisitorBase::DL
const DataLayout & DL
Definition: PtrUseVisitor.h:110
AssumptionCache.h
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:299
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:264
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:494
llvm::sroa::AllocaSlices::iterator
SmallVectorImpl< Slice >::iterator iterator
Support for iterating over the slices.
Definition: SROA.cpp:231
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:724
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:65
Pass
print lazy value Lazy Value Info Printer Pass
Definition: LazyValueInfo.cpp:1978
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
stripAggregateTypeWrapping
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition: SROA.cpp:3593
llvm::PHINode::incoming_values
op_range incoming_values()
Definition: Instructions.h:2750
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:189
Metadata.h
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:291
llvm::PointerIntPair::setInt
void setInt(IntType IntVal) &
Definition: PointerIntPair.h:68
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:121
IntrinsicInst.h
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:218
llvm::DIBuilder
Definition: DIBuilder.h:41
llvm::sroa::Partition::end
iterator end() const
Definition: SROA.cpp:404
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:753
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:780
DebugInfoMetadata.h
llvm::cl::Prefix
@ Prefix
Definition: CommandLine.h:160
Scalar.h
Loads.h
llvm::MemTransferInst
This class wraps the llvm.memcpy/memmove intrinsics.
Definition: IntrinsicInst.h:1005
llvm::StringRef::rfind
LLVM_NODISCARD size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition: StringRef.h:375
llvm::Function
Definition: Function.h:60
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:199
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
sroa
sroa
Definition: SROA.cpp:4815
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
llvm::StructType::element_iterator
Type::subtype_iterator element_iterator
Definition: DerivedTypes.h:315
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5225
GetElementPtrTypeIterator.h
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::AllocaInst::getType
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:100
llvm::MemIntrinsicBase::getDestAddressSpace
unsigned getDestAddressSpace() const
Definition: IntrinsicInst.h:742
llvm::StringRef::find
LLVM_NODISCARD size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:315
llvm::SmallVector< Slice, 8 >
Statistic.h
llvm::sroa::Partition::beginOffset
uint64_t beginOffset() const
The start offset of this partition.
Definition: SROA.cpp:375
llvm::sroa::SROALegacyPass::SROALegacyPass
SROALegacyPass()
Definition: SROA.cpp:4783
ErrorHandling.h
insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition: SROA.cpp:2190
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2489
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1797
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:682
llvm::SmallDenseMap< Instruction *, unsigned >
llvm::GlobalAlias
Definition: GlobalAlias.h:28
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
GlobalsModRef.h
llvm::sroa::AllocaSliceRewriter::AllocaSliceRewriter
AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROAPass &Pass, AllocaInst &OldAI, AllocaInst &NewAI, uint64_t NewAllocaBeginOffset, uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, VectorType *PromotableVecTy, SmallSetVector< PHINode *, 8 > &PHIUsers, SmallSetVector< SelectInst *, 8 > &SelectUsers)
Definition: SROA.cpp:2302
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::ConstantInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=~0ULL) const
getLimitedValue - If the value is smaller than the specified limit, return it, otherwise return the l...
Definition: Constants.h:244
APInt.h
llvm::sroa::SROALegacyPass
A legacy pass for the legacy pass manager that wraps the SROA pass.
Definition: SROA.cpp:4776
llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:652
AllocaSlices::SliceBuilder::SliceBuilder
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition: SROA.cpp:654
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::LoadInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition: Instructions.h:270
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::StoreInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:358
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2004
llvm::APInt::ugt
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition: APInt.h:1132
llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition: IntrinsicInst.h:962
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:147
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:79
llvm::SmallPtrSet< Instruction *, 4 >
llvm::PromoteMemToReg
void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
Definition: PromoteMemoryToRegister.cpp:1013
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1786
llvm::FindDbgAddrUses
TinyPtrVector< DbgVariableIntrinsic * > FindDbgAddrUses(Value *V)
Finds all intrinsics declaring local variables as living in the memory that 'V' points to.
Definition: DebugInfo.cpp:45
Operator.h
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
getNaturalGEPWithType
static Value * getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl< Value * > &Indices, const Twine &NamePrefix)
Get a natural GEP off of the BasePtr walking through Ty toward TargetTy without changing the offset o...
Definition: SROA.cpp:1452
getBitWidth
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Definition: ValueTracking.cpp:98
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:841
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
STLExtras.h
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::IRBuilderDefaultInserter
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition: IRBuilder.h:61
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:268
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:123
LLVM_ATTRIBUTE_UNUSED
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:182
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:116
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:224
llvm::Type::getNonOpaquePointerElementType
Type * getNonOpaquePointerElementType() const
Only use this method in code that is not reachable with opaque pointers, or part of deprecated method...
Definition: Type.h:382
llvm::sroa::AllocaSlices
Representation of the alloca slices.
Definition: SROA.cpp:218
Use.h
endif
__FakeVCSRevision h endif() endif() set(generated_files "$
Definition: CMakeLists.txt:16
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::StoreInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:385
llvm::AMDGPU::HSAMD::ValueKind::Queue
@ Queue
llvm::Instruction::mayHaveSideEffects
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.cpp:695
llvm::GetElementPtrInst::getSourceElementType
Type * getSourceElementType() const
Definition: Instructions.h:1004
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:854
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:585
isIntegerWideningViableForSlice
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition: SROA.cpp:1976
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::gep_type_end
gep_type_iterator gep_type_end(const User *GEP)
Definition: GetElementPtrTypeIterator.h:130
llvm::IntegerType::getMask
APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition: Type.cpp:335
PointerIntPair.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition: SmallBitVector.h:35
llvm::StringRef::substr
LLVM_NODISCARD StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:611
llvm::GetElementPtrInst::isInBounds
bool isInBounds() const
Determine whether the GEP has the inbounds flag.
Definition: Instructions.cpp:1832
llvm::User::getOperandUse
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
CommandLine.h
insertInteger
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition: SROA.cpp:2135
extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition: SROA.cpp:2168
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Type::isArrayTy
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:215
llvm::Instruction::getNumSuccessors
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
Definition: Instruction.cpp:777
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1366
llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:249
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1607
llvm::ConstantFolder
ConstantFolder - Create constants with minimum, target independent, folding.
Definition: ConstantFolder.h:28
foldPHINodeOrSelectInst
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition: SROA.cpp:626
llvm::MemTransferBase::getSourceAddressSpace
unsigned getSourceAddressSpace() const
Definition: IntrinsicInst.h:807
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:399
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1784
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_END(SROALegacyPass
llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition: Instructions.h:5265
Rewriter
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Constants.h
llvm::PHINode::getIncomingValue
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
Definition: Instructions.h:2760
llvm::sroa::AllocaSlices::AllocaSlices
AllocaSlices(const DataLayout &DL, AllocaInst &AI)
Construct the slices of a particular alloca.
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:114
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:667
llvm::User
Definition: User.h:44
llvm::StructType::isPacked
bool isPacked() const
Definition: DerivedTypes.h:273
llvm::sroa::AllocaSliceRewriter
Visitor to rewrite instructions using p particular slice of an alloca to use a new alloca.
Definition: SROA.cpp:2245
llvm::sroa::Partition::endOffset
uint64_t endOffset() const
The end offset of this partition.
Definition: SROA.cpp:380
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Twine.h
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:297
llvm::SPIRV::ExecutionModel::Fragment
@ Fragment
ConstantFolder.h
llvm::sroa::AllocaSlices::dump
void dump() const
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
runImpl
static bool runImpl(const TargetLibraryInfo &TLI, Function &F)
Definition: ReplaceWithVeclib.cpp:176
llvm::MemTransferBase::getRawSource
Value * getRawSource() const
Return the arguments to the instruction.
Definition: IntrinsicInst.h:794
isSafePHIToSpeculate
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition: SROA.cpp:1203
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:190
llvm::sroa::AllocaSlices::getDeadUsers
ArrayRef< Instruction * > getDeadUsers() const
Access the dead users for this alloca.
Definition: SROA.cpp:266
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:246
false
Definition: StackSlotColoring.cpp:141
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::Instruction
Definition: Instruction.h:42
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
AllocaSlices::partition_iterator::operator++
partition_iterator & operator++()
Definition: SROA.cpp:593
getAdjustedAlignment
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition: SROA.cpp:1657
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:302
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:54
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::sroa::Partition::empty
bool empty() const
Test whether this partition contains no slices, and merely spans a region occupied by split slices.
Definition: SROA.cpp:392
llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
llvm::sroa::AllocaSlices::erase
void erase(iterator Start, iterator Stop)
Erase a range of slices.
Definition: SROA.cpp:245
llvm::sroa::SROALegacyPass::runOnFunction
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Definition: SROA.cpp:4787
SmallPtrSet.h
llvm::Instruction::getAAMetadata
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1402
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:209
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MemTransferBase::getSourceAlign
MaybeAlign getSourceAlign() const
Definition: IntrinsicInst.h:819
llvm::Instruction::isLifetimeStartOrEnd
bool isLifetimeStartOrEnd() const
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
Definition: Instruction.cpp:718
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition: Instructions.h:2756