LLVM  16.0.0git
SROA.cpp
Go to the documentation of this file.
1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
14 ///
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
18 ///
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
22 ///
23 //===----------------------------------------------------------------------===//
24 
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringRef.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/ADT/iterator.h"
42 #include "llvm/Analysis/Loads.h"
44 #include "llvm/Config/llvm-config.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/ConstantFolder.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DIBuilder.h"
50 #include "llvm/IR/DataLayout.h"
51 #include "llvm/IR/DebugInfo.h"
53 #include "llvm/IR/DerivedTypes.h"
54 #include "llvm/IR/Dominators.h"
55 #include "llvm/IR/Function.h"
57 #include "llvm/IR/GlobalAlias.h"
58 #include "llvm/IR/IRBuilder.h"
59 #include "llvm/IR/InstVisitor.h"
60 #include "llvm/IR/Instruction.h"
61 #include "llvm/IR/Instructions.h"
62 #include "llvm/IR/IntrinsicInst.h"
63 #include "llvm/IR/LLVMContext.h"
64 #include "llvm/IR/Metadata.h"
65 #include "llvm/IR/Module.h"
66 #include "llvm/IR/Operator.h"
67 #include "llvm/IR/PassManager.h"
68 #include "llvm/IR/Type.h"
69 #include "llvm/IR/Use.h"
70 #include "llvm/IR/User.h"
71 #include "llvm/IR/Value.h"
72 #include "llvm/InitializePasses.h"
73 #include "llvm/Pass.h"
74 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Compiler.h"
77 #include "llvm/Support/Debug.h"
80 #include "llvm/Transforms/Scalar.h"
83 #include <algorithm>
84 #include <cassert>
85 #include <cstddef>
86 #include <cstdint>
87 #include <cstring>
88 #include <iterator>
89 #include <string>
90 #include <tuple>
91 #include <utility>
92 #include <vector>
93 
94 using namespace llvm;
95 using namespace llvm::sroa;
96 
97 #define DEBUG_TYPE "sroa"
98 
99 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
100 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
101 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
102 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
103 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
104 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
105 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
106 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
107 STATISTIC(NumDeleted, "Number of instructions deleted");
108 STATISTIC(NumVectorized, "Number of vectorized aggregates");
109 
110 /// Hidden option to experiment with completely strict handling of inbounds
111 /// GEPs.
112 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
113  cl::Hidden);
114 
115 namespace {
116 
117 /// A custom IRBuilder inserter which prefixes all names, but only in
118 /// Assert builds.
119 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
120  std::string Prefix;
121 
122  Twine getNameWithPrefix(const Twine &Name) const {
123  return Name.isTriviallyEmpty() ? Name : Prefix + Name;
124  }
125 
126 public:
127  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
128 
129  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
130  BasicBlock::iterator InsertPt) const override {
131  IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
132  InsertPt);
133  }
134 };
135 
136 /// Provide a type for IRBuilder that drops names in release builds.
138 
139 /// A used slice of an alloca.
140 ///
141 /// This structure represents a slice of an alloca used by some instruction. It
142 /// stores both the begin and end offsets of this use, a pointer to the use
143 /// itself, and a flag indicating whether we can classify the use as splittable
144 /// or not when forming partitions of the alloca.
145 class Slice {
146  /// The beginning offset of the range.
147  uint64_t BeginOffset = 0;
148 
149  /// The ending offset, not included in the range.
150  uint64_t EndOffset = 0;
151 
152  /// Storage for both the use of this slice and whether it can be
153  /// split.
154  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
155 
156 public:
157  Slice() = default;
158 
159  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
160  : BeginOffset(BeginOffset), EndOffset(EndOffset),
161  UseAndIsSplittable(U, IsSplittable) {}
162 
163  uint64_t beginOffset() const { return BeginOffset; }
164  uint64_t endOffset() const { return EndOffset; }
165 
166  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
167  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
168 
169  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
170 
171  bool isDead() const { return getUse() == nullptr; }
172  void kill() { UseAndIsSplittable.setPointer(nullptr); }
173 
174  /// Support for ordering ranges.
175  ///
176  /// This provides an ordering over ranges such that start offsets are
177  /// always increasing, and within equal start offsets, the end offsets are
178  /// decreasing. Thus the spanning range comes first in a cluster with the
179  /// same start position.
180  bool operator<(const Slice &RHS) const {
181  if (beginOffset() < RHS.beginOffset())
182  return true;
183  if (beginOffset() > RHS.beginOffset())
184  return false;
185  if (isSplittable() != RHS.isSplittable())
186  return !isSplittable();
187  if (endOffset() > RHS.endOffset())
188  return true;
189  return false;
190  }
191 
192  /// Support comparison with a single offset to allow binary searches.
193  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
194  uint64_t RHSOffset) {
195  return LHS.beginOffset() < RHSOffset;
196  }
197  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
198  const Slice &RHS) {
199  return LHSOffset < RHS.beginOffset();
200  }
201 
202  bool operator==(const Slice &RHS) const {
203  return isSplittable() == RHS.isSplittable() &&
204  beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
205  }
206  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
207 };
208 
209 } // end anonymous namespace
210 
211 /// Representation of the alloca slices.
212 ///
213 /// This class represents the slices of an alloca which are formed by its
214 /// various uses. If a pointer escapes, we can't fully build a representation
215 /// for the slices used and we reflect that in this structure. The uses are
216 /// stored, sorted by increasing beginning offset and with unsplittable slices
217 /// starting at a particular offset before splittable slices.
219 public:
220  /// Construct the slices of a particular alloca.
221  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
222 
223  /// Test whether a pointer to the allocation escapes our analysis.
224  ///
225  /// If this is true, the slices are never fully built and should be
226  /// ignored.
227  bool isEscaped() const { return PointerEscapingInstr; }
228 
229  /// Support for iterating over the slices.
230  /// @{
233 
234  iterator begin() { return Slices.begin(); }
235  iterator end() { return Slices.end(); }
236 
239 
240  const_iterator begin() const { return Slices.begin(); }
241  const_iterator end() const { return Slices.end(); }
242  /// @}
243 
244  /// Erase a range of slices.
245  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
246 
247  /// Insert new slices for this alloca.
248  ///
249  /// This moves the slices into the alloca's slices collection, and re-sorts
250  /// everything so that the usual ordering properties of the alloca's slices
251  /// hold.
252  void insert(ArrayRef<Slice> NewSlices) {
253  int OldSize = Slices.size();
254  Slices.append(NewSlices.begin(), NewSlices.end());
255  auto SliceI = Slices.begin() + OldSize;
256  llvm::sort(SliceI, Slices.end());
257  std::inplace_merge(Slices.begin(), SliceI, Slices.end());
258  }
259 
260  // Forward declare the iterator and range accessor for walking the
261  // partitions.
262  class partition_iterator;
264 
265  /// Access the dead users for this alloca.
266  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
267 
268  /// Access Uses that should be dropped if the alloca is promotable.
270  return DeadUseIfPromotable;
271  }
272 
273  /// Access the dead operands referring to this alloca.
274  ///
275  /// These are operands which have cannot actually be used to refer to the
276  /// alloca as they are outside its range and the user doesn't correct for
277  /// that. These mostly consist of PHI node inputs and the like which we just
278  /// need to replace with undef.
279  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
280 
281 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
282  void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
284  StringRef Indent = " ") const;
286  StringRef Indent = " ") const;
287  void print(raw_ostream &OS) const;
288  void dump(const_iterator I) const;
289  void dump() const;
290 #endif
291 
292 private:
293  template <typename DerivedT, typename RetT = void> class BuilderBase;
294  class SliceBuilder;
295 
296  friend class AllocaSlices::SliceBuilder;
297 
298 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
299  /// Handle to alloca instruction to simplify method interfaces.
300  AllocaInst &AI;
301 #endif
302 
303  /// The instruction responsible for this alloca not having a known set
304  /// of slices.
305  ///
306  /// When an instruction (potentially) escapes the pointer to the alloca, we
307  /// store a pointer to that here and abort trying to form slices of the
308  /// alloca. This will be null if the alloca slices are analyzed successfully.
309  Instruction *PointerEscapingInstr;
310 
311  /// The slices of the alloca.
312  ///
313  /// We store a vector of the slices formed by uses of the alloca here. This
314  /// vector is sorted by increasing begin offset, and then the unsplittable
315  /// slices before the splittable ones. See the Slice inner class for more
316  /// details.
317  SmallVector<Slice, 8> Slices;
318 
319  /// Instructions which will become dead if we rewrite the alloca.
320  ///
321  /// Note that these are not separated by slice. This is because we expect an
322  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
323  /// all these instructions can simply be removed and replaced with poison as
324  /// they come from outside of the allocated space.
326 
327  /// Uses which will become dead if can promote the alloca.
328  SmallVector<Use *, 8> DeadUseIfPromotable;
329 
330  /// Operands which will become dead if we rewrite the alloca.
331  ///
332  /// These are operands that in their particular use can be replaced with
333  /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
334  /// to PHI nodes and the like. They aren't entirely dead (there might be
335  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
336  /// want to swap this particular input for poison to simplify the use lists of
337  /// the alloca.
338  SmallVector<Use *, 8> DeadOperands;
339 };
340 
341 /// A partition of the slices.
342 ///
343 /// An ephemeral representation for a range of slices which can be viewed as
344 /// a partition of the alloca. This range represents a span of the alloca's
345 /// memory which cannot be split, and provides access to all of the slices
346 /// overlapping some part of the partition.
347 ///
348 /// Objects of this type are produced by traversing the alloca's slices, but
349 /// are only ephemeral and not persistent.
351 private:
352  friend class AllocaSlices;
354 
355  using iterator = AllocaSlices::iterator;
356 
357  /// The beginning and ending offsets of the alloca for this
358  /// partition.
359  uint64_t BeginOffset = 0, EndOffset = 0;
360 
361  /// The start and end iterators of this partition.
362  iterator SI, SJ;
363 
364  /// A collection of split slice tails overlapping the partition.
365  SmallVector<Slice *, 4> SplitTails;
366 
367  /// Raw constructor builds an empty partition starting and ending at
368  /// the given iterator.
369  Partition(iterator SI) : SI(SI), SJ(SI) {}
370 
371 public:
372  /// The start offset of this partition.
373  ///
374  /// All of the contained slices start at or after this offset.
375  uint64_t beginOffset() const { return BeginOffset; }
376 
377  /// The end offset of this partition.
378  ///
379  /// All of the contained slices end at or before this offset.
380  uint64_t endOffset() const { return EndOffset; }
381 
382  /// The size of the partition.
383  ///
384  /// Note that this can never be zero.
385  uint64_t size() const {
386  assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
387  return EndOffset - BeginOffset;
388  }
389 
390  /// Test whether this partition contains no slices, and merely spans
391  /// a region occupied by split slices.
392  bool empty() const { return SI == SJ; }
393 
394  /// \name Iterate slices that start within the partition.
395  /// These may be splittable or unsplittable. They have a begin offset >= the
396  /// partition begin offset.
397  /// @{
398  // FIXME: We should probably define a "concat_iterator" helper and use that
399  // to stitch together pointee_iterators over the split tails and the
400  // contiguous iterators of the partition. That would give a much nicer
401  // interface here. We could then additionally expose filtered iterators for
402  // split, unsplit, and unsplittable splices based on the usage patterns.
403  iterator begin() const { return SI; }
404  iterator end() const { return SJ; }
405  /// @}
406 
407  /// Get the sequence of split slice tails.
408  ///
409  /// These tails are of slices which start before this partition but are
410  /// split and overlap into the partition. We accumulate these while forming
411  /// partitions.
412  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
413 };
414 
415 /// An iterator over partitions of the alloca's slices.
416 ///
417 /// This iterator implements the core algorithm for partitioning the alloca's
418 /// slices. It is a forward iterator as we don't support backtracking for
419 /// efficiency reasons, and re-use a single storage area to maintain the
420 /// current set of split slices.
421 ///
422 /// It is templated on the slice iterator type to use so that it can operate
423 /// with either const or non-const slice iterators.
425  : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
426  Partition> {
427  friend class AllocaSlices;
428 
429  /// Most of the state for walking the partitions is held in a class
430  /// with a nice interface for examining them.
431  Partition P;
432 
433  /// We need to keep the end of the slices to know when to stop.
435 
436  /// We also need to keep track of the maximum split end offset seen.
437  /// FIXME: Do we really?
438  uint64_t MaxSplitSliceEndOffset = 0;
439 
440  /// Sets the partition to be empty at given iterator, and sets the
441  /// end iterator.
443  : P(SI), SE(SE) {
444  // If not already at the end, advance our state to form the initial
445  // partition.
446  if (SI != SE)
447  advance();
448  }
449 
450  /// Advance the iterator to the next partition.
451  ///
452  /// Requires that the iterator not be at the end of the slices.
453  void advance() {
454  assert((P.SI != SE || !P.SplitTails.empty()) &&
455  "Cannot advance past the end of the slices!");
456 
457  // Clear out any split uses which have ended.
458  if (!P.SplitTails.empty()) {
459  if (P.EndOffset >= MaxSplitSliceEndOffset) {
460  // If we've finished all splits, this is easy.
461  P.SplitTails.clear();
462  MaxSplitSliceEndOffset = 0;
463  } else {
464  // Remove the uses which have ended in the prior partition. This
465  // cannot change the max split slice end because we just checked that
466  // the prior partition ended prior to that max.
467  llvm::erase_if(P.SplitTails,
468  [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
469  assert(llvm::any_of(P.SplitTails,
470  [&](Slice *S) {
471  return S->endOffset() == MaxSplitSliceEndOffset;
472  }) &&
473  "Could not find the current max split slice offset!");
474  assert(llvm::all_of(P.SplitTails,
475  [&](Slice *S) {
476  return S->endOffset() <= MaxSplitSliceEndOffset;
477  }) &&
478  "Max split slice end offset is not actually the max!");
479  }
480  }
481 
482  // If P.SI is already at the end, then we've cleared the split tail and
483  // now have an end iterator.
484  if (P.SI == SE) {
485  assert(P.SplitTails.empty() && "Failed to clear the split slices!");
486  return;
487  }
488 
489  // If we had a non-empty partition previously, set up the state for
490  // subsequent partitions.
491  if (P.SI != P.SJ) {
492  // Accumulate all the splittable slices which started in the old
493  // partition into the split list.
494  for (Slice &S : P)
495  if (S.isSplittable() && S.endOffset() > P.EndOffset) {
496  P.SplitTails.push_back(&S);
497  MaxSplitSliceEndOffset =
498  std::max(S.endOffset(), MaxSplitSliceEndOffset);
499  }
500 
501  // Start from the end of the previous partition.
502  P.SI = P.SJ;
503 
504  // If P.SI is now at the end, we at most have a tail of split slices.
505  if (P.SI == SE) {
506  P.BeginOffset = P.EndOffset;
507  P.EndOffset = MaxSplitSliceEndOffset;
508  return;
509  }
510 
511  // If the we have split slices and the next slice is after a gap and is
512  // not splittable immediately form an empty partition for the split
513  // slices up until the next slice begins.
514  if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
515  !P.SI->isSplittable()) {
516  P.BeginOffset = P.EndOffset;
517  P.EndOffset = P.SI->beginOffset();
518  return;
519  }
520  }
521 
522  // OK, we need to consume new slices. Set the end offset based on the
523  // current slice, and step SJ past it. The beginning offset of the
524  // partition is the beginning offset of the next slice unless we have
525  // pre-existing split slices that are continuing, in which case we begin
526  // at the prior end offset.
527  P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
528  P.EndOffset = P.SI->endOffset();
529  ++P.SJ;
530 
531  // There are two strategies to form a partition based on whether the
532  // partition starts with an unsplittable slice or a splittable slice.
533  if (!P.SI->isSplittable()) {
534  // When we're forming an unsplittable region, it must always start at
535  // the first slice and will extend through its end.
536  assert(P.BeginOffset == P.SI->beginOffset());
537 
538  // Form a partition including all of the overlapping slices with this
539  // unsplittable slice.
540  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
541  if (!P.SJ->isSplittable())
542  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
543  ++P.SJ;
544  }
545 
546  // We have a partition across a set of overlapping unsplittable
547  // partitions.
548  return;
549  }
550 
551  // If we're starting with a splittable slice, then we need to form
552  // a synthetic partition spanning it and any other overlapping splittable
553  // splices.
554  assert(P.SI->isSplittable() && "Forming a splittable partition!");
555 
556  // Collect all of the overlapping splittable slices.
557  while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
558  P.SJ->isSplittable()) {
559  P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
560  ++P.SJ;
561  }
562 
563  // Back upiP.EndOffset if we ended the span early when encountering an
564  // unsplittable slice. This synthesizes the early end offset of
565  // a partition spanning only splittable slices.
566  if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
567  assert(!P.SJ->isSplittable());
568  P.EndOffset = P.SJ->beginOffset();
569  }
570  }
571 
572 public:
573  bool operator==(const partition_iterator &RHS) const {
574  assert(SE == RHS.SE &&
575  "End iterators don't match between compared partition iterators!");
576 
577  // The observed positions of partitions is marked by the P.SI iterator and
578  // the emptiness of the split slices. The latter is only relevant when
579  // P.SI == SE, as the end iterator will additionally have an empty split
580  // slices list, but the prior may have the same P.SI and a tail of split
581  // slices.
582  if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
583  assert(P.SJ == RHS.P.SJ &&
584  "Same set of slices formed two different sized partitions!");
585  assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
586  "Same slice position with differently sized non-empty split "
587  "slice tails!");
588  return true;
589  }
590  return false;
591  }
592 
594  advance();
595  return *this;
596  }
597 
598  Partition &operator*() { return P; }
599 };
600 
601 /// A forward range over the partitions of the alloca's slices.
602 ///
603 /// This accesses an iterator range over the partitions of the alloca's
604 /// slices. It computes these partitions on the fly based on the overlapping
605 /// offsets of the slices and the ability to split them. It will visit "empty"
606 /// partitions to cover regions of the alloca only accessed via split
607 /// slices.
609  return make_range(partition_iterator(begin(), end()),
610  partition_iterator(end(), end()));
611 }
612 
614  // If the condition being selected on is a constant or the same value is
615  // being selected between, fold the select. Yes this does (rarely) happen
616  // early on.
617  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
618  return SI.getOperand(1 + CI->isZero());
619  if (SI.getOperand(1) == SI.getOperand(2))
620  return SI.getOperand(1);
621 
622  return nullptr;
623 }
624 
625 /// A helper that folds a PHI node or a select.
627  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
628  // If PN merges together the same value, return that value.
629  return PN->hasConstantValue();
630  }
631  return foldSelectInst(cast<SelectInst>(I));
632 }
633 
634 /// Builder for the alloca slices.
635 ///
636 /// This class builds a set of alloca slices by recursively visiting the uses
637 /// of an alloca and making a slice for each load and store at each offset.
638 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
640  friend class InstVisitor<SliceBuilder>;
641 
643 
644  const uint64_t AllocSize;
645  AllocaSlices &AS;
646 
647  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
649 
650  /// Set to de-duplicate dead instructions found in the use walk.
651  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
652 
653 public:
656  AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
657  AS(AS) {}
658 
659 private:
660  void markAsDead(Instruction &I) {
661  if (VisitedDeadInsts.insert(&I).second)
662  AS.DeadUsers.push_back(&I);
663  }
664 
665  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
666  bool IsSplittable = false) {
667  // Completely skip uses which have a zero size or start either before or
668  // past the end of the allocation.
669  if (Size == 0 || Offset.uge(AllocSize)) {
670  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
671  << Offset
672  << " which has zero size or starts outside of the "
673  << AllocSize << " byte alloca:\n"
674  << " alloca: " << AS.AI << "\n"
675  << " use: " << I << "\n");
676  return markAsDead(I);
677  }
678 
679  uint64_t BeginOffset = Offset.getZExtValue();
680  uint64_t EndOffset = BeginOffset + Size;
681 
682  // Clamp the end offset to the end of the allocation. Note that this is
683  // formulated to handle even the case where "BeginOffset + Size" overflows.
684  // This may appear superficially to be something we could ignore entirely,
685  // but that is not so! There may be widened loads or PHI-node uses where
686  // some instructions are dead but not others. We can't completely ignore
687  // them, and so have to record at least the information here.
688  assert(AllocSize >= BeginOffset); // Established above.
689  if (Size > AllocSize - BeginOffset) {
690  LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
691  << Offset << " to remain within the " << AllocSize
692  << " byte alloca:\n"
693  << " alloca: " << AS.AI << "\n"
694  << " use: " << I << "\n");
695  EndOffset = AllocSize;
696  }
697 
698  AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
699  }
700 
701  void visitBitCastInst(BitCastInst &BC) {
702  if (BC.use_empty())
703  return markAsDead(BC);
704 
705  return Base::visitBitCastInst(BC);
706  }
707 
708  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
709  if (ASC.use_empty())
710  return markAsDead(ASC);
711 
712  return Base::visitAddrSpaceCastInst(ASC);
713  }
714 
715  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
716  if (GEPI.use_empty())
717  return markAsDead(GEPI);
718 
719  if (SROAStrictInbounds && GEPI.isInBounds()) {
720  // FIXME: This is a manually un-factored variant of the basic code inside
721  // of GEPs with checking of the inbounds invariant specified in the
722  // langref in a very strict sense. If we ever want to enable
723  // SROAStrictInbounds, this code should be factored cleanly into
724  // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
725  // by writing out the code here where we have the underlying allocation
726  // size readily available.
727  APInt GEPOffset = Offset;
728  const DataLayout &DL = GEPI.getModule()->getDataLayout();
729  for (gep_type_iterator GTI = gep_type_begin(GEPI),
730  GTE = gep_type_end(GEPI);
731  GTI != GTE; ++GTI) {
732  ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
733  if (!OpC)
734  break;
735 
736  // Handle a struct index, which adds its field offset to the pointer.
737  if (StructType *STy = GTI.getStructTypeOrNull()) {
738  unsigned ElementIdx = OpC->getZExtValue();
739  const StructLayout *SL = DL.getStructLayout(STy);
740  GEPOffset +=
741  APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
742  } else {
743  // For array or vector indices, scale the index by the size of the
744  // type.
745  APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
746  GEPOffset +=
747  Index *
748  APInt(Offset.getBitWidth(),
749  DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
750  }
751 
752  // If this index has computed an intermediate pointer which is not
753  // inbounds, then the result of the GEP is a poison value and we can
754  // delete it and all uses.
755  if (GEPOffset.ugt(AllocSize))
756  return markAsDead(GEPI);
757  }
758  }
759 
760  return Base::visitGetElementPtrInst(GEPI);
761  }
762 
763  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
764  uint64_t Size, bool IsVolatile) {
765  // We allow splitting of non-volatile loads and stores where the type is an
766  // integer type. These may be used to implement 'memcpy' or other "transfer
767  // of bits" patterns.
768  bool IsSplittable =
769  Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
770 
771  insertUse(I, Offset, Size, IsSplittable);
772  }
773 
774  void visitLoadInst(LoadInst &LI) {
775  assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
776  "All simple FCA loads should have been pre-split");
777 
778  if (!IsOffsetKnown)
779  return PI.setAborted(&LI);
780 
781  if (LI.isVolatile() &&
782  LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
783  return PI.setAborted(&LI);
784 
785  if (isa<ScalableVectorType>(LI.getType()))
786  return PI.setAborted(&LI);
787 
788  uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
789  return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
790  }
791 
792  void visitStoreInst(StoreInst &SI) {
793  Value *ValOp = SI.getValueOperand();
794  if (ValOp == *U)
795  return PI.setEscapedAndAborted(&SI);
796  if (!IsOffsetKnown)
797  return PI.setAborted(&SI);
798 
799  if (SI.isVolatile() &&
800  SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
801  return PI.setAborted(&SI);
802 
803  if (isa<ScalableVectorType>(ValOp->getType()))
804  return PI.setAborted(&SI);
805 
806  uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
807 
808  // If this memory access can be shown to *statically* extend outside the
809  // bounds of the allocation, it's behavior is undefined, so simply
810  // ignore it. Note that this is more strict than the generic clamping
811  // behavior of insertUse. We also try to handle cases which might run the
812  // risk of overflow.
813  // FIXME: We should instead consider the pointer to have escaped if this
814  // function is being instrumented for addressing bugs or race conditions.
815  if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
816  LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
817  << Offset << " which extends past the end of the "
818  << AllocSize << " byte alloca:\n"
819  << " alloca: " << AS.AI << "\n"
820  << " use: " << SI << "\n");
821  return markAsDead(SI);
822  }
823 
824  assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
825  "All simple FCA stores should have been pre-split");
826  handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
827  }
828 
829  void visitMemSetInst(MemSetInst &II) {
830  assert(II.getRawDest() == *U && "Pointer use is not the destination?");
831  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
832  if ((Length && Length->getValue() == 0) ||
833  (IsOffsetKnown && Offset.uge(AllocSize)))
834  // Zero-length mem transfer intrinsics can be ignored entirely.
835  return markAsDead(II);
836 
837  if (!IsOffsetKnown)
838  return PI.setAborted(&II);
839 
840  // Don't replace this with a store with a different address space. TODO:
841  // Use a store with the casted new alloca?
842  if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
843  return PI.setAborted(&II);
844 
845  insertUse(II, Offset, Length ? Length->getLimitedValue()
846  : AllocSize - Offset.getLimitedValue(),
847  (bool)Length);
848  }
849 
850  void visitMemTransferInst(MemTransferInst &II) {
851  ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
852  if (Length && Length->getValue() == 0)
853  // Zero-length mem transfer intrinsics can be ignored entirely.
854  return markAsDead(II);
855 
856  // Because we can visit these intrinsics twice, also check to see if the
857  // first time marked this instruction as dead. If so, skip it.
858  if (VisitedDeadInsts.count(&II))
859  return;
860 
861  if (!IsOffsetKnown)
862  return PI.setAborted(&II);
863 
864  // Don't replace this with a load/store with a different address space.
865  // TODO: Use a store with the casted new alloca?
866  if (II.isVolatile() &&
867  (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
868  II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
869  return PI.setAborted(&II);
870 
871  // This side of the transfer is completely out-of-bounds, and so we can
872  // nuke the entire transfer. However, we also need to nuke the other side
873  // if already added to our partitions.
874  // FIXME: Yet another place we really should bypass this when
875  // instrumenting for ASan.
876  if (Offset.uge(AllocSize)) {
878  MemTransferSliceMap.find(&II);
879  if (MTPI != MemTransferSliceMap.end())
880  AS.Slices[MTPI->second].kill();
881  return markAsDead(II);
882  }
883 
884  uint64_t RawOffset = Offset.getLimitedValue();
885  uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
886 
887  // Check for the special case where the same exact value is used for both
888  // source and dest.
889  if (*U == II.getRawDest() && *U == II.getRawSource()) {
890  // For non-volatile transfers this is a no-op.
891  if (!II.isVolatile())
892  return markAsDead(II);
893 
894  return insertUse(II, Offset, Size, /*IsSplittable=*/false);
895  }
896 
897  // If we have seen both source and destination for a mem transfer, then
898  // they both point to the same alloca.
899  bool Inserted;
901  std::tie(MTPI, Inserted) =
902  MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
903  unsigned PrevIdx = MTPI->second;
904  if (!Inserted) {
905  Slice &PrevP = AS.Slices[PrevIdx];
906 
907  // Check if the begin offsets match and this is a non-volatile transfer.
908  // In that case, we can completely elide the transfer.
909  if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
910  PrevP.kill();
911  return markAsDead(II);
912  }
913 
914  // Otherwise we have an offset transfer within the same alloca. We can't
915  // split those.
916  PrevP.makeUnsplittable();
917  }
918 
919  // Insert the use now that we've fixed up the splittable nature.
920  insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
921 
922  // Check that we ended up with a valid index in the map.
923  assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
924  "Map index doesn't point back to a slice with this user.");
925  }
926 
927  // Disable SRoA for any intrinsics except for lifetime invariants and
928  // invariant group.
929  // FIXME: What about debug intrinsics? This matches old behavior, but
930  // doesn't make sense.
931  void visitIntrinsicInst(IntrinsicInst &II) {
932  if (II.isDroppable()) {
933  AS.DeadUseIfPromotable.push_back(U);
934  return;
935  }
936 
937  if (!IsOffsetKnown)
938  return PI.setAborted(&II);
939 
940  if (II.isLifetimeStartOrEnd()) {
941  ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
942  uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
943  Length->getLimitedValue());
944  insertUse(II, Offset, Size, true);
945  return;
946  }
947 
949  enqueueUsers(II);
950  return;
951  }
952 
953  Base::visitIntrinsicInst(II);
954  }
955 
956  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
957  // We consider any PHI or select that results in a direct load or store of
958  // the same offset to be a viable use for slicing purposes. These uses
959  // are considered unsplittable and the size is the maximum loaded or stored
960  // size.
963  Visited.insert(Root);
964  Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
965  const DataLayout &DL = Root->getModule()->getDataLayout();
966  // If there are no loads or stores, the access is dead. We mark that as
967  // a size zero access.
968  Size = 0;
969  do {
970  Instruction *I, *UsedI;
971  std::tie(UsedI, I) = Uses.pop_back_val();
972 
973  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
974  Size = std::max(Size,
975  DL.getTypeStoreSize(LI->getType()).getFixedSize());
976  continue;
977  }
978  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
979  Value *Op = SI->getOperand(0);
980  if (Op == UsedI)
981  return SI;
982  Size = std::max(Size,
983  DL.getTypeStoreSize(Op->getType()).getFixedSize());
984  continue;
985  }
986 
987  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
988  if (!GEP->hasAllZeroIndices())
989  return GEP;
990  } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
991  !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
992  return I;
993  }
994 
995  for (User *U : I->users())
996  if (Visited.insert(cast<Instruction>(U)).second)
997  Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
998  } while (!Uses.empty());
999 
1000  return nullptr;
1001  }
1002 
1003  void visitPHINodeOrSelectInst(Instruction &I) {
1004  assert(isa<PHINode>(I) || isa<SelectInst>(I));
1005  if (I.use_empty())
1006  return markAsDead(I);
1007 
1008  // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1009  // instructions in this BB, which may be required during rewriting. Bail out
1010  // on these cases.
1011  if (isa<PHINode>(I) &&
1012  I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1013  return PI.setAborted(&I);
1014 
1015  // TODO: We could use simplifyInstruction here to fold PHINodes and
1016  // SelectInsts. However, doing so requires to change the current
1017  // dead-operand-tracking mechanism. For instance, suppose neither loading
1018  // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1019  // trap either. However, if we simply replace %U with undef using the
1020  // current dead-operand-tracking mechanism, "load (select undef, undef,
1021  // %other)" may trap because the select may return the first operand
1022  // "undef".
1023  if (Value *Result = foldPHINodeOrSelectInst(I)) {
1024  if (Result == *U)
1025  // If the result of the constant fold will be the pointer, recurse
1026  // through the PHI/select as if we had RAUW'ed it.
1027  enqueueUsers(I);
1028  else
1029  // Otherwise the operand to the PHI/select is dead, and we can replace
1030  // it with poison.
1031  AS.DeadOperands.push_back(U);
1032 
1033  return;
1034  }
1035 
1036  if (!IsOffsetKnown)
1037  return PI.setAborted(&I);
1038 
1039  // See if we already have computed info on this node.
1040  uint64_t &Size = PHIOrSelectSizes[&I];
1041  if (!Size) {
1042  // This is a new PHI/Select, check for an unsafe use of it.
1043  if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1044  return PI.setAborted(UnsafeI);
1045  }
1046 
1047  // For PHI and select operands outside the alloca, we can't nuke the entire
1048  // phi or select -- the other side might still be relevant, so we special
1049  // case them here and use a separate structure to track the operands
1050  // themselves which should be replaced with poison.
1051  // FIXME: This should instead be escaped in the event we're instrumenting
1052  // for address sanitization.
1053  if (Offset.uge(AllocSize)) {
1054  AS.DeadOperands.push_back(U);
1055  return;
1056  }
1057 
1058  insertUse(I, Offset, Size);
1059  }
1060 
1061  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1062 
1063  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1064 
1065  /// Disable SROA entirely if there are unhandled users of the alloca.
1066  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1067 };
1068 
1070  :
1071 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1072  AI(AI),
1073 #endif
1074  PointerEscapingInstr(nullptr) {
1075  SliceBuilder PB(DL, AI, *this);
1076  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1077  if (PtrI.isEscaped() || PtrI.isAborted()) {
1078  // FIXME: We should sink the escape vs. abort info into the caller nicely,
1079  // possibly by just storing the PtrInfo in the AllocaSlices.
1080  PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1081  : PtrI.getAbortingInst();
1082  assert(PointerEscapingInstr && "Did not track a bad instruction");
1083  return;
1084  }
1085 
1086  llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1087 
1088  // Sort the uses. This arranges for the offsets to be in ascending order,
1089  // and the sizes to be in descending order.
1090  llvm::stable_sort(Slices);
1091 }
1092 
1093 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1094 
1095 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1096  StringRef Indent) const {
1097  printSlice(OS, I, Indent);
1098  OS << "\n";
1099  printUse(OS, I, Indent);
1100 }
1101 
1102 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1103  StringRef Indent) const {
1104  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1105  << " slice #" << (I - begin())
1106  << (I->isSplittable() ? " (splittable)" : "");
1107 }
1108 
1109 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1110  StringRef Indent) const {
1111  OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1112 }
1113 
1114 void AllocaSlices::print(raw_ostream &OS) const {
1115  if (PointerEscapingInstr) {
1116  OS << "Can't analyze slices for alloca: " << AI << "\n"
1117  << " A pointer to this alloca escaped by:\n"
1118  << " " << *PointerEscapingInstr << "\n";
1119  return;
1120  }
1121 
1122  OS << "Slices of alloca: " << AI << "\n";
1123  for (const_iterator I = begin(), E = end(); I != E; ++I)
1124  print(OS, I);
1125 }
1126 
1127 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1128  print(dbgs(), I);
1129 }
1130 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1131 
1132 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1133 
1134 /// Walk the range of a partitioning looking for a common type to cover this
1135 /// sequence of slices.
1136 static std::pair<Type *, IntegerType *>
1137 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1138  uint64_t EndOffset) {
1139  Type *Ty = nullptr;
1140  bool TyIsCommon = true;
1141  IntegerType *ITy = nullptr;
1142 
1143  // Note that we need to look at *every* alloca slice's Use to ensure we
1144  // always get consistent results regardless of the order of slices.
1145  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1146  Use *U = I->getUse();
1147  if (isa<IntrinsicInst>(*U->getUser()))
1148  continue;
1149  if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1150  continue;
1151 
1152  Type *UserTy = nullptr;
1153  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1154  UserTy = LI->getType();
1155  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1156  UserTy = SI->getValueOperand()->getType();
1157  }
1158 
1159  if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1160  // If the type is larger than the partition, skip it. We only encounter
1161  // this for split integer operations where we want to use the type of the
1162  // entity causing the split. Also skip if the type is not a byte width
1163  // multiple.
1164  if (UserITy->getBitWidth() % 8 != 0 ||
1165  UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1166  continue;
1167 
1168  // Track the largest bitwidth integer type used in this way in case there
1169  // is no common type.
1170  if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1171  ITy = UserITy;
1172  }
1173 
1174  // To avoid depending on the order of slices, Ty and TyIsCommon must not
1175  // depend on types skipped above.
1176  if (!UserTy || (Ty && Ty != UserTy))
1177  TyIsCommon = false; // Give up on anything but an iN type.
1178  else
1179  Ty = UserTy;
1180  }
1181 
1182  return {TyIsCommon ? Ty : nullptr, ITy};
1183 }
1184 
1185 /// PHI instructions that use an alloca and are subsequently loaded can be
1186 /// rewritten to load both input pointers in the pred blocks and then PHI the
1187 /// results, allowing the load of the alloca to be promoted.
1188 /// From this:
1189 /// %P2 = phi [i32* %Alloca, i32* %Other]
1190 /// %V = load i32* %P2
1191 /// to:
1192 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1193 /// ...
1194 /// %V2 = load i32* %Other
1195 /// ...
1196 /// %V = phi [i32 %V1, i32 %V2]
1197 ///
1198 /// We can do this to a select if its only uses are loads and if the operands
1199 /// to the select can be loaded unconditionally.
1200 ///
1201 /// FIXME: This should be hoisted into a generic utility, likely in
1202 /// Transforms/Util/Local.h
1203 static bool isSafePHIToSpeculate(PHINode &PN) {
1204  const DataLayout &DL = PN.getModule()->getDataLayout();
1205 
1206  // For now, we can only do this promotion if the load is in the same block
1207  // as the PHI, and if there are no stores between the phi and load.
1208  // TODO: Allow recursive phi users.
1209  // TODO: Allow stores.
1210  BasicBlock *BB = PN.getParent();
1211  Align MaxAlign;
1212  uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1213  Type *LoadType = nullptr;
1214  for (User *U : PN.users()) {
1215  LoadInst *LI = dyn_cast<LoadInst>(U);
1216  if (!LI || !LI->isSimple())
1217  return false;
1218 
1219  // For now we only allow loads in the same block as the PHI. This is
1220  // a common case that happens when instcombine merges two loads through
1221  // a PHI.
1222  if (LI->getParent() != BB)
1223  return false;
1224 
1225  if (LoadType) {
1226  if (LoadType != LI->getType())
1227  return false;
1228  } else {
1229  LoadType = LI->getType();
1230  }
1231 
1232  // Ensure that there are no instructions between the PHI and the load that
1233  // could store.
1234  for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1235  if (BBI->mayWriteToMemory())
1236  return false;
1237 
1238  MaxAlign = std::max(MaxAlign, LI->getAlign());
1239  }
1240 
1241  if (!LoadType)
1242  return false;
1243 
1244  APInt LoadSize = APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedSize());
1245 
1246  // We can only transform this if it is safe to push the loads into the
1247  // predecessor blocks. The only thing to watch out for is that we can't put
1248  // a possibly trapping load in the predecessor if it is a critical edge.
1249  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1250  Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
1251  Value *InVal = PN.getIncomingValue(Idx);
1252 
1253  // If the value is produced by the terminator of the predecessor (an
1254  // invoke) or it has side-effects, there is no valid place to put a load
1255  // in the predecessor.
1256  if (TI == InVal || TI->mayHaveSideEffects())
1257  return false;
1258 
1259  // If the predecessor has a single successor, then the edge isn't
1260  // critical.
1261  if (TI->getNumSuccessors() == 1)
1262  continue;
1263 
1264  // If this pointer is always safe to load, or if we can prove that there
1265  // is already a load in the block, then we can move the load to the pred
1266  // block.
1267  if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1268  continue;
1269 
1270  return false;
1271  }
1272 
1273  return true;
1274 }
1275 
1276 static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1277  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1278 
1279  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1280  Type *LoadTy = SomeLoad->getType();
1281  IRB.SetInsertPoint(&PN);
1282  PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1283  PN.getName() + ".sroa.speculated");
1284 
1285  // Get the AA tags and alignment to use from one of the loads. It does not
1286  // matter which one we get and if any differ.
1287  AAMDNodes AATags = SomeLoad->getAAMetadata();
1288  Align Alignment = SomeLoad->getAlign();
1289 
1290  // Rewrite all loads of the PN to use the new PHI.
1291  while (!PN.use_empty()) {
1292  LoadInst *LI = cast<LoadInst>(PN.user_back());
1293  LI->replaceAllUsesWith(NewPN);
1294  LI->eraseFromParent();
1295  }
1296 
1297  // Inject loads into all of the pred blocks.
1298  DenseMap<BasicBlock*, Value*> InjectedLoads;
1299  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1300  BasicBlock *Pred = PN.getIncomingBlock(Idx);
1301  Value *InVal = PN.getIncomingValue(Idx);
1302 
1303  // A PHI node is allowed to have multiple (duplicated) entries for the same
1304  // basic block, as long as the value is the same. So if we already injected
1305  // a load in the predecessor, then we should reuse the same load for all
1306  // duplicated entries.
1307  if (Value* V = InjectedLoads.lookup(Pred)) {
1308  NewPN->addIncoming(V, Pred);
1309  continue;
1310  }
1311 
1312  Instruction *TI = Pred->getTerminator();
1313  IRB.SetInsertPoint(TI);
1314 
1315  LoadInst *Load = IRB.CreateAlignedLoad(
1316  LoadTy, InVal, Alignment,
1317  (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1318  ++NumLoadsSpeculated;
1319  if (AATags)
1320  Load->setAAMetadata(AATags);
1321  NewPN->addIncoming(Load, Pred);
1322  InjectedLoads[Pred] = Load;
1323  }
1324 
1325  LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1326  PN.eraseFromParent();
1327 }
1328 
1329 /// Select instructions that use an alloca and are subsequently loaded can be
1330 /// rewritten to load both input pointers and then select between the result,
1331 /// allowing the load of the alloca to be promoted.
1332 /// From this:
1333 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1334 /// %V = load i32* %P2
1335 /// to:
1336 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1337 /// %V2 = load i32* %Other
1338 /// %V = select i1 %cond, i32 %V1, i32 %V2
1339 ///
1340 /// We can do this to a select if its only uses are loads and if the operand
1341 /// to the select can be loaded unconditionally. If found an intervening bitcast
1342 /// with a single use of the load, allow the promotion.
1344  Value *TValue = SI.getTrueValue();
1345  Value *FValue = SI.getFalseValue();
1346  const DataLayout &DL = SI.getModule()->getDataLayout();
1347 
1348  for (User *U : SI.users()) {
1349  LoadInst *LI;
1350  BitCastInst *BC = dyn_cast<BitCastInst>(U);
1351  if (BC && BC->hasOneUse())
1352  LI = dyn_cast<LoadInst>(*BC->user_begin());
1353  else
1354  LI = dyn_cast<LoadInst>(U);
1355 
1356  if (!LI || !LI->isSimple())
1357  return false;
1358 
1359  // Both operands to the select need to be dereferenceable, either
1360  // absolutely (e.g. allocas) or at this point because we can see other
1361  // accesses to it.
1362  if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
1363  LI->getAlign(), DL, LI))
1364  return false;
1365  if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
1366  LI->getAlign(), DL, LI))
1367  return false;
1368  }
1369 
1370  return true;
1371 }
1372 
1373 static void speculateSelectInstLoads(IRBuilderTy &IRB, SelectInst &SI) {
1374  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
1375 
1376  IRB.SetInsertPoint(&SI);
1377  Value *TV = SI.getTrueValue();
1378  Value *FV = SI.getFalseValue();
1379  // Replace the loads of the select with a select of two loads.
1380  while (!SI.use_empty()) {
1381  LoadInst *LI;
1382  BitCastInst *BC = dyn_cast<BitCastInst>(SI.user_back());
1383  if (BC) {
1384  assert(BC->hasOneUse() && "Bitcast should have a single use.");
1385  LI = cast<LoadInst>(BC->user_back());
1386  } else {
1387  LI = cast<LoadInst>(SI.user_back());
1388  }
1389 
1390  assert(LI->isSimple() && "We only speculate simple loads");
1391 
1392  IRB.SetInsertPoint(LI);
1393  Value *NewTV =
1394  BC ? IRB.CreateBitCast(TV, BC->getType(), TV->getName() + ".sroa.cast")
1395  : TV;
1396  Value *NewFV =
1397  BC ? IRB.CreateBitCast(FV, BC->getType(), FV->getName() + ".sroa.cast")
1398  : FV;
1399  LoadInst *TL = IRB.CreateLoad(LI->getType(), NewTV,
1400  LI->getName() + ".sroa.speculate.load.true");
1401  LoadInst *FL = IRB.CreateLoad(LI->getType(), NewFV,
1402  LI->getName() + ".sroa.speculate.load.false");
1403  NumLoadsSpeculated += 2;
1404 
1405  // Transfer alignment and AA info if present.
1406  TL->setAlignment(LI->getAlign());
1407  FL->setAlignment(LI->getAlign());
1408 
1409  AAMDNodes Tags = LI->getAAMetadata();
1410  if (Tags) {
1411  TL->setAAMetadata(Tags);
1412  FL->setAAMetadata(Tags);
1413  }
1414 
1415  Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1416  LI->getName() + ".sroa.speculated");
1417 
1418  LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1419  LI->replaceAllUsesWith(V);
1420  LI->eraseFromParent();
1421  if (BC)
1422  BC->eraseFromParent();
1423  }
1424  SI.eraseFromParent();
1425 }
1426 
1427 /// Build a GEP out of a base pointer and indices.
1428 ///
1429 /// This will return the BasePtr if that is valid, or build a new GEP
1430 /// instruction using the IRBuilder if GEP-ing is needed.
1431 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
1432  SmallVectorImpl<Value *> &Indices,
1433  const Twine &NamePrefix) {
1434  if (Indices.empty())
1435  return BasePtr;
1436 
1437  // A single zero index is a no-op, so check for this and avoid building a GEP
1438  // in that case.
1439  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1440  return BasePtr;
1441 
1442  // buildGEP() is only called for non-opaque pointers.
1443  return IRB.CreateInBoundsGEP(
1444  BasePtr->getType()->getNonOpaquePointerElementType(), BasePtr, Indices,
1445  NamePrefix + "sroa_idx");
1446 }
1447 
1448 /// Get a natural GEP off of the BasePtr walking through Ty toward
1449 /// TargetTy without changing the offset of the pointer.
1450 ///
1451 /// This routine assumes we've already established a properly offset GEP with
1452 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1453 /// zero-indices down through type layers until we find one the same as
1454 /// TargetTy. If we can't find one with the same type, we at least try to use
1455 /// one with the same size. If none of that works, we just produce the GEP as
1456 /// indicated by Indices to have the correct offset.
1457 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
1458  Value *BasePtr, Type *Ty, Type *TargetTy,
1459  SmallVectorImpl<Value *> &Indices,
1460  const Twine &NamePrefix) {
1461  if (Ty == TargetTy)
1462  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1463 
1464  // Offset size to use for the indices.
1465  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
1466 
1467  // See if we can descend into a struct and locate a field with the correct
1468  // type.
1469  unsigned NumLayers = 0;
1470  Type *ElementTy = Ty;
1471  do {
1472  if (ElementTy->isPointerTy())
1473  break;
1474 
1475  if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
1476  ElementTy = ArrayTy->getElementType();
1477  Indices.push_back(IRB.getIntN(OffsetSize, 0));
1478  } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
1479  ElementTy = VectorTy->getElementType();
1480  Indices.push_back(IRB.getInt32(0));
1481  } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1482  if (STy->element_begin() == STy->element_end())
1483  break; // Nothing left to descend into.
1484  ElementTy = *STy->element_begin();
1485  Indices.push_back(IRB.getInt32(0));
1486  } else {
1487  break;
1488  }
1489  ++NumLayers;
1490  } while (ElementTy != TargetTy);
1491  if (ElementTy != TargetTy)
1492  Indices.erase(Indices.end() - NumLayers, Indices.end());
1493 
1494  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1495 }
1496 
1497 /// Get a natural GEP from a base pointer to a particular offset and
1498 /// resulting in a particular type.
1499 ///
1500 /// The goal is to produce a "natural" looking GEP that works with the existing
1501 /// composite types to arrive at the appropriate offset and element type for
1502 /// a pointer. TargetTy is the element type the returned GEP should point-to if
1503 /// possible. We recurse by decreasing Offset, adding the appropriate index to
1504 /// Indices, and setting Ty to the result subtype.
1505 ///
1506 /// If no natural GEP can be constructed, this function returns null.
1507 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
1508  Value *Ptr, APInt Offset, Type *TargetTy,
1509  SmallVectorImpl<Value *> &Indices,
1510  const Twine &NamePrefix) {
1511  PointerType *Ty = cast<PointerType>(Ptr->getType());
1512 
1513  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1514  // an i8.
1515  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
1516  return nullptr;
1517 
1518  Type *ElementTy = Ty->getNonOpaquePointerElementType();
1519  if (!ElementTy->isSized())
1520  return nullptr; // We can't GEP through an unsized element.
1521 
1522  SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(ElementTy, Offset);
1523  if (Offset != 0)
1524  return nullptr;
1525 
1526  for (const APInt &Index : IntIndices)
1527  Indices.push_back(IRB.getInt(Index));
1528  return getNaturalGEPWithType(IRB, DL, Ptr, ElementTy, TargetTy, Indices,
1529  NamePrefix);
1530 }
1531 
1532 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1533 /// resulting pointer has PointerTy.
1534 ///
1535 /// This tries very hard to compute a "natural" GEP which arrives at the offset
1536 /// and produces the pointer type desired. Where it cannot, it will try to use
1537 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
1538 /// fails, it will try to use an existing i8* and GEP to the byte offset and
1539 /// bitcast to the type.
1540 ///
1541 /// The strategy for finding the more natural GEPs is to peel off layers of the
1542 /// pointer, walking back through bit casts and GEPs, searching for a base
1543 /// pointer from which we can compute a natural GEP with the desired
1544 /// properties. The algorithm tries to fold as many constant indices into
1545 /// a single GEP as possible, thus making each GEP more independent of the
1546 /// surrounding code.
1547 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1549  const Twine &NamePrefix) {
1550  // Create i8 GEP for opaque pointers.
1551  if (Ptr->getType()->isOpaquePointerTy()) {
1552  if (Offset != 0)
1553  Ptr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(Offset),
1554  NamePrefix + "sroa_idx");
1555  return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1556  NamePrefix + "sroa_cast");
1557  }
1558 
1559  // Even though we don't look through PHI nodes, we could be called on an
1560  // instruction in an unreachable block, which may be on a cycle.
1561  SmallPtrSet<Value *, 4> Visited;
1562  Visited.insert(Ptr);
1563  SmallVector<Value *, 4> Indices;
1564 
1565  // We may end up computing an offset pointer that has the wrong type. If we
1566  // never are able to compute one directly that has the correct type, we'll
1567  // fall back to it, so keep it and the base it was computed from around here.
1568  Value *OffsetPtr = nullptr;
1569  Value *OffsetBasePtr;
1570 
1571  // Remember any i8 pointer we come across to re-use if we need to do a raw
1572  // byte offset.
1573  Value *Int8Ptr = nullptr;
1574  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1575 
1576  PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
1577  Type *TargetTy = TargetPtrTy->getNonOpaquePointerElementType();
1578 
1579  // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
1580  // address space from the expected `PointerTy` (the pointer to be used).
1581  // Adjust the pointer type based the original storage pointer.
1582  auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
1583  PointerTy = TargetTy->getPointerTo(AS);
1584 
1585  do {
1586  // First fold any existing GEPs into the offset.
1587  while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1588  APInt GEPOffset(Offset.getBitWidth(), 0);
1589  if (!GEP->accumulateConstantOffset(DL, GEPOffset))
1590  break;
1591  Offset += GEPOffset;
1592  Ptr = GEP->getPointerOperand();
1593  if (!Visited.insert(Ptr).second)
1594  break;
1595  }
1596 
1597  // See if we can perform a natural GEP here.
1598  Indices.clear();
1599  if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
1600  Indices, NamePrefix)) {
1601  // If we have a new natural pointer at the offset, clear out any old
1602  // offset pointer we computed. Unless it is the base pointer or
1603  // a non-instruction, we built a GEP we don't need. Zap it.
1604  if (OffsetPtr && OffsetPtr != OffsetBasePtr)
1605  if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
1606  assert(I->use_empty() && "Built a GEP with uses some how!");
1607  I->eraseFromParent();
1608  }
1609  OffsetPtr = P;
1610  OffsetBasePtr = Ptr;
1611  // If we also found a pointer of the right type, we're done.
1612  if (P->getType() == PointerTy)
1613  break;
1614  }
1615 
1616  // Stash this pointer if we've found an i8*.
1617  if (Ptr->getType()->isIntegerTy(8)) {
1618  Int8Ptr = Ptr;
1619  Int8PtrOffset = Offset;
1620  }
1621 
1622  // Peel off a layer of the pointer and update the offset appropriately.
1623  if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1624  Ptr = cast<Operator>(Ptr)->getOperand(0);
1625  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1626  if (GA->isInterposable())
1627  break;
1628  Ptr = GA->getAliasee();
1629  } else {
1630  break;
1631  }
1632  assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1633  } while (Visited.insert(Ptr).second);
1634 
1635  if (!OffsetPtr) {
1636  if (!Int8Ptr) {
1637  Int8Ptr = IRB.CreateBitCast(
1638  Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
1639  NamePrefix + "sroa_raw_cast");
1640  Int8PtrOffset = Offset;
1641  }
1642 
1643  OffsetPtr = Int8PtrOffset == 0
1644  ? Int8Ptr
1645  : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
1646  IRB.getInt(Int8PtrOffset),
1647  NamePrefix + "sroa_raw_idx");
1648  }
1649  Ptr = OffsetPtr;
1650 
1651  // On the off chance we were targeting i8*, guard the bitcast here.
1652  if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
1653  Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
1654  TargetPtrTy,
1655  NamePrefix + "sroa_cast");
1656  }
1657 
1658  return Ptr;
1659 }
1660 
1661 /// Compute the adjusted alignment for a load or store from an offset.
1664 }
1665 
1666 /// Test whether we can convert a value from the old to the new type.
1667 ///
1668 /// This predicate should be used to guard calls to convertValue in order to
1669 /// ensure that we only try to convert viable values. The strategy is that we
1670 /// will peel off single element struct and array wrappings to get to an
1671 /// underlying value, and convert that value.
1672 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1673  if (OldTy == NewTy)
1674  return true;
1675 
1676  // For integer types, we can't handle any bit-width differences. This would
1677  // break both vector conversions with extension and introduce endianness
1678  // issues when in conjunction with loads and stores.
1679  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1680  assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1681  cast<IntegerType>(NewTy)->getBitWidth() &&
1682  "We can't have the same bitwidth for different int types");
1683  return false;
1684  }
1685 
1686  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
1687  DL.getTypeSizeInBits(OldTy).getFixedSize())
1688  return false;
1689  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1690  return false;
1691 
1692  // We can convert pointers to integers and vice-versa. Same for vectors
1693  // of pointers and integers.
1694  OldTy = OldTy->getScalarType();
1695  NewTy = NewTy->getScalarType();
1696  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1697  if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1698  unsigned OldAS = OldTy->getPointerAddressSpace();
1699  unsigned NewAS = NewTy->getPointerAddressSpace();
1700  // Convert pointers if they are pointers from the same address space or
1701  // different integral (not non-integral) address spaces with the same
1702  // pointer size.
1703  return OldAS == NewAS ||
1704  (!DL.isNonIntegralAddressSpace(OldAS) &&
1705  !DL.isNonIntegralAddressSpace(NewAS) &&
1706  DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1707  }
1708 
1709  // We can convert integers to integral pointers, but not to non-integral
1710  // pointers.
1711  if (OldTy->isIntegerTy())
1712  return !DL.isNonIntegralPointerType(NewTy);
1713 
1714  // We can convert integral pointers to integers, but non-integral pointers
1715  // need to remain pointers.
1716  if (!DL.isNonIntegralPointerType(OldTy))
1717  return NewTy->isIntegerTy();
1718 
1719  return false;
1720  }
1721 
1722  return true;
1723 }
1724 
1725 /// Generic routine to convert an SSA value to a value of a different
1726 /// type.
1727 ///
1728 /// This will try various different casting techniques, such as bitcasts,
1729 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1730 /// two types for viability with this routine.
1731 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
1732  Type *NewTy) {
1733  Type *OldTy = V->getType();
1734  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
1735 
1736  if (OldTy == NewTy)
1737  return V;
1738 
1739  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
1740  "Integer types must be the exact same to convert.");
1741 
1742  // See if we need inttoptr for this type pair. May require additional bitcast.
1743  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1744  // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
1745  // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
1746  // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
1747  // Directly handle i64 to i8*
1748  return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
1749  NewTy);
1750  }
1751 
1752  // See if we need ptrtoint for this type pair. May require additional bitcast.
1753  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
1754  // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
1755  // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
1756  // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
1757  // Expand i8* to i64 --> i8* to i64 to i64
1758  return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1759  NewTy);
1760  }
1761 
1762  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
1763  unsigned OldAS = OldTy->getPointerAddressSpace();
1764  unsigned NewAS = NewTy->getPointerAddressSpace();
1765  // To convert pointers with different address spaces (they are already
1766  // checked convertible, i.e. they have the same pointer size), so far we
1767  // cannot use `bitcast` (which has restrict on the same address space) or
1768  // `addrspacecast` (which is not always no-op casting). Instead, use a pair
1769  // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
1770  // size.
1771  if (OldAS != NewAS) {
1772  assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1773  return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1774  NewTy);
1775  }
1776  }
1777 
1778  return IRB.CreateBitCast(V, NewTy);
1779 }
1780 
1781 /// Test whether the given slice use can be promoted to a vector.
1782 ///
1783 /// This function is called to test each entry in a partition which is slated
1784 /// for a single slice.
1785 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
1786  VectorType *Ty,
1787  uint64_t ElementSize,
1788  const DataLayout &DL) {
1789  // First validate the slice offsets.
1790  uint64_t BeginOffset =
1791  std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
1792  uint64_t BeginIndex = BeginOffset / ElementSize;
1793  if (BeginIndex * ElementSize != BeginOffset ||
1794  BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
1795  return false;
1796  uint64_t EndOffset =
1797  std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
1798  uint64_t EndIndex = EndOffset / ElementSize;
1799  if (EndIndex * ElementSize != EndOffset ||
1800  EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
1801  return false;
1802 
1803  assert(EndIndex > BeginIndex && "Empty vector!");
1804  uint64_t NumElements = EndIndex - BeginIndex;
1805  Type *SliceTy = (NumElements == 1)
1806  ? Ty->getElementType()
1807  : FixedVectorType::get(Ty->getElementType(), NumElements);
1808 
1809  Type *SplitIntTy =
1810  Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
1811 
1812  Use *U = S.getUse();
1813 
1814  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
1815  if (MI->isVolatile())
1816  return false;
1817  if (!S.isSplittable())
1818  return false; // Skip any unsplittable intrinsics.
1819  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
1820  if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
1821  return false;
1822  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1823  if (LI->isVolatile())
1824  return false;
1825  Type *LTy = LI->getType();
1826  // Disable vector promotion when there are loads or stores of an FCA.
1827  if (LTy->isStructTy())
1828  return false;
1829  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1830  assert(LTy->isIntegerTy());
1831  LTy = SplitIntTy;
1832  }
1833  if (!canConvertValue(DL, SliceTy, LTy))
1834  return false;
1835  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1836  if (SI->isVolatile())
1837  return false;
1838  Type *STy = SI->getValueOperand()->getType();
1839  // Disable vector promotion when there are loads or stores of an FCA.
1840  if (STy->isStructTy())
1841  return false;
1842  if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1843  assert(STy->isIntegerTy());
1844  STy = SplitIntTy;
1845  }
1846  if (!canConvertValue(DL, STy, SliceTy))
1847  return false;
1848  } else {
1849  return false;
1850  }
1851 
1852  return true;
1853 }
1854 
1855 /// Test whether a vector type is viable for promotion.
1856 ///
1857 /// This implements the necessary checking for \c isVectorPromotionViable over
1858 /// all slices of the alloca for the given VectorType.
1860  const DataLayout &DL) {
1861  uint64_t ElementSize =
1862  DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
1863 
1864  // While the definition of LLVM vectors is bitpacked, we don't support sizes
1865  // that aren't byte sized.
1866  if (ElementSize % 8)
1867  return false;
1868  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
1869  "vector size not a multiple of element size?");
1870  ElementSize /= 8;
1871 
1872  for (const Slice &S : P)
1873  if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
1874  return false;
1875 
1876  for (const Slice *S : P.splitSliceTails())
1877  if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
1878  return false;
1879 
1880  return true;
1881 }
1882 
1883 /// Test whether the given alloca partitioning and range of slices can be
1884 /// promoted to a vector.
1885 ///
1886 /// This is a quick test to check whether we can rewrite a particular alloca
1887 /// partition (and its newly formed alloca) into a vector alloca with only
1888 /// whole-vector loads and stores such that it could be promoted to a vector
1889 /// SSA value. We only can ensure this for a limited set of operations, and we
1890 /// don't want to do the rewrites unless we are confident that the result will
1891 /// be promotable, so we have an early test here.
1893  // Collect the candidate types for vector-based promotion. Also track whether
1894  // we have different element types.
1895  SmallVector<VectorType *, 4> CandidateTys;
1896  Type *CommonEltTy = nullptr;
1897  VectorType *CommonVecPtrTy = nullptr;
1898  bool HaveVecPtrTy = false;
1899  bool HaveCommonEltTy = true;
1900  bool HaveCommonVecPtrTy = true;
1901  auto CheckCandidateType = [&](Type *Ty) {
1902  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1903  // Return if bitcast to vectors is different for total size in bits.
1904  if (!CandidateTys.empty()) {
1905  VectorType *V = CandidateTys[0];
1906  if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
1907  DL.getTypeSizeInBits(V).getFixedSize()) {
1908  CandidateTys.clear();
1909  return;
1910  }
1911  }
1912  CandidateTys.push_back(VTy);
1913  Type *EltTy = VTy->getElementType();
1914 
1915  if (!CommonEltTy)
1916  CommonEltTy = EltTy;
1917  else if (CommonEltTy != EltTy)
1918  HaveCommonEltTy = false;
1919 
1920  if (EltTy->isPointerTy()) {
1921  HaveVecPtrTy = true;
1922  if (!CommonVecPtrTy)
1923  CommonVecPtrTy = VTy;
1924  else if (CommonVecPtrTy != VTy)
1925  HaveCommonVecPtrTy = false;
1926  }
1927  }
1928  };
1929  // Consider any loads or stores that are the exact size of the slice.
1930  for (const Slice &S : P)
1931  if (S.beginOffset() == P.beginOffset() &&
1932  S.endOffset() == P.endOffset()) {
1933  if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
1934  CheckCandidateType(LI->getType());
1935  else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
1936  CheckCandidateType(SI->getValueOperand()->getType());
1937  }
1938 
1939  // If we didn't find a vector type, nothing to do here.
1940  if (CandidateTys.empty())
1941  return nullptr;
1942 
1943  // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
1944  // then we should choose it, not some other alternative.
1945  // But, we can't perform a no-op pointer address space change via bitcast,
1946  // so if we didn't have a common pointer element type, bail.
1947  if (HaveVecPtrTy && !HaveCommonVecPtrTy)
1948  return nullptr;
1949 
1950  // Try to pick the "best" element type out of the choices.
1951  if (!HaveCommonEltTy && HaveVecPtrTy) {
1952  // If there was a pointer element type, there's really only one choice.
1953  CandidateTys.clear();
1954  CandidateTys.push_back(CommonVecPtrTy);
1955  } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
1956  // Integer-ify vector types.
1957  for (VectorType *&VTy : CandidateTys) {
1958  if (!VTy->getElementType()->isIntegerTy())
1959  VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
1960  VTy->getContext(), VTy->getScalarSizeInBits())));
1961  }
1962 
1963  // Rank the remaining candidate vector types. This is easy because we know
1964  // they're all integer vectors. We sort by ascending number of elements.
1965  auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
1966  (void)DL;
1967  assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
1968  DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
1969  "Cannot have vector types of different sizes!");
1970  assert(RHSTy->getElementType()->isIntegerTy() &&
1971  "All non-integer types eliminated!");
1972  assert(LHSTy->getElementType()->isIntegerTy() &&
1973  "All non-integer types eliminated!");
1974  return cast<FixedVectorType>(RHSTy)->getNumElements() <
1975  cast<FixedVectorType>(LHSTy)->getNumElements();
1976  };
1977  llvm::sort(CandidateTys, RankVectorTypes);
1978  CandidateTys.erase(
1979  std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
1980  CandidateTys.end());
1981  } else {
1982 // The only way to have the same element type in every vector type is to
1983 // have the same vector type. Check that and remove all but one.
1984 #ifndef NDEBUG
1985  for (VectorType *VTy : CandidateTys) {
1986  assert(VTy->getElementType() == CommonEltTy &&
1987  "Unaccounted for element type!");
1988  assert(VTy == CandidateTys[0] &&
1989  "Different vector types with the same element type!");
1990  }
1991 #endif
1992  CandidateTys.resize(1);
1993  }
1994 
1995  // FIXME: hack. Do we have a named constant for this?
1996  // SDAG SDNode can't have more than 65535 operands.
1997  llvm::erase_if(CandidateTys, [](VectorType *VTy) {
1998  return cast<FixedVectorType>(VTy)->getNumElements() >
2000  });
2001 
2002  for (VectorType *VTy : CandidateTys)
2003  if (checkVectorTypeForPromotion(P, VTy, DL))
2004  return VTy;
2005 
2006  return nullptr;
2007 }
2008 
2009 /// Test whether a slice of an alloca is valid for integer widening.
2010 ///
2011 /// This implements the necessary checking for the \c isIntegerWideningViable
2012 /// test below on a single slice of the alloca.
2013 static bool isIntegerWideningViableForSlice(const Slice &S,
2014  uint64_t AllocBeginOffset,
2015  Type *AllocaTy,
2016  const DataLayout &DL,
2017  bool &WholeAllocaOp) {
2018  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
2019 
2020  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2021  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2022 
2023  Use *U = S.getUse();
2024 
2025  // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2026  // larger than other load/store slices (RelEnd > Size). But lifetime are
2027  // always promotable and should not impact other slices' promotability of the
2028  // partition.
2029  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2030  if (II->isLifetimeStartOrEnd() || II->isDroppable())
2031  return true;
2032  }
2033 
2034  // We can't reasonably handle cases where the load or store extends past
2035  // the end of the alloca's type and into its padding.
2036  if (RelEnd > Size)
2037  return false;
2038 
2039  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2040  if (LI->isVolatile())
2041  return false;
2042  // We can't handle loads that extend past the allocated memory.
2043  if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
2044  return false;
2045  // So far, AllocaSliceRewriter does not support widening split slice tails
2046  // in rewriteIntegerLoad.
2047  if (S.beginOffset() < AllocBeginOffset)
2048  return false;
2049  // Note that we don't count vector loads or stores as whole-alloca
2050  // operations which enable integer widening because we would prefer to use
2051  // vector widening instead.
2052  if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2053  WholeAllocaOp = true;
2054  if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2055  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2056  return false;
2057  } else if (RelBegin != 0 || RelEnd != Size ||
2058  !canConvertValue(DL, AllocaTy, LI->getType())) {
2059  // Non-integer loads need to be convertible from the alloca type so that
2060  // they are promotable.
2061  return false;
2062  }
2063  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2064  Type *ValueTy = SI->getValueOperand()->getType();
2065  if (SI->isVolatile())
2066  return false;
2067  // We can't handle stores that extend past the allocated memory.
2068  if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
2069  return false;
2070  // So far, AllocaSliceRewriter does not support widening split slice tails
2071  // in rewriteIntegerStore.
2072  if (S.beginOffset() < AllocBeginOffset)
2073  return false;
2074  // Note that we don't count vector loads or stores as whole-alloca
2075  // operations which enable integer widening because we would prefer to use
2076  // vector widening instead.
2077  if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2078  WholeAllocaOp = true;
2079  if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2080  if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
2081  return false;
2082  } else if (RelBegin != 0 || RelEnd != Size ||
2083  !canConvertValue(DL, ValueTy, AllocaTy)) {
2084  // Non-integer stores need to be convertible to the alloca type so that
2085  // they are promotable.
2086  return false;
2087  }
2088  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2089  if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2090  return false;
2091  if (!S.isSplittable())
2092  return false; // Skip any unsplittable intrinsics.
2093  } else {
2094  return false;
2095  }
2096 
2097  return true;
2098 }
2099 
2100 /// Test whether the given alloca partition's integer operations can be
2101 /// widened to promotable ones.
2102 ///
2103 /// This is a quick test to check whether we can rewrite the integer loads and
2104 /// stores to a particular alloca into wider loads and stores and be able to
2105 /// promote the resulting alloca.
2106 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2107  const DataLayout &DL) {
2108  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
2109  // Don't create integer types larger than the maximum bitwidth.
2110  if (SizeInBits > IntegerType::MAX_INT_BITS)
2111  return false;
2112 
2113  // Don't try to handle allocas with bit-padding.
2114  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
2115  return false;
2116 
2117  // We need to ensure that an integer type with the appropriate bitwidth can
2118  // be converted to the alloca type, whatever that is. We don't want to force
2119  // the alloca itself to have an integer type if there is a more suitable one.
2120  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2121  if (!canConvertValue(DL, AllocaTy, IntTy) ||
2122  !canConvertValue(DL, IntTy, AllocaTy))
2123  return false;
2124 
2125  // While examining uses, we ensure that the alloca has a covering load or
2126  // store. We don't want to widen the integer operations only to fail to
2127  // promote due to some other unsplittable entry (which we may make splittable
2128  // later). However, if there are only splittable uses, go ahead and assume
2129  // that we cover the alloca.
2130  // FIXME: We shouldn't consider split slices that happen to start in the
2131  // partition here...
2132  bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2133 
2134  for (const Slice &S : P)
2135  if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2136  WholeAllocaOp))
2137  return false;
2138 
2139  for (const Slice *S : P.splitSliceTails())
2140  if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2141  WholeAllocaOp))
2142  return false;
2143 
2144  return WholeAllocaOp;
2145 }
2146 
2147 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2149  const Twine &Name) {
2150  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2151  IntegerType *IntTy = cast<IntegerType>(V->getType());
2152  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2153  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2154  "Element extends past full value");
2155  uint64_t ShAmt = 8 * Offset;
2156  if (DL.isBigEndian())
2157  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2158  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2159  if (ShAmt) {
2160  V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2161  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2162  }
2163  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2164  "Cannot extract to a larger integer!");
2165  if (Ty != IntTy) {
2166  V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2167  LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2168  }
2169  return V;
2170 }
2171 
2172 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2173  Value *V, uint64_t Offset, const Twine &Name) {
2174  IntegerType *IntTy = cast<IntegerType>(Old->getType());
2175  IntegerType *Ty = cast<IntegerType>(V->getType());
2176  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2177  "Cannot insert a larger integer!");
2178  LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2179  if (Ty != IntTy) {
2180  V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2181  LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2182  }
2183  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
2184  DL.getTypeStoreSize(IntTy).getFixedSize() &&
2185  "Element store outside of alloca store");
2186  uint64_t ShAmt = 8 * Offset;
2187  if (DL.isBigEndian())
2188  ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
2189  DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
2190  if (ShAmt) {
2191  V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2192  LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2193  }
2194 
2195  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2196  APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2197  Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2198  LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2199  V = IRB.CreateOr(Old, V, Name + ".insert");
2200  LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2201  }
2202  return V;
2203 }
2204 
2205 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2206  unsigned EndIndex, const Twine &Name) {
2207  auto *VecTy = cast<FixedVectorType>(V->getType());
2208  unsigned NumElements = EndIndex - BeginIndex;
2209  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2210 
2211  if (NumElements == VecTy->getNumElements())
2212  return V;
2213 
2214  if (NumElements == 1) {
2215  V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2216  Name + ".extract");
2217  LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2218  return V;
2219  }
2220 
2221  auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2222  V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2223  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2224  return V;
2225 }
2226 
2227 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2228  unsigned BeginIndex, const Twine &Name) {
2229  VectorType *VecTy = cast<VectorType>(Old->getType());
2230  assert(VecTy && "Can only insert a vector into a vector");
2231 
2232  VectorType *Ty = dyn_cast<VectorType>(V->getType());
2233  if (!Ty) {
2234  // Single element to insert.
2235  V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2236  Name + ".insert");
2237  LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2238  return V;
2239  }
2240 
2241  assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2242  cast<FixedVectorType>(VecTy)->getNumElements() &&
2243  "Too many elements!");
2244  if (cast<FixedVectorType>(Ty)->getNumElements() ==
2245  cast<FixedVectorType>(VecTy)->getNumElements()) {
2246  assert(V->getType() == VecTy && "Vector type mismatch");
2247  return V;
2248  }
2249  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2250 
2251  // When inserting a smaller vector into the larger to store, we first
2252  // use a shuffle vector to widen it with undef elements, and then
2253  // a second shuffle vector to select between the loaded vector and the
2254  // incoming vector.
2256  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2257  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2258  if (i >= BeginIndex && i < EndIndex)
2259  Mask.push_back(i - BeginIndex);
2260  else
2261  Mask.push_back(-1);
2262  V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2263  LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2264 
2266  Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2267  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2268  Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2269 
2270  V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2271 
2272  LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2273  return V;
2274 }
2275 
2276 /// Visitor to rewrite instructions using p particular slice of an alloca
2277 /// to use a new alloca.
2278 ///
2279 /// Also implements the rewriting to vector-based accesses when the partition
2280 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2281 /// lives here.
2283  : public InstVisitor<AllocaSliceRewriter, bool> {
2284  // Befriend the base class so it can delegate to private visit methods.
2285  friend class InstVisitor<AllocaSliceRewriter, bool>;
2286 
2288 
2289  const DataLayout &DL;
2290  AllocaSlices &AS;
2291  SROAPass &Pass;
2292  AllocaInst &OldAI, &NewAI;
2293  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2294  Type *NewAllocaTy;
2295 
2296  // This is a convenience and flag variable that will be null unless the new
2297  // alloca's integer operations should be widened to this integer type due to
2298  // passing isIntegerWideningViable above. If it is non-null, the desired
2299  // integer type will be stored here for easy access during rewriting.
2300  IntegerType *IntTy;
2301 
2302  // If we are rewriting an alloca partition which can be written as pure
2303  // vector operations, we stash extra information here. When VecTy is
2304  // non-null, we have some strict guarantees about the rewritten alloca:
2305  // - The new alloca is exactly the size of the vector type here.
2306  // - The accesses all either map to the entire vector or to a single
2307  // element.
2308  // - The set of accessing instructions is only one of those handled above
2309  // in isVectorPromotionViable. Generally these are the same access kinds
2310  // which are promotable via mem2reg.
2311  VectorType *VecTy;
2312  Type *ElementTy;
2313  uint64_t ElementSize;
2314 
2315  // The original offset of the slice currently being rewritten relative to
2316  // the original alloca.
2317  uint64_t BeginOffset = 0;
2318  uint64_t EndOffset = 0;
2319 
2320  // The new offsets of the slice currently being rewritten relative to the
2321  // original alloca.
2322  uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2323 
2324  uint64_t SliceSize = 0;
2325  bool IsSplittable = false;
2326  bool IsSplit = false;
2327  Use *OldUse = nullptr;
2328  Instruction *OldPtr = nullptr;
2329 
2330  // Track post-rewrite users which are PHI nodes and Selects.
2331  SmallSetVector<PHINode *, 8> &PHIUsers;
2332  SmallSetVector<SelectInst *, 8> &SelectUsers;
2333 
2334  // Utility IR builder, whose name prefix is setup for each visited use, and
2335  // the insertion point is set to point to the user.
2336  IRBuilderTy IRB;
2337 
2338 public:
2340  AllocaInst &OldAI, AllocaInst &NewAI,
2341  uint64_t NewAllocaBeginOffset,
2342  uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2343  VectorType *PromotableVecTy,
2344  SmallSetVector<PHINode *, 8> &PHIUsers,
2345  SmallSetVector<SelectInst *, 8> &SelectUsers)
2346  : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2347  NewAllocaBeginOffset(NewAllocaBeginOffset),
2348  NewAllocaEndOffset(NewAllocaEndOffset),
2349  NewAllocaTy(NewAI.getAllocatedType()),
2350  IntTy(
2351  IsIntegerPromotable
2352  ? Type::getIntNTy(NewAI.getContext(),
2353  DL.getTypeSizeInBits(NewAI.getAllocatedType())
2354  .getFixedSize())
2355  : nullptr),
2356  VecTy(PromotableVecTy),
2357  ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2358  ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
2359  : 0),
2360  PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2361  IRB(NewAI.getContext(), ConstantFolder()) {
2362  if (VecTy) {
2363  assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
2364  "Only multiple-of-8 sized vector elements are viable");
2365  ++NumVectorized;
2366  }
2367  assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2368  }
2369 
2371  bool CanSROA = true;
2372  BeginOffset = I->beginOffset();
2373  EndOffset = I->endOffset();
2374  IsSplittable = I->isSplittable();
2375  IsSplit =
2376  BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2377  LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2378  LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2379  LLVM_DEBUG(dbgs() << "\n");
2380 
2381  // Compute the intersecting offset range.
2382  assert(BeginOffset < NewAllocaEndOffset);
2383  assert(EndOffset > NewAllocaBeginOffset);
2384  NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2385  NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2386 
2387  SliceSize = NewEndOffset - NewBeginOffset;
2388 
2389  OldUse = I->getUse();
2390  OldPtr = cast<Instruction>(OldUse->get());
2391 
2392  Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2393  IRB.SetInsertPoint(OldUserI);
2394  IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2395  IRB.getInserter().SetNamePrefix(
2396  Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
2397 
2398  CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2399  if (VecTy || IntTy)
2400  assert(CanSROA);
2401  return CanSROA;
2402  }
2403 
2404 private:
2405  // Make sure the other visit overloads are visible.
2406  using Base::visit;
2407 
2408  // Every instruction which can end up as a user must have a rewrite rule.
2409  bool visitInstruction(Instruction &I) {
2410  LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2411  llvm_unreachable("No rewrite rule for this instruction!");
2412  }
2413 
2414  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2415  // Note that the offset computation can use BeginOffset or NewBeginOffset
2416  // interchangeably for unsplit slices.
2417  assert(IsSplit || BeginOffset == NewBeginOffset);
2418  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2419 
2420 #ifndef NDEBUG
2421  StringRef OldName = OldPtr->getName();
2422  // Skip through the last '.sroa.' component of the name.
2423  size_t LastSROAPrefix = OldName.rfind(".sroa.");
2424  if (LastSROAPrefix != StringRef::npos) {
2425  OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2426  // Look for an SROA slice index.
2427  size_t IndexEnd = OldName.find_first_not_of("0123456789");
2428  if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2429  // Strip the index and look for the offset.
2430  OldName = OldName.substr(IndexEnd + 1);
2431  size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2432  if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2433  // Strip the offset.
2434  OldName = OldName.substr(OffsetEnd + 1);
2435  }
2436  }
2437  // Strip any SROA suffixes as well.
2438  OldName = OldName.substr(0, OldName.find(".sroa_"));
2439 #endif
2440 
2441  return getAdjustedPtr(IRB, DL, &NewAI,
2442  APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
2443  PointerTy,
2444 #ifndef NDEBUG
2445  Twine(OldName) + "."
2446 #else
2447  Twine()
2448 #endif
2449  );
2450  }
2451 
2452  /// Compute suitable alignment to access this slice of the *new*
2453  /// alloca.
2454  ///
2455  /// You can optionally pass a type to this routine and if that type's ABI
2456  /// alignment is itself suitable, this will return zero.
2457  Align getSliceAlign() {
2458  return commonAlignment(NewAI.getAlign(),
2459  NewBeginOffset - NewAllocaBeginOffset);
2460  }
2461 
2462  unsigned getIndex(uint64_t Offset) {
2463  assert(VecTy && "Can only call getIndex when rewriting a vector");
2464  uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2465  assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2466  uint32_t Index = RelOffset / ElementSize;
2467  assert(Index * ElementSize == RelOffset);
2468  return Index;
2469  }
2470 
2471  void deleteIfTriviallyDead(Value *V) {
2472  Instruction *I = cast<Instruction>(V);
2474  Pass.DeadInsts.push_back(I);
2475  }
2476 
2477  Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2478  unsigned BeginIndex = getIndex(NewBeginOffset);
2479  unsigned EndIndex = getIndex(NewEndOffset);
2480  assert(EndIndex > BeginIndex && "Empty vector!");
2481 
2482  LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2483  NewAI.getAlign(), "load");
2484 
2485  Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2486  LLVMContext::MD_access_group});
2487  return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
2488  }
2489 
2490  Value *rewriteIntegerLoad(LoadInst &LI) {
2491  assert(IntTy && "We cannot insert an integer to the alloca");
2492  assert(!LI.isVolatile());
2493  Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2494  NewAI.getAlign(), "load");
2495  V = convertValue(DL, IRB, V, IntTy);
2496  assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2497  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2498  if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2499  IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
2500  V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
2501  }
2502  // It is possible that the extracted type is not the load type. This
2503  // happens if there is a load past the end of the alloca, and as
2504  // a consequence the slice is narrower but still a candidate for integer
2505  // lowering. To handle this case, we just zero extend the extracted
2506  // integer.
2507  assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2508  "Can only handle an extract for an overly wide load");
2509  if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
2510  V = IRB.CreateZExt(V, LI.getType());
2511  return V;
2512  }
2513 
2514  bool visitLoadInst(LoadInst &LI) {
2515  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2516  Value *OldOp = LI.getOperand(0);
2517  assert(OldOp == OldPtr);
2518 
2519  AAMDNodes AATags = LI.getAAMetadata();
2520 
2521  unsigned AS = LI.getPointerAddressSpace();
2522 
2523  Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2524  : LI.getType();
2525  const bool IsLoadPastEnd =
2526  DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
2527  bool IsPtrAdjusted = false;
2528  Value *V;
2529  if (VecTy) {
2530  V = rewriteVectorizedLoadInst(LI);
2531  } else if (IntTy && LI.getType()->isIntegerTy()) {
2532  V = rewriteIntegerLoad(LI);
2533  } else if (NewBeginOffset == NewAllocaBeginOffset &&
2534  NewEndOffset == NewAllocaEndOffset &&
2535  (canConvertValue(DL, NewAllocaTy, TargetTy) ||
2536  (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2537  TargetTy->isIntegerTy()))) {
2538  LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2539  NewAI.getAlign(), LI.isVolatile(),
2540  LI.getName());
2541  if (AATags)
2542  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2543  if (LI.isVolatile())
2544  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2545  if (NewLI->isAtomic())
2546  NewLI->setAlignment(LI.getAlign());
2547 
2548  // Any !nonnull metadata or !range metadata on the old load is also valid
2549  // on the new load. This is even true in some cases even when the loads
2550  // are different types, for example by mapping !nonnull metadata to
2551  // !range metadata by modeling the null pointer constant converted to the
2552  // integer type.
2553  // FIXME: Add support for range metadata here. Currently the utilities
2554  // for this don't propagate range metadata in trivial cases from one
2555  // integer load to another, don't handle non-addrspace-0 null pointers
2556  // correctly, and don't have any support for mapping ranges as the
2557  // integer type becomes winder or narrower.
2558  if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
2559  copyNonnullMetadata(LI, N, *NewLI);
2560 
2561  // Try to preserve nonnull metadata
2562  V = NewLI;
2563 
2564  // If this is an integer load past the end of the slice (which means the
2565  // bytes outside the slice are undef or this load is dead) just forcibly
2566  // fix the integer size with correct handling of endianness.
2567  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2568  if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
2569  if (AITy->getBitWidth() < TITy->getBitWidth()) {
2570  V = IRB.CreateZExt(V, TITy, "load.ext");
2571  if (DL.isBigEndian())
2572  V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
2573  "endian_shift");
2574  }
2575  } else {
2576  Type *LTy = TargetTy->getPointerTo(AS);
2577  LoadInst *NewLI =
2578  IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
2579  getSliceAlign(), LI.isVolatile(), LI.getName());
2580  if (AATags)
2581  NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2582  if (LI.isVolatile())
2583  NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2584  NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2585  LLVMContext::MD_access_group});
2586 
2587  V = NewLI;
2588  IsPtrAdjusted = true;
2589  }
2590  V = convertValue(DL, IRB, V, TargetTy);
2591 
2592  if (IsSplit) {
2593  assert(!LI.isVolatile());
2594  assert(LI.getType()->isIntegerTy() &&
2595  "Only integer type loads and stores are split");
2596  assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
2597  "Split load isn't smaller than original load");
2598  assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
2599  "Non-byte-multiple bit width");
2600  // Move the insertion point just past the load so that we can refer to it.
2601  IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
2602  // Create a placeholder value with the same type as LI to use as the
2603  // basis for the new value. This allows us to replace the uses of LI with
2604  // the computed value, and then replace the placeholder with LI, leaving
2605  // LI only used for this computation.
2606  Value *Placeholder = new LoadInst(
2607  LI.getType(), PoisonValue::get(LI.getType()->getPointerTo(AS)), "",
2608  false, Align(1));
2609  V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
2610  "insert");
2611  LI.replaceAllUsesWith(V);
2612  Placeholder->replaceAllUsesWith(&LI);
2613  Placeholder->deleteValue();
2614  } else {
2615  LI.replaceAllUsesWith(V);
2616  }
2617 
2618  Pass.DeadInsts.push_back(&LI);
2619  deleteIfTriviallyDead(OldOp);
2620  LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
2621  return !LI.isVolatile() && !IsPtrAdjusted;
2622  }
2623 
2624  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
2625  AAMDNodes AATags) {
2626  if (V->getType() != VecTy) {
2627  unsigned BeginIndex = getIndex(NewBeginOffset);
2628  unsigned EndIndex = getIndex(NewEndOffset);
2629  assert(EndIndex > BeginIndex && "Empty vector!");
2630  unsigned NumElements = EndIndex - BeginIndex;
2631  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2632  "Too many elements!");
2633  Type *SliceTy = (NumElements == 1)
2634  ? ElementTy
2635  : FixedVectorType::get(ElementTy, NumElements);
2636  if (V->getType() != SliceTy)
2637  V = convertValue(DL, IRB, V, SliceTy);
2638 
2639  // Mix in the existing elements.
2640  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2641  NewAI.getAlign(), "load");
2642  V = insertVector(IRB, Old, V, BeginIndex, "vec");
2643  }
2644  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2645  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2646  LLVMContext::MD_access_group});
2647  if (AATags)
2648  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2649  Pass.DeadInsts.push_back(&SI);
2650 
2651  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2652  return true;
2653  }
2654 
2655  bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
2656  assert(IntTy && "We cannot extract an integer from the alloca");
2657  assert(!SI.isVolatile());
2658  if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
2659  IntTy->getBitWidth()) {
2660  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2661  NewAI.getAlign(), "oldload");
2662  Old = convertValue(DL, IRB, Old, IntTy);
2663  assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2664  uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2665  V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
2666  }
2667  V = convertValue(DL, IRB, V, NewAllocaTy);
2668  StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
2669  Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2670  LLVMContext::MD_access_group});
2671  if (AATags)
2672  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2673  Pass.DeadInsts.push_back(&SI);
2674  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
2675  return true;
2676  }
2677 
2678  bool visitStoreInst(StoreInst &SI) {
2679  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
2680  Value *OldOp = SI.getOperand(1);
2681  assert(OldOp == OldPtr);
2682 
2683  AAMDNodes AATags = SI.getAAMetadata();
2684  Value *V = SI.getValueOperand();
2685 
2686  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2687  // alloca that should be re-examined after promoting this alloca.
2688  if (V->getType()->isPointerTy())
2689  if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
2690  Pass.PostPromotionWorklist.insert(AI);
2691 
2692  if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
2693  assert(!SI.isVolatile());
2694  assert(V->getType()->isIntegerTy() &&
2695  "Only integer type loads and stores are split");
2696  assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
2697  "Non-byte-multiple bit width");
2698  IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
2699  V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
2700  "extract");
2701  }
2702 
2703  if (VecTy)
2704  return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
2705  if (IntTy && V->getType()->isIntegerTy())
2706  return rewriteIntegerStore(V, SI, AATags);
2707 
2708  const bool IsStorePastEnd =
2709  DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
2710  StoreInst *NewSI;
2711  if (NewBeginOffset == NewAllocaBeginOffset &&
2712  NewEndOffset == NewAllocaEndOffset &&
2713  (canConvertValue(DL, V->getType(), NewAllocaTy) ||
2714  (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
2715  V->getType()->isIntegerTy()))) {
2716  // If this is an integer store past the end of slice (and thus the bytes
2717  // past that point are irrelevant or this is unreachable), truncate the
2718  // value prior to storing.
2719  if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
2720  if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2721  if (VITy->getBitWidth() > AITy->getBitWidth()) {
2722  if (DL.isBigEndian())
2723  V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
2724  "endian_shift");
2725  V = IRB.CreateTrunc(V, AITy, "load.trunc");
2726  }
2727 
2728  V = convertValue(DL, IRB, V, NewAllocaTy);
2729  NewSI =
2730  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
2731  } else {
2732  unsigned AS = SI.getPointerAddressSpace();
2733  Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
2734  NewSI =
2735  IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
2736  }
2737  NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
2738  LLVMContext::MD_access_group});
2739  if (AATags)
2740  NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2741  if (SI.isVolatile())
2742  NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
2743  if (NewSI->isAtomic())
2744  NewSI->setAlignment(SI.getAlign());
2745  Pass.DeadInsts.push_back(&SI);
2746  deleteIfTriviallyDead(OldOp);
2747 
2748  LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
2749  return NewSI->getPointerOperand() == &NewAI &&
2750  NewSI->getValueOperand()->getType() == NewAllocaTy &&
2751  !SI.isVolatile();
2752  }
2753 
2754  /// Compute an integer value from splatting an i8 across the given
2755  /// number of bytes.
2756  ///
2757  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2758  /// call this routine.
2759  /// FIXME: Heed the advice above.
2760  ///
2761  /// \param V The i8 value to splat.
2762  /// \param Size The number of bytes in the output (assuming i8 is one byte)
2763  Value *getIntegerSplat(Value *V, unsigned Size) {
2764  assert(Size > 0 && "Expected a positive number of bytes.");
2765  IntegerType *VTy = cast<IntegerType>(V->getType());
2766  assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
2767  if (Size == 1)
2768  return V;
2769 
2770  Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
2771  V = IRB.CreateMul(
2772  IRB.CreateZExt(V, SplatIntTy, "zext"),
2773  IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
2774  IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
2775  SplatIntTy)),
2776  "isplat");
2777  return V;
2778  }
2779 
2780  /// Compute a vector splat for a given element value.
2781  Value *getVectorSplat(Value *V, unsigned NumElements) {
2782  V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
2783  LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
2784  return V;
2785  }
2786 
2787  bool visitMemSetInst(MemSetInst &II) {
2788  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2789  assert(II.getRawDest() == OldPtr);
2790 
2791  AAMDNodes AATags = II.getAAMetadata();
2792 
2793  // If the memset has a variable size, it cannot be split, just adjust the
2794  // pointer to the new alloca.
2795  if (!isa<ConstantInt>(II.getLength())) {
2796  assert(!IsSplit);
2797  assert(NewBeginOffset == BeginOffset);
2798  II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
2799  II.setDestAlignment(getSliceAlign());
2800 
2801  deleteIfTriviallyDead(OldPtr);
2802  return false;
2803  }
2804 
2805  // Record this instruction for deletion.
2806  Pass.DeadInsts.push_back(&II);
2807 
2808  Type *AllocaTy = NewAI.getAllocatedType();
2809  Type *ScalarTy = AllocaTy->getScalarType();
2810 
2811  const bool CanContinue = [&]() {
2812  if (VecTy || IntTy)
2813  return true;
2814  if (BeginOffset > NewAllocaBeginOffset ||
2815  EndOffset < NewAllocaEndOffset)
2816  return false;
2817  // Length must be in range for FixedVectorType.
2818  auto *C = cast<ConstantInt>(II.getLength());
2819  const uint64_t Len = C->getLimitedValue();
2821  return false;
2822  auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
2823  auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
2824  return canConvertValue(DL, SrcTy, AllocaTy) &&
2825  DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
2826  }();
2827 
2828  // If this doesn't map cleanly onto the alloca type, and that type isn't
2829  // a single value type, just emit a memset.
2830  if (!CanContinue) {
2831  Type *SizeTy = II.getLength()->getType();
2832  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2833  CallInst *New = IRB.CreateMemSet(
2834  getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
2835  MaybeAlign(getSliceAlign()), II.isVolatile());
2836  if (AATags)
2837  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2838  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2839  return false;
2840  }
2841 
2842  // If we can represent this as a simple value, we have to build the actual
2843  // value to store, which requires expanding the byte present in memset to
2844  // a sensible representation for the alloca type. This is essentially
2845  // splatting the byte to a sufficiently wide integer, splatting it across
2846  // any desired vector width, and bitcasting to the final type.
2847  Value *V;
2848 
2849  if (VecTy) {
2850  // If this is a memset of a vectorized alloca, insert it.
2851  assert(ElementTy == ScalarTy);
2852 
2853  unsigned BeginIndex = getIndex(NewBeginOffset);
2854  unsigned EndIndex = getIndex(NewEndOffset);
2855  assert(EndIndex > BeginIndex && "Empty vector!");
2856  unsigned NumElements = EndIndex - BeginIndex;
2857  assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
2858  "Too many elements!");
2859 
2860  Value *Splat = getIntegerSplat(
2861  II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
2862  Splat = convertValue(DL, IRB, Splat, ElementTy);
2863  if (NumElements > 1)
2864  Splat = getVectorSplat(Splat, NumElements);
2865 
2866  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2867  NewAI.getAlign(), "oldload");
2868  V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
2869  } else if (IntTy) {
2870  // If this is a memset on an alloca where we can widen stores, insert the
2871  // set integer.
2872  assert(!II.isVolatile());
2873 
2874  uint64_t Size = NewEndOffset - NewBeginOffset;
2875  V = getIntegerSplat(II.getValue(), Size);
2876 
2877  if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
2878  EndOffset != NewAllocaBeginOffset)) {
2879  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2880  NewAI.getAlign(), "oldload");
2881  Old = convertValue(DL, IRB, Old, IntTy);
2882  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2883  V = insertInteger(DL, IRB, Old, V, Offset, "insert");
2884  } else {
2885  assert(V->getType() == IntTy &&
2886  "Wrong type for an alloca wide integer!");
2887  }
2888  V = convertValue(DL, IRB, V, AllocaTy);
2889  } else {
2890  // Established these invariants above.
2891  assert(NewBeginOffset == NewAllocaBeginOffset);
2892  assert(NewEndOffset == NewAllocaEndOffset);
2893 
2894  V = getIntegerSplat(II.getValue(),
2895  DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
2896  if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
2897  V = getVectorSplat(
2898  V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
2899 
2900  V = convertValue(DL, IRB, V, AllocaTy);
2901  }
2902 
2903  StoreInst *New =
2904  IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
2905  New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
2906  LLVMContext::MD_access_group});
2907  if (AATags)
2908  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
2909  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
2910  return !II.isVolatile();
2911  }
2912 
2913  bool visitMemTransferInst(MemTransferInst &II) {
2914  // Rewriting of memory transfer instructions can be a bit tricky. We break
2915  // them into two categories: split intrinsics and unsplit intrinsics.
2916 
2917  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
2918 
2919  AAMDNodes AATags = II.getAAMetadata();
2920 
2921  bool IsDest = &II.getRawDestUse() == OldUse;
2922  assert((IsDest && II.getRawDest() == OldPtr) ||
2923  (!IsDest && II.getRawSource() == OldPtr));
2924 
2925  Align SliceAlign = getSliceAlign();
2926 
2927  // For unsplit intrinsics, we simply modify the source and destination
2928  // pointers in place. This isn't just an optimization, it is a matter of
2929  // correctness. With unsplit intrinsics we may be dealing with transfers
2930  // within a single alloca before SROA ran, or with transfers that have
2931  // a variable length. We may also be dealing with memmove instead of
2932  // memcpy, and so simply updating the pointers is the necessary for us to
2933  // update both source and dest of a single call.
2934  if (!IsSplittable) {
2935  Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2936  if (IsDest) {
2937  II.setDest(AdjustedPtr);
2938  II.setDestAlignment(SliceAlign);
2939  }
2940  else {
2941  II.setSource(AdjustedPtr);
2942  II.setSourceAlignment(SliceAlign);
2943  }
2944 
2945  LLVM_DEBUG(dbgs() << " to: " << II << "\n");
2946  deleteIfTriviallyDead(OldPtr);
2947  return false;
2948  }
2949  // For split transfer intrinsics we have an incredibly useful assurance:
2950  // the source and destination do not reside within the same alloca, and at
2951  // least one of them does not escape. This means that we can replace
2952  // memmove with memcpy, and we don't need to worry about all manner of
2953  // downsides to splitting and transforming the operations.
2954 
2955  // If this doesn't map cleanly onto the alloca type, and that type isn't
2956  // a single value type, just emit a memcpy.
2957  bool EmitMemCpy =
2958  !VecTy && !IntTy &&
2959  (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
2960  SliceSize !=
2961  DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
2962  !NewAI.getAllocatedType()->isSingleValueType());
2963 
2964  // If we're just going to emit a memcpy, the alloca hasn't changed, and the
2965  // size hasn't been shrunk based on analysis of the viable range, this is
2966  // a no-op.
2967  if (EmitMemCpy && &OldAI == &NewAI) {
2968  // Ensure the start lines up.
2969  assert(NewBeginOffset == BeginOffset);
2970 
2971  // Rewrite the size as needed.
2972  if (NewEndOffset != EndOffset)
2974  NewEndOffset - NewBeginOffset));
2975  return false;
2976  }
2977  // Record this instruction for deletion.
2978  Pass.DeadInsts.push_back(&II);
2979 
2980  // Strip all inbounds GEPs and pointer casts to try to dig out any root
2981  // alloca that should be re-examined after rewriting this instruction.
2982  Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
2983  if (AllocaInst *AI =
2984  dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
2985  assert(AI != &OldAI && AI != &NewAI &&
2986  "Splittable transfers cannot reach the same alloca on both ends.");
2987  Pass.Worklist.insert(AI);
2988  }
2989 
2990  Type *OtherPtrTy = OtherPtr->getType();
2991  unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
2992 
2993  // Compute the relative offset for the other pointer within the transfer.
2994  unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
2995  APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
2996  Align OtherAlign =
2997  (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
2998  OtherAlign =
2999  commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3000 
3001  if (EmitMemCpy) {
3002  // Compute the other pointer, folding as much as possible to produce
3003  // a single, simple GEP in most cases.
3004  OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3005  OtherPtr->getName() + ".");
3006 
3007  Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3008  Type *SizeTy = II.getLength()->getType();
3009  Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3010 
3011  Value *DestPtr, *SrcPtr;
3012  MaybeAlign DestAlign, SrcAlign;
3013  // Note: IsDest is true iff we're copying into the new alloca slice
3014  if (IsDest) {
3015  DestPtr = OurPtr;
3016  DestAlign = SliceAlign;
3017  SrcPtr = OtherPtr;
3018  SrcAlign = OtherAlign;
3019  } else {
3020  DestPtr = OtherPtr;
3021  DestAlign = OtherAlign;
3022  SrcPtr = OurPtr;
3023  SrcAlign = SliceAlign;
3024  }
3025  CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3026  Size, II.isVolatile());
3027  if (AATags)
3028  New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3029  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3030  return false;
3031  }
3032 
3033  bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3034  NewEndOffset == NewAllocaEndOffset;
3035  uint64_t Size = NewEndOffset - NewBeginOffset;
3036  unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3037  unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3038  unsigned NumElements = EndIndex - BeginIndex;
3039  IntegerType *SubIntTy =
3040  IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3041 
3042  // Reset the other pointer type to match the register type we're going to
3043  // use, but using the address space of the original other pointer.
3044  Type *OtherTy;
3045  if (VecTy && !IsWholeAlloca) {
3046  if (NumElements == 1)
3047  OtherTy = VecTy->getElementType();
3048  else
3049  OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3050  } else if (IntTy && !IsWholeAlloca) {
3051  OtherTy = SubIntTy;
3052  } else {
3053  OtherTy = NewAllocaTy;
3054  }
3055  OtherPtrTy = OtherTy->getPointerTo(OtherAS);
3056 
3057  Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3058  OtherPtr->getName() + ".");
3059  MaybeAlign SrcAlign = OtherAlign;
3060  Value *DstPtr = &NewAI;
3061  MaybeAlign DstAlign = SliceAlign;
3062  if (!IsDest) {
3063  std::swap(SrcPtr, DstPtr);
3064  std::swap(SrcAlign, DstAlign);
3065  }
3066 
3067  Value *Src;
3068  if (VecTy && !IsWholeAlloca && !IsDest) {
3069  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3070  NewAI.getAlign(), "load");
3071  Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3072  } else if (IntTy && !IsWholeAlloca && !IsDest) {
3073  Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3074  NewAI.getAlign(), "load");
3075  Src = convertValue(DL, IRB, Src, IntTy);
3076  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3077  Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3078  } else {
3079  LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3080  II.isVolatile(), "copyload");
3081  Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3082  LLVMContext::MD_access_group});
3083  if (AATags)
3084  Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3085  Src = Load;
3086  }
3087 
3088  if (VecTy && !IsWholeAlloca && IsDest) {
3089  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3090  NewAI.getAlign(), "oldload");
3091  Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3092  } else if (IntTy && !IsWholeAlloca && IsDest) {
3093  Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3094  NewAI.getAlign(), "oldload");
3095  Old = convertValue(DL, IRB, Old, IntTy);
3096  uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3097  Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3098  Src = convertValue(DL, IRB, Src, NewAllocaTy);
3099  }
3100 
3101  StoreInst *Store = cast<StoreInst>(
3102  IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3103  Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3104  LLVMContext::MD_access_group});
3105  if (AATags)
3106  Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3107  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3108  return !II.isVolatile();
3109  }
3110 
3111  bool visitIntrinsicInst(IntrinsicInst &II) {
3112  assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3113  "Unexpected intrinsic!");
3114  LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3115 
3116  // Record this instruction for deletion.
3117  Pass.DeadInsts.push_back(&II);
3118 
3119  if (II.isDroppable()) {
3120  assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3121  // TODO For now we forget assumed information, this can be improved.
3122  OldPtr->dropDroppableUsesIn(II);
3123  return true;
3124  }
3125 
3126  assert(II.getArgOperand(1) == OldPtr);
3127  // Lifetime intrinsics are only promotable if they cover the whole alloca.
3128  // Therefore, we drop lifetime intrinsics which don't cover the whole
3129  // alloca.
3130  // (In theory, intrinsics which partially cover an alloca could be
3131  // promoted, but PromoteMemToReg doesn't handle that case.)
3132  // FIXME: Check whether the alloca is promotable before dropping the
3133  // lifetime intrinsics?
3134  if (NewBeginOffset != NewAllocaBeginOffset ||
3135  NewEndOffset != NewAllocaEndOffset)
3136  return true;
3137 
3138  ConstantInt *Size =
3139  ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3140  NewEndOffset - NewBeginOffset);
3141  // Lifetime intrinsics always expect an i8* so directly get such a pointer
3142  // for the new alloca slice.
3144  Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3145  Value *New;
3146  if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3147  New = IRB.CreateLifetimeStart(Ptr, Size);
3148  else
3149  New = IRB.CreateLifetimeEnd(Ptr, Size);
3150 
3151  (void)New;
3152  LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3153 
3154  return true;
3155  }
3156 
3157  void fixLoadStoreAlign(Instruction &Root) {
3158  // This algorithm implements the same visitor loop as
3159  // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3160  // or store found.
3163  Visited.insert(&Root);
3164  Uses.push_back(&Root);
3165  do {
3166  Instruction *I = Uses.pop_back_val();
3167 
3168  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3169  LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3170  continue;
3171  }
3172  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3173  SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3174  continue;
3175  }
3176 
3177  assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3178  isa<PHINode>(I) || isa<SelectInst>(I) ||
3179  isa<GetElementPtrInst>(I));
3180  for (User *U : I->users())
3181  if (Visited.insert(cast<Instruction>(U)).second)
3182  Uses.push_back(cast<Instruction>(U));
3183  } while (!Uses.empty());
3184  }
3185 
3186  bool visitPHINode(PHINode &PN) {
3187  LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3188  assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3189  assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3190 
3191  // We would like to compute a new pointer in only one place, but have it be
3192  // as local as possible to the PHI. To do that, we re-use the location of
3193  // the old pointer, which necessarily must be in the right position to
3194  // dominate the PHI.
3196  if (isa<PHINode>(OldPtr))
3197  IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
3198  else
3199  IRB.SetInsertPoint(OldPtr);
3200  IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3201 
3202  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3203  // Replace the operands which were using the old pointer.
3204  std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3205 
3206  LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3207  deleteIfTriviallyDead(OldPtr);
3208 
3209  // Fix the alignment of any loads or stores using this PHI node.
3210  fixLoadStoreAlign(PN);
3211 
3212  // PHIs can't be promoted on their own, but often can be speculated. We
3213  // check the speculation outside of the rewriter so that we see the
3214  // fully-rewritten alloca.
3215  PHIUsers.insert(&PN);
3216  return true;
3217  }
3218 
3219  bool visitSelectInst(SelectInst &SI) {
3220  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3221  assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3222  "Pointer isn't an operand!");
3223  assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3224  assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3225 
3226  Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3227  // Replace the operands which were using the old pointer.
3228  if (SI.getOperand(1) == OldPtr)
3229  SI.setOperand(1, NewPtr);
3230  if (SI.getOperand(2) == OldPtr)
3231  SI.setOperand(2, NewPtr);
3232 
3233  LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3234  deleteIfTriviallyDead(OldPtr);
3235 
3236  // Fix the alignment of any loads or stores using this select.
3237  fixLoadStoreAlign(SI);
3238 
3239  // Selects can't be promoted on their own, but often can be speculated. We
3240  // check the speculation outside of the rewriter so that we see the
3241  // fully-rewritten alloca.
3242  SelectUsers.insert(&SI);
3243  return true;
3244  }
3245 };
3246 
3247 namespace {
3248 
3249 /// Visitor to rewrite aggregate loads and stores as scalar.
3250 ///
3251 /// This pass aggressively rewrites all aggregate loads and stores on
3252 /// a particular pointer (or any pointer derived from it which we can identify)
3253 /// with scalar loads and stores.
3254 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3255  // Befriend the base class so it can delegate to private visit methods.
3256  friend class InstVisitor<AggLoadStoreRewriter, bool>;
3257 
3258  /// Queue of pointer uses to analyze and potentially rewrite.
3260 
3261  /// Set to prevent us from cycling with phi nodes and loops.
3262  SmallPtrSet<User *, 8> Visited;
3263 
3264  /// The current pointer use being rewritten. This is used to dig up the used
3265  /// value (as opposed to the user).
3266  Use *U = nullptr;
3267 
3268  /// Used to calculate offsets, and hence alignment, of subobjects.
3269  const DataLayout &DL;
3270 
3271  IRBuilderTy &IRB;
3272 
3273 public:
3274  AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3275  : DL(DL), IRB(IRB) {}
3276 
3277  /// Rewrite loads and stores through a pointer and all pointers derived from
3278  /// it.
3279  bool rewrite(Instruction &I) {
3280  LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3281  enqueueUsers(I);
3282  bool Changed = false;
3283  while (!Queue.empty()) {
3284  U = Queue.pop_back_val();
3285  Changed |= visit(cast<Instruction>(U->getUser()));
3286  }
3287  return Changed;
3288  }
3289 
3290 private:
3291  /// Enqueue all the users of the given instruction for further processing.
3292  /// This uses a set to de-duplicate users.
3293  void enqueueUsers(Instruction &I) {
3294  for (Use &U : I.uses())
3295  if (Visited.insert(U.getUser()).second)
3296  Queue.push_back(&U);
3297  }
3298 
3299  // Conservative default is to not rewrite anything.
3300  bool visitInstruction(Instruction &I) { return false; }
3301 
3302  /// Generic recursive split emission class.
3303  template <typename Derived> class OpSplitter {
3304  protected:
3305  /// The builder used to form new instructions.
3306  IRBuilderTy &IRB;
3307 
3308  /// The indices which to be used with insert- or extractvalue to select the
3309  /// appropriate value within the aggregate.
3310  SmallVector<unsigned, 4> Indices;
3311 
3312  /// The indices to a GEP instruction which will move Ptr to the correct slot
3313  /// within the aggregate.
3314  SmallVector<Value *, 4> GEPIndices;
3315 
3316  /// The base pointer of the original op, used as a base for GEPing the
3317  /// split operations.
3318  Value *Ptr;
3319 
3320  /// The base pointee type being GEPed into.
3321  Type *BaseTy;
3322 
3323  /// Known alignment of the base pointer.
3324  Align BaseAlign;
3325 
3326  /// To calculate offset of each component so we can correctly deduce
3327  /// alignments.
3328  const DataLayout &DL;
3329 
3330  /// Initialize the splitter with an insertion point, Ptr and start with a
3331  /// single zero GEP index.
3332  OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3333  Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
3334  : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
3335  BaseAlign(BaseAlign), DL(DL) {
3336  IRB.SetInsertPoint(InsertionPoint);
3337  }
3338 
3339  public:
3340  /// Generic recursive split emission routine.
3341  ///
3342  /// This method recursively splits an aggregate op (load or store) into
3343  /// scalar or vector ops. It splits recursively until it hits a single value
3344  /// and emits that single value operation via the template argument.
3345  ///
3346  /// The logic of this routine relies on GEPs and insertvalue and
3347  /// extractvalue all operating with the same fundamental index list, merely
3348  /// formatted differently (GEPs need actual values).
3349  ///
3350  /// \param Ty The type being split recursively into smaller ops.
3351  /// \param Agg The aggregate value being built up or stored, depending on
3352  /// whether this is splitting a load or a store respectively.
3353  void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3354  if (Ty->isSingleValueType()) {
3355  unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
3356  return static_cast<Derived *>(this)->emitFunc(
3357  Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
3358  }
3359 
3360  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3361  unsigned OldSize = Indices.size();
3362  (void)OldSize;
3363  for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3364  ++Idx) {
3365  assert(Indices.size() == OldSize && "Did not return to the old size");
3366  Indices.push_back(Idx);
3367  GEPIndices.push_back(IRB.getInt32(Idx));
3368  emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3369  GEPIndices.pop_back();
3370  Indices.pop_back();
3371  }
3372  return;
3373  }
3374 
3375  if (StructType *STy = dyn_cast<StructType>(Ty)) {
3376  unsigned OldSize = Indices.size();
3377  (void)OldSize;
3378  for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3379  ++Idx) {
3380  assert(Indices.size() == OldSize && "Did not return to the old size");
3381  Indices.push_back(Idx);
3382  GEPIndices.push_back(IRB.getInt32(Idx));
3383  emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3384  GEPIndices.pop_back();
3385  Indices.pop_back();
3386  }
3387  return;
3388  }
3389 
3390  llvm_unreachable("Only arrays and structs are aggregate loadable types");
3391  }
3392  };
3393 
3394  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3395  AAMDNodes AATags;
3396 
3397  LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3398  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3399  IRBuilderTy &IRB)
3400  : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
3401  IRB),
3402  AATags(AATags) {}
3403 
3404  /// Emit a leaf load of a single value. This is called at the leaves of the
3405  /// recursive emission to actually load values.
3406  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3407  assert(Ty->isSingleValueType());
3408  // Load the single value and insert it using the indices.
3409  Value *GEP =
3410  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3411  LoadInst *Load =
3412  IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
3413 
3414  APInt Offset(
3415  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3416  if (AATags &&
3417  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3418  Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3419 
3420  Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3421  LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3422  }
3423  };
3424 
3425  bool visitLoadInst(LoadInst &LI) {
3426  assert(LI.getPointerOperand() == *U);
3427  if (!LI.isSimple() || LI.getType()->isSingleValueType())
3428  return false;
3429 
3430  // We have an aggregate being loaded, split it apart.
3431  LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3432  LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3433  getAdjustedAlignment(&LI, 0), DL, IRB);
3434  Value *V = PoisonValue::get(LI.getType());
3435  Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3436  Visited.erase(&LI);
3437  LI.replaceAllUsesWith(V);
3438  LI.eraseFromParent();
3439  return true;
3440  }
3441 
3442  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3443  StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3444  AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3445  IRBuilderTy &IRB)
3446  : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3447  DL, IRB),
3448  AATags(AATags) {}
3449  AAMDNodes AATags;
3450  /// Emit a leaf store of a single value. This is called at the leaves of the
3451  /// recursive emission to actually produce stores.
3452  void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3453  assert(Ty->isSingleValueType());
3454  // Extract the single value and store it using the indices.
3455  //
3456  // The gep and extractvalue values are factored out of the CreateStore
3457  // call to make the output independent of the argument evaluation order.
3458  Value *ExtractValue =
3459  IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
3460  Value *InBoundsGEP =
3461  IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3462  StoreInst *Store =
3463  IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
3464 
3465  APInt Offset(
3466  DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3467  if (AATags &&
3468  GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3469  Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
3470 
3471  LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3472  }
3473  };
3474 
3475  bool visitStoreInst(StoreInst &SI) {
3476  if (!SI.isSimple() || SI.getPointerOperand() != *U)
3477  return false;
3478  Value *V = SI.getValueOperand();
3479  if (V->getType()->isSingleValueType())
3480  return false;
3481 
3482  // We have an aggregate being stored, split it apart.
3483  LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3484  StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(),
3485  getAdjustedAlignment(&SI, 0), DL, IRB);
3486  Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3487  Visited.erase(&SI);
3488  SI.eraseFromParent();
3489  return true;
3490  }
3491 
3492  bool visitBitCastInst(BitCastInst &BC) {
3493  enqueueUsers(BC);
3494  return false;
3495  }
3496 
3497  bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
3498  enqueueUsers(ASC);
3499  return false;
3500  }
3501 
3502  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
3503  bool foldGEPSelect(GetElementPtrInst &GEPI) {
3504  if (!GEPI.hasAllConstantIndices())
3505  return false;
3506 
3507  SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
3508 
3509  LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
3510  << "\n original: " << *Sel
3511  << "\n " << GEPI);
3512 
3513  IRB.SetInsertPoint(&GEPI);
3515  bool IsInBounds = GEPI.isInBounds();
3516 
3517  Type *Ty = GEPI.getSourceElementType();
3518  Value *True = Sel->getTrueValue();
3519  Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep",
3520  IsInBounds);
3521 
3522  Value *False = Sel->getFalseValue();
3523 
3524  Value *NFalse = IRB.CreateGEP(Ty, False, Index,
3525  False->getName() + ".sroa.gep", IsInBounds);
3526 
3527  Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
3528  Sel->getName() + ".sroa.sel");
3529  Visited.erase(&GEPI);
3530  GEPI.replaceAllUsesWith(NSel);
3531  GEPI.eraseFromParent();
3532  Instruction *NSelI = cast<Instruction>(NSel);
3533  Visited.insert(NSelI);
3534  enqueueUsers(*NSelI);
3535 
3536  LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
3537  << "\n " << *NFalse
3538  << "\n " << *NSel << '\n');
3539 
3540  return true;
3541  }
3542 
3543  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
3544  bool foldGEPPhi(GetElementPtrInst &GEPI) {
3545  if (!GEPI.hasAllConstantIndices())
3546  return false;
3547 
3548  PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
3549  if (GEPI.getParent() != PHI->getParent() ||
3550  llvm::any_of(PHI->incoming_values(), [](Value *In)
3551  { Instruction *I = dyn_cast<Instruction>(In);
3552  return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
3553  succ_empty(I->getParent()) ||
3554  !I->getParent()->isLegalToHoistInto();
3555  }))
3556  return false;
3557 
3558  LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
3559  << "\n original: " << *PHI
3560  << "\n " << GEPI
3561  << "\n to: ");
3562 
3564  bool IsInBounds = GEPI.isInBounds();
3565  IRB.SetInsertPoint(GEPI.getParent()->getFirstNonPHI());
3566  PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(),
3567  PHI->getName() + ".sroa.phi");
3568  for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
3569  BasicBlock *B = PHI->getIncomingBlock(I);
3570  Value *NewVal = nullptr;
3571  int Idx = NewPN->getBasicBlockIndex(B);
3572  if (Idx >= 0) {
3573  NewVal = NewPN->getIncomingValue(Idx);
3574  } else {
3575  Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
3576 
3577  IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
3578  Type *Ty = GEPI.getSourceElementType();
3579  NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep",
3580  IsInBounds);
3581  }
3582  NewPN->addIncoming(NewVal, B);
3583  }
3584 
3585  Visited.erase(&GEPI);
3586  GEPI.replaceAllUsesWith(NewPN);
3587  GEPI.eraseFromParent();
3588  Visited.insert(NewPN);
3589  enqueueUsers(*NewPN);
3590 
3591  LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
3592  dbgs() << "\n " << *In;
3593  dbgs() << "\n " << *NewPN << '\n');
3594 
3595  return true;
3596  }
3597 
3598  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
3599  if (isa<SelectInst>(GEPI.getPointerOperand()) &&
3600  foldGEPSelect(GEPI))
3601  return true;
3602 
3603  if (isa<PHINode>(GEPI.getPointerOperand()) &&
3604  foldGEPPhi(GEPI))
3605  return true;
3606 
3607  enqueueUsers(GEPI);
3608  return false;
3609  }
3610 
3611  bool visitPHINode(PHINode &PN) {
3612  enqueueUsers(PN);
3613  return false;
3614  }
3615 
3616  bool visitSelectInst(SelectInst &SI) {
3617  enqueueUsers(SI);
3618  return false;
3619  }
3620 };
3621 
3622 } // end anonymous namespace
3623 
3624 /// Strip aggregate type wrapping.
3625 ///
3626 /// This removes no-op aggregate types wrapping an underlying type. It will
3627 /// strip as many layers of types as it can without changing either the type
3628 /// size or the allocated size.
3630  if (Ty->isSingleValueType())
3631  return Ty;
3632 
3633  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
3634  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
3635 
3636  Type *InnerTy;
3637  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
3638  InnerTy = ArrTy->getElementType();
3639  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
3640  const StructLayout *SL = DL.getStructLayout(STy);
3641  unsigned Index = SL->getElementContainingOffset(0);
3642  InnerTy = STy->getElementType(Index);
3643  } else {
3644  return Ty;
3645  }
3646 
3647  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
3648  TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
3649  return Ty;
3650 
3651  return stripAggregateTypeWrapping(DL, InnerTy);
3652 }
3653 
3654 /// Try to find a partition of the aggregate type passed in for a given
3655 /// offset and size.
3656 ///
3657 /// This recurses through the aggregate type and tries to compute a subtype
3658 /// based on the offset and size. When the offset and size span a sub-section
3659 /// of an array, it will even compute a new array type for that sub-section,
3660 /// and the same for structs.
3661 ///
3662 /// Note that this routine is very strict and tries to find a partition of the
3663 /// type which produces the *exact* right offset and size. It is not forgiving
3664 /// when the size or offset cause either end of type-based partition to be off.
3665 /// Also, this is a best-effort routine. It is reasonable to give up and not
3666 /// return a type if necessary.
3667 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
3668  uint64_t Size) {
3669  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
3670  return stripAggregateTypeWrapping(DL, Ty);
3671  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
3672  (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
3673  return nullptr;
3674 
3675  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
3676  Type *ElementTy;
3677  uint64_t TyNumElements;
3678  if (auto *AT = dyn_cast<ArrayType>(Ty)) {
3679  ElementTy = AT->getElementType();
3680  TyNumElements = AT->getNumElements();
3681  } else {
3682  // FIXME: This isn't right for vectors with non-byte-sized or
3683  // non-power-of-two sized elements.
3684  auto *VT = cast<FixedVectorType>(Ty);
3685  ElementTy = VT->getElementType();
3686  TyNumElements = VT->getNumElements();
3687  }
3688  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3689  uint64_t NumSkippedElements = Offset / ElementSize;
3690  if (NumSkippedElements >= TyNumElements)
3691  return nullptr;
3692  Offset -= NumSkippedElements * ElementSize;
3693 
3694  // First check if we need to recurse.
3695  if (Offset > 0 || Size < ElementSize) {
3696  // Bail if the partition ends in a different array element.
3697  if ((Offset + Size) > ElementSize)
3698  return nullptr;
3699  // Recurse through the element type trying to peel off offset bytes.
3700  return getTypePartition(DL, ElementTy, Offset, Size);
3701  }
3702  assert(Offset == 0);
3703 
3704  if (Size == ElementSize)
3705  return stripAggregateTypeWrapping(DL, ElementTy);
3706  assert(Size > ElementSize);
3707  uint64_t NumElements = Size / ElementSize;
3708  if (NumElements * ElementSize != Size)
3709  return nullptr;
3710  return ArrayType::get(ElementTy, NumElements);
3711  }
3712 
3713  StructType *STy = dyn_cast<StructType>(Ty);
3714  if (!STy)
3715  return nullptr;
3716 
3717  const StructLayout *SL = DL.getStructLayout(STy);
3718  if (Offset >= SL->getSizeInBytes())
3719  return nullptr;
3720  uint64_t EndOffset = Offset + Size;
3721  if (EndOffset > SL->getSizeInBytes())
3722  return nullptr;
3723 
3724  unsigned Index = SL->getElementContainingOffset(Offset);
3725  Offset -= SL->getElementOffset(Index);
3726 
3727  Type *ElementTy = STy->getElementType(Index);
3728  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
3729  if (Offset >= ElementSize)
3730  return nullptr; // The offset points into alignment padding.
3731 
3732  // See if any partition must be contained by the element.
3733  if (Offset > 0 || Size < ElementSize) {
3734  if ((Offset + Size) > ElementSize)
3735  return nullptr;
3736  return getTypePartition(DL, ElementTy, Offset, Size);
3737  }
3738  assert(Offset == 0);
3739 
3740  if (Size == ElementSize)
3741  return stripAggregateTypeWrapping(DL, ElementTy);
3742 
3744  EE = STy->element_end();
3745  if (EndOffset < SL->getSizeInBytes()) {
3746  unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
3747  if (Index == EndIndex)
3748  return nullptr; // Within a single element and its padding.
3749 
3750  // Don't try to form "natural" types if the elements don't line up with the
3751  // expected size.
3752  // FIXME: We could potentially recurse down through the last element in the
3753  // sub-struct to find a natural end point.
3754  if (SL->getElementOffset(EndIndex) != EndOffset)
3755  return nullptr;
3756 
3757  assert(Index < EndIndex);
3758  EE = STy->element_begin() + EndIndex;
3759  }
3760 
3761  // Try to build up a sub-structure.
3762  StructType *SubTy =
3763  StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
3764  const StructLayout *SubSL = DL.getStructLayout(SubTy);
3765  if (Size != SubSL->getSizeInBytes())
3766  return nullptr; // The sub-struct doesn't have quite the size needed.
3767 
3768  return SubTy;
3769 }
3770 
3771 /// Pre-split loads and stores to simplify rewriting.
3772 ///
3773 /// We want to break up the splittable load+store pairs as much as
3774 /// possible. This is important to do as a preprocessing step, as once we
3775 /// start rewriting the accesses to partitions of the alloca we lose the
3776 /// necessary information to correctly split apart paired loads and stores
3777 /// which both point into this alloca. The case to consider is something like
3778 /// the following:
3779 ///
3780 /// %a = alloca [12 x i8]
3781 /// %gep1 = getelementptr i8, ptr %a, i32 0
3782 /// %gep2 = getelementptr i8, ptr %a, i32 4
3783 /// %gep3 = getelementptr i8, ptr %a, i32 8
3784 /// store float 0.0, ptr %gep1
3785 /// store float 1.0, ptr %gep2
3786 /// %v = load i64, ptr %gep1
3787 /// store i64 %v, ptr %gep2
3788 /// %f1 = load float, ptr %gep2
3789 /// %f2 = load float, ptr %gep3
3790 ///
3791 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
3792 /// promote everything so we recover the 2 SSA values that should have been
3793 /// there all along.
3794 ///
3795 /// \returns true if any changes are made.
3796 bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
3797  LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
3798 
3799  // Track the loads and stores which are candidates for pre-splitting here, in
3800  // the order they first appear during the partition scan. These give stable
3801  // iteration order and a basis for tracking which loads and stores we
3802  // actually split.
3805 
3806  // We need to accumulate the splits required of each load or store where we
3807  // can find them via a direct lookup. This is important to cross-check loads
3808  // and stores against each other. We also track the slice so that we can kill
3809  // all the slices that end up split.
3810  struct SplitOffsets {
3811  Slice *S;
3812  std::vector<uint64_t> Splits;
3813  };
3815 
3816  // Track loads out of this alloca which cannot, for any reason, be pre-split.
3817  // This is important as we also cannot pre-split stores of those loads!
3818  // FIXME: This is all pretty gross. It means that we can be more aggressive
3819  // in pre-splitting when the load feeding the store happens to come from
3820  // a separate alloca. Put another way, the effectiveness of SROA would be
3821  // decreased by a frontend which just concatenated all of its local allocas
3822  // into one big flat alloca. But defeating such patterns is exactly the job
3823  // SROA is tasked with! Sadly, to not have this discrepancy we would have
3824  // change store pre-splitting to actually force pre-splitting of the load
3825  // that feeds it *and all stores*. That makes pre-splitting much harder, but
3826  // maybe it would make it more principled?
3827  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
3828 
3829  LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
3830  for (auto &P : AS.partitions()) {
3831  for (Slice &S : P) {
3832  Instruction *I = cast<Instruction>(S.getUse()->getUser());
3833  if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
3834  // If this is a load we have to track that it can't participate in any
3835  // pre-splitting. If this is a store of a load we have to track that
3836  // that load also can't participate in any pre-splitting.
3837  if (auto *LI = dyn_cast<LoadInst>(I))
3838  UnsplittableLoads.insert(LI);
3839  else if (auto *SI = dyn_cast<StoreInst>(I))
3840  if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
3841  UnsplittableLoads.insert(LI);
3842  continue;
3843  }
3844  assert(P.endOffset() > S.beginOffset() &&
3845  "Empty or backwards partition!");
3846 
3847  // Determine if this is a pre-splittable slice.
3848  if (auto *LI = dyn_cast<LoadInst>(I)) {
3849  assert(!LI->isVolatile() && "Cannot split volatile loads!");
3850 
3851  // The load must be used exclusively to store into other pointers for
3852  // us to be able to arbitrarily pre-split it. The stores must also be
3853  // simple to avoid changing semantics.
3854  auto IsLoadSimplyStored = [](LoadInst *LI) {
3855  for (User *LU : LI->users()) {
3856  auto *SI = dyn_cast<StoreInst>(LU);
3857  if (!SI || !SI->isSimple())
3858  return false;
3859  }
3860  return true;
3861  };
3862  if (!IsLoadSimplyStored(LI)) {
3863  UnsplittableLoads.insert(LI);
3864  continue;
3865  }
3866 
3867  Loads.push_back(LI);
3868  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
3869  if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
3870  // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
3871  continue;
3872  auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
3873  if (!StoredLoad || !StoredLoad->isSimple())
3874  continue;
3875  assert(!SI->isVolatile() && "Cannot split volatile stores!");
3876 
3877  Stores.push_back(SI);
3878  } else {
3879  // Other uses cannot be pre-split.
3880  continue;
3881  }
3882 
3883  // Record the initial split.
3884  LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
3885  auto &Offsets = SplitOffsetsMap[I];
3886  assert(Offsets.Splits.empty() &&
3887  "Should not have splits the first time we see an instruction!");
3888  Offsets.S = &S;
3889  Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
3890  }
3891 
3892  // Now scan the already split slices, and add a split for any of them which
3893  // we're going to pre-split.
3894  for (Slice *S : P.splitSliceTails()) {
3895  auto SplitOffsetsMapI =
3896  SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
3897  if (SplitOffsetsMapI == SplitOffsetsMap.end())
3898  continue;
3899  auto &Offsets = SplitOffsetsMapI->second;
3900 
3901  assert(Offsets.S == S && "Found a mismatched slice!");
3902  assert(!Offsets.Splits.empty() &&
3903  "Cannot have an empty set of splits on the second partition!");
3904  assert(Offsets.Splits.back() ==
3905  P.beginOffset() - Offsets.S->beginOffset() &&
3906  "Previous split does not end where this one begins!");
3907 
3908  // Record each split. The last partition's end isn't needed as the size
3909  // of the slice dictates that.
3910  if (S->endOffset() > P.endOffset())
3911  Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
3912  }
3913  }
3914 
3915  // We may have split loads where some of their stores are split stores. For
3916  // such loads and stores, we can only pre-split them if their splits exactly
3917  // match relative to their starting offset. We have to verify this prior to
3918  // any rewriting.
3919  llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
3920  // Lookup the load we are storing in our map of split
3921  // offsets.
3922  auto *LI = cast<LoadInst>(SI->getValueOperand());
3923  // If it was completely unsplittable, then we're done,
3924  // and this store can't be pre-split.
3925  if (UnsplittableLoads.count(LI))
3926  return true;
3927 
3928  auto LoadOffsetsI = SplitOffsetsMap.find(LI);
3929  if (LoadOffsetsI == SplitOffsetsMap.end())
3930  return false; // Unrelated loads are definitely safe.
3931  auto &LoadOffsets = LoadOffsetsI->second;
3932 
3933  // Now lookup the store's offsets.
3934  auto &StoreOffsets = SplitOffsetsMap[SI];
3935 
3936  // If the relative offsets of each split in the load and
3937  // store match exactly, then we can split them and we
3938  // don't need to remove them here.
3939  if (LoadOffsets.Splits == StoreOffsets.Splits)
3940  return false;
3941 
3942  LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
3943  << " " << *LI << "\n"
3944  << " " << *SI << "\n");
3945 
3946  // We've found a store and load that we need to split
3947  // with mismatched relative splits. Just give up on them
3948  // and remove both instructions from our list of
3949  // candidates.
3950  UnsplittableLoads.insert(LI);
3951  return true;
3952  });
3953  // Now we have to go *back* through all the stores, because a later store may
3954  // have caused an earlier store's load to become unsplittable and if it is
3955  // unsplittable for the later store, then we can't rely on it being split in
3956  // the earlier store either.
3957  llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
3958  auto *LI = cast<LoadInst>(SI->getValueOperand());
3959  return UnsplittableLoads.count(LI);
3960  });
3961  // Once we've established all the loads that can't be split for some reason,
3962  // filter any that made it into our list out.
3963  llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
3964  return UnsplittableLoads.count(LI);
3965  });
3966 
3967  // If no loads or stores are left, there is no pre-splitting to be done for
3968  // this alloca.
3969  if (Loads.empty() && Stores.empty())
3970  return false;
3971 
3972  // From here on, we can't fail and will be building new accesses, so rig up
3973  // an IR builder.
3974  IRBuilderTy IRB(&AI);
3975 
3976  // Collect the new slices which we will merge into the alloca slices.
3977  SmallVector<Slice, 4> NewSlices;
3978 
3979  // Track any allocas we end up splitting loads and stores for so we iterate
3980  // on them.
3981  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
3982 
3983  // At this point, we have collected all of the loads and stores we can
3984  // pre-split, and the specific splits needed for them. We actually do the
3985  // splitting in a specific order in order to handle when one of the loads in
3986  // the value operand to one of the stores.
3987  //
3988  // First, we rewrite all of the split loads, and just accumulate each split
3989  // load in a parallel structure. We also build the slices for them and append
3990  // them to the alloca slices.
3992  std::vector<LoadInst *> SplitLoads;
3993  const DataLayout &DL = AI.getModule()->getDataLayout();
3994  for (LoadInst *LI : Loads) {
3995  SplitLoads.clear();
3996 
3997  auto &Offsets = SplitOffsetsMap[LI];
3998  unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
3999  assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4000  "Load must have type size equal to store size");
4001  assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4002  "Load must be >= slice size");
4003 
4004  uint64_t BaseOffset = Offsets.S->beginOffset();
4005  assert(BaseOffset + SliceSize > BaseOffset &&
4006  "Cannot represent alloca access size using 64-bit integers!");
4007 
4008  Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
4009  IRB.SetInsertPoint(LI);
4010 
4011  LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4012 
4013  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4014  int Idx = 0, Size = Offsets.Splits.size();
4015  for (;;) {
4016  auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4017  auto AS = LI->getPointerAddressSpace();
4018  auto *PartPtrTy = PartTy->getPointerTo(AS);
4019  LoadInst *PLoad = IRB.CreateAlignedLoad(
4020  PartTy,
4021  getAdjustedPtr(IRB, DL, BasePtr,
4022  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4023  PartPtrTy, BasePtr->getName() + "."),
4024  getAdjustedAlignment(LI, PartOffset),
4025  /*IsVolatile*/ false, LI->getName());
4026  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4027  LLVMContext::MD_access_group});
4028 
4029  // Append this load onto the list of split loads so we can find it later
4030  // to rewrite the stores.
4031  SplitLoads.push_back(PLoad);
4032 
4033  // Now build a new slice for the alloca.
4034  NewSlices.push_back(
4035  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4036  &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4037  /*IsSplittable*/ false));
4038  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4039  << ", " << NewSlices.back().endOffset()
4040  << "): " << *PLoad << "\n");
4041 
4042  // See if we've handled all the splits.
4043  if (Idx >= Size)
4044  break;
4045 
4046  // Setup the next partition.
4047  PartOffset = Offsets.Splits[Idx];
4048  ++Idx;
4049  PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4050  }
4051 
4052  // Now that we have the split loads, do the slow walk over all uses of the
4053  // load and rewrite them as split stores, or save the split loads to use
4054  // below if the store is going to be split there anyways.
4055  bool DeferredStores = false;
4056  for (User *LU : LI->users()) {
4057  StoreInst *SI = cast<StoreInst>(LU);
4058  if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4059  DeferredStores = true;
4060  LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4061  << "\n");
4062  continue;
4063  }
4064 
4065  Value *StoreBasePtr = SI->getPointerOperand();
4066  IRB.SetInsertPoint(SI);
4067 
4068  LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4069 
4070  for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4071  LoadInst *PLoad = SplitLoads[Idx];
4072  uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4073  auto *PartPtrTy =
4074  PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
4075 
4076  auto AS = SI->getPointerAddressSpace();
4077  StoreInst *PStore = IRB.CreateAlignedStore(
4078  PLoad,
4079  getAdjustedPtr(IRB, DL, StoreBasePtr,
4080  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4081  PartPtrTy, StoreBasePtr->getName() + "."),
4082  getAdjustedAlignment(SI, PartOffset),
4083  /*IsVolatile*/ false);
4084  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4085  LLVMContext::MD_access_group});
4086  LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4087  }
4088 
4089  // We want to immediately iterate on any allocas impacted by splitting
4090  // this store, and we have to track any promotable alloca (indicated by
4091  // a direct store) as needing to be resplit because it is no longer
4092  // promotable.
4093  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4094  ResplitPromotableAllocas.insert(OtherAI);
4095  Worklist.insert(OtherAI);
4096  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4097  StoreBasePtr->stripInBoundsOffsets())) {
4098  Worklist.insert(OtherAI);
4099  }
4100 
4101  // Mark the original store as dead.
4102  DeadInsts.push_back(SI);
4103  }
4104 
4105  // Save the split loads if there are deferred stores among the users.
4106  if (DeferredStores)
4107  SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4108 
4109  // Mark the original load as dead and kill the original slice.
4110  DeadInsts.push_back(LI);
4111  Offsets.S->kill();
4112  }
4113 
4114  // Second, we rewrite all of the split stores. At this point, we know that
4115  // all loads from this alloca have been split already. For stores of such
4116  // loads, we can simply look up the pre-existing split loads. For stores of
4117  // other loads, we split those loads first and then write split stores of
4118  // them.
4119  for (StoreInst *SI : Stores) {
4120  auto *LI = cast<LoadInst>(SI->getValueOperand());
4121  IntegerType *Ty = cast<IntegerType>(LI->getType());
4122  assert(Ty->getBitWidth() % 8 == 0);
4123  uint64_t StoreSize = Ty->getBitWidth() / 8;
4124  assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4125 
4126  auto &Offsets = SplitOffsetsMap[SI];
4127  assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4128  "Slice size should always match load size exactly!");
4129  uint64_t BaseOffset = Offsets.S->beginOffset();
4130  assert(BaseOffset + StoreSize > BaseOffset &&
4131  "Cannot represent alloca access size using 64-bit integers!");
4132 
4133  Value *LoadBasePtr = LI->getPointerOperand();
4134  Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4135 
4136  LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4137 
4138  // Check whether we have an already split load.
4139  auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4140  std::vector<LoadInst *> *SplitLoads = nullptr;
4141  if (SplitLoadsMapI != SplitLoadsMap.end()) {
4142  SplitLoads = &SplitLoadsMapI->second;
4143  assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4144  "Too few split loads for the number of splits in the store!");
4145  } else {
4146  LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4147  }
4148 
4149  uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4150  int Idx = 0, Size = Offsets.Splits.size();
4151  for (;;) {
4152  auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4153  auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
4154  auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
4155 
4156  // Either lookup a split load or create one.
4157  LoadInst *PLoad;
4158  if (SplitLoads) {
4159  PLoad = (*SplitLoads)[Idx];
4160  } else {
4161  IRB.SetInsertPoint(LI);
4162  auto AS = LI->getPointerAddressSpace();
4163  PLoad = IRB.CreateAlignedLoad(
4164  PartTy,
4165  getAdjustedPtr(IRB, DL, LoadBasePtr,
4166  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4167  LoadPartPtrTy, LoadBasePtr->getName() + "."),
4168  getAdjustedAlignment(LI, PartOffset),
4169  /*IsVolatile*/ false, LI->getName());
4170  PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4171  LLVMContext::MD_access_group});
4172  }
4173 
4174  // And store this partition.
4175  IRB.SetInsertPoint(SI);
4176  auto AS = SI->getPointerAddressSpace();
4177  StoreInst *PStore = IRB.CreateAlignedStore(
4178  PLoad,
4179  getAdjustedPtr(IRB, DL, StoreBasePtr,
4180  APInt(DL.getIndexSizeInBits(AS), PartOffset),
4181  StorePartPtrTy, StoreBasePtr->getName() + "."),
4182  getAdjustedAlignment(SI, PartOffset),
4183  /*IsVolatile*/ false);
4184  PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4185  LLVMContext::MD_access_group});
4186 
4187  // Now build a new slice for the alloca.
4188  NewSlices.push_back(
4189  Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4190  &PStore->getOperandUse(PStore->getPointerOperandIndex()),
4191  /*IsSplittable*/ false));
4192  LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4193  << ", " << NewSlices.back().endOffset()
4194  << "): " << *PStore << "\n");
4195  if (!SplitLoads) {
4196  LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4197  }
4198 
4199  // See if we've finished all the splits.
4200  if (Idx >= Size)
4201  break;
4202 
4203  // Setup the next partition.
4204  PartOffset = Offsets.Splits[Idx];
4205  ++Idx;
4206  PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4207  }
4208 
4209  // We want to immediately iterate on any allocas impacted by splitting
4210  // this load, which is only relevant if it isn't a load of this alloca and
4211  // thus we didn't already split the loads above. We also have to keep track
4212  // of any promotable allocas we split loads on as they can no longer be
4213  // promoted.
4214  if (!SplitLoads) {
4215  if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
4216  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4217  ResplitPromotableAllocas.insert(OtherAI);
4218  Worklist.insert(OtherAI);
4219  } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4220  LoadBasePtr->stripInBoundsOffsets())) {
4221  assert(OtherAI != &AI && "We can't re-split our own alloca!");
4222  Worklist.insert(OtherAI);
4223  }
4224  }
4225 
4226  // Mark the original store as dead now that we've split it up and kill its
4227  // slice. Note that we leave the original load in place unless this store
4228  // was its only use. It may in turn be split up if it is an alloca load
4229  // for some other alloca, but it may be a normal load. This may introduce
4230  // redundant loads, but where those can be merged the rest of the optimizer
4231  // should handle the merging, and this uncovers SSA splits which is more
4232  // important. In practice, the original loads will almost always be fully
4233  // split and removed eventually, and the splits will be merged by any
4234  // trivial CSE, including instcombine.
4235  if (LI->hasOneUse()) {
4236  assert(*LI->user_begin() == SI && "Single use isn't this store!");
4237  DeadInsts.push_back(LI);
4238  }
4239  DeadInsts.push_back(SI);
4240  Offsets.S->kill();
4241  }
4242 
4243  // Remove the killed slices that have ben pre-split.
4244  llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
4245 
4246  // Insert our new slices. This will sort and merge them into the sorted
4247  // sequence.
4248  AS.insert(NewSlices);
4249 
4250  LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4251 #ifndef NDEBUG
4252  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4253  LLVM_DEBUG(AS.print(dbgs(), I, " "));
4254 #endif
4255 
4256  // Finally, don't try to promote any allocas that new require re-splitting.
4257  // They have already been added to the worklist above.
4258  llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
4259  return ResplitPromotableAllocas.count(AI);
4260  });
4261 
4262  return true;
4263 }
4264 
4265 /// Rewrite an alloca partition's users.
4266 ///
4267 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4268 /// to rewrite uses of an alloca partition to be conducive for SSA value
4269 /// promotion. If the partition needs a new, more refined alloca, this will
4270 /// build that new alloca, preserving as much type information as possible, and
4271 /// rewrite the uses of the old alloca to point at the new one and have the
4272 /// appropriate new offsets. It also evaluates how successful the rewrite was
4273 /// at enabling promotion and if it was successful queues the alloca to be
4274 /// promoted.
4275 AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4276  Partition &P) {
4277  // Try to compute a friendly type for this partition of the alloca. This
4278  // won't always succeed, in which case we fall back to a legal integer type
4279  // or an i8 array of an appropriate size.
4280  Type *SliceTy = nullptr;
4281  VectorType *SliceVecTy = nullptr;
4282  const DataLayout &DL = AI.getModule()->getDataLayout();
4283  std::pair<Type *, IntegerType *> CommonUseTy =
4284  findCommonType(P.begin(), P.end(), P.endOffset());
4285  // Do all uses operate on the same type?
4286  if (CommonUseTy.first)
4287  if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size()) {
4288  SliceTy = CommonUseTy.first;
4289  SliceVecTy = dyn_cast<VectorType>(SliceTy);
4290  }
4291  // If not, can we find an appropriate subtype in the original allocated type?
4292  if (!SliceTy)
4293  if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4294  P.beginOffset(), P.size()))
4295  SliceTy = TypePartitionTy;
4296 
4297  // If still not, can we use the largest bitwidth integer type used?
4298  if (!SliceTy && CommonUseTy.second)
4299  if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size()) {
4300  SliceTy = CommonUseTy.second;
4301  SliceVecTy = dyn_cast<VectorType>(SliceTy);
4302  }
4303  if ((!SliceTy || (SliceTy->isArrayTy() &&
4304  SliceTy->getArrayElementType()->isIntegerTy())) &&
4305  DL.isLegalInteger(P.size() * 8)) {
4306  SliceTy = Type::getIntNTy(*C, P.size() * 8);
4307  }
4308 
4309  // If the common use types are not viable for promotion then attempt to find
4310  // another type that is viable.
4311  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL))
4312  if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4313  P.beginOffset(), P.size())) {
4314  VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
4315  if (TypePartitionVecTy &&
4316  checkVectorTypeForPromotion(P, TypePartitionVecTy, DL))
4317  SliceTy = TypePartitionTy;
4318  }
4319 
4320  if (!SliceTy)
4321  SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4322  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
4323 
4324  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4325 
4326  VectorType *VecTy =
4327  IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4328  if (VecTy)
4329  SliceTy = VecTy;
4330 
4331  // Check for the case where we're going to rewrite to a new alloca of the
4332  // exact same type as the original, and with the same access offsets. In that
4333  // case, re-use the existing alloca, but still run through the rewriter to
4334  // perform phi and select speculation.
4335  // P.beginOffset() can be non-zero even with the same type in a case with
4336  // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4337  AllocaInst *NewAI;
4338  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4339  NewAI = &AI;
4340  // FIXME: We should be able to bail at this point with "nothing changed".
4341  // FIXME: We might want to defer PHI speculation until after here.
4342  // FIXME: return nullptr;
4343  } else {
4344  // Make sure the alignment is compatible with P.beginOffset().
4345  const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
4346  // If we will get at least this much alignment from the type alone, leave
4347  // the alloca's alignment unconstrained.
4348  const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
4349  NewAI = new AllocaInst(
4350  SliceTy, AI.getAddressSpace(), nullptr,
4351  IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
4352  AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
4353  // Copy the old AI debug location over to the new one.
4354  NewAI->setDebugLoc(AI.getDebugLoc());
4355  ++NumNewAllocas;
4356  }
4357 
4358  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
4359  << "[" << P.beginOffset() << "," << P.endOffset()
4360  << ") to: " << *NewAI << "\n");
4361 
4362  // Track the high watermark on the worklist as it is only relevant for
4363  // promoted allocas. We will reset it to this point if the alloca is not in
4364  // fact scheduled for promotion.
4365  unsigned PPWOldSize = PostPromotionWorklist.size();
4366  unsigned NumUses = 0;
4368  SmallSetVector<SelectInst *, 8> SelectUsers;
4369 
4370  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4371  P.endOffset(), IsIntegerPromotable, VecTy,
4372  PHIUsers, SelectUsers);
4373  bool Promotable = true;
4374  for (Slice *S : P.splitSliceTails()) {
4375  Promotable &= Rewriter.visit(S);
4376  ++NumUses;
4377  }
4378  for (Slice &S : P) {
4379  Promotable &= Rewriter.visit(&S);
4380  ++NumUses;
4381  }
4382 
4383  NumAllocaPartitionUses += NumUses;
4384  MaxUsesPerAllocaPartition.updateMax(NumUses);
4385 
4386  // Now that we've processed all the slices in the new partition, check if any
4387  // PHIs or Selects would block promotion.
4388  for (PHINode *PHI : PHIUsers)
4389  if (!isSafePHIToSpeculate(*PHI)) {
4390  Promotable = false;
4391  PHIUsers.clear();
4392  SelectUsers.clear();
4393  break;
4394  }
4395 
4396  for (SelectInst *Sel : SelectUsers)
4397  if (!isSafeSelectToSpeculate(*Sel)) {
4398  Promotable = false;
4399  PHIUsers.clear();
4400  SelectUsers.clear();
4401  break;
4402  }
4403 
4404  if (Promotable) {
4405  for (Use *U : AS.getDeadUsesIfPromotable()) {
4406  auto *OldInst = dyn_cast<Instruction>(U->get());
4407  Value::dropDroppableUse(*U);
4408  if (OldInst)
4409  if (isInstructionTriviallyDead(OldInst))
4410  DeadInsts.push_back(OldInst);
4411  }
4412  if (PHIUsers.empty() && SelectUsers.empty()) {
4413  // Promote the alloca.
4414  PromotableAllocas.push_back(NewAI);
4415  } else {
4416  // If we have either PHIs or Selects to speculate, add them to those
4417  // worklists and re-queue the new alloca so that we promote in on the
4418  // next iteration.
4419  for (PHINode *PHIUser : PHIUsers)
4420  SpeculatablePHIs.insert(PHIUser);
4421  for (SelectInst *SelectUser : SelectUsers)
4422  SpeculatableSelects.insert(SelectUser);
4423  Worklist.insert(NewAI);
4424  }
4425  } else {
4426  // Drop any post-promotion work items if promotion didn't happen.
4427  while (PostPromotionWorklist.size() > PPWOldSize)
4428  PostPromotionWorklist.pop_back();
4429 
4430  // We couldn't promote and we didn't create a new partition, nothing
4431  // happened.
4432  if (NewAI == &AI)
4433  return nullptr;
4434 
4435  // If we can't promote the alloca, iterate on it to check for new
4436  // refinements exposed by splitting the current alloca. Don't iterate on an
4437  // alloca which didn't actually change and didn't get promoted.
4438  Worklist.insert(NewAI);
4439  }
4440 
4441  return NewAI;
4442 }
4443 
4444 /// Walks the slices of an alloca and form partitions based on them,
4445 /// rewriting each of their uses.
4446 bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
4447  if (AS.begin() == AS.end())
4448  return false;
4449 
4450  unsigned NumPartitions = 0;
4451  bool Changed = false;
4452  const DataLayout &DL = AI.getModule()->getDataLayout();
4453 
4454  // First try to pre-split loads and stores.
4455  Changed |= presplitLoadsAndStores(AI, AS);
4456 
4457  // Now that we have identified any pre-splitting opportunities,
4458  // mark loads and stores unsplittable except for the following case.
4459  // We leave a slice splittable if all other slices are disjoint or fully
4460  // included in the slice, such as whole-alloca loads and stores.
4461  // If we fail to split these during pre-splitting, we want to force them
4462  // to be rewritten into a partition.
4463  bool IsSorted = true;
4464 
4465  uint64_t AllocaSize =
4466  DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
4467  const uint64_t MaxBitVectorSize = 1024;
4468  if (AllocaSize <= MaxBitVectorSize) {
4469  // If a byte boundary is included in any load or store, a slice starting or
4470  // ending at the boundary is not splittable.
4471  SmallBitVector SplittableOffset(AllocaSize + 1, true);
4472  for (Slice &S : AS)
4473  for (unsigned O = S.beginOffset() + 1;
4474  O < S.endOffset() && O < AllocaSize; O++)
4475  SplittableOffset.reset(O);
4476 
4477  for (Slice &S : AS) {
4478  if (!S.isSplittable())
4479  continue;
4480 
4481  if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
4482  (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
4483  continue;
4484 
4485  if (isa<LoadInst>(S.getUse()->getUser()) ||
4486  isa<StoreInst>(S.getUse()->getUser())) {
4487  S.makeUnsplittable();
4488  IsSorted = false;
4489  }
4490  }
4491  }
4492  else {
4493  // We only allow whole-alloca splittable loads and stores
4494  // for a large alloca to avoid creating too large BitVector.
4495  for (Slice &S : AS) {
4496  if (!S.isSplittable())
4497  continue;
4498 
4499  if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
4500  continue;
4501 
4502  if (isa<LoadInst>(S.getUse()->getUser()) ||
4503  isa<StoreInst>(S.getUse()->getUser())) {
4504  S.makeUnsplittable();
4505  IsSorted = false;
4506  }
4507  }
4508  }
4509 
4510  if (!IsSorted)
4511  llvm::sort(AS);
4512 
4513  /// Describes the allocas introduced by rewritePartition in order to migrate
4514  /// the debug info.
4515  struct Fragment {
4516  AllocaInst *Alloca;
4517  uint64_t Offset;
4518  uint64_t Size;
4519  Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
4520  : Alloca(AI), Offset(O), Size(S) {}
4521  };
4522  SmallVector<Fragment, 4> Fragments;
4523 
4524  // Rewrite each partition.
4525  for (auto &P : AS.partitions()) {
4526  if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
4527  Changed = true;
4528  if (NewAI != &AI) {
4529  uint64_t SizeOfByte = 8;
4530  uint64_t AllocaSize =
4531  DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
4532  // Don't include any padding.
4533  uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
4534  Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
4535  }
4536  }
4537  ++NumPartitions;
4538  }
4539 
4540  NumAllocaPartitions += NumPartitions;
4541  MaxPartitionsPerAlloca.updateMax(NumPartitions);
4542 
4543  // Migrate debug information from the old alloca to the new alloca(s)
4544  // and the individual partitions.
4546  for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
4547  auto *Expr = DbgDeclare->getExpression();
4548  DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
4549  uint64_t AllocaSize =
4550  DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
4551  for (auto Fragment : Fragments) {
4552  // Create a fragment expression describing the new partition or reuse AI's
4553  // expression if there is only one partition.
4554  auto *FragmentExpr = Expr;
4555  if (Fragment.Size < AllocaSize || Expr->isFragment()) {
4556  // If this alloca is already a scalar replacement of a larger aggregate,
4557  // Fragment.Offset describes the offset inside the scalar.
4558  auto ExprFragment = Expr->getFragmentInfo();
4559  uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
4560  uint64_t Start = Offset + Fragment.Offset;
4561  uint64_t Size = Fragment.Size;
4562  if (ExprFragment) {
4563  uint64_t AbsEnd =
4564  ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
4565  if (Start >= AbsEnd)
4566  // No need to describe a SROAed padding.
4567  continue;
4568  Size = std::min(Size, AbsEnd - Start);
4569  }
4570  // The new, smaller fragment is stenciled out from the old fragment.
4571  if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
4572  assert(Start >= OrigFragment->OffsetInBits &&
4573  "new fragment is outside of original fragment");
4574  Start -= OrigFragment->OffsetInBits;
4575  }
4576 
4577  // The alloca may be larger than the variable.
4578  auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
4579  if (VarSize) {
4580  if (Size > *VarSize)
4581  Size = *VarSize;
4582  if (Size == 0 || Start + Size > *VarSize)
4583  continue;
4584  }
4585 
4586  // Avoid creating a fragment expression that covers the entire variable.
4587  if (!VarSize || *VarSize != Size) {
4588  if (auto E =
4589  DIExpression::createFragmentExpression(Expr, Start, Size))
4590  FragmentExpr = *E;
4591  else
4592  continue;
4593  }
4594  }
4595 
4596  // Remove any existing intrinsics on the new alloca describing
4597  // the variable fragment.
4598  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
4599  auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS,
4600  const DbgVariableIntrinsic *RHS) {
4601  return LHS->getVariable() == RHS->getVariable() &&
4602  LHS->getDebugLoc()->getInlinedAt() ==
4603  RHS->getDebugLoc()->getInlinedAt();
4604  };
4605  if (SameVariableFragment(OldDII, DbgDeclare))
4606  OldDII->eraseFromParent();
4607  }
4608 
4609  DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
4610  DbgDeclare->getDebugLoc(), &AI);
4611  }
4612  }
4613  return Changed;
4614 }
4615 
4616 /// Clobber a use with poison, deleting the used value if it becomes dead.
4617 void SROAPass::clobberUse(Use &U) {
4618  Value *OldV = U;
4619  // Replace the use with an poison value.
4620  U = PoisonValue::get(OldV->getType());
4621 
4622  // Check for this making an instruction dead. We have to garbage collect
4623  // all the dead instructions to ensure the uses of any alloca end up being
4624  // minimal.
4625  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
4626  if (isInstructionTriviallyDead(OldI)) {
4627  DeadInsts.push_back(OldI);
4628  }
4629 }
4630 
4631 /// Analyze an alloca for SROA.
4632 ///
4633 /// This analyzes the alloca to ensure we can reason about it, builds
4634 /// the slices of the alloca, and then hands it off to be split and
4635 /// rewritten as needed.
4636 bool SROAPass::runOnAlloca(AllocaInst &AI) {
4637  LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
4638  ++NumAllocasAnalyzed;
4639 
4640  // Special case dead allocas, as they're trivial.
4641  if (AI.use_empty()) {
4642  AI.eraseFromParent();
4643  return true;
4644  }
4645  const DataLayout &DL = AI.getModule()->getDataLayout();
4646 
4647  // Skip alloca forms that this analysis can't handle.
4648  auto *AT = AI.getAllocatedType();
4649  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
4650  DL.getTypeAllocSize(AT).getFixedSize() == 0)
4651  return false;
4652 
4653  bool Changed = false;
4654 
4655  // First, split any FCA loads and stores touching this alloca to promote
4656  // better splitting and promotion opportunities.
4657  IRBuilderTy IRB(&AI);
4658  AggLoadStoreRewriter AggRewriter(DL, IRB);
4659  Changed |= AggRewriter.rewrite(AI);
4660 
4661  // Build the slices using a recursive instruction-visiting builder.
4662  AllocaSlices AS(DL, AI);
4663  LLVM_DEBUG(AS.print(dbgs()));
4664  if (AS.isEscaped())
4665  return Changed;
4666 
4667  // Delete all the dead users of this alloca before splitting and rewriting it.
4668  for (Instruction *DeadUser : AS.getDeadUsers()) {
4669  // Free up everything used by this instruction.
4670  for (Use &DeadOp : DeadUser->operands())
4671  clobberUse(DeadOp);
4672 
4673  // Now replace the uses of this instruction.
4674  DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
4675 
4676  // And mark it for deletion.
4677  DeadInsts.push_back(DeadUser);
4678  Changed = true;
4679  }
4680  for (Use *DeadOp : AS.getDeadOperands()) {
4681  clobberUse(*DeadOp);
4682  Changed = true;
4683  }
4684 
4685  // No slices to split. Leave the dead alloca for a later pass to clean up.
4686  if (AS.begin() == AS.end())
4687  return Changed;
4688 
4689  Changed |= splitAlloca(AI, AS);
4690 
4691  LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
4692  while (!SpeculatablePHIs.empty())
4693  speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
4694 
4695  LLVM_DEBUG(dbgs() << " Speculating Selects\n");
4696  while (!SpeculatableSelects.empty())
4697  speculateSelectInstLoads(IRB, *SpeculatableSelects.pop_back_val());
4698 
4699  return Changed;
4700 }
4701 
4702 /// Delete the dead instructions accumulated in this run.
4703 ///
4704 /// Recursively deletes the dead instructions we've accumulated. This is done
4705 /// at the very end to maximize locality of the recursive delete and to
4706 /// minimize the problems of invalidated instruction pointers as such pointers
4707 /// are used heavily in the intermediate stages of the algorithm.
4708 ///
4709 /// We also record the alloca instructions deleted here so that they aren't
4710 /// subsequently handed to mem2reg to promote.
4711 bool SROAPass::deleteDeadInstructions(
4712  SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
4713  bool Changed = false;
4714  while (!DeadInsts.empty()) {
4715  Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
4716  if (!I) continue;
4717  LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
4718 
4719  // If the instruction is an alloca, find the possible dbg.declare connected
4720  // to it, and remove it too. We must do this before calling RAUW or we will
4721  // not be able to find it.
4722  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4723  DeletedAllocas.insert(AI);
4724  for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
4725  OldDII->eraseFromParent();
4726  }
4727 
4728  I->replaceAllUsesWith(UndefValue::get(I->getType()));
4729 
4730  for (Use &Operand : I->operands())
4731  if (Instruction *U = dyn_cast<Instruction>(Operand)) {
4732  // Zero out the operand and see if it becomes trivially dead.
4733  Operand = nullptr;
4735  DeadInsts.push_back(U);
4736  }
4737 
4738  ++NumDeleted;
4739  I->eraseFromParent();
4740  Changed = true;
4741  }
4742  return Changed;
4743 }
4744 
4745 /// Promote the allocas, using the best available technique.
4746 ///
4747 /// This attempts to promote whatever allocas have been identified as viable in
4748 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
4749 /// This function returns whether any promotion occurred.
4750 bool SROAPass::promoteAllocas(Function &F) {
4751  if (PromotableAllocas.empty())
4752  return false;
4753 
4754  NumPromoted += PromotableAllocas.size();
4755 
4756  LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
4757  PromoteMemToReg(PromotableAllocas, *DT, AC);
4758  PromotableAllocas.clear();
4759  return true;
4760 }
4761 
4763  AssumptionCache &RunAC) {
4764  LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
4765  C = &F.getContext();
4766  DT = &RunDT;
4767  AC = &RunAC;
4768 
4769  BasicBlock &EntryBB = F.getEntryBlock();
4770  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
4771  I != E; ++I) {
4772  if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4773  if (isa<ScalableVectorType>(AI->getAllocatedType())) {
4774  if (isAllocaPromotable(AI))
4775  PromotableAllocas.push_back(AI);
4776  } else {
4777  Worklist.insert(AI);
4778  }
4779  }
4780  }
4781 
4782  bool Changed = false;
4783  // A set of deleted alloca instruction pointers which should be removed from
4784  // the list of promotable allocas.
4785  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
4786 
4787  do {
4788  while (!Worklist.empty()) {
4789  Changed |= runOnAlloca(*Worklist.pop_back_val());
4790  Changed |= deleteDeadInstructions(DeletedAllocas);
4791 
4792  // Remove the deleted allocas from various lists so that we don't try to
4793  // continue processing them.
4794  if (!DeletedAllocas.empty()) {
4795  auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
4796  Worklist.remove_if(IsInSet);
4797  PostPromotionWorklist.remove_if(IsInSet);
4798  llvm::erase_if(PromotableAllocas, IsInSet);
4799  DeletedAllocas.clear();
4800  }
4801  }
4802 
4803  Changed |= promoteAllocas(F);
4804 
4805  Worklist = PostPromotionWorklist;
4806  PostPromotionWorklist.clear();
4807  } while (!Worklist.empty());
4808 
4809  if (!Changed)
4810  return PreservedAnalyses::all();
4811 
4812  PreservedAnalyses PA;
4813  PA.preserveSet<CFGAnalyses>();
4814  return PA;
4815 }
4816 
4818  return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
4820 }
4821 
4822 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
4823 ///
4824 /// This is in the llvm namespace purely to allow it to be a friend of the \c
4825 /// SROA pass.
4827  /// The SROA implementation.
4828  SROAPass Impl;
4829 
4830 public:
4831  static char ID;
4832 
4834  initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
4835  }
4836 
4837  bool runOnFunction(Function &F) override {
4838  if (skipFunction(F))
4839  return false;
4840 
4841  auto PA = Impl.runImpl(
4842  F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
4843  getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
4844  return !PA.areAllPreserved();
4845  }
4846 
4847  void getAnalysisUsage(AnalysisUsage &AU) const override {
4851  AU.setPreservesCFG();
4852  }
4853 
4854  StringRef getPassName() const override { return "SROA"; }
4855 };
4856 
4857 char SROALegacyPass::ID = 0;
4858 
4860 
4862  "Scalar Replacement Of Aggregates", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
PtrUseVisitor.h
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::sroa::AllocaSlices::begin
iterator begin()
Definition: SROA.cpp:234
llvm::detail::PtrUseVisitorBase::DL
const DataLayout & DL
Definition: PtrUseVisitor.h:110
AssumptionCache.h
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:308
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:283
llvm::Instruction::getNumSuccessors
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
Definition: Instruction.cpp:807
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:108
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:492
llvm::sroa::AllocaSlices::iterator
SmallVectorImpl< Slice >::iterator iterator
Support for iterating over the slices.
Definition: SROA.cpp:231
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:741
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:69
Pass
print lazy value Lazy Value Info Printer Pass
Definition: LazyValueInfo.cpp:1999
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
stripAggregateTypeWrapping
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition: SROA.cpp:3629
llvm::PHINode::incoming_values
op_range incoming_values()
Definition: Instructions.h:2784
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::generic_gep_type_iterator
Definition: GetElementPtrTypeIterator.h:31
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:189
Metadata.h
llvm::StringRef::rfind
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition: StringRef.h:350
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:291
llvm::PointerIntPair::setInt
void setInt(IntType IntVal) &
Definition: PointerIntPair.h:68
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:121
IntrinsicInst.h
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:237
llvm::DIBuilder
Definition: DIBuilder.h:42
llvm::sroa::Partition::end
iterator end() const
Definition: SROA.cpp:404
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:837
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
DebugInfoMetadata.h
llvm::cl::Prefix
@ Prefix
Definition: CommandLine.h:161
Scalar.h
Loads.h
llvm::MemTransferInst
This class wraps the llvm.memcpy/memmove intrinsics.
Definition: IntrinsicInst.h:1106
llvm::Function
Definition: Function.h:60
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
sroa
sroa
Definition: SROA.cpp:4865
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
llvm::StructType::element_iterator
Type::subtype_iterator element_iterator
Definition: DerivedTypes.h:315
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5255
GetElementPtrTypeIterator.h
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:328
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::MemIntrinsicBase::getDestAddressSpace
unsigned getDestAddressSpace() const
Definition: IntrinsicInst.h:826
llvm::SmallVector< Slice, 8 >
Statistic.h
llvm::sroa::Partition::beginOffset
uint64_t beginOffset() const
The start offset of this partition.
Definition: SROA.cpp:375
llvm::sroa::SROALegacyPass::SROALegacyPass
SROALegacyPass()
Definition: SROA.cpp:4833
ErrorHandling.h
insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition: SROA.cpp:2227
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2525
llvm::erase_if
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:1972
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:682
llvm::SmallDenseMap< Instruction *, unsigned >
llvm::GlobalAlias
Definition: GlobalAlias.h:28
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
GlobalsModRef.h
llvm::sroa::AllocaSliceRewriter::AllocaSliceRewriter
AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROAPass &Pass, AllocaInst &OldAI, AllocaInst &NewAI, uint64_t NewAllocaBeginOffset, uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, VectorType *PromotableVecTy, SmallSetVector< PHINode *, 8 > &PHIUsers, SmallSetVector< SelectInst *, 8 > &SelectUsers)
Definition: SROA.cpp:2339
checkVectorTypeForPromotion
static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, const DataLayout &DL)
Test whether a vector type is viable for promotion.
Definition: SROA.cpp:1859
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::ConstantInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=~0ULL) const
getLimitedValue - If the value is smaller than the specified limit, return it, otherwise return the l...
Definition: Constants.h:244
APInt.h
llvm::sroa::SROALegacyPass
A legacy pass for the legacy pass manager that wraps the SROA pass.
Definition: SROA.cpp:4826
llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:652
AllocaSlices::SliceBuilder::SliceBuilder
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition: SROA.cpp:654
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::LoadInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition: Instructions.h:262
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::StoreInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:345
llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2012
llvm::APInt::ugt
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition: APInt.h:1152
llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition: IntrinsicInst.h:1041
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::StringRef::substr
StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:564
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:80
llvm::SmallPtrSet< Instruction *, 4 >
llvm::MemIntrinsicBase::setDestAlignment
void setDestAlignment(MaybeAlign Alignment)
Definition: IntrinsicInst.h:846
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::PromoteMemToReg
void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
Definition: PromoteMemoryToRegister.cpp:1107
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1785
llvm::FindDbgAddrUses
TinyPtrVector< DbgVariableIntrinsic * > FindDbgAddrUses(Value *V)
Finds all intrinsics declaring local variables as living in the memory that 'V' points to.
Definition: DebugInfo.cpp:55
Operator.h
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
getNaturalGEPWithType
static Value * getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl< Value * > &Indices, const Twine &NamePrefix)
Get a natural GEP off of the BasePtr walking through Ty toward TargetTy without changing the offset o...
Definition: SROA.cpp:1457
getBitWidth
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Definition: ValueTracking.cpp:100
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:871
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
STLExtras.h
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::IRBuilderDefaultInserter
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition: IRBuilder.h:61
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:260
llvm::gep_type_begin
gep_type_iterator gep_type_begin(const User *GEP)
Definition: GetElementPtrTypeIterator.h:123
LLVM_ATTRIBUTE_UNUSED
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:172
llvm::AllocaInst::getAddressSpace
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:105
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:116
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:216
llvm::Type::getNonOpaquePointerElementType
Type * getNonOpaquePointerElementType() const
Only use this method in code that is not reachable with opaque pointers, or part of deprecated method...
Definition: Type.h:401
llvm::sroa::AllocaSlices
Representation of the alloca slices.
Definition: SROA.cpp:218
Use.h
endif
__FakeVCSRevision h endif() endif() set(generated_files "$
Definition: CMakeLists.txt:16
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::StoreInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:372
llvm::AMDGPU::HSAMD::ValueKind::Queue
@ Queue
llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:213
llvm::GetElementPtrInst::getSourceElementType
Type * getSourceElementType() const
Definition: Instructions.h:1003
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:933
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:590
isIntegerWideningViableForSlice
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition: SROA.cpp:2013
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::gep_type_end
gep_type_iterator gep_type_end(const User *GEP)
Definition: GetElementPtrTypeIterator.h:130
llvm::IntegerType::getMask
APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition: Type.cpp:335
PointerIntPair.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition: SmallBitVector.h:35
llvm::GetElementPtrInst::isInBounds
bool isInBounds() const
Determine whether the GEP has the inbounds flag.
Definition: Instructions.cpp:1928
llvm::User::getOperandUse
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
CommandLine.h
insertInteger
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition: SROA.cpp:2172
extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition: SROA.cpp:2205
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Type::isArrayTy
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:234
llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:268
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1709
llvm::ConstantFolder
ConstantFolder - Create constants with minimum, target independent, folding.
Definition: ConstantFolder.h:30
foldPHINodeOrSelectInst
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition: SROA.cpp:626
llvm::MemTransferBase::getSourceAddressSpace
unsigned getSourceAddressSpace() const
Definition: IntrinsicInst.h:886
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:386
llvm::Instruction::isLifetimeStartOrEnd
bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
Definition: Instruction.cpp:748
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1783
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_END(SROALegacyPass
llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition: Instructions.h:5295
Rewriter
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Constants.h
llvm::PHINode::getIncomingValue
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
Definition: Instructions.h:2794
llvm::sroa::AllocaSlices::AllocaSlices
AllocaSlices(const DataLayout &DL, AllocaInst &AI)
Construct the slices of a particular alloca.