LLVM 22.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
76#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <queue>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <variant>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "sroa"
104
105STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
106STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
107STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
108STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
109STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
110STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
111STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
112STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
113STATISTIC(NumLoadsPredicated,
114 "Number of loads rewritten into predicated loads to allow promotion");
116 NumStoresPredicated,
117 "Number of stores rewritten into predicated loads to allow promotion");
118STATISTIC(NumDeleted, "Number of instructions deleted");
119STATISTIC(NumVectorized, "Number of vectorized aggregates");
120
121/// Disable running mem2reg during SROA in order to test or debug SROA.
122static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
123 cl::Hidden);
124namespace {
125
126class AllocaSliceRewriter;
127class AllocaSlices;
128class Partition;
129
130class SelectHandSpeculativity {
131 unsigned char Storage = 0; // None are speculatable by default.
132 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
133 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
134public:
135 SelectHandSpeculativity() = default;
136 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
137 bool isSpeculatable(bool isTrueVal) const;
138 bool areAllSpeculatable() const;
139 bool areAnySpeculatable() const;
140 bool areNoneSpeculatable() const;
141 // For interop as int half of PointerIntPair.
142 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
143 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
144};
145static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
146
147using PossiblySpeculatableLoad =
149using UnspeculatableStore = StoreInst *;
150using RewriteableMemOp =
151 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
152using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
153
154/// An optimization pass providing Scalar Replacement of Aggregates.
155///
156/// This pass takes allocations which can be completely analyzed (that is, they
157/// don't escape) and tries to turn them into scalar SSA values. There are
158/// a few steps to this process.
159///
160/// 1) It takes allocations of aggregates and analyzes the ways in which they
161/// are used to try to split them into smaller allocations, ideally of
162/// a single scalar data type. It will split up memcpy and memset accesses
163/// as necessary and try to isolate individual scalar accesses.
164/// 2) It will transform accesses into forms which are suitable for SSA value
165/// promotion. This can be replacing a memset with a scalar store of an
166/// integer value, or it can involve speculating operations on a PHI or
167/// select to be a PHI or select of the results.
168/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
169/// onto insert and extract operations on a vector value, and convert them to
170/// this form. By doing so, it will enable promotion of vector aggregates to
171/// SSA vector values.
172class SROA {
173 LLVMContext *const C;
174 DomTreeUpdater *const DTU;
175 AssumptionCache *const AC;
176 const bool PreserveCFG;
177
178 /// Worklist of alloca instructions to simplify.
179 ///
180 /// Each alloca in the function is added to this. Each new alloca formed gets
181 /// added to it as well to recursively simplify unless that alloca can be
182 /// directly promoted. Finally, each time we rewrite a use of an alloca other
183 /// the one being actively rewritten, we add it back onto the list if not
184 /// already present to ensure it is re-visited.
185 SmallSetVector<AllocaInst *, 16> Worklist;
186
187 /// A collection of instructions to delete.
188 /// We try to batch deletions to simplify code and make things a bit more
189 /// efficient. We also make sure there is no dangling pointers.
190 SmallVector<WeakVH, 8> DeadInsts;
191
192 /// Post-promotion worklist.
193 ///
194 /// Sometimes we discover an alloca which has a high probability of becoming
195 /// viable for SROA after a round of promotion takes place. In those cases,
196 /// the alloca is enqueued here for re-processing.
197 ///
198 /// Note that we have to be very careful to clear allocas out of this list in
199 /// the event they are deleted.
200 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
201
202 /// A collection of alloca instructions we can directly promote.
203 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
204 SmallPtrSet<AllocaInst *, 16>, 16>
205 PromotableAllocas;
206
207 /// A worklist of PHIs to speculate prior to promoting allocas.
208 ///
209 /// All of these PHIs have been checked for the safety of speculation and by
210 /// being speculated will allow promoting allocas currently in the promotable
211 /// queue.
212 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
213
214 /// A worklist of select instructions to rewrite prior to promoting
215 /// allocas.
216 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
217
218 /// Select instructions that use an alloca and are subsequently loaded can be
219 /// rewritten to load both input pointers and then select between the result,
220 /// allowing the load of the alloca to be promoted.
221 /// From this:
222 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
223 /// %V = load <type>, ptr %P2
224 /// to:
225 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
226 /// %V2 = load <type>, ptr %Other
227 /// %V = select i1 %cond, <type> %V1, <type> %V2
228 ///
229 /// We can do this to a select if its only uses are loads
230 /// and if either the operand to the select can be loaded unconditionally,
231 /// or if we are allowed to perform CFG modifications.
232 /// If found an intervening bitcast with a single use of the load,
233 /// allow the promotion.
234 static std::optional<RewriteableMemOps>
235 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
236
237public:
238 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
239 SROAOptions PreserveCFG_)
240 : C(C), DTU(DTU), AC(AC),
241 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
242
243 /// Main run method used by both the SROAPass and by the legacy pass.
244 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
245
246private:
247 friend class AllocaSliceRewriter;
248
249 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
250 AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
251 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
252 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
253 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
254 void clobberUse(Use &U);
255 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
256 bool promoteAllocas();
257};
258
259} // end anonymous namespace
260
261/// Calculate the fragment of a variable to use when slicing a store
262/// based on the slice dimensions, existing fragment, and base storage
263/// fragment.
264/// Results:
265/// UseFrag - Use Target as the new fragment.
266/// UseNoFrag - The new slice already covers the whole variable.
267/// Skip - The new alloca slice doesn't include this variable.
268/// FIXME: Can we use calculateFragmentIntersect instead?
269namespace {
270enum FragCalcResult { UseFrag, UseNoFrag, Skip };
271}
272static FragCalcResult
274 uint64_t NewStorageSliceOffsetInBits,
275 uint64_t NewStorageSliceSizeInBits,
276 std::optional<DIExpression::FragmentInfo> StorageFragment,
277 std::optional<DIExpression::FragmentInfo> CurrentFragment,
279 // If the base storage describes part of the variable apply the offset and
280 // the size constraint.
281 if (StorageFragment) {
282 Target.SizeInBits =
283 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
284 Target.OffsetInBits =
285 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
286 } else {
287 Target.SizeInBits = NewStorageSliceSizeInBits;
288 Target.OffsetInBits = NewStorageSliceOffsetInBits;
289 }
290
291 // If this slice extracts the entirety of an independent variable from a
292 // larger alloca, do not produce a fragment expression, as the variable is
293 // not fragmented.
294 if (!CurrentFragment) {
295 if (auto Size = Variable->getSizeInBits()) {
296 // Treat the current fragment as covering the whole variable.
297 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
298 if (Target == CurrentFragment)
299 return UseNoFrag;
300 }
301 }
302
303 // No additional work to do if there isn't a fragment already, or there is
304 // but it already exactly describes the new assignment.
305 if (!CurrentFragment || *CurrentFragment == Target)
306 return UseFrag;
307
308 // Reject the target fragment if it doesn't fit wholly within the current
309 // fragment. TODO: We could instead chop up the target to fit in the case of
310 // a partial overlap.
311 if (Target.startInBits() < CurrentFragment->startInBits() ||
312 Target.endInBits() > CurrentFragment->endInBits())
313 return Skip;
314
315 // Target fits within the current fragment, return it.
316 return UseFrag;
317}
318
320 return DebugVariable(DVR->getVariable(), std::nullopt,
321 DVR->getDebugLoc().getInlinedAt());
322}
323
324/// Find linked dbg.assign and generate a new one with the correct
325/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
326/// value component is copied from the old dbg.assign to the new.
327/// \param OldAlloca Alloca for the variable before splitting.
328/// \param IsSplit True if the store (not necessarily alloca)
329/// is being split.
330/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
331/// \param SliceSizeInBits New number of bits being written to.
332/// \param OldInst Instruction that is being split.
333/// \param Inst New instruction performing this part of the
334/// split store.
335/// \param Dest Store destination.
336/// \param Value Stored value.
337/// \param DL Datalayout.
338static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
339 uint64_t OldAllocaOffsetInBits,
340 uint64_t SliceSizeInBits, Instruction *OldInst,
341 Instruction *Inst, Value *Dest, Value *Value,
342 const DataLayout &DL) {
343 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
344 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
345 if (DVRAssignMarkerRange.empty())
346 return;
347
348 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
349 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
350 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
351 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
352 << "\n");
353 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
354 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
355 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
356 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
357 if (Value)
358 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
359
360 /// Map of aggregate variables to their fragment associated with OldAlloca.
362 BaseFragments;
363 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
364 BaseFragments[getAggregateVariable(DVR)] =
365 DVR->getExpression()->getFragmentInfo();
366
367 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
368 // one). It shouldn't already have one: assert this assumption.
369 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
370 DIAssignID *NewID = nullptr;
371 auto &Ctx = Inst->getContext();
372 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
373 assert(OldAlloca->isStaticAlloca());
374
375 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
376 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
377 << "\n");
378 auto *Expr = DbgAssign->getExpression();
379 bool SetKillLocation = false;
380
381 if (IsSplit) {
382 std::optional<DIExpression::FragmentInfo> BaseFragment;
383 {
384 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
385 if (R == BaseFragments.end())
386 return;
387 BaseFragment = R->second;
388 }
389 std::optional<DIExpression::FragmentInfo> CurrentFragment =
390 Expr->getFragmentInfo();
391 DIExpression::FragmentInfo NewFragment;
392 FragCalcResult Result = calculateFragment(
393 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
394 BaseFragment, CurrentFragment, NewFragment);
395
396 if (Result == Skip)
397 return;
398 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
399 if (CurrentFragment) {
400 // Rewrite NewFragment to be relative to the existing one (this is
401 // what createFragmentExpression wants). CalculateFragment has
402 // already resolved the size for us. FIXME: Should it return the
403 // relative fragment too?
404 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
405 }
406 // Add the new fragment info to the existing expression if possible.
408 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
409 Expr = *E;
410 } else {
411 // Otherwise, add the new fragment info to an empty expression and
412 // discard the value component of this dbg.assign as the value cannot
413 // be computed with the new fragment.
415 DIExpression::get(Expr->getContext(), {}),
416 NewFragment.OffsetInBits, NewFragment.SizeInBits);
417 SetKillLocation = true;
418 }
419 }
420 }
421
422 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
423 if (!NewID) {
424 NewID = DIAssignID::getDistinct(Ctx);
425 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
426 }
427
428 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
430 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
431 Dest, DIExpression::get(Expr->getContext(), {}),
432 DbgAssign->getDebugLoc())));
433
434 // If we've updated the value but the original dbg.assign has an arglist
435 // then kill it now - we can't use the requested new value.
436 // We can't replace the DIArgList with the new value as it'd leave
437 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
438 // an arglist). And we can't keep the DIArgList in case the linked store
439 // is being split - in which case the DIArgList + expression may no longer
440 // be computing the correct value.
441 // This should be a very rare situation as it requires the value being
442 // stored to differ from the dbg.assign (i.e., the value has been
443 // represented differently in the debug intrinsic for some reason).
444 SetKillLocation |=
445 Value && (DbgAssign->hasArgList() ||
446 !DbgAssign->getExpression()->isSingleLocationExpression());
447 if (SetKillLocation)
448 NewAssign->setKillLocation();
449
450 // We could use more precision here at the cost of some additional (code)
451 // complexity - if the original dbg.assign was adjacent to its store, we
452 // could position this new dbg.assign adjacent to its store rather than the
453 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
454 // what we get now:
455 // split store !1
456 // split store !2
457 // dbg.assign !1
458 // dbg.assign !2
459 // This (current behaviour) results results in debug assignments being
460 // noted as slightly offset (in code) from the store. In practice this
461 // should have little effect on the debugging experience due to the fact
462 // that all the split stores should get the same line number.
463 NewAssign->moveBefore(DbgAssign->getIterator());
464
465 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
466 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
467 };
468
469 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
470}
471
472namespace {
473
474/// A custom IRBuilder inserter which prefixes all names, but only in
475/// Assert builds.
476class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
477 std::string Prefix;
478
479 Twine getNameWithPrefix(const Twine &Name) const {
480 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
481 }
482
483public:
484 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
485
486 void InsertHelper(Instruction *I, const Twine &Name,
487 BasicBlock::iterator InsertPt) const override {
488 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
489 InsertPt);
490 }
491};
492
493/// Provide a type for IRBuilder that drops names in release builds.
495
496/// A used slice of an alloca.
497///
498/// This structure represents a slice of an alloca used by some instruction. It
499/// stores both the begin and end offsets of this use, a pointer to the use
500/// itself, and a flag indicating whether we can classify the use as splittable
501/// or not when forming partitions of the alloca.
502class Slice {
503 /// The beginning offset of the range.
504 uint64_t BeginOffset = 0;
505
506 /// The ending offset, not included in the range.
507 uint64_t EndOffset = 0;
508
509 /// Storage for both the use of this slice and whether it can be
510 /// split.
511 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
512
513public:
514 Slice() = default;
515
516 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
517 : BeginOffset(BeginOffset), EndOffset(EndOffset),
518 UseAndIsSplittable(U, IsSplittable) {}
519
520 uint64_t beginOffset() const { return BeginOffset; }
521 uint64_t endOffset() const { return EndOffset; }
522
523 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
524 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
525
526 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
527
528 bool isDead() const { return getUse() == nullptr; }
529 void kill() { UseAndIsSplittable.setPointer(nullptr); }
530
531 /// Support for ordering ranges.
532 ///
533 /// This provides an ordering over ranges such that start offsets are
534 /// always increasing, and within equal start offsets, the end offsets are
535 /// decreasing. Thus the spanning range comes first in a cluster with the
536 /// same start position.
537 bool operator<(const Slice &RHS) const {
538 if (beginOffset() < RHS.beginOffset())
539 return true;
540 if (beginOffset() > RHS.beginOffset())
541 return false;
542 if (isSplittable() != RHS.isSplittable())
543 return !isSplittable();
544 if (endOffset() > RHS.endOffset())
545 return true;
546 return false;
547 }
548
549 /// Support comparison with a single offset to allow binary searches.
550 friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
551 uint64_t RHSOffset) {
552 return LHS.beginOffset() < RHSOffset;
553 }
554 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
555 const Slice &RHS) {
556 return LHSOffset < RHS.beginOffset();
557 }
558
559 bool operator==(const Slice &RHS) const {
560 return isSplittable() == RHS.isSplittable() &&
561 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
562 }
563 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
564};
565
566/// Representation of the alloca slices.
567///
568/// This class represents the slices of an alloca which are formed by its
569/// various uses. If a pointer escapes, we can't fully build a representation
570/// for the slices used and we reflect that in this structure. The uses are
571/// stored, sorted by increasing beginning offset and with unsplittable slices
572/// starting at a particular offset before splittable slices.
573class AllocaSlices {
574public:
575 /// Construct the slices of a particular alloca.
576 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
577
578 /// Test whether a pointer to the allocation escapes our analysis.
579 ///
580 /// If this is true, the slices are never fully built and should be
581 /// ignored.
582 bool isEscaped() const { return PointerEscapingInstr; }
583 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
584
585 /// Support for iterating over the slices.
586 /// @{
587 using iterator = SmallVectorImpl<Slice>::iterator;
588 using range = iterator_range<iterator>;
589
590 iterator begin() { return Slices.begin(); }
591 iterator end() { return Slices.end(); }
592
593 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
594 using const_range = iterator_range<const_iterator>;
595
596 const_iterator begin() const { return Slices.begin(); }
597 const_iterator end() const { return Slices.end(); }
598 /// @}
599
600 /// Erase a range of slices.
601 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
602
603 /// Insert new slices for this alloca.
604 ///
605 /// This moves the slices into the alloca's slices collection, and re-sorts
606 /// everything so that the usual ordering properties of the alloca's slices
607 /// hold.
608 void insert(ArrayRef<Slice> NewSlices) {
609 int OldSize = Slices.size();
610 Slices.append(NewSlices.begin(), NewSlices.end());
611 auto SliceI = Slices.begin() + OldSize;
612 std::stable_sort(SliceI, Slices.end());
613 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
614 }
615
616 // Forward declare the iterator and range accessor for walking the
617 // partitions.
618 class partition_iterator;
620
621 /// Access the dead users for this alloca.
622 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
623
624 /// Access Uses that should be dropped if the alloca is promotable.
625 ArrayRef<Use *> getDeadUsesIfPromotable() const {
626 return DeadUseIfPromotable;
627 }
628
629 /// Access the dead operands referring to this alloca.
630 ///
631 /// These are operands which have cannot actually be used to refer to the
632 /// alloca as they are outside its range and the user doesn't correct for
633 /// that. These mostly consist of PHI node inputs and the like which we just
634 /// need to replace with undef.
635 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
636
637#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
638 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
639 void printSlice(raw_ostream &OS, const_iterator I,
640 StringRef Indent = " ") const;
641 void printUse(raw_ostream &OS, const_iterator I,
642 StringRef Indent = " ") const;
643 void print(raw_ostream &OS) const;
644 void dump(const_iterator I) const;
645 void dump() const;
646#endif
647
648private:
649 template <typename DerivedT, typename RetT = void> class BuilderBase;
650 class SliceBuilder;
651
652 friend class AllocaSlices::SliceBuilder;
653
654#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
655 /// Handle to alloca instruction to simplify method interfaces.
656 AllocaInst &AI;
657#endif
658
659 /// The instruction responsible for this alloca not having a known set
660 /// of slices.
661 ///
662 /// When an instruction (potentially) escapes the pointer to the alloca, we
663 /// store a pointer to that here and abort trying to form slices of the
664 /// alloca. This will be null if the alloca slices are analyzed successfully.
665 Instruction *PointerEscapingInstr;
666 Instruction *PointerEscapingInstrReadOnly;
667
668 /// The slices of the alloca.
669 ///
670 /// We store a vector of the slices formed by uses of the alloca here. This
671 /// vector is sorted by increasing begin offset, and then the unsplittable
672 /// slices before the splittable ones. See the Slice inner class for more
673 /// details.
675
676 /// Instructions which will become dead if we rewrite the alloca.
677 ///
678 /// Note that these are not separated by slice. This is because we expect an
679 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
680 /// all these instructions can simply be removed and replaced with poison as
681 /// they come from outside of the allocated space.
682 SmallVector<Instruction *, 8> DeadUsers;
683
684 /// Uses which will become dead if can promote the alloca.
685 SmallVector<Use *, 8> DeadUseIfPromotable;
686
687 /// Operands which will become dead if we rewrite the alloca.
688 ///
689 /// These are operands that in their particular use can be replaced with
690 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
691 /// to PHI nodes and the like. They aren't entirely dead (there might be
692 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
693 /// want to swap this particular input for poison to simplify the use lists of
694 /// the alloca.
695 SmallVector<Use *, 8> DeadOperands;
696};
697
698/// A partition of the slices.
699///
700/// An ephemeral representation for a range of slices which can be viewed as
701/// a partition of the alloca. This range represents a span of the alloca's
702/// memory which cannot be split, and provides access to all of the slices
703/// overlapping some part of the partition.
704///
705/// Objects of this type are produced by traversing the alloca's slices, but
706/// are only ephemeral and not persistent.
707class Partition {
708private:
709 friend class AllocaSlices;
710 friend class AllocaSlices::partition_iterator;
711
712 using iterator = AllocaSlices::iterator;
713
714 /// The beginning and ending offsets of the alloca for this
715 /// partition.
716 uint64_t BeginOffset = 0, EndOffset = 0;
717
718 /// The start and end iterators of this partition.
719 iterator SI, SJ;
720
721 /// A collection of split slice tails overlapping the partition.
722 SmallVector<Slice *, 4> SplitTails;
723
724 /// Raw constructor builds an empty partition starting and ending at
725 /// the given iterator.
726 Partition(iterator SI) : SI(SI), SJ(SI) {}
727
728public:
729 /// The start offset of this partition.
730 ///
731 /// All of the contained slices start at or after this offset.
732 uint64_t beginOffset() const { return BeginOffset; }
733
734 /// The end offset of this partition.
735 ///
736 /// All of the contained slices end at or before this offset.
737 uint64_t endOffset() const { return EndOffset; }
738
739 /// The size of the partition.
740 ///
741 /// Note that this can never be zero.
742 uint64_t size() const {
743 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
744 return EndOffset - BeginOffset;
745 }
746
747 /// Test whether this partition contains no slices, and merely spans
748 /// a region occupied by split slices.
749 bool empty() const { return SI == SJ; }
750
751 /// \name Iterate slices that start within the partition.
752 /// These may be splittable or unsplittable. They have a begin offset >= the
753 /// partition begin offset.
754 /// @{
755 // FIXME: We should probably define a "concat_iterator" helper and use that
756 // to stitch together pointee_iterators over the split tails and the
757 // contiguous iterators of the partition. That would give a much nicer
758 // interface here. We could then additionally expose filtered iterators for
759 // split, unsplit, and unsplittable splices based on the usage patterns.
760 iterator begin() const { return SI; }
761 iterator end() const { return SJ; }
762 /// @}
763
764 /// Get the sequence of split slice tails.
765 ///
766 /// These tails are of slices which start before this partition but are
767 /// split and overlap into the partition. We accumulate these while forming
768 /// partitions.
769 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
770};
771
772} // end anonymous namespace
773
774/// An iterator over partitions of the alloca's slices.
775///
776/// This iterator implements the core algorithm for partitioning the alloca's
777/// slices. It is a forward iterator as we don't support backtracking for
778/// efficiency reasons, and re-use a single storage area to maintain the
779/// current set of split slices.
780///
781/// It is templated on the slice iterator type to use so that it can operate
782/// with either const or non-const slice iterators.
784 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
785 Partition> {
786 friend class AllocaSlices;
787
788 /// Most of the state for walking the partitions is held in a class
789 /// with a nice interface for examining them.
790 Partition P;
791
792 /// We need to keep the end of the slices to know when to stop.
793 AllocaSlices::iterator SE;
794
795 /// We also need to keep track of the maximum split end offset seen.
796 /// FIXME: Do we really?
797 uint64_t MaxSplitSliceEndOffset = 0;
798
799 /// Sets the partition to be empty at given iterator, and sets the
800 /// end iterator.
801 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
802 : P(SI), SE(SE) {
803 // If not already at the end, advance our state to form the initial
804 // partition.
805 if (SI != SE)
806 advance();
807 }
808
809 /// Advance the iterator to the next partition.
810 ///
811 /// Requires that the iterator not be at the end of the slices.
812 void advance() {
813 assert((P.SI != SE || !P.SplitTails.empty()) &&
814 "Cannot advance past the end of the slices!");
815
816 // Clear out any split uses which have ended.
817 if (!P.SplitTails.empty()) {
818 if (P.EndOffset >= MaxSplitSliceEndOffset) {
819 // If we've finished all splits, this is easy.
820 P.SplitTails.clear();
821 MaxSplitSliceEndOffset = 0;
822 } else {
823 // Remove the uses which have ended in the prior partition. This
824 // cannot change the max split slice end because we just checked that
825 // the prior partition ended prior to that max.
826 llvm::erase_if(P.SplitTails,
827 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
828 assert(llvm::any_of(P.SplitTails,
829 [&](Slice *S) {
830 return S->endOffset() == MaxSplitSliceEndOffset;
831 }) &&
832 "Could not find the current max split slice offset!");
833 assert(llvm::all_of(P.SplitTails,
834 [&](Slice *S) {
835 return S->endOffset() <= MaxSplitSliceEndOffset;
836 }) &&
837 "Max split slice end offset is not actually the max!");
838 }
839 }
840
841 // If P.SI is already at the end, then we've cleared the split tail and
842 // now have an end iterator.
843 if (P.SI == SE) {
844 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
845 return;
846 }
847
848 // If we had a non-empty partition previously, set up the state for
849 // subsequent partitions.
850 if (P.SI != P.SJ) {
851 // Accumulate all the splittable slices which started in the old
852 // partition into the split list.
853 for (Slice &S : P)
854 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
855 P.SplitTails.push_back(&S);
856 MaxSplitSliceEndOffset =
857 std::max(S.endOffset(), MaxSplitSliceEndOffset);
858 }
859
860 // Start from the end of the previous partition.
861 P.SI = P.SJ;
862
863 // If P.SI is now at the end, we at most have a tail of split slices.
864 if (P.SI == SE) {
865 P.BeginOffset = P.EndOffset;
866 P.EndOffset = MaxSplitSliceEndOffset;
867 return;
868 }
869
870 // If the we have split slices and the next slice is after a gap and is
871 // not splittable immediately form an empty partition for the split
872 // slices up until the next slice begins.
873 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
874 !P.SI->isSplittable()) {
875 P.BeginOffset = P.EndOffset;
876 P.EndOffset = P.SI->beginOffset();
877 return;
878 }
879 }
880
881 // OK, we need to consume new slices. Set the end offset based on the
882 // current slice, and step SJ past it. The beginning offset of the
883 // partition is the beginning offset of the next slice unless we have
884 // pre-existing split slices that are continuing, in which case we begin
885 // at the prior end offset.
886 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
887 P.EndOffset = P.SI->endOffset();
888 ++P.SJ;
889
890 // There are two strategies to form a partition based on whether the
891 // partition starts with an unsplittable slice or a splittable slice.
892 if (!P.SI->isSplittable()) {
893 // When we're forming an unsplittable region, it must always start at
894 // the first slice and will extend through its end.
895 assert(P.BeginOffset == P.SI->beginOffset());
896
897 // Form a partition including all of the overlapping slices with this
898 // unsplittable slice.
899 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
900 if (!P.SJ->isSplittable())
901 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
902 ++P.SJ;
903 }
904
905 // We have a partition across a set of overlapping unsplittable
906 // partitions.
907 return;
908 }
909
910 // If we're starting with a splittable slice, then we need to form
911 // a synthetic partition spanning it and any other overlapping splittable
912 // splices.
913 assert(P.SI->isSplittable() && "Forming a splittable partition!");
914
915 // Collect all of the overlapping splittable slices.
916 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
917 P.SJ->isSplittable()) {
918 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
919 ++P.SJ;
920 }
921
922 // Back upiP.EndOffset if we ended the span early when encountering an
923 // unsplittable slice. This synthesizes the early end offset of
924 // a partition spanning only splittable slices.
925 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
926 assert(!P.SJ->isSplittable());
927 P.EndOffset = P.SJ->beginOffset();
928 }
929 }
930
931public:
932 bool operator==(const partition_iterator &RHS) const {
933 assert(SE == RHS.SE &&
934 "End iterators don't match between compared partition iterators!");
935
936 // The observed positions of partitions is marked by the P.SI iterator and
937 // the emptiness of the split slices. The latter is only relevant when
938 // P.SI == SE, as the end iterator will additionally have an empty split
939 // slices list, but the prior may have the same P.SI and a tail of split
940 // slices.
941 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
942 assert(P.SJ == RHS.P.SJ &&
943 "Same set of slices formed two different sized partitions!");
944 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
945 "Same slice position with differently sized non-empty split "
946 "slice tails!");
947 return true;
948 }
949 return false;
950 }
951
952 partition_iterator &operator++() {
953 advance();
954 return *this;
955 }
956
957 Partition &operator*() { return P; }
958};
959
960/// A forward range over the partitions of the alloca's slices.
961///
962/// This accesses an iterator range over the partitions of the alloca's
963/// slices. It computes these partitions on the fly based on the overlapping
964/// offsets of the slices and the ability to split them. It will visit "empty"
965/// partitions to cover regions of the alloca only accessed via split
966/// slices.
967iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
968 return make_range(partition_iterator(begin(), end()),
969 partition_iterator(end(), end()));
970}
971
973 // If the condition being selected on is a constant or the same value is
974 // being selected between, fold the select. Yes this does (rarely) happen
975 // early on.
976 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
977 return SI.getOperand(1 + CI->isZero());
978 if (SI.getOperand(1) == SI.getOperand(2))
979 return SI.getOperand(1);
980
981 return nullptr;
982}
983
984/// A helper that folds a PHI node or a select.
986 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
987 // If PN merges together the same value, return that value.
988 return PN->hasConstantValue();
989 }
991}
992
993/// Builder for the alloca slices.
994///
995/// This class builds a set of alloca slices by recursively visiting the uses
996/// of an alloca and making a slice for each load and store at each offset.
997class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
998 friend class PtrUseVisitor<SliceBuilder>;
999 friend class InstVisitor<SliceBuilder>;
1000
1001 using Base = PtrUseVisitor<SliceBuilder>;
1002
1003 const uint64_t AllocSize;
1004 AllocaSlices &AS;
1005
1006 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1008
1009 /// Set to de-duplicate dead instructions found in the use walk.
1010 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1011
1012public:
1013 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1015 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1016 AS(AS) {}
1017
1018private:
1019 void markAsDead(Instruction &I) {
1020 if (VisitedDeadInsts.insert(&I).second)
1021 AS.DeadUsers.push_back(&I);
1022 }
1023
1024 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1025 bool IsSplittable = false) {
1026 // Completely skip uses which have a zero size or start either before or
1027 // past the end of the allocation.
1028 if (Size == 0 || Offset.uge(AllocSize)) {
1029 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1030 << Offset
1031 << " which has zero size or starts outside of the "
1032 << AllocSize << " byte alloca:\n"
1033 << " alloca: " << AS.AI << "\n"
1034 << " use: " << I << "\n");
1035 return markAsDead(I);
1036 }
1037
1038 uint64_t BeginOffset = Offset.getZExtValue();
1039 uint64_t EndOffset = BeginOffset + Size;
1040
1041 // Clamp the end offset to the end of the allocation. Note that this is
1042 // formulated to handle even the case where "BeginOffset + Size" overflows.
1043 // This may appear superficially to be something we could ignore entirely,
1044 // but that is not so! There may be widened loads or PHI-node uses where
1045 // some instructions are dead but not others. We can't completely ignore
1046 // them, and so have to record at least the information here.
1047 assert(AllocSize >= BeginOffset); // Established above.
1048 if (Size > AllocSize - BeginOffset) {
1049 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1050 << Offset << " to remain within the " << AllocSize
1051 << " byte alloca:\n"
1052 << " alloca: " << AS.AI << "\n"
1053 << " use: " << I << "\n");
1054 EndOffset = AllocSize;
1055 }
1056
1057 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1058 }
1059
1060 void visitBitCastInst(BitCastInst &BC) {
1061 if (BC.use_empty())
1062 return markAsDead(BC);
1063
1064 return Base::visitBitCastInst(BC);
1065 }
1066
1067 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1068 if (ASC.use_empty())
1069 return markAsDead(ASC);
1070
1071 return Base::visitAddrSpaceCastInst(ASC);
1072 }
1073
1074 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1075 if (GEPI.use_empty())
1076 return markAsDead(GEPI);
1077
1078 return Base::visitGetElementPtrInst(GEPI);
1079 }
1080
1081 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1082 uint64_t Size, bool IsVolatile) {
1083 // We allow splitting of non-volatile loads and stores where the type is an
1084 // integer type. These may be used to implement 'memcpy' or other "transfer
1085 // of bits" patterns.
1086 bool IsSplittable =
1087 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1088
1089 insertUse(I, Offset, Size, IsSplittable);
1090 }
1091
1092 void visitLoadInst(LoadInst &LI) {
1093 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1094 "All simple FCA loads should have been pre-split");
1095
1096 // If there is a load with an unknown offset, we can still perform store
1097 // to load forwarding for other known-offset loads.
1098 if (!IsOffsetKnown)
1099 return PI.setEscapedReadOnly(&LI);
1100
1101 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1102 if (Size.isScalable()) {
1103 unsigned VScale = LI.getFunction()->getVScaleValue();
1104 if (!VScale)
1105 return PI.setAborted(&LI);
1106
1107 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1108 }
1109
1110 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1111 LI.isVolatile());
1112 }
1113
1114 void visitStoreInst(StoreInst &SI) {
1115 Value *ValOp = SI.getValueOperand();
1116 if (ValOp == *U)
1117 return PI.setEscapedAndAborted(&SI);
1118 if (!IsOffsetKnown)
1119 return PI.setAborted(&SI);
1120
1121 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1122 if (StoreSize.isScalable()) {
1123 unsigned VScale = SI.getFunction()->getVScaleValue();
1124 if (!VScale)
1125 return PI.setAborted(&SI);
1126
1127 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1128 }
1129
1130 uint64_t Size = StoreSize.getFixedValue();
1131
1132 // If this memory access can be shown to *statically* extend outside the
1133 // bounds of the allocation, it's behavior is undefined, so simply
1134 // ignore it. Note that this is more strict than the generic clamping
1135 // behavior of insertUse. We also try to handle cases which might run the
1136 // risk of overflow.
1137 // FIXME: We should instead consider the pointer to have escaped if this
1138 // function is being instrumented for addressing bugs or race conditions.
1139 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1140 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1141 << Offset << " which extends past the end of the "
1142 << AllocSize << " byte alloca:\n"
1143 << " alloca: " << AS.AI << "\n"
1144 << " use: " << SI << "\n");
1145 return markAsDead(SI);
1146 }
1147
1148 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1149 "All simple FCA stores should have been pre-split");
1150 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1151 }
1152
1153 void visitMemSetInst(MemSetInst &II) {
1154 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1155 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1156 if ((Length && Length->getValue() == 0) ||
1157 (IsOffsetKnown && Offset.uge(AllocSize)))
1158 // Zero-length mem transfer intrinsics can be ignored entirely.
1159 return markAsDead(II);
1160
1161 if (!IsOffsetKnown)
1162 return PI.setAborted(&II);
1163
1164 insertUse(II, Offset,
1165 Length ? Length->getLimitedValue()
1166 : AllocSize - Offset.getLimitedValue(),
1167 (bool)Length);
1168 }
1169
1170 void visitMemTransferInst(MemTransferInst &II) {
1171 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1172 if (Length && Length->getValue() == 0)
1173 // Zero-length mem transfer intrinsics can be ignored entirely.
1174 return markAsDead(II);
1175
1176 // Because we can visit these intrinsics twice, also check to see if the
1177 // first time marked this instruction as dead. If so, skip it.
1178 if (VisitedDeadInsts.count(&II))
1179 return;
1180
1181 if (!IsOffsetKnown)
1182 return PI.setAborted(&II);
1183
1184 // This side of the transfer is completely out-of-bounds, and so we can
1185 // nuke the entire transfer. However, we also need to nuke the other side
1186 // if already added to our partitions.
1187 // FIXME: Yet another place we really should bypass this when
1188 // instrumenting for ASan.
1189 if (Offset.uge(AllocSize)) {
1190 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1191 MemTransferSliceMap.find(&II);
1192 if (MTPI != MemTransferSliceMap.end())
1193 AS.Slices[MTPI->second].kill();
1194 return markAsDead(II);
1195 }
1196
1197 uint64_t RawOffset = Offset.getLimitedValue();
1198 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1199
1200 // Check for the special case where the same exact value is used for both
1201 // source and dest.
1202 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1203 // For non-volatile transfers this is a no-op.
1204 if (!II.isVolatile())
1205 return markAsDead(II);
1206
1207 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1208 }
1209
1210 // If we have seen both source and destination for a mem transfer, then
1211 // they both point to the same alloca.
1212 bool Inserted;
1213 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1214 std::tie(MTPI, Inserted) =
1215 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1216 unsigned PrevIdx = MTPI->second;
1217 if (!Inserted) {
1218 Slice &PrevP = AS.Slices[PrevIdx];
1219
1220 // Check if the begin offsets match and this is a non-volatile transfer.
1221 // In that case, we can completely elide the transfer.
1222 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1223 PrevP.kill();
1224 return markAsDead(II);
1225 }
1226
1227 // Otherwise we have an offset transfer within the same alloca. We can't
1228 // split those.
1229 PrevP.makeUnsplittable();
1230 }
1231
1232 // Insert the use now that we've fixed up the splittable nature.
1233 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1234
1235 // Check that we ended up with a valid index in the map.
1236 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1237 "Map index doesn't point back to a slice with this user.");
1238 }
1239
1240 // Disable SRoA for any intrinsics except for lifetime invariants.
1241 // FIXME: What about debug intrinsics? This matches old behavior, but
1242 // doesn't make sense.
1243 void visitIntrinsicInst(IntrinsicInst &II) {
1244 if (II.isDroppable()) {
1245 AS.DeadUseIfPromotable.push_back(U);
1246 return;
1247 }
1248
1249 if (!IsOffsetKnown)
1250 return PI.setAborted(&II);
1251
1252 if (II.isLifetimeStartOrEnd()) {
1253 insertUse(II, Offset, AllocSize, true);
1254 return;
1255 }
1256
1257 Base::visitIntrinsicInst(II);
1258 }
1259
1260 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1261 // We consider any PHI or select that results in a direct load or store of
1262 // the same offset to be a viable use for slicing purposes. These uses
1263 // are considered unsplittable and the size is the maximum loaded or stored
1264 // size.
1265 SmallPtrSet<Instruction *, 4> Visited;
1267 Visited.insert(Root);
1268 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1269 const DataLayout &DL = Root->getDataLayout();
1270 // If there are no loads or stores, the access is dead. We mark that as
1271 // a size zero access.
1272 Size = 0;
1273 do {
1274 Instruction *I, *UsedI;
1275 std::tie(UsedI, I) = Uses.pop_back_val();
1276
1277 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1278 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1279 if (LoadSize.isScalable()) {
1280 PI.setAborted(LI);
1281 return nullptr;
1282 }
1283 Size = std::max(Size, LoadSize.getFixedValue());
1284 continue;
1285 }
1286 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1287 Value *Op = SI->getOperand(0);
1288 if (Op == UsedI)
1289 return SI;
1290 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1291 if (StoreSize.isScalable()) {
1292 PI.setAborted(SI);
1293 return nullptr;
1294 }
1295 Size = std::max(Size, StoreSize.getFixedValue());
1296 continue;
1297 }
1298
1299 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1300 if (!GEP->hasAllZeroIndices())
1301 return GEP;
1302 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1304 return I;
1305 }
1306
1307 for (User *U : I->users())
1308 if (Visited.insert(cast<Instruction>(U)).second)
1309 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1310 } while (!Uses.empty());
1311
1312 return nullptr;
1313 }
1314
1315 void visitPHINodeOrSelectInst(Instruction &I) {
1317 if (I.use_empty())
1318 return markAsDead(I);
1319
1320 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1321 // instructions in this BB, which may be required during rewriting. Bail out
1322 // on these cases.
1323 if (isa<PHINode>(I) &&
1324 I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1325 return PI.setAborted(&I);
1326
1327 // TODO: We could use simplifyInstruction here to fold PHINodes and
1328 // SelectInsts. However, doing so requires to change the current
1329 // dead-operand-tracking mechanism. For instance, suppose neither loading
1330 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1331 // trap either. However, if we simply replace %U with undef using the
1332 // current dead-operand-tracking mechanism, "load (select undef, undef,
1333 // %other)" may trap because the select may return the first operand
1334 // "undef".
1335 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1336 if (Result == *U)
1337 // If the result of the constant fold will be the pointer, recurse
1338 // through the PHI/select as if we had RAUW'ed it.
1339 enqueueUsers(I);
1340 else
1341 // Otherwise the operand to the PHI/select is dead, and we can replace
1342 // it with poison.
1343 AS.DeadOperands.push_back(U);
1344
1345 return;
1346 }
1347
1348 if (!IsOffsetKnown)
1349 return PI.setAborted(&I);
1350
1351 // See if we already have computed info on this node.
1352 uint64_t &Size = PHIOrSelectSizes[&I];
1353 if (!Size) {
1354 // This is a new PHI/Select, check for an unsafe use of it.
1355 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1356 return PI.setAborted(UnsafeI);
1357 }
1358
1359 // For PHI and select operands outside the alloca, we can't nuke the entire
1360 // phi or select -- the other side might still be relevant, so we special
1361 // case them here and use a separate structure to track the operands
1362 // themselves which should be replaced with poison.
1363 // FIXME: This should instead be escaped in the event we're instrumenting
1364 // for address sanitization.
1365 if (Offset.uge(AllocSize)) {
1366 AS.DeadOperands.push_back(U);
1367 return;
1368 }
1369
1370 insertUse(I, Offset, Size);
1371 }
1372
1373 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1374
1375 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1376
1377 /// Disable SROA entirely if there are unhandled users of the alloca.
1378 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1379
1380 void visitCallBase(CallBase &CB) {
1381 // If the call operand is read-only and only does a read-only or address
1382 // capture, then we mark it as EscapedReadOnly.
1383 if (CB.isDataOperand(U) &&
1384 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1385 CB.onlyReadsMemory(U->getOperandNo())) {
1386 PI.setEscapedReadOnly(&CB);
1387 return;
1388 }
1389
1390 Base::visitCallBase(CB);
1391 }
1392};
1393
1394AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1395 :
1396#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1397 AI(AI),
1398#endif
1399 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1400 SliceBuilder PB(DL, AI, *this);
1401 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1402 if (PtrI.isEscaped() || PtrI.isAborted()) {
1403 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1404 // possibly by just storing the PtrInfo in the AllocaSlices.
1405 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1406 : PtrI.getAbortingInst();
1407 assert(PointerEscapingInstr && "Did not track a bad instruction");
1408 return;
1409 }
1410 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1411
1412 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1413
1414 // Sort the uses. This arranges for the offsets to be in ascending order,
1415 // and the sizes to be in descending order.
1416 llvm::stable_sort(Slices);
1417}
1418
1419#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1420
1421void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1422 StringRef Indent) const {
1423 printSlice(OS, I, Indent);
1424 OS << "\n";
1425 printUse(OS, I, Indent);
1426}
1427
1428void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1429 StringRef Indent) const {
1430 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1431 << " slice #" << (I - begin())
1432 << (I->isSplittable() ? " (splittable)" : "");
1433}
1434
1435void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1436 StringRef Indent) const {
1437 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1438}
1439
1440void AllocaSlices::print(raw_ostream &OS) const {
1441 if (PointerEscapingInstr) {
1442 OS << "Can't analyze slices for alloca: " << AI << "\n"
1443 << " A pointer to this alloca escaped by:\n"
1444 << " " << *PointerEscapingInstr << "\n";
1445 return;
1446 }
1447
1448 if (PointerEscapingInstrReadOnly)
1449 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1450
1451 OS << "Slices of alloca: " << AI << "\n";
1452 for (const_iterator I = begin(), E = end(); I != E; ++I)
1453 print(OS, I);
1454}
1455
1456LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1457 print(dbgs(), I);
1458}
1459LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1460
1461#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1462
1463/// Walk the range of a partitioning looking for a common type to cover this
1464/// sequence of slices.
1465static std::pair<Type *, IntegerType *>
1466findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1467 uint64_t EndOffset) {
1468 Type *Ty = nullptr;
1469 bool TyIsCommon = true;
1470 IntegerType *ITy = nullptr;
1471
1472 // Note that we need to look at *every* alloca slice's Use to ensure we
1473 // always get consistent results regardless of the order of slices.
1474 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1475 Use *U = I->getUse();
1476 if (isa<IntrinsicInst>(*U->getUser()))
1477 continue;
1478 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1479 continue;
1480
1481 Type *UserTy = nullptr;
1482 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1483 UserTy = LI->getType();
1484 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1485 UserTy = SI->getValueOperand()->getType();
1486 }
1487
1488 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1489 // If the type is larger than the partition, skip it. We only encounter
1490 // this for split integer operations where we want to use the type of the
1491 // entity causing the split. Also skip if the type is not a byte width
1492 // multiple.
1493 if (UserITy->getBitWidth() % 8 != 0 ||
1494 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1495 continue;
1496
1497 // Track the largest bitwidth integer type used in this way in case there
1498 // is no common type.
1499 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1500 ITy = UserITy;
1501 }
1502
1503 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1504 // depend on types skipped above.
1505 if (!UserTy || (Ty && Ty != UserTy))
1506 TyIsCommon = false; // Give up on anything but an iN type.
1507 else
1508 Ty = UserTy;
1509 }
1510
1511 return {TyIsCommon ? Ty : nullptr, ITy};
1512}
1513
1514/// PHI instructions that use an alloca and are subsequently loaded can be
1515/// rewritten to load both input pointers in the pred blocks and then PHI the
1516/// results, allowing the load of the alloca to be promoted.
1517/// From this:
1518/// %P2 = phi [i32* %Alloca, i32* %Other]
1519/// %V = load i32* %P2
1520/// to:
1521/// %V1 = load i32* %Alloca -> will be mem2reg'd
1522/// ...
1523/// %V2 = load i32* %Other
1524/// ...
1525/// %V = phi [i32 %V1, i32 %V2]
1526///
1527/// We can do this to a select if its only uses are loads and if the operands
1528/// to the select can be loaded unconditionally.
1529///
1530/// FIXME: This should be hoisted into a generic utility, likely in
1531/// Transforms/Util/Local.h
1533 const DataLayout &DL = PN.getDataLayout();
1534
1535 // For now, we can only do this promotion if the load is in the same block
1536 // as the PHI, and if there are no stores between the phi and load.
1537 // TODO: Allow recursive phi users.
1538 // TODO: Allow stores.
1539 BasicBlock *BB = PN.getParent();
1540 Align MaxAlign;
1541 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1542 Type *LoadType = nullptr;
1543 for (User *U : PN.users()) {
1545 if (!LI || !LI->isSimple())
1546 return false;
1547
1548 // For now we only allow loads in the same block as the PHI. This is
1549 // a common case that happens when instcombine merges two loads through
1550 // a PHI.
1551 if (LI->getParent() != BB)
1552 return false;
1553
1554 if (LoadType) {
1555 if (LoadType != LI->getType())
1556 return false;
1557 } else {
1558 LoadType = LI->getType();
1559 }
1560
1561 // Ensure that there are no instructions between the PHI and the load that
1562 // could store.
1563 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1564 if (BBI->mayWriteToMemory())
1565 return false;
1566
1567 MaxAlign = std::max(MaxAlign, LI->getAlign());
1568 }
1569
1570 if (!LoadType)
1571 return false;
1572
1573 APInt LoadSize =
1574 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1575
1576 // We can only transform this if it is safe to push the loads into the
1577 // predecessor blocks. The only thing to watch out for is that we can't put
1578 // a possibly trapping load in the predecessor if it is a critical edge.
1579 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1581 Value *InVal = PN.getIncomingValue(Idx);
1582
1583 // If the value is produced by the terminator of the predecessor (an
1584 // invoke) or it has side-effects, there is no valid place to put a load
1585 // in the predecessor.
1586 if (TI == InVal || TI->mayHaveSideEffects())
1587 return false;
1588
1589 // If the predecessor has a single successor, then the edge isn't
1590 // critical.
1591 if (TI->getNumSuccessors() == 1)
1592 continue;
1593
1594 // If this pointer is always safe to load, or if we can prove that there
1595 // is already a load in the block, then we can move the load to the pred
1596 // block.
1597 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1598 continue;
1599
1600 return false;
1601 }
1602
1603 return true;
1604}
1605
1606static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1607 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1608
1609 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1610 Type *LoadTy = SomeLoad->getType();
1611 IRB.SetInsertPoint(&PN);
1612 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1613 PN.getName() + ".sroa.speculated");
1614
1615 // Get the AA tags and alignment to use from one of the loads. It does not
1616 // matter which one we get and if any differ.
1617 AAMDNodes AATags = SomeLoad->getAAMetadata();
1618 Align Alignment = SomeLoad->getAlign();
1619
1620 // Rewrite all loads of the PN to use the new PHI.
1621 while (!PN.use_empty()) {
1622 LoadInst *LI = cast<LoadInst>(PN.user_back());
1623 LI->replaceAllUsesWith(NewPN);
1624 LI->eraseFromParent();
1625 }
1626
1627 // Inject loads into all of the pred blocks.
1628 DenseMap<BasicBlock *, Value *> InjectedLoads;
1629 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1630 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1631 Value *InVal = PN.getIncomingValue(Idx);
1632
1633 // A PHI node is allowed to have multiple (duplicated) entries for the same
1634 // basic block, as long as the value is the same. So if we already injected
1635 // a load in the predecessor, then we should reuse the same load for all
1636 // duplicated entries.
1637 if (Value *V = InjectedLoads.lookup(Pred)) {
1638 NewPN->addIncoming(V, Pred);
1639 continue;
1640 }
1641
1642 Instruction *TI = Pred->getTerminator();
1643 IRB.SetInsertPoint(TI);
1644
1645 LoadInst *Load = IRB.CreateAlignedLoad(
1646 LoadTy, InVal, Alignment,
1647 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1648 ++NumLoadsSpeculated;
1649 if (AATags)
1650 Load->setAAMetadata(AATags);
1651 NewPN->addIncoming(Load, Pred);
1652 InjectedLoads[Pred] = Load;
1653 }
1654
1655 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1656 PN.eraseFromParent();
1657}
1658
1659SelectHandSpeculativity &
1660SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1661 if (isTrueVal)
1663 else
1665 return *this;
1666}
1667
1668bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1669 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1670 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1671}
1672
1673bool SelectHandSpeculativity::areAllSpeculatable() const {
1674 return isSpeculatable(/*isTrueVal=*/true) &&
1675 isSpeculatable(/*isTrueVal=*/false);
1676}
1677
1678bool SelectHandSpeculativity::areAnySpeculatable() const {
1679 return isSpeculatable(/*isTrueVal=*/true) ||
1680 isSpeculatable(/*isTrueVal=*/false);
1681}
1682bool SelectHandSpeculativity::areNoneSpeculatable() const {
1683 return !areAnySpeculatable();
1684}
1685
1686static SelectHandSpeculativity
1688 assert(LI.isSimple() && "Only for simple loads");
1689 SelectHandSpeculativity Spec;
1690
1691 const DataLayout &DL = SI.getDataLayout();
1692 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1694 &LI))
1695 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1696 else if (PreserveCFG)
1697 return Spec;
1698
1699 return Spec;
1700}
1701
1702std::optional<RewriteableMemOps>
1703SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1704 RewriteableMemOps Ops;
1705
1706 for (User *U : SI.users()) {
1707 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1708 U = *BC->user_begin();
1709
1710 if (auto *Store = dyn_cast<StoreInst>(U)) {
1711 // Note that atomic stores can be transformed; atomic semantics do not
1712 // have any meaning for a local alloca. Stores are not speculatable,
1713 // however, so if we can't turn it into a predicated store, we are done.
1714 if (Store->isVolatile() || PreserveCFG)
1715 return {}; // Give up on this `select`.
1716 Ops.emplace_back(Store);
1717 continue;
1718 }
1719
1720 auto *LI = dyn_cast<LoadInst>(U);
1721
1722 // Note that atomic loads can be transformed;
1723 // atomic semantics do not have any meaning for a local alloca.
1724 if (!LI || LI->isVolatile())
1725 return {}; // Give up on this `select`.
1726
1727 PossiblySpeculatableLoad Load(LI);
1728 if (!LI->isSimple()) {
1729 // If the `load` is not simple, we can't speculatively execute it,
1730 // but we could handle this via a CFG modification. But can we?
1731 if (PreserveCFG)
1732 return {}; // Give up on this `select`.
1733 Ops.emplace_back(Load);
1734 continue;
1735 }
1736
1737 SelectHandSpeculativity Spec =
1739 if (PreserveCFG && !Spec.areAllSpeculatable())
1740 return {}; // Give up on this `select`.
1741
1742 Load.setInt(Spec);
1743 Ops.emplace_back(Load);
1744 }
1745
1746 return Ops;
1747}
1748
1750 IRBuilderTy &IRB) {
1751 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1752
1753 Value *TV = SI.getTrueValue();
1754 Value *FV = SI.getFalseValue();
1755 // Replace the given load of the select with a select of two loads.
1756
1757 assert(LI.isSimple() && "We only speculate simple loads");
1758
1759 IRB.SetInsertPoint(&LI);
1760
1761 LoadInst *TL =
1762 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1763 LI.getName() + ".sroa.speculate.load.true");
1764 LoadInst *FL =
1765 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1766 LI.getName() + ".sroa.speculate.load.false");
1767 NumLoadsSpeculated += 2;
1768
1769 // Transfer alignment and AA info if present.
1770 TL->setAlignment(LI.getAlign());
1771 FL->setAlignment(LI.getAlign());
1772
1773 AAMDNodes Tags = LI.getAAMetadata();
1774 if (Tags) {
1775 TL->setAAMetadata(Tags);
1776 FL->setAAMetadata(Tags);
1777 }
1778
1779 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1780 LI.getName() + ".sroa.speculated");
1781
1782 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1783 LI.replaceAllUsesWith(V);
1784}
1785
1786template <typename T>
1788 SelectHandSpeculativity Spec,
1789 DomTreeUpdater &DTU) {
1790 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1791 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1792 BasicBlock *Head = I.getParent();
1793 Instruction *ThenTerm = nullptr;
1794 Instruction *ElseTerm = nullptr;
1795 if (Spec.areNoneSpeculatable())
1796 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1797 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1798 else {
1799 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1800 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1801 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1802 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1803 cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1804 }
1805 auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1806 Spec = {}; // Do not use `Spec` beyond this point.
1807 BasicBlock *Tail = I.getParent();
1808 Tail->setName(Head->getName() + ".cont");
1809 PHINode *PN;
1810 if (isa<LoadInst>(I))
1811 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1812 for (BasicBlock *SuccBB : successors(Head)) {
1813 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1814 int SuccIdx = IsThen ? 0 : 1;
1815 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1816 auto &CondMemOp = cast<T>(*I.clone());
1817 if (NewMemOpBB != Head) {
1818 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1819 if (isa<LoadInst>(I))
1820 ++NumLoadsPredicated;
1821 else
1822 ++NumStoresPredicated;
1823 } else {
1824 CondMemOp.dropUBImplyingAttrsAndMetadata();
1825 ++NumLoadsSpeculated;
1826 }
1827 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1828 Value *Ptr = SI.getOperand(1 + SuccIdx);
1829 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1830 if (isa<LoadInst>(I)) {
1831 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1832 PN->addIncoming(&CondMemOp, NewMemOpBB);
1833 } else
1834 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1835 }
1836 if (isa<LoadInst>(I)) {
1837 PN->takeName(&I);
1838 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1839 I.replaceAllUsesWith(PN);
1840 }
1841}
1842
1844 SelectHandSpeculativity Spec,
1845 DomTreeUpdater &DTU) {
1846 if (auto *LI = dyn_cast<LoadInst>(&I))
1847 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1848 else if (auto *SI = dyn_cast<StoreInst>(&I))
1849 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1850 else
1851 llvm_unreachable_internal("Only for load and store.");
1852}
1853
1855 const RewriteableMemOps &Ops,
1856 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1857 bool CFGChanged = false;
1858 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1859
1860 for (const RewriteableMemOp &Op : Ops) {
1861 SelectHandSpeculativity Spec;
1862 Instruction *I;
1863 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1864 I = *US;
1865 } else {
1866 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1867 I = PSL.getPointer();
1868 Spec = PSL.getInt();
1869 }
1870 if (Spec.areAllSpeculatable()) {
1872 } else {
1873 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1874 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1875 CFGChanged = true;
1876 }
1877 I->eraseFromParent();
1878 }
1879
1880 for (User *U : make_early_inc_range(SI.users()))
1881 cast<BitCastInst>(U)->eraseFromParent();
1882 SI.eraseFromParent();
1883 return CFGChanged;
1884}
1885
1886/// Compute an adjusted pointer from Ptr by Offset bytes where the
1887/// resulting pointer has PointerTy.
1888static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1890 const Twine &NamePrefix) {
1891 if (Offset != 0)
1892 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1893 NamePrefix + "sroa_idx");
1894 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1895 NamePrefix + "sroa_cast");
1896}
1897
1898/// Compute the adjusted alignment for a load or store from an offset.
1902
1903/// Test whether we can convert a value from the old to the new type.
1904///
1905/// This predicate should be used to guard calls to convertValue in order to
1906/// ensure that we only try to convert viable values. The strategy is that we
1907/// will peel off single element struct and array wrappings to get to an
1908/// underlying value, and convert that value.
1909static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1910 unsigned VScale = 0) {
1911 if (OldTy == NewTy)
1912 return true;
1913
1914 // For integer types, we can't handle any bit-width differences. This would
1915 // break both vector conversions with extension and introduce endianness
1916 // issues when in conjunction with loads and stores.
1917 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1919 cast<IntegerType>(NewTy)->getBitWidth() &&
1920 "We can't have the same bitwidth for different int types");
1921 return false;
1922 }
1923
1924 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1925 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1926
1927 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1928 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1929 // Conversion is only possible when the size of scalable vectors is known.
1930 if (!VScale)
1931 return false;
1932
1933 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1934 // a single domain (either fixed or scalable). Any additional conversion
1935 // between fixed and scalable types is handled through integer types.
1936 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1937 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1938
1939 if (isa<ScalableVectorType>(NewTy)) {
1941 return false;
1942
1943 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1944 } else {
1946 return false;
1947
1948 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1949 }
1950 }
1951
1952 if (NewSize != OldSize)
1953 return false;
1954 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1955 return false;
1956
1957 // We can convert pointers to integers and vice-versa. Same for vectors
1958 // of pointers and integers.
1959 OldTy = OldTy->getScalarType();
1960 NewTy = NewTy->getScalarType();
1961 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1962 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1963 unsigned OldAS = OldTy->getPointerAddressSpace();
1964 unsigned NewAS = NewTy->getPointerAddressSpace();
1965 // Convert pointers if they are pointers from the same address space or
1966 // different integral (not non-integral) address spaces with the same
1967 // pointer size.
1968 return OldAS == NewAS ||
1969 (!DL.isNonIntegralAddressSpace(OldAS) &&
1970 !DL.isNonIntegralAddressSpace(NewAS) &&
1971 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1972 }
1973
1974 // We can convert integers to integral pointers, but not to non-integral
1975 // pointers.
1976 if (OldTy->isIntegerTy())
1977 return !DL.isNonIntegralPointerType(NewTy);
1978
1979 // We can convert integral pointers to integers, but non-integral pointers
1980 // need to remain pointers.
1981 if (!DL.isNonIntegralPointerType(OldTy))
1982 return NewTy->isIntegerTy();
1983
1984 return false;
1985 }
1986
1987 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
1988 return false;
1989
1990 return true;
1991}
1992
1993/// Generic routine to convert an SSA value to a value of a different
1994/// type.
1995///
1996/// This will try various different casting techniques, such as bitcasts,
1997/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1998/// two types for viability with this routine.
1999static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2000 Type *NewTy) {
2001 Type *OldTy = V->getType();
2002
2003#ifndef NDEBUG
2004 BasicBlock *BB = IRB.GetInsertBlock();
2005 assert(BB && BB->getParent() && "VScale unknown!");
2006 unsigned VScale = BB->getParent()->getVScaleValue();
2007 assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2008 "Value not convertable to type");
2009#endif
2010
2011 if (OldTy == NewTy)
2012 return V;
2013
2014 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2015 "Integer types must be the exact same to convert.");
2016
2017 // A variant of bitcast that supports a mixture of fixed and scalable types
2018 // that are know to have the same size.
2019 auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
2020 Type *InTy = In->getType();
2021 if (InTy == Ty)
2022 return In;
2023
2025 // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2026 // <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2028 return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
2029 PoisonValue::get(VTy), In,
2030 IRB.getInt64(0)),
2031 Ty);
2032 }
2033
2035 // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2036 // <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2038 return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
2039 IRB.getInt64(0));
2040 }
2041
2042 return IRB.CreateBitCast(In, Ty);
2043 };
2044
2045 // See if we need inttoptr for this type pair. May require additional bitcast.
2046 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2047 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2048 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2049 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2050 // Directly handle i64 to i8*
2051 return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
2052 NewTy);
2053 }
2054
2055 // See if we need ptrtoint for this type pair. May require additional bitcast.
2056 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2057 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2058 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2059 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2060 // Expand i8* to i64 --> i8* to i64 to i64
2061 return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2062 NewTy);
2063 }
2064
2065 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2066 unsigned OldAS = OldTy->getPointerAddressSpace();
2067 unsigned NewAS = NewTy->getPointerAddressSpace();
2068 // To convert pointers with different address spaces (they are already
2069 // checked convertible, i.e. they have the same pointer size), so far we
2070 // cannot use `bitcast` (which has restrict on the same address space) or
2071 // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2072 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2073 // size.
2074 if (OldAS != NewAS) {
2075 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2076 return IRB.CreateIntToPtr(
2077 CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2078 DL.getIntPtrType(NewTy)),
2079 NewTy);
2080 }
2081 }
2082
2083 return CreateBitCastLike(V, NewTy);
2084}
2085
2086/// Test whether the given slice use can be promoted to a vector.
2087///
2088/// This function is called to test each entry in a partition which is slated
2089/// for a single slice.
2090static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2091 VectorType *Ty,
2092 uint64_t ElementSize,
2093 const DataLayout &DL,
2094 unsigned VScale) {
2095 // First validate the slice offsets.
2096 uint64_t BeginOffset =
2097 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2098 uint64_t BeginIndex = BeginOffset / ElementSize;
2099 if (BeginIndex * ElementSize != BeginOffset ||
2100 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2101 return false;
2102 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2103 uint64_t EndIndex = EndOffset / ElementSize;
2104 if (EndIndex * ElementSize != EndOffset ||
2105 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2106 return false;
2107
2108 assert(EndIndex > BeginIndex && "Empty vector!");
2109 uint64_t NumElements = EndIndex - BeginIndex;
2110 Type *SliceTy = (NumElements == 1)
2111 ? Ty->getElementType()
2112 : FixedVectorType::get(Ty->getElementType(), NumElements);
2113
2114 Type *SplitIntTy =
2115 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2116
2117 Use *U = S.getUse();
2118
2119 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2120 if (MI->isVolatile())
2121 return false;
2122 if (!S.isSplittable())
2123 return false; // Skip any unsplittable intrinsics.
2124 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2125 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2126 return false;
2127 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2128 if (LI->isVolatile())
2129 return false;
2130 Type *LTy = LI->getType();
2131 // Disable vector promotion when there are loads or stores of an FCA.
2132 if (LTy->isStructTy())
2133 return false;
2134 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2135 assert(LTy->isIntegerTy());
2136 LTy = SplitIntTy;
2137 }
2138 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2139 return false;
2140 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2141 if (SI->isVolatile())
2142 return false;
2143 Type *STy = SI->getValueOperand()->getType();
2144 // Disable vector promotion when there are loads or stores of an FCA.
2145 if (STy->isStructTy())
2146 return false;
2147 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2148 assert(STy->isIntegerTy());
2149 STy = SplitIntTy;
2150 }
2151 if (!canConvertValue(DL, STy, SliceTy, VScale))
2152 return false;
2153 } else {
2154 return false;
2155 }
2156
2157 return true;
2158}
2159
2160/// Test whether a vector type is viable for promotion.
2161///
2162/// This implements the necessary checking for \c checkVectorTypesForPromotion
2163/// (and thus isVectorPromotionViable) over all slices of the alloca for the
2164/// given VectorType.
2165static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2166 const DataLayout &DL, unsigned VScale) {
2167 uint64_t ElementSize =
2168 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2169
2170 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2171 // that aren't byte sized.
2172 if (ElementSize % 8)
2173 return false;
2174 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2175 "vector size not a multiple of element size?");
2176 ElementSize /= 8;
2177
2178 for (const Slice &S : P)
2179 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2180 return false;
2181
2182 for (const Slice *S : P.splitSliceTails())
2183 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2184 return false;
2185
2186 return true;
2187}
2188
2189/// Test whether any vector type in \p CandidateTys is viable for promotion.
2190///
2191/// This implements the necessary checking for \c isVectorPromotionViable over
2192/// all slices of the alloca for the given VectorType.
2193static VectorType *
2195 SmallVectorImpl<VectorType *> &CandidateTys,
2196 bool HaveCommonEltTy, Type *CommonEltTy,
2197 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2198 VectorType *CommonVecPtrTy, unsigned VScale) {
2199 // If we didn't find a vector type, nothing to do here.
2200 if (CandidateTys.empty())
2201 return nullptr;
2202
2203 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2204 // then we should choose it, not some other alternative.
2205 // But, we can't perform a no-op pointer address space change via bitcast,
2206 // so if we didn't have a common pointer element type, bail.
2207 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2208 return nullptr;
2209
2210 // Try to pick the "best" element type out of the choices.
2211 if (!HaveCommonEltTy && HaveVecPtrTy) {
2212 // If there was a pointer element type, there's really only one choice.
2213 CandidateTys.clear();
2214 CandidateTys.push_back(CommonVecPtrTy);
2215 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2216 // Integer-ify vector types.
2217 for (VectorType *&VTy : CandidateTys) {
2218 if (!VTy->getElementType()->isIntegerTy())
2219 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2220 VTy->getContext(), VTy->getScalarSizeInBits())));
2221 }
2222
2223 // Rank the remaining candidate vector types. This is easy because we know
2224 // they're all integer vectors. We sort by ascending number of elements.
2225 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2226 (void)DL;
2227 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2228 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2229 "Cannot have vector types of different sizes!");
2230 assert(RHSTy->getElementType()->isIntegerTy() &&
2231 "All non-integer types eliminated!");
2232 assert(LHSTy->getElementType()->isIntegerTy() &&
2233 "All non-integer types eliminated!");
2234 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2235 cast<FixedVectorType>(LHSTy)->getNumElements();
2236 };
2237 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2238 (void)DL;
2239 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2240 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2241 "Cannot have vector types of different sizes!");
2242 assert(RHSTy->getElementType()->isIntegerTy() &&
2243 "All non-integer types eliminated!");
2244 assert(LHSTy->getElementType()->isIntegerTy() &&
2245 "All non-integer types eliminated!");
2246 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2247 cast<FixedVectorType>(LHSTy)->getNumElements();
2248 };
2249 llvm::sort(CandidateTys, RankVectorTypesComp);
2250 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2251 CandidateTys.end());
2252 } else {
2253// The only way to have the same element type in every vector type is to
2254// have the same vector type. Check that and remove all but one.
2255#ifndef NDEBUG
2256 for (VectorType *VTy : CandidateTys) {
2257 assert(VTy->getElementType() == CommonEltTy &&
2258 "Unaccounted for element type!");
2259 assert(VTy == CandidateTys[0] &&
2260 "Different vector types with the same element type!");
2261 }
2262#endif
2263 CandidateTys.resize(1);
2264 }
2265
2266 // FIXME: hack. Do we have a named constant for this?
2267 // SDAG SDNode can't have more than 65535 operands.
2268 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2269 return cast<FixedVectorType>(VTy)->getNumElements() >
2270 std::numeric_limits<unsigned short>::max();
2271 });
2272
2273 for (VectorType *VTy : CandidateTys)
2274 if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
2275 return VTy;
2276
2277 return nullptr;
2278}
2279
2281 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2282 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2283 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2284 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2285 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2286 [[maybe_unused]] VectorType *OriginalElt =
2287 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2288 // Consider additional vector types where the element type size is a
2289 // multiple of load/store element size.
2290 for (Type *Ty : OtherTys) {
2292 continue;
2293 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2294 // Make a copy of CandidateTys and iterate through it, because we
2295 // might append to CandidateTys in the loop.
2296 for (VectorType *const VTy : CandidateTysCopy) {
2297 // The elements in the copy should remain invariant throughout the loop
2298 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2299 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2300 unsigned ElementSize =
2301 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2302 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2303 VectorSize % TypeSize == 0) {
2304 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2305 CheckCandidateType(NewVTy);
2306 }
2307 }
2308 }
2309
2311 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2312 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2313}
2314
2315/// Test whether the given alloca partitioning and range of slices can be
2316/// promoted to a vector.
2317///
2318/// This is a quick test to check whether we can rewrite a particular alloca
2319/// partition (and its newly formed alloca) into a vector alloca with only
2320/// whole-vector loads and stores such that it could be promoted to a vector
2321/// SSA value. We only can ensure this for a limited set of operations, and we
2322/// don't want to do the rewrites unless we are confident that the result will
2323/// be promotable, so we have an early test here.
2325 unsigned VScale) {
2326 // Collect the candidate types for vector-based promotion. Also track whether
2327 // we have different element types.
2328 SmallVector<VectorType *, 4> CandidateTys;
2329 SetVector<Type *> LoadStoreTys;
2330 SetVector<Type *> DeferredTys;
2331 Type *CommonEltTy = nullptr;
2332 VectorType *CommonVecPtrTy = nullptr;
2333 bool HaveVecPtrTy = false;
2334 bool HaveCommonEltTy = true;
2335 bool HaveCommonVecPtrTy = true;
2336 auto CheckCandidateType = [&](Type *Ty) {
2337 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2338 // Return if bitcast to vectors is different for total size in bits.
2339 if (!CandidateTys.empty()) {
2340 VectorType *V = CandidateTys[0];
2341 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2342 DL.getTypeSizeInBits(V).getFixedValue()) {
2343 CandidateTys.clear();
2344 return;
2345 }
2346 }
2347 CandidateTys.push_back(VTy);
2348 Type *EltTy = VTy->getElementType();
2349
2350 if (!CommonEltTy)
2351 CommonEltTy = EltTy;
2352 else if (CommonEltTy != EltTy)
2353 HaveCommonEltTy = false;
2354
2355 if (EltTy->isPointerTy()) {
2356 HaveVecPtrTy = true;
2357 if (!CommonVecPtrTy)
2358 CommonVecPtrTy = VTy;
2359 else if (CommonVecPtrTy != VTy)
2360 HaveCommonVecPtrTy = false;
2361 }
2362 }
2363 };
2364
2365 // Put load and store types into a set for de-duplication.
2366 for (const Slice &S : P) {
2367 Type *Ty;
2368 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2369 Ty = LI->getType();
2370 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2371 Ty = SI->getValueOperand()->getType();
2372 else
2373 continue;
2374
2375 auto CandTy = Ty->getScalarType();
2376 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2377 S.endOffset() != P.endOffset())) {
2378 DeferredTys.insert(Ty);
2379 continue;
2380 }
2381
2382 LoadStoreTys.insert(Ty);
2383 // Consider any loads or stores that are the exact size of the slice.
2384 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2385 CheckCandidateType(Ty);
2386 }
2387
2388 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2390 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2391 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2392 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2393 return VTy;
2394
2395 CandidateTys.clear();
2397 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2398 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2399 CommonVecPtrTy, VScale);
2400}
2401
2402/// Test whether a slice of an alloca is valid for integer widening.
2403///
2404/// This implements the necessary checking for the \c isIntegerWideningViable
2405/// test below on a single slice of the alloca.
2406static bool isIntegerWideningViableForSlice(const Slice &S,
2407 uint64_t AllocBeginOffset,
2408 Type *AllocaTy,
2409 const DataLayout &DL,
2410 bool &WholeAllocaOp) {
2411 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2412
2413 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2414 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2415
2416 Use *U = S.getUse();
2417
2418 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2419 // larger than other load/store slices (RelEnd > Size). But lifetime are
2420 // always promotable and should not impact other slices' promotability of the
2421 // partition.
2422 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2423 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2424 return true;
2425 }
2426
2427 // We can't reasonably handle cases where the load or store extends past
2428 // the end of the alloca's type and into its padding.
2429 if (RelEnd > Size)
2430 return false;
2431
2432 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2433 if (LI->isVolatile())
2434 return false;
2435 // We can't handle loads that extend past the allocated memory.
2436 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2437 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2438 return false;
2439 // So far, AllocaSliceRewriter does not support widening split slice tails
2440 // in rewriteIntegerLoad.
2441 if (S.beginOffset() < AllocBeginOffset)
2442 return false;
2443 // Note that we don't count vector loads or stores as whole-alloca
2444 // operations which enable integer widening because we would prefer to use
2445 // vector widening instead.
2446 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2447 WholeAllocaOp = true;
2448 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2449 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2450 return false;
2451 } else if (RelBegin != 0 || RelEnd != Size ||
2452 !canConvertValue(DL, AllocaTy, LI->getType())) {
2453 // Non-integer loads need to be convertible from the alloca type so that
2454 // they are promotable.
2455 return false;
2456 }
2457 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2458 Type *ValueTy = SI->getValueOperand()->getType();
2459 if (SI->isVolatile())
2460 return false;
2461 // We can't handle stores that extend past the allocated memory.
2462 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2463 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2464 return false;
2465 // So far, AllocaSliceRewriter does not support widening split slice tails
2466 // in rewriteIntegerStore.
2467 if (S.beginOffset() < AllocBeginOffset)
2468 return false;
2469 // Note that we don't count vector loads or stores as whole-alloca
2470 // operations which enable integer widening because we would prefer to use
2471 // vector widening instead.
2472 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2473 WholeAllocaOp = true;
2474 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2475 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2476 return false;
2477 } else if (RelBegin != 0 || RelEnd != Size ||
2478 !canConvertValue(DL, ValueTy, AllocaTy)) {
2479 // Non-integer stores need to be convertible to the alloca type so that
2480 // they are promotable.
2481 return false;
2482 }
2483 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2484 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2485 return false;
2486 if (!S.isSplittable())
2487 return false; // Skip any unsplittable intrinsics.
2488 } else {
2489 return false;
2490 }
2491
2492 return true;
2493}
2494
2495/// Test whether the given alloca partition's integer operations can be
2496/// widened to promotable ones.
2497///
2498/// This is a quick test to check whether we can rewrite the integer loads and
2499/// stores to a particular alloca into wider loads and stores and be able to
2500/// promote the resulting alloca.
2501static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2502 const DataLayout &DL) {
2503 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2504 // Don't create integer types larger than the maximum bitwidth.
2505 if (SizeInBits > IntegerType::MAX_INT_BITS)
2506 return false;
2507
2508 // Don't try to handle allocas with bit-padding.
2509 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2510 return false;
2511
2512 // We need to ensure that an integer type with the appropriate bitwidth can
2513 // be converted to the alloca type, whatever that is. We don't want to force
2514 // the alloca itself to have an integer type if there is a more suitable one.
2515 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2516 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2517 !canConvertValue(DL, IntTy, AllocaTy))
2518 return false;
2519
2520 // While examining uses, we ensure that the alloca has a covering load or
2521 // store. We don't want to widen the integer operations only to fail to
2522 // promote due to some other unsplittable entry (which we may make splittable
2523 // later). However, if there are only splittable uses, go ahead and assume
2524 // that we cover the alloca.
2525 // FIXME: We shouldn't consider split slices that happen to start in the
2526 // partition here...
2527 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2528
2529 for (const Slice &S : P)
2530 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2531 WholeAllocaOp))
2532 return false;
2533
2534 for (const Slice *S : P.splitSliceTails())
2535 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2536 WholeAllocaOp))
2537 return false;
2538
2539 return WholeAllocaOp;
2540}
2541
2542static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2544 const Twine &Name) {
2545 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2546 IntegerType *IntTy = cast<IntegerType>(V->getType());
2547 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2548 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2549 "Element extends past full value");
2550 uint64_t ShAmt = 8 * Offset;
2551 if (DL.isBigEndian())
2552 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2553 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2554 if (ShAmt) {
2555 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2556 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2557 }
2558 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2559 "Cannot extract to a larger integer!");
2560 if (Ty != IntTy) {
2561 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2562 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2563 }
2564 return V;
2565}
2566
2567static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2568 Value *V, uint64_t Offset, const Twine &Name) {
2569 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2570 IntegerType *Ty = cast<IntegerType>(V->getType());
2571 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2572 "Cannot insert a larger integer!");
2573 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2574 if (Ty != IntTy) {
2575 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2576 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2577 }
2578 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2579 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2580 "Element store outside of alloca store");
2581 uint64_t ShAmt = 8 * Offset;
2582 if (DL.isBigEndian())
2583 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2584 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2585 if (ShAmt) {
2586 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2587 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2588 }
2589
2590 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2591 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2592 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2593 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2594 V = IRB.CreateOr(Old, V, Name + ".insert");
2595 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2596 }
2597 return V;
2598}
2599
2600static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2601 unsigned EndIndex, const Twine &Name) {
2602 auto *VecTy = cast<FixedVectorType>(V->getType());
2603 unsigned NumElements = EndIndex - BeginIndex;
2604 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2605
2606 if (NumElements == VecTy->getNumElements())
2607 return V;
2608
2609 if (NumElements == 1) {
2610 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2611 Name + ".extract");
2612 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2613 return V;
2614 }
2615
2616 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2617 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2618 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2619 return V;
2620}
2621
2622static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2623 unsigned BeginIndex, const Twine &Name) {
2624 VectorType *VecTy = cast<VectorType>(Old->getType());
2625 assert(VecTy && "Can only insert a vector into a vector");
2626
2627 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2628 if (!Ty) {
2629 // Single element to insert.
2630 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2631 Name + ".insert");
2632 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2633 return V;
2634 }
2635
2638 "Too many elements!");
2641 assert(V->getType() == VecTy && "Vector type mismatch");
2642 return V;
2643 }
2644 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2645
2646 // When inserting a smaller vector into the larger to store, we first
2647 // use a shuffle vector to widen it with undef elements, and then
2648 // a second shuffle vector to select between the loaded vector and the
2649 // incoming vector.
2651 Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2652 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2653 if (i >= BeginIndex && i < EndIndex)
2654 Mask.push_back(i - BeginIndex);
2655 else
2656 Mask.push_back(-1);
2657 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2658 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2659
2662 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2663 Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2664
2665 V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2666
2667 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2668 return V;
2669}
2670
2671/// This function takes two vector values and combines them into a single vector
2672/// by concatenating their elements. The function handles:
2673///
2674/// 1. Element type mismatch: If either vector's element type differs from
2675/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2676/// preserving the total bit width (adjusting the number of elements
2677/// accordingly).
2678///
2679/// 2. Size mismatch: After transforming the vectors to have the desired element
2680/// type, if the two vectors have different numbers of elements, the smaller
2681/// vector is extended with poison values to match the size of the larger
2682/// vector before concatenation.
2683///
2684/// 3. Concatenation: The vectors are merged using a shuffle operation that
2685/// places all elements of V0 first, followed by all elements of V1.
2686///
2687/// \param V0 The first vector to merge (must be a vector type)
2688/// \param V1 The second vector to merge (must be a vector type)
2689/// \param DL The data layout for size calculations
2690/// \param NewAIEltTy The desired element type for the result vector
2691/// \param Builder IRBuilder for creating new instructions
2692/// \return A new vector containing all elements from V0 followed by all
2693/// elements from V1
2695 Type *NewAIEltTy, IRBuilder<> &Builder) {
2696 // V0 and V1 are vectors
2697 // Create a new vector type with combined elements
2698 // Use ShuffleVector to concatenate the vectors
2699 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2700 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2701
2702 // If V0/V1 element types are different from NewAllocaElementType,
2703 // we need to introduce bitcasts before merging them
2704 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2705 const char *DebugName) {
2706 Type *EltType = VecType->getElementType();
2707 if (EltType != NewAIEltTy) {
2708 // Calculate new number of elements to maintain same bit width
2709 unsigned TotalBits =
2710 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2711 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2712
2713 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2714 V = Builder.CreateBitCast(V, NewVecType);
2715 VecType = NewVecType;
2716 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2717 }
2718 };
2719
2720 BitcastIfNeeded(V0, VecType0, "V0");
2721 BitcastIfNeeded(V1, VecType1, "V1");
2722
2723 unsigned NumElts0 = VecType0->getNumElements();
2724 unsigned NumElts1 = VecType1->getNumElements();
2725
2726 SmallVector<int, 16> ShuffleMask;
2727
2728 if (NumElts0 == NumElts1) {
2729 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2730 ShuffleMask.push_back(i);
2731 } else {
2732 // If two vectors have different sizes, we need to extend
2733 // the smaller vector to the size of the larger vector.
2734 unsigned SmallSize = std::min(NumElts0, NumElts1);
2735 unsigned LargeSize = std::max(NumElts0, NumElts1);
2736 bool IsV0Smaller = NumElts0 < NumElts1;
2737 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2738 SmallVector<int, 16> ExtendMask;
2739 for (unsigned i = 0; i < SmallSize; ++i)
2740 ExtendMask.push_back(i);
2741 for (unsigned i = SmallSize; i < LargeSize; ++i)
2742 ExtendMask.push_back(PoisonMaskElem);
2743 ExtendedVec = Builder.CreateShuffleVector(
2744 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2745 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2746 for (unsigned i = 0; i < NumElts0; ++i)
2747 ShuffleMask.push_back(i);
2748 for (unsigned i = 0; i < NumElts1; ++i)
2749 ShuffleMask.push_back(LargeSize + i);
2750 }
2751
2752 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2753}
2754
2755namespace {
2756
2757/// Visitor to rewrite instructions using p particular slice of an alloca
2758/// to use a new alloca.
2759///
2760/// Also implements the rewriting to vector-based accesses when the partition
2761/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2762/// lives here.
2763class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2764 // Befriend the base class so it can delegate to private visit methods.
2765 friend class InstVisitor<AllocaSliceRewriter, bool>;
2766
2767 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2768
2769 const DataLayout &DL;
2770 AllocaSlices &AS;
2771 SROA &Pass;
2772 AllocaInst &OldAI, &NewAI;
2773 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2774 Type *NewAllocaTy;
2775
2776 // This is a convenience and flag variable that will be null unless the new
2777 // alloca's integer operations should be widened to this integer type due to
2778 // passing isIntegerWideningViable above. If it is non-null, the desired
2779 // integer type will be stored here for easy access during rewriting.
2780 IntegerType *IntTy;
2781
2782 // If we are rewriting an alloca partition which can be written as pure
2783 // vector operations, we stash extra information here. When VecTy is
2784 // non-null, we have some strict guarantees about the rewritten alloca:
2785 // - The new alloca is exactly the size of the vector type here.
2786 // - The accesses all either map to the entire vector or to a single
2787 // element.
2788 // - The set of accessing instructions is only one of those handled above
2789 // in isVectorPromotionViable. Generally these are the same access kinds
2790 // which are promotable via mem2reg.
2791 VectorType *VecTy;
2792 Type *ElementTy;
2793 uint64_t ElementSize;
2794
2795 // The original offset of the slice currently being rewritten relative to
2796 // the original alloca.
2797 uint64_t BeginOffset = 0;
2798 uint64_t EndOffset = 0;
2799
2800 // The new offsets of the slice currently being rewritten relative to the
2801 // original alloca.
2802 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2803
2804 uint64_t SliceSize = 0;
2805 bool IsSplittable = false;
2806 bool IsSplit = false;
2807 Use *OldUse = nullptr;
2808 Instruction *OldPtr = nullptr;
2809
2810 // Track post-rewrite users which are PHI nodes and Selects.
2811 SmallSetVector<PHINode *, 8> &PHIUsers;
2812 SmallSetVector<SelectInst *, 8> &SelectUsers;
2813
2814 // Utility IR builder, whose name prefix is setup for each visited use, and
2815 // the insertion point is set to point to the user.
2816 IRBuilderTy IRB;
2817
2818 // Return the new alloca, addrspacecasted if required to avoid changing the
2819 // addrspace of a volatile access.
2820 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2821 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2822 return &NewAI;
2823
2824 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2825 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2826 }
2827
2828public:
2829 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2830 AllocaInst &OldAI, AllocaInst &NewAI,
2831 uint64_t NewAllocaBeginOffset,
2832 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2833 VectorType *PromotableVecTy,
2834 SmallSetVector<PHINode *, 8> &PHIUsers,
2835 SmallSetVector<SelectInst *, 8> &SelectUsers)
2836 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2837 NewAllocaBeginOffset(NewAllocaBeginOffset),
2838 NewAllocaEndOffset(NewAllocaEndOffset),
2839 NewAllocaTy(NewAI.getAllocatedType()),
2840 IntTy(
2841 IsIntegerPromotable
2842 ? Type::getIntNTy(NewAI.getContext(),
2843 DL.getTypeSizeInBits(NewAI.getAllocatedType())
2844 .getFixedValue())
2845 : nullptr),
2846 VecTy(PromotableVecTy),
2847 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2848 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2849 : 0),
2850 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2851 IRB(NewAI.getContext(), ConstantFolder()) {
2852 if (VecTy) {
2853 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2854 "Only multiple-of-8 sized vector elements are viable");
2855 ++NumVectorized;
2856 }
2857 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2858 }
2859
2860 bool visit(AllocaSlices::const_iterator I) {
2861 bool CanSROA = true;
2862 BeginOffset = I->beginOffset();
2863 EndOffset = I->endOffset();
2864 IsSplittable = I->isSplittable();
2865 IsSplit =
2866 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2867 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2868 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2869 LLVM_DEBUG(dbgs() << "\n");
2870
2871 // Compute the intersecting offset range.
2872 assert(BeginOffset < NewAllocaEndOffset);
2873 assert(EndOffset > NewAllocaBeginOffset);
2874 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2875 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2876
2877 SliceSize = NewEndOffset - NewBeginOffset;
2878 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2879 << ") NewBegin:(" << NewBeginOffset << ", "
2880 << NewEndOffset << ") NewAllocaBegin:("
2881 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2882 << ")\n");
2883 assert(IsSplit || NewBeginOffset == BeginOffset);
2884 OldUse = I->getUse();
2885 OldPtr = cast<Instruction>(OldUse->get());
2886
2887 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2888 IRB.SetInsertPoint(OldUserI);
2889 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2890 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2891 Twine(BeginOffset) + ".");
2892
2893 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2894 if (VecTy || IntTy)
2895 assert(CanSROA);
2896 return CanSROA;
2897 }
2898
2899 /// Attempts to rewrite a partition using tree-structured merge optimization.
2900 ///
2901 /// This function analyzes a partition to determine if it can be optimized
2902 /// using a tree-structured merge pattern, where multiple non-overlapping
2903 /// stores completely fill an alloca. And there is no load from the alloca in
2904 /// the middle of the stores. Such patterns can be optimized by eliminating
2905 /// the intermediate stores and directly constructing the final vector by
2906 /// using shufflevectors.
2907 ///
2908 /// Example transformation:
2909 /// Before: (stores do not have to be in order)
2910 /// %alloca = alloca <8 x float>
2911 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2912 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2913 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2914 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2915 ///
2916 /// After:
2917 /// %alloca = alloca <8 x float>
2918 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2919 /// i32 3>
2920 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2921 /// i32 3>
2922 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2923 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2924 /// store %shuffle2, ptr %alloca
2925 ///
2926 /// The optimization looks for partitions that:
2927 /// 1. Have no overlapping split slice tails
2928 /// 2. Contain non-overlapping stores that cover the entire alloca
2929 /// 3. Have exactly one load that reads the complete alloca structure and not
2930 /// in the middle of the stores (TODO: maybe we can relax the constraint
2931 /// about reading the entire alloca structure)
2932 ///
2933 /// \param P The partition to analyze and potentially rewrite
2934 /// \return An optional vector of values that were deleted during the rewrite
2935 /// process, or std::nullopt if the partition cannot be optimized
2936 /// using tree-structured merge
2937 std::optional<SmallVector<Value *, 4>>
2938 rewriteTreeStructuredMerge(Partition &P) {
2939 // No tail slices that overlap with the partition
2940 if (P.splitSliceTails().size() > 0)
2941 return std::nullopt;
2942
2943 SmallVector<Value *, 4> DeletedValues;
2944 LoadInst *TheLoad = nullptr;
2945
2946 // Structure to hold store information
2947 struct StoreInfo {
2948 StoreInst *Store;
2949 uint64_t BeginOffset;
2950 uint64_t EndOffset;
2951 Value *StoredValue;
2952 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2953 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2954 };
2955
2956 SmallVector<StoreInfo, 4> StoreInfos;
2957
2958 // If the new alloca is a fixed vector type, we use its element type as the
2959 // allocated element type, otherwise we use i8 as the allocated element
2960 Type *AllocatedEltTy =
2962 ? cast<FixedVectorType>(NewAI.getAllocatedType())->getElementType()
2963 : Type::getInt8Ty(NewAI.getContext());
2964
2965 // Helper to check if a type is
2966 // 1. A fixed vector type
2967 // 2. The element type is not a pointer
2968 // 3. The element type size is byte-aligned
2969 // We only handle the cases that the ld/st meet these conditions
2970 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2971 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2972 return FixedVecTy &&
2973 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2974 !FixedVecTy->getElementType()->isPointerTy();
2975 };
2976
2977 for (Slice &S : P) {
2978 auto *User = cast<Instruction>(S.getUse()->getUser());
2979 if (auto *LI = dyn_cast<LoadInst>(User)) {
2980 // Do not handle the case if
2981 // 1. There is more than one load
2982 // 2. The load is volatile
2983 // 3. The load does not read the entire alloca structure
2984 // 4. The load does not meet the conditions in the helper function
2985 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2986 S.beginOffset() != NewAllocaBeginOffset ||
2987 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2988 return std::nullopt;
2989 TheLoad = LI;
2990 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
2991 // Do not handle the case if
2992 // 1. The store does not meet the conditions in the helper function
2993 // 2. The store is volatile
2994 if (!IsTypeValidForTreeStructuredMerge(
2995 SI->getValueOperand()->getType()) ||
2996 SI->isVolatile())
2997 return std::nullopt;
2998 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
2999 SI->getValueOperand());
3000 } else {
3001 // If we have instructions other than load and store, we cannot do the
3002 // tree structured merge
3003 return std::nullopt;
3004 }
3005 }
3006 // If we do not have any load, we cannot do the tree structured merge
3007 if (!TheLoad)
3008 return std::nullopt;
3009
3010 // If we do not have multiple stores, we cannot do the tree structured merge
3011 if (StoreInfos.size() < 2)
3012 return std::nullopt;
3013
3014 // Stores should not overlap and should cover the whole alloca
3015 // Sort by begin offset
3016 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
3017 return A.BeginOffset < B.BeginOffset;
3018 });
3019
3020 // Check for overlaps and coverage
3021 uint64_t ExpectedStart = NewAllocaBeginOffset;
3022 for (auto &StoreInfo : StoreInfos) {
3023 uint64_t BeginOff = StoreInfo.BeginOffset;
3024 uint64_t EndOff = StoreInfo.EndOffset;
3025
3026 // Check for gap or overlap
3027 if (BeginOff != ExpectedStart)
3028 return std::nullopt;
3029
3030 ExpectedStart = EndOff;
3031 }
3032 // Check that stores cover the entire alloca
3033 if (ExpectedStart != NewAllocaEndOffset)
3034 return std::nullopt;
3035
3036 // Stores should be in the same basic block
3037 // The load should not be in the middle of the stores
3038 // Note:
3039 // If the load is in a different basic block with the stores, we can still
3040 // do the tree structured merge. This is because we do not have the
3041 // store->load forwarding here. The merged vector will be stored back to
3042 // NewAI and the new load will load from NewAI. The forwarding will be
3043 // handled later when we try to promote NewAI.
3044 BasicBlock *LoadBB = TheLoad->getParent();
3045 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3046
3047 for (auto &StoreInfo : StoreInfos) {
3048 if (StoreInfo.Store->getParent() != StoreBB)
3049 return std::nullopt;
3050 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
3051 return std::nullopt;
3052 }
3053
3054 // If we reach here, the partition can be merged with a tree structured
3055 // merge
3056 LLVM_DEBUG({
3057 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3058 << "\n Ordered stores:\n";
3059 for (auto [i, Info] : enumerate(StoreInfos))
3060 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3061 << Info.EndOffset << ") \tStore: " << *Info.Store
3062 << "\tValue: " << *Info.StoredValue << "\n";
3063 });
3064
3065 // Instead of having these stores, we merge all the stored values into a
3066 // vector and store the merged value into the alloca
3067 std::queue<Value *> VecElements;
3068 IRBuilder<> Builder(StoreInfos.back().Store);
3069 for (const auto &Info : StoreInfos) {
3070 DeletedValues.push_back(Info.Store);
3071 VecElements.push(Info.StoredValue);
3072 }
3073
3074 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3075 while (VecElements.size() > 1) {
3076 const auto NumElts = VecElements.size();
3077 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3078 Value *V0 = VecElements.front();
3079 VecElements.pop();
3080 Value *V1 = VecElements.front();
3081 VecElements.pop();
3082 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3083 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3084 VecElements.push(Merged);
3085 }
3086 if (NumElts % 2 == 1) {
3087 Value *V = VecElements.front();
3088 VecElements.pop();
3089 VecElements.push(V);
3090 }
3091 }
3092
3093 // Store the merged value into the alloca
3094 Value *MergedValue = VecElements.front();
3095 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3096
3097 IRBuilder<> LoadBuilder(TheLoad);
3098 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3099 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3100 TheLoad->getName() + ".sroa.new.load"));
3101 DeletedValues.push_back(TheLoad);
3102
3103 return DeletedValues;
3104 }
3105
3106private:
3107 // Make sure the other visit overloads are visible.
3108 using Base::visit;
3109
3110 // Every instruction which can end up as a user must have a rewrite rule.
3111 bool visitInstruction(Instruction &I) {
3112 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3113 llvm_unreachable("No rewrite rule for this instruction!");
3114 }
3115
3116 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3117 // Note that the offset computation can use BeginOffset or NewBeginOffset
3118 // interchangeably for unsplit slices.
3119 assert(IsSplit || BeginOffset == NewBeginOffset);
3120 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3121
3122#ifndef NDEBUG
3123 StringRef OldName = OldPtr->getName();
3124 // Skip through the last '.sroa.' component of the name.
3125 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3126 if (LastSROAPrefix != StringRef::npos) {
3127 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3128 // Look for an SROA slice index.
3129 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3130 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3131 // Strip the index and look for the offset.
3132 OldName = OldName.substr(IndexEnd + 1);
3133 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3134 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3135 // Strip the offset.
3136 OldName = OldName.substr(OffsetEnd + 1);
3137 }
3138 }
3139 // Strip any SROA suffixes as well.
3140 OldName = OldName.substr(0, OldName.find(".sroa_"));
3141#endif
3142
3143 return getAdjustedPtr(IRB, DL, &NewAI,
3144 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3145 PointerTy,
3146#ifndef NDEBUG
3147 Twine(OldName) + "."
3148#else
3149 Twine()
3150#endif
3151 );
3152 }
3153
3154 /// Compute suitable alignment to access this slice of the *new*
3155 /// alloca.
3156 ///
3157 /// You can optionally pass a type to this routine and if that type's ABI
3158 /// alignment is itself suitable, this will return zero.
3159 Align getSliceAlign() {
3160 return commonAlignment(NewAI.getAlign(),
3161 NewBeginOffset - NewAllocaBeginOffset);
3162 }
3163
3164 unsigned getIndex(uint64_t Offset) {
3165 assert(VecTy && "Can only call getIndex when rewriting a vector");
3166 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3167 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3168 uint32_t Index = RelOffset / ElementSize;
3169 assert(Index * ElementSize == RelOffset);
3170 return Index;
3171 }
3172
3173 void deleteIfTriviallyDead(Value *V) {
3176 Pass.DeadInsts.push_back(I);
3177 }
3178
3179 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3180 unsigned BeginIndex = getIndex(NewBeginOffset);
3181 unsigned EndIndex = getIndex(NewEndOffset);
3182 assert(EndIndex > BeginIndex && "Empty vector!");
3183
3184 LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3185 NewAI.getAlign(), "load");
3186
3187 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3188 LLVMContext::MD_access_group});
3189 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3190 }
3191
3192 Value *rewriteIntegerLoad(LoadInst &LI) {
3193 assert(IntTy && "We cannot insert an integer to the alloca");
3194 assert(!LI.isVolatile());
3195 Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3196 NewAI.getAlign(), "load");
3197 V = convertValue(DL, IRB, V, IntTy);
3198 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3199 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3200 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3201 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3202 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3203 }
3204 // It is possible that the extracted type is not the load type. This
3205 // happens if there is a load past the end of the alloca, and as
3206 // a consequence the slice is narrower but still a candidate for integer
3207 // lowering. To handle this case, we just zero extend the extracted
3208 // integer.
3209 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3210 "Can only handle an extract for an overly wide load");
3211 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3212 V = IRB.CreateZExt(V, LI.getType());
3213 return V;
3214 }
3215
3216 bool visitLoadInst(LoadInst &LI) {
3217 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3218 Value *OldOp = LI.getOperand(0);
3219 assert(OldOp == OldPtr);
3220
3221 AAMDNodes AATags = LI.getAAMetadata();
3222
3223 unsigned AS = LI.getPointerAddressSpace();
3224
3225 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3226 : LI.getType();
3227 bool IsPtrAdjusted = false;
3228 Value *V;
3229 if (VecTy) {
3230 V = rewriteVectorizedLoadInst(LI);
3231 } else if (IntTy && LI.getType()->isIntegerTy()) {
3232 V = rewriteIntegerLoad(LI);
3233 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3234 NewEndOffset == NewAllocaEndOffset &&
3235 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3236 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3237 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3238 !LI.isVolatile()))) {
3239 Value *NewPtr =
3240 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3241 LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
3242 NewAI.getAlign(), LI.isVolatile(),
3243 LI.getName());
3244 if (LI.isVolatile())
3245 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3246 if (NewLI->isAtomic())
3247 NewLI->setAlignment(LI.getAlign());
3248
3249 // Copy any metadata that is valid for the new load. This may require
3250 // conversion to a different kind of metadata, e.g. !nonnull might change
3251 // to !range or vice versa.
3252 copyMetadataForLoad(*NewLI, LI);
3253
3254 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3255 if (AATags)
3256 NewLI->setAAMetadata(AATags.adjustForAccess(
3257 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3258
3259 // Try to preserve nonnull metadata
3260 V = NewLI;
3261
3262 // If this is an integer load past the end of the slice (which means the
3263 // bytes outside the slice are undef or this load is dead) just forcibly
3264 // fix the integer size with correct handling of endianness.
3265 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3266 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3267 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3268 V = IRB.CreateZExt(V, TITy, "load.ext");
3269 if (DL.isBigEndian())
3270 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3271 "endian_shift");
3272 }
3273 } else {
3274 Type *LTy = IRB.getPtrTy(AS);
3275 LoadInst *NewLI =
3276 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3277 getSliceAlign(), LI.isVolatile(), LI.getName());
3278
3279 if (AATags)
3280 NewLI->setAAMetadata(AATags.adjustForAccess(
3281 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3282
3283 if (LI.isVolatile())
3284 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3285 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3286 LLVMContext::MD_access_group});
3287
3288 V = NewLI;
3289 IsPtrAdjusted = true;
3290 }
3291 V = convertValue(DL, IRB, V, TargetTy);
3292
3293 if (IsSplit) {
3294 assert(!LI.isVolatile());
3295 assert(LI.getType()->isIntegerTy() &&
3296 "Only integer type loads and stores are split");
3297 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3298 "Split load isn't smaller than original load");
3299 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3300 "Non-byte-multiple bit width");
3301 // Move the insertion point just past the load so that we can refer to it.
3302 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3303 // Ensure the insertion point comes before any debug-info immediately
3304 // after the load, so that variable values referring to the load are
3305 // dominated by it.
3306 LIIt.setHeadBit(true);
3307 IRB.SetInsertPoint(LI.getParent(), LIIt);
3308 // Create a placeholder value with the same type as LI to use as the
3309 // basis for the new value. This allows us to replace the uses of LI with
3310 // the computed value, and then replace the placeholder with LI, leaving
3311 // LI only used for this computation.
3312 Value *Placeholder =
3313 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3314 false, Align(1));
3315 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3316 "insert");
3317 LI.replaceAllUsesWith(V);
3318 Placeholder->replaceAllUsesWith(&LI);
3319 Placeholder->deleteValue();
3320 } else {
3321 LI.replaceAllUsesWith(V);
3322 }
3323
3324 Pass.DeadInsts.push_back(&LI);
3325 deleteIfTriviallyDead(OldOp);
3326 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3327 return !LI.isVolatile() && !IsPtrAdjusted;
3328 }
3329
3330 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3331 AAMDNodes AATags) {
3332 // Capture V for the purpose of debug-info accounting once it's converted
3333 // to a vector store.
3334 Value *OrigV = V;
3335 if (V->getType() != VecTy) {
3336 unsigned BeginIndex = getIndex(NewBeginOffset);
3337 unsigned EndIndex = getIndex(NewEndOffset);
3338 assert(EndIndex > BeginIndex && "Empty vector!");
3339 unsigned NumElements = EndIndex - BeginIndex;
3340 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3341 "Too many elements!");
3342 Type *SliceTy = (NumElements == 1)
3343 ? ElementTy
3344 : FixedVectorType::get(ElementTy, NumElements);
3345 if (V->getType() != SliceTy)
3346 V = convertValue(DL, IRB, V, SliceTy);
3347
3348 // Mix in the existing elements.
3349 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3350 NewAI.getAlign(), "load");
3351 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3352 }
3353 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3354 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3355 LLVMContext::MD_access_group});
3356 if (AATags)
3357 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3358 V->getType(), DL));
3359 Pass.DeadInsts.push_back(&SI);
3360
3361 // NOTE: Careful to use OrigV rather than V.
3362 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3363 Store, Store->getPointerOperand(), OrigV, DL);
3364 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3365 return true;
3366 }
3367
3368 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3369 assert(IntTy && "We cannot extract an integer from the alloca");
3370 assert(!SI.isVolatile());
3371 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3372 IntTy->getBitWidth()) {
3373 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3374 NewAI.getAlign(), "oldload");
3375 Old = convertValue(DL, IRB, Old, IntTy);
3376 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3377 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3378 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3379 }
3380 V = convertValue(DL, IRB, V, NewAllocaTy);
3381 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3382 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3383 LLVMContext::MD_access_group});
3384 if (AATags)
3385 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3386 V->getType(), DL));
3387
3388 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3389 Store, Store->getPointerOperand(),
3390 Store->getValueOperand(), DL);
3391
3392 Pass.DeadInsts.push_back(&SI);
3393 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3394 return true;
3395 }
3396
3397 bool visitStoreInst(StoreInst &SI) {
3398 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3399 Value *OldOp = SI.getOperand(1);
3400 assert(OldOp == OldPtr);
3401
3402 AAMDNodes AATags = SI.getAAMetadata();
3403 Value *V = SI.getValueOperand();
3404
3405 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3406 // alloca that should be re-examined after promoting this alloca.
3407 if (V->getType()->isPointerTy())
3408 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3409 Pass.PostPromotionWorklist.insert(AI);
3410
3411 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3412 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3413 assert(!SI.isVolatile());
3414 assert(V->getType()->isIntegerTy() &&
3415 "Only integer type loads and stores are split");
3416 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3417 "Non-byte-multiple bit width");
3418 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3419 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3420 "extract");
3421 }
3422
3423 if (VecTy)
3424 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3425 if (IntTy && V->getType()->isIntegerTy())
3426 return rewriteIntegerStore(V, SI, AATags);
3427
3428 StoreInst *NewSI;
3429 if (NewBeginOffset == NewAllocaBeginOffset &&
3430 NewEndOffset == NewAllocaEndOffset &&
3431 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3432 V = convertValue(DL, IRB, V, NewAllocaTy);
3433 Value *NewPtr =
3434 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3435
3436 NewSI =
3437 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3438 } else {
3439 unsigned AS = SI.getPointerAddressSpace();
3440 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3441 NewSI =
3442 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3443 }
3444 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3445 LLVMContext::MD_access_group});
3446 if (AATags)
3447 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3448 V->getType(), DL));
3449 if (SI.isVolatile())
3450 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3451 if (NewSI->isAtomic())
3452 NewSI->setAlignment(SI.getAlign());
3453
3454 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3455 NewSI, NewSI->getPointerOperand(),
3456 NewSI->getValueOperand(), DL);
3457
3458 Pass.DeadInsts.push_back(&SI);
3459 deleteIfTriviallyDead(OldOp);
3460
3461 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3462 return NewSI->getPointerOperand() == &NewAI &&
3463 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3464 !SI.isVolatile();
3465 }
3466
3467 /// Compute an integer value from splatting an i8 across the given
3468 /// number of bytes.
3469 ///
3470 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3471 /// call this routine.
3472 /// FIXME: Heed the advice above.
3473 ///
3474 /// \param V The i8 value to splat.
3475 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3476 Value *getIntegerSplat(Value *V, unsigned Size) {
3477 assert(Size > 0 && "Expected a positive number of bytes.");
3478 IntegerType *VTy = cast<IntegerType>(V->getType());
3479 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3480 if (Size == 1)
3481 return V;
3482
3483 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3484 V = IRB.CreateMul(
3485 IRB.CreateZExt(V, SplatIntTy, "zext"),
3486 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3487 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3488 SplatIntTy)),
3489 "isplat");
3490 return V;
3491 }
3492
3493 /// Compute a vector splat for a given element value.
3494 Value *getVectorSplat(Value *V, unsigned NumElements) {
3495 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3496 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3497 return V;
3498 }
3499
3500 bool visitMemSetInst(MemSetInst &II) {
3501 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3502 assert(II.getRawDest() == OldPtr);
3503
3504 AAMDNodes AATags = II.getAAMetadata();
3505
3506 // If the memset has a variable size, it cannot be split, just adjust the
3507 // pointer to the new alloca.
3508 if (!isa<ConstantInt>(II.getLength())) {
3509 assert(!IsSplit);
3510 assert(NewBeginOffset == BeginOffset);
3511 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3512 II.setDestAlignment(getSliceAlign());
3513 // In theory we should call migrateDebugInfo here. However, we do not
3514 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3515 // constant geps, or storing a variable number of bytes.
3517 "AT: Unexpected link to non-const GEP");
3518 deleteIfTriviallyDead(OldPtr);
3519 return false;
3520 }
3521
3522 // Record this instruction for deletion.
3523 Pass.DeadInsts.push_back(&II);
3524
3525 Type *AllocaTy = NewAI.getAllocatedType();
3526 Type *ScalarTy = AllocaTy->getScalarType();
3527
3528 const bool CanContinue = [&]() {
3529 if (VecTy || IntTy)
3530 return true;
3531 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3532 return false;
3533 // Length must be in range for FixedVectorType.
3534 auto *C = cast<ConstantInt>(II.getLength());
3535 const uint64_t Len = C->getLimitedValue();
3536 if (Len > std::numeric_limits<unsigned>::max())
3537 return false;
3538 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3539 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3540 return canConvertValue(DL, SrcTy, AllocaTy) &&
3541 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3542 }();
3543
3544 // If this doesn't map cleanly onto the alloca type, and that type isn't
3545 // a single value type, just emit a memset.
3546 if (!CanContinue) {
3547 Type *SizeTy = II.getLength()->getType();
3548 unsigned Sz = NewEndOffset - NewBeginOffset;
3549 Constant *Size = ConstantInt::get(SizeTy, Sz);
3550 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3551 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3552 MaybeAlign(getSliceAlign()), II.isVolatile()));
3553 if (AATags)
3554 New->setAAMetadata(
3555 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3556
3557 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3558 New, New->getRawDest(), nullptr, DL);
3559
3560 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3561 return false;
3562 }
3563
3564 // If we can represent this as a simple value, we have to build the actual
3565 // value to store, which requires expanding the byte present in memset to
3566 // a sensible representation for the alloca type. This is essentially
3567 // splatting the byte to a sufficiently wide integer, splatting it across
3568 // any desired vector width, and bitcasting to the final type.
3569 Value *V;
3570
3571 if (VecTy) {
3572 // If this is a memset of a vectorized alloca, insert it.
3573 assert(ElementTy == ScalarTy);
3574
3575 unsigned BeginIndex = getIndex(NewBeginOffset);
3576 unsigned EndIndex = getIndex(NewEndOffset);
3577 assert(EndIndex > BeginIndex && "Empty vector!");
3578 unsigned NumElements = EndIndex - BeginIndex;
3579 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3580 "Too many elements!");
3581
3582 Value *Splat = getIntegerSplat(
3583 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3584 Splat = convertValue(DL, IRB, Splat, ElementTy);
3585 if (NumElements > 1)
3586 Splat = getVectorSplat(Splat, NumElements);
3587
3588 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3589 NewAI.getAlign(), "oldload");
3590 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3591 } else if (IntTy) {
3592 // If this is a memset on an alloca where we can widen stores, insert the
3593 // set integer.
3594 assert(!II.isVolatile());
3595
3596 uint64_t Size = NewEndOffset - NewBeginOffset;
3597 V = getIntegerSplat(II.getValue(), Size);
3598
3599 if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3600 EndOffset != NewAllocaBeginOffset)) {
3601 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3602 NewAI.getAlign(), "oldload");
3603 Old = convertValue(DL, IRB, Old, IntTy);
3604 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3605 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3606 } else {
3607 assert(V->getType() == IntTy &&
3608 "Wrong type for an alloca wide integer!");
3609 }
3610 V = convertValue(DL, IRB, V, AllocaTy);
3611 } else {
3612 // Established these invariants above.
3613 assert(NewBeginOffset == NewAllocaBeginOffset);
3614 assert(NewEndOffset == NewAllocaEndOffset);
3615
3616 V = getIntegerSplat(II.getValue(),
3617 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3618 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
3619 V = getVectorSplat(
3620 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3621
3622 V = convertValue(DL, IRB, V, AllocaTy);
3623 }
3624
3625 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3626 StoreInst *New =
3627 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3628 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3629 LLVMContext::MD_access_group});
3630 if (AATags)
3631 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3632 V->getType(), DL));
3633
3634 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3635 New, New->getPointerOperand(), V, DL);
3636
3637 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3638 return !II.isVolatile();
3639 }
3640
3641 bool visitMemTransferInst(MemTransferInst &II) {
3642 // Rewriting of memory transfer instructions can be a bit tricky. We break
3643 // them into two categories: split intrinsics and unsplit intrinsics.
3644
3645 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3646
3647 AAMDNodes AATags = II.getAAMetadata();
3648
3649 bool IsDest = &II.getRawDestUse() == OldUse;
3650 assert((IsDest && II.getRawDest() == OldPtr) ||
3651 (!IsDest && II.getRawSource() == OldPtr));
3652
3653 Align SliceAlign = getSliceAlign();
3654 // For unsplit intrinsics, we simply modify the source and destination
3655 // pointers in place. This isn't just an optimization, it is a matter of
3656 // correctness. With unsplit intrinsics we may be dealing with transfers
3657 // within a single alloca before SROA ran, or with transfers that have
3658 // a variable length. We may also be dealing with memmove instead of
3659 // memcpy, and so simply updating the pointers is the necessary for us to
3660 // update both source and dest of a single call.
3661 if (!IsSplittable) {
3662 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3663 if (IsDest) {
3664 // Update the address component of linked dbg.assigns.
3665 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3666 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3667 DbgAssign->getAddress() == II.getDest())
3668 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3669 }
3670 II.setDest(AdjustedPtr);
3671 II.setDestAlignment(SliceAlign);
3672 } else {
3673 II.setSource(AdjustedPtr);
3674 II.setSourceAlignment(SliceAlign);
3675 }
3676
3677 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3678 deleteIfTriviallyDead(OldPtr);
3679 return false;
3680 }
3681 // For split transfer intrinsics we have an incredibly useful assurance:
3682 // the source and destination do not reside within the same alloca, and at
3683 // least one of them does not escape. This means that we can replace
3684 // memmove with memcpy, and we don't need to worry about all manner of
3685 // downsides to splitting and transforming the operations.
3686
3687 // If this doesn't map cleanly onto the alloca type, and that type isn't
3688 // a single value type, just emit a memcpy.
3689 bool EmitMemCpy =
3690 !VecTy && !IntTy &&
3691 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3692 SliceSize !=
3693 DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
3694 !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
3696
3697 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3698 // size hasn't been shrunk based on analysis of the viable range, this is
3699 // a no-op.
3700 if (EmitMemCpy && &OldAI == &NewAI) {
3701 // Ensure the start lines up.
3702 assert(NewBeginOffset == BeginOffset);
3703
3704 // Rewrite the size as needed.
3705 if (NewEndOffset != EndOffset)
3706 II.setLength(NewEndOffset - NewBeginOffset);
3707 return false;
3708 }
3709 // Record this instruction for deletion.
3710 Pass.DeadInsts.push_back(&II);
3711
3712 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3713 // alloca that should be re-examined after rewriting this instruction.
3714 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3715 if (AllocaInst *AI =
3717 assert(AI != &OldAI && AI != &NewAI &&
3718 "Splittable transfers cannot reach the same alloca on both ends.");
3719 Pass.Worklist.insert(AI);
3720 }
3721
3722 Type *OtherPtrTy = OtherPtr->getType();
3723 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3724
3725 // Compute the relative offset for the other pointer within the transfer.
3726 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3727 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3728 Align OtherAlign =
3729 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3730 OtherAlign =
3731 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3732
3733 if (EmitMemCpy) {
3734 // Compute the other pointer, folding as much as possible to produce
3735 // a single, simple GEP in most cases.
3736 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3737 OtherPtr->getName() + ".");
3738
3739 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3740 Type *SizeTy = II.getLength()->getType();
3741 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3742
3743 Value *DestPtr, *SrcPtr;
3744 MaybeAlign DestAlign, SrcAlign;
3745 // Note: IsDest is true iff we're copying into the new alloca slice
3746 if (IsDest) {
3747 DestPtr = OurPtr;
3748 DestAlign = SliceAlign;
3749 SrcPtr = OtherPtr;
3750 SrcAlign = OtherAlign;
3751 } else {
3752 DestPtr = OtherPtr;
3753 DestAlign = OtherAlign;
3754 SrcPtr = OurPtr;
3755 SrcAlign = SliceAlign;
3756 }
3757 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3758 Size, II.isVolatile());
3759 if (AATags)
3760 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3761
3762 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3763 if (IsDest) {
3764 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3765 &II, New, DestPtr, nullptr, DL);
3766 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3768 DL, Offset, /*AllowNonInbounds*/ true))) {
3769 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3770 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3771 }
3772 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3773 return false;
3774 }
3775
3776 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3777 NewEndOffset == NewAllocaEndOffset;
3778 uint64_t Size = NewEndOffset - NewBeginOffset;
3779 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3780 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3781 unsigned NumElements = EndIndex - BeginIndex;
3782 IntegerType *SubIntTy =
3783 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3784
3785 // Reset the other pointer type to match the register type we're going to
3786 // use, but using the address space of the original other pointer.
3787 Type *OtherTy;
3788 if (VecTy && !IsWholeAlloca) {
3789 if (NumElements == 1)
3790 OtherTy = VecTy->getElementType();
3791 else
3792 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3793 } else if (IntTy && !IsWholeAlloca) {
3794 OtherTy = SubIntTy;
3795 } else {
3796 OtherTy = NewAllocaTy;
3797 }
3798
3799 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3800 OtherPtr->getName() + ".");
3801 MaybeAlign SrcAlign = OtherAlign;
3802 MaybeAlign DstAlign = SliceAlign;
3803 if (!IsDest)
3804 std::swap(SrcAlign, DstAlign);
3805
3806 Value *SrcPtr;
3807 Value *DstPtr;
3808
3809 if (IsDest) {
3810 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3811 SrcPtr = AdjPtr;
3812 } else {
3813 DstPtr = AdjPtr;
3814 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3815 }
3816
3817 Value *Src;
3818 if (VecTy && !IsWholeAlloca && !IsDest) {
3819 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3820 NewAI.getAlign(), "load");
3821 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3822 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3823 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3824 NewAI.getAlign(), "load");
3825 Src = convertValue(DL, IRB, Src, IntTy);
3826 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3827 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3828 } else {
3829 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3830 II.isVolatile(), "copyload");
3831 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3832 LLVMContext::MD_access_group});
3833 if (AATags)
3834 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3835 Load->getType(), DL));
3836 Src = Load;
3837 }
3838
3839 if (VecTy && !IsWholeAlloca && IsDest) {
3840 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3841 NewAI.getAlign(), "oldload");
3842 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3843 } else if (IntTy && !IsWholeAlloca && IsDest) {
3844 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3845 NewAI.getAlign(), "oldload");
3846 Old = convertValue(DL, IRB, Old, IntTy);
3847 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3848 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3849 Src = convertValue(DL, IRB, Src, NewAllocaTy);
3850 }
3851
3852 StoreInst *Store = cast<StoreInst>(
3853 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3854 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3855 LLVMContext::MD_access_group});
3856 if (AATags)
3857 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3858 Src->getType(), DL));
3859
3860 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3861 if (IsDest) {
3862
3863 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3864 Store, DstPtr, Src, DL);
3865 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3867 DL, Offset, /*AllowNonInbounds*/ true))) {
3868 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3869 &II, Store, DstPtr, Src, DL);
3870 }
3871
3872 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3873 return !II.isVolatile();
3874 }
3875
3876 bool visitIntrinsicInst(IntrinsicInst &II) {
3877 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3878 "Unexpected intrinsic!");
3879 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3880
3881 // Record this instruction for deletion.
3882 Pass.DeadInsts.push_back(&II);
3883
3884 if (II.isDroppable()) {
3885 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3886 // TODO For now we forget assumed information, this can be improved.
3887 OldPtr->dropDroppableUsesIn(II);
3888 return true;
3889 }
3890
3891 assert(II.getArgOperand(0) == OldPtr);
3892 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3893 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3894 Value *New;
3895 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3896 New = IRB.CreateLifetimeStart(Ptr);
3897 else
3898 New = IRB.CreateLifetimeEnd(Ptr);
3899
3900 (void)New;
3901 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3902
3903 return true;
3904 }
3905
3906 void fixLoadStoreAlign(Instruction &Root) {
3907 // This algorithm implements the same visitor loop as
3908 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3909 // or store found.
3910 SmallPtrSet<Instruction *, 4> Visited;
3911 SmallVector<Instruction *, 4> Uses;
3912 Visited.insert(&Root);
3913 Uses.push_back(&Root);
3914 do {
3915 Instruction *I = Uses.pop_back_val();
3916
3917 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3918 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3919 continue;
3920 }
3921 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3922 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3923 continue;
3924 }
3925
3929 for (User *U : I->users())
3930 if (Visited.insert(cast<Instruction>(U)).second)
3931 Uses.push_back(cast<Instruction>(U));
3932 } while (!Uses.empty());
3933 }
3934
3935 bool visitPHINode(PHINode &PN) {
3936 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3937 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3938 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3939
3940 // We would like to compute a new pointer in only one place, but have it be
3941 // as local as possible to the PHI. To do that, we re-use the location of
3942 // the old pointer, which necessarily must be in the right position to
3943 // dominate the PHI.
3944 IRBuilderBase::InsertPointGuard Guard(IRB);
3945 if (isa<PHINode>(OldPtr))
3946 IRB.SetInsertPoint(OldPtr->getParent(),
3947 OldPtr->getParent()->getFirstInsertionPt());
3948 else
3949 IRB.SetInsertPoint(OldPtr);
3950 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3951
3952 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3953 // Replace the operands which were using the old pointer.
3954 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3955
3956 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3957 deleteIfTriviallyDead(OldPtr);
3958
3959 // Fix the alignment of any loads or stores using this PHI node.
3960 fixLoadStoreAlign(PN);
3961
3962 // PHIs can't be promoted on their own, but often can be speculated. We
3963 // check the speculation outside of the rewriter so that we see the
3964 // fully-rewritten alloca.
3965 PHIUsers.insert(&PN);
3966 return true;
3967 }
3968
3969 bool visitSelectInst(SelectInst &SI) {
3970 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3971 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3972 "Pointer isn't an operand!");
3973 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3974 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3975
3976 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3977 // Replace the operands which were using the old pointer.
3978 if (SI.getOperand(1) == OldPtr)
3979 SI.setOperand(1, NewPtr);
3980 if (SI.getOperand(2) == OldPtr)
3981 SI.setOperand(2, NewPtr);
3982
3983 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3984 deleteIfTriviallyDead(OldPtr);
3985
3986 // Fix the alignment of any loads or stores using this select.
3987 fixLoadStoreAlign(SI);
3988
3989 // Selects can't be promoted on their own, but often can be speculated. We
3990 // check the speculation outside of the rewriter so that we see the
3991 // fully-rewritten alloca.
3992 SelectUsers.insert(&SI);
3993 return true;
3994 }
3995};
3996
3997/// Visitor to rewrite aggregate loads and stores as scalar.
3998///
3999/// This pass aggressively rewrites all aggregate loads and stores on
4000/// a particular pointer (or any pointer derived from it which we can identify)
4001/// with scalar loads and stores.
4002class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
4003 // Befriend the base class so it can delegate to private visit methods.
4004 friend class InstVisitor<AggLoadStoreRewriter, bool>;
4005
4006 /// Queue of pointer uses to analyze and potentially rewrite.
4008
4009 /// Set to prevent us from cycling with phi nodes and loops.
4010 SmallPtrSet<User *, 8> Visited;
4011
4012 /// The current pointer use being rewritten. This is used to dig up the used
4013 /// value (as opposed to the user).
4014 Use *U = nullptr;
4015
4016 /// Used to calculate offsets, and hence alignment, of subobjects.
4017 const DataLayout &DL;
4018
4019 IRBuilderTy &IRB;
4020
4021public:
4022 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
4023 : DL(DL), IRB(IRB) {}
4024
4025 /// Rewrite loads and stores through a pointer and all pointers derived from
4026 /// it.
4027 bool rewrite(Instruction &I) {
4028 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
4029 enqueueUsers(I);
4030 bool Changed = false;
4031 while (!Queue.empty()) {
4032 U = Queue.pop_back_val();
4033 Changed |= visit(cast<Instruction>(U->getUser()));
4034 }
4035 return Changed;
4036 }
4037
4038private:
4039 /// Enqueue all the users of the given instruction for further processing.
4040 /// This uses a set to de-duplicate users.
4041 void enqueueUsers(Instruction &I) {
4042 for (Use &U : I.uses())
4043 if (Visited.insert(U.getUser()).second)
4044 Queue.push_back(&U);
4045 }
4046
4047 // Conservative default is to not rewrite anything.
4048 bool visitInstruction(Instruction &I) { return false; }
4049
4050 /// Generic recursive split emission class.
4051 template <typename Derived> class OpSplitter {
4052 protected:
4053 /// The builder used to form new instructions.
4054 IRBuilderTy &IRB;
4055
4056 /// The indices which to be used with insert- or extractvalue to select the
4057 /// appropriate value within the aggregate.
4058 SmallVector<unsigned, 4> Indices;
4059
4060 /// The indices to a GEP instruction which will move Ptr to the correct slot
4061 /// within the aggregate.
4062 SmallVector<Value *, 4> GEPIndices;
4063
4064 /// The base pointer of the original op, used as a base for GEPing the
4065 /// split operations.
4066 Value *Ptr;
4067
4068 /// The base pointee type being GEPed into.
4069 Type *BaseTy;
4070
4071 /// Known alignment of the base pointer.
4072 Align BaseAlign;
4073
4074 /// To calculate offset of each component so we can correctly deduce
4075 /// alignments.
4076 const DataLayout &DL;
4077
4078 /// Initialize the splitter with an insertion point, Ptr and start with a
4079 /// single zero GEP index.
4080 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4081 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4082 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4083 BaseAlign(BaseAlign), DL(DL) {
4084 IRB.SetInsertPoint(InsertionPoint);
4085 }
4086
4087 public:
4088 /// Generic recursive split emission routine.
4089 ///
4090 /// This method recursively splits an aggregate op (load or store) into
4091 /// scalar or vector ops. It splits recursively until it hits a single value
4092 /// and emits that single value operation via the template argument.
4093 ///
4094 /// The logic of this routine relies on GEPs and insertvalue and
4095 /// extractvalue all operating with the same fundamental index list, merely
4096 /// formatted differently (GEPs need actual values).
4097 ///
4098 /// \param Ty The type being split recursively into smaller ops.
4099 /// \param Agg The aggregate value being built up or stored, depending on
4100 /// whether this is splitting a load or a store respectively.
4101 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4102 if (Ty->isSingleValueType()) {
4103 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4104 return static_cast<Derived *>(this)->emitFunc(
4105 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4106 }
4107
4108 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4109 unsigned OldSize = Indices.size();
4110 (void)OldSize;
4111 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4112 ++Idx) {
4113 assert(Indices.size() == OldSize && "Did not return to the old size");
4114 Indices.push_back(Idx);
4115 GEPIndices.push_back(IRB.getInt32(Idx));
4116 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4117 GEPIndices.pop_back();
4118 Indices.pop_back();
4119 }
4120 return;
4121 }
4122
4123 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4124 unsigned OldSize = Indices.size();
4125 (void)OldSize;
4126 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4127 ++Idx) {
4128 assert(Indices.size() == OldSize && "Did not return to the old size");
4129 Indices.push_back(Idx);
4130 GEPIndices.push_back(IRB.getInt32(Idx));
4131 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4132 GEPIndices.pop_back();
4133 Indices.pop_back();
4134 }
4135 return;
4136 }
4137
4138 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4139 }
4140 };
4141
4142 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4143 AAMDNodes AATags;
4144 // A vector to hold the split components that we want to emit
4145 // separate fake uses for.
4146 SmallVector<Value *, 4> Components;
4147 // A vector to hold all the fake uses of the struct that we are splitting.
4148 // Usually there should only be one, but we are handling the general case.
4150
4151 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4152 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4153 IRBuilderTy &IRB)
4154 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4155 IRB),
4156 AATags(AATags) {}
4157
4158 /// Emit a leaf load of a single value. This is called at the leaves of the
4159 /// recursive emission to actually load values.
4160 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4162 // Load the single value and insert it using the indices.
4163 Value *GEP =
4164 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4165 LoadInst *Load =
4166 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4167
4168 APInt Offset(
4169 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4170 if (AATags &&
4171 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4172 Load->setAAMetadata(
4173 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4174 // Record the load so we can generate a fake use for this aggregate
4175 // component.
4176 Components.push_back(Load);
4177
4178 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4179 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4180 }
4181
4182 // Stash the fake uses that use the value generated by this instruction.
4183 void recordFakeUses(LoadInst &LI) {
4184 for (Use &U : LI.uses())
4185 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4186 if (II->getIntrinsicID() == Intrinsic::fake_use)
4187 FakeUses.push_back(II);
4188 }
4189
4190 // Replace all fake uses of the aggregate with a series of fake uses, one
4191 // for each split component.
4192 void emitFakeUses() {
4193 for (Instruction *I : FakeUses) {
4194 IRB.SetInsertPoint(I);
4195 for (auto *V : Components)
4196 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4197 I->eraseFromParent();
4198 }
4199 }
4200 };
4201
4202 bool visitLoadInst(LoadInst &LI) {
4203 assert(LI.getPointerOperand() == *U);
4204 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4205 return false;
4206
4207 // We have an aggregate being loaded, split it apart.
4208 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4209 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4210 getAdjustedAlignment(&LI, 0), DL, IRB);
4211 Splitter.recordFakeUses(LI);
4213 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4214 Splitter.emitFakeUses();
4215 Visited.erase(&LI);
4216 LI.replaceAllUsesWith(V);
4217 LI.eraseFromParent();
4218 return true;
4219 }
4220
4221 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4222 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4223 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4224 const DataLayout &DL, IRBuilderTy &IRB)
4225 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4226 DL, IRB),
4227 AATags(AATags), AggStore(AggStore) {}
4228 AAMDNodes AATags;
4229 StoreInst *AggStore;
4230 /// Emit a leaf store of a single value. This is called at the leaves of the
4231 /// recursive emission to actually produce stores.
4232 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4234 // Extract the single value and store it using the indices.
4235 //
4236 // The gep and extractvalue values are factored out of the CreateStore
4237 // call to make the output independent of the argument evaluation order.
4238 Value *ExtractValue =
4239 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4240 Value *InBoundsGEP =
4241 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4242 StoreInst *Store =
4243 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4244
4245 APInt Offset(
4246 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4247 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4248 if (AATags) {
4249 Store->setAAMetadata(AATags.adjustForAccess(
4250 Offset.getZExtValue(), ExtractValue->getType(), DL));
4251 }
4252
4253 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4254 // If we cannot (because there's an intervening non-const or unbounded
4255 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4256 // this instruction.
4258 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4259 uint64_t SizeInBits =
4260 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4261 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4262 SizeInBits, AggStore, Store,
4263 Store->getPointerOperand(), Store->getValueOperand(),
4264 DL);
4265 } else {
4267 "AT: unexpected debug.assign linked to store through "
4268 "unbounded GEP");
4269 }
4270 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4271 }
4272 };
4273
4274 bool visitStoreInst(StoreInst &SI) {
4275 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4276 return false;
4277 Value *V = SI.getValueOperand();
4278 if (V->getType()->isSingleValueType())
4279 return false;
4280
4281 // We have an aggregate being stored, split it apart.
4282 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4283 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4284 getAdjustedAlignment(&SI, 0), DL, IRB);
4285 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4286 Visited.erase(&SI);
4287 // The stores replacing SI each have markers describing fragments of the
4288 // assignment so delete the assignment markers linked to SI.
4290 SI.eraseFromParent();
4291 return true;
4292 }
4293
4294 bool visitBitCastInst(BitCastInst &BC) {
4295 enqueueUsers(BC);
4296 return false;
4297 }
4298
4299 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4300 enqueueUsers(ASC);
4301 return false;
4302 }
4303
4304 // Unfold gep (select cond, ptr1, ptr2), idx
4305 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4306 // and gep ptr, (select cond, idx1, idx2)
4307 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4308 // We also allow for i1 zext indices, which are equivalent to selects.
4309 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4310 // Check whether the GEP has exactly one select operand and all indices
4311 // will become constant after the transform.
4313 for (Value *Op : GEPI.indices()) {
4314 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4315 if (Sel)
4316 return false;
4317
4318 Sel = SI;
4319 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4320 !isa<ConstantInt>(SI->getFalseValue()))
4321 return false;
4322 continue;
4323 }
4324 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4325 if (Sel)
4326 return false;
4327 Sel = ZI;
4328 if (!ZI->getSrcTy()->isIntegerTy(1))
4329 return false;
4330 continue;
4331 }
4332
4333 if (!isa<ConstantInt>(Op))
4334 return false;
4335 }
4336
4337 if (!Sel)
4338 return false;
4339
4340 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4341 dbgs() << " original: " << *Sel << "\n";
4342 dbgs() << " " << GEPI << "\n";);
4343
4344 auto GetNewOps = [&](Value *SelOp) {
4345 SmallVector<Value *> NewOps;
4346 for (Value *Op : GEPI.operands())
4347 if (Op == Sel)
4348 NewOps.push_back(SelOp);
4349 else
4350 NewOps.push_back(Op);
4351 return NewOps;
4352 };
4353
4354 Value *Cond, *True, *False;
4355 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4356 Cond = SI->getCondition();
4357 True = SI->getTrueValue();
4358 False = SI->getFalseValue();
4359 } else {
4360 Cond = Sel->getOperand(0);
4361 True = ConstantInt::get(Sel->getType(), 1);
4362 False = ConstantInt::get(Sel->getType(), 0);
4363 }
4364 SmallVector<Value *> TrueOps = GetNewOps(True);
4365 SmallVector<Value *> FalseOps = GetNewOps(False);
4366
4367 IRB.SetInsertPoint(&GEPI);
4368 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4369
4370 Type *Ty = GEPI.getSourceElementType();
4371 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4372 True->getName() + ".sroa.gep", NW);
4373
4374 Value *NFalse =
4375 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4376 False->getName() + ".sroa.gep", NW);
4377
4378 Value *NSel =
4379 IRB.CreateSelect(Cond, NTrue, NFalse, Sel->getName() + ".sroa.sel");
4380 Visited.erase(&GEPI);
4381 GEPI.replaceAllUsesWith(NSel);
4382 GEPI.eraseFromParent();
4383 Instruction *NSelI = cast<Instruction>(NSel);
4384 Visited.insert(NSelI);
4385 enqueueUsers(*NSelI);
4386
4387 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4388 dbgs() << " " << *NFalse << "\n";
4389 dbgs() << " " << *NSel << "\n";);
4390
4391 return true;
4392 }
4393
4394 // Unfold gep (phi ptr1, ptr2), idx
4395 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4396 // and gep ptr, (phi idx1, idx2)
4397 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4398 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4399 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4400 // operand (looking through the phi if it is the phi we want to unfold) is
4401 // an instruction besides a static alloca.
4402 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4403 auto IsInvalidPointerOperand = [](Value *V) {
4404 if (!isa<Instruction>(V))
4405 return false;
4406 if (auto *AI = dyn_cast<AllocaInst>(V))
4407 return !AI->isStaticAlloca();
4408 return true;
4409 };
4410 if (Phi) {
4411 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4412 return false;
4413 } else {
4414 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4415 return false;
4416 }
4417 // Check whether the GEP has exactly one phi operand (including the pointer
4418 // operand) and all indices will become constant after the transform.
4419 for (Value *Op : GEPI.indices()) {
4420 if (auto *SI = dyn_cast<PHINode>(Op)) {
4421 if (Phi)
4422 return false;
4423
4424 Phi = SI;
4425 if (!all_of(Phi->incoming_values(),
4426 [](Value *V) { return isa<ConstantInt>(V); }))
4427 return false;
4428 continue;
4429 }
4430
4431 if (!isa<ConstantInt>(Op))
4432 return false;
4433 }
4434
4435 if (!Phi)
4436 return false;
4437
4438 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4439 dbgs() << " original: " << *Phi << "\n";
4440 dbgs() << " " << GEPI << "\n";);
4441
4442 auto GetNewOps = [&](Value *PhiOp) {
4443 SmallVector<Value *> NewOps;
4444 for (Value *Op : GEPI.operands())
4445 if (Op == Phi)
4446 NewOps.push_back(PhiOp);
4447 else
4448 NewOps.push_back(Op);
4449 return NewOps;
4450 };
4451
4452 IRB.SetInsertPoint(Phi);
4453 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4454 Phi->getName() + ".sroa.phi");
4455
4456 Type *SourceTy = GEPI.getSourceElementType();
4457 // We only handle arguments, constants, and static allocas here, so we can
4458 // insert GEPs at the end of the entry block.
4459 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4460 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4461 Value *Op = Phi->getIncomingValue(I);
4462 BasicBlock *BB = Phi->getIncomingBlock(I);
4463 Value *NewGEP;
4464 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4465 NewGEP = NewPhi->getIncomingValue(NI);
4466 } else {
4467 SmallVector<Value *> NewOps = GetNewOps(Op);
4468 NewGEP =
4469 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4470 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4471 }
4472 NewPhi->addIncoming(NewGEP, BB);
4473 }
4474
4475 Visited.erase(&GEPI);
4476 GEPI.replaceAllUsesWith(NewPhi);
4477 GEPI.eraseFromParent();
4478 Visited.insert(NewPhi);
4479 enqueueUsers(*NewPhi);
4480
4481 LLVM_DEBUG(dbgs() << " to: ";
4482 for (Value *In
4483 : NewPhi->incoming_values()) dbgs()
4484 << "\n " << *In;
4485 dbgs() << "\n " << *NewPhi << '\n');
4486
4487 return true;
4488 }
4489
4490 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4491 if (unfoldGEPSelect(GEPI))
4492 return true;
4493
4494 if (unfoldGEPPhi(GEPI))
4495 return true;
4496
4497 enqueueUsers(GEPI);
4498 return false;
4499 }
4500
4501 bool visitPHINode(PHINode &PN) {
4502 enqueueUsers(PN);
4503 return false;
4504 }
4505
4506 bool visitSelectInst(SelectInst &SI) {
4507 enqueueUsers(SI);
4508 return false;
4509 }
4510};
4511
4512} // end anonymous namespace
4513
4514/// Strip aggregate type wrapping.
4515///
4516/// This removes no-op aggregate types wrapping an underlying type. It will
4517/// strip as many layers of types as it can without changing either the type
4518/// size or the allocated size.
4520 if (Ty->isSingleValueType())
4521 return Ty;
4522
4523 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4524 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4525
4526 Type *InnerTy;
4527 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4528 InnerTy = ArrTy->getElementType();
4529 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4530 const StructLayout *SL = DL.getStructLayout(STy);
4531 unsigned Index = SL->getElementContainingOffset(0);
4532 InnerTy = STy->getElementType(Index);
4533 } else {
4534 return Ty;
4535 }
4536
4537 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4538 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4539 return Ty;
4540
4541 return stripAggregateTypeWrapping(DL, InnerTy);
4542}
4543
4544/// Try to find a partition of the aggregate type passed in for a given
4545/// offset and size.
4546///
4547/// This recurses through the aggregate type and tries to compute a subtype
4548/// based on the offset and size. When the offset and size span a sub-section
4549/// of an array, it will even compute a new array type for that sub-section,
4550/// and the same for structs.
4551///
4552/// Note that this routine is very strict and tries to find a partition of the
4553/// type which produces the *exact* right offset and size. It is not forgiving
4554/// when the size or offset cause either end of type-based partition to be off.
4555/// Also, this is a best-effort routine. It is reasonable to give up and not
4556/// return a type if necessary.
4558 uint64_t Size) {
4559 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4560 return stripAggregateTypeWrapping(DL, Ty);
4561 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4562 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4563 return nullptr;
4564
4565 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4566 Type *ElementTy;
4567 uint64_t TyNumElements;
4568 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4569 ElementTy = AT->getElementType();
4570 TyNumElements = AT->getNumElements();
4571 } else {
4572 // FIXME: This isn't right for vectors with non-byte-sized or
4573 // non-power-of-two sized elements.
4574 auto *VT = cast<FixedVectorType>(Ty);
4575 ElementTy = VT->getElementType();
4576 TyNumElements = VT->getNumElements();
4577 }
4578 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4579 uint64_t NumSkippedElements = Offset / ElementSize;
4580 if (NumSkippedElements >= TyNumElements)
4581 return nullptr;
4582 Offset -= NumSkippedElements * ElementSize;
4583
4584 // First check if we need to recurse.
4585 if (Offset > 0 || Size < ElementSize) {
4586 // Bail if the partition ends in a different array element.
4587 if ((Offset + Size) > ElementSize)
4588 return nullptr;
4589 // Recurse through the element type trying to peel off offset bytes.
4590 return getTypePartition(DL, ElementTy, Offset, Size);
4591 }
4592 assert(Offset == 0);
4593
4594 if (Size == ElementSize)
4595 return stripAggregateTypeWrapping(DL, ElementTy);
4596 assert(Size > ElementSize);
4597 uint64_t NumElements = Size / ElementSize;
4598 if (NumElements * ElementSize != Size)
4599 return nullptr;
4600 return ArrayType::get(ElementTy, NumElements);
4601 }
4602
4604 if (!STy)
4605 return nullptr;
4606
4607 const StructLayout *SL = DL.getStructLayout(STy);
4608
4609 if (SL->getSizeInBits().isScalable())
4610 return nullptr;
4611
4612 if (Offset >= SL->getSizeInBytes())
4613 return nullptr;
4614 uint64_t EndOffset = Offset + Size;
4615 if (EndOffset > SL->getSizeInBytes())
4616 return nullptr;
4617
4618 unsigned Index = SL->getElementContainingOffset(Offset);
4619 Offset -= SL->getElementOffset(Index);
4620
4621 Type *ElementTy = STy->getElementType(Index);
4622 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4623 if (Offset >= ElementSize)
4624 return nullptr; // The offset points into alignment padding.
4625
4626 // See if any partition must be contained by the element.
4627 if (Offset > 0 || Size < ElementSize) {
4628 if ((Offset + Size) > ElementSize)
4629 return nullptr;
4630 return getTypePartition(DL, ElementTy, Offset, Size);
4631 }
4632 assert(Offset == 0);
4633
4634 if (Size == ElementSize)
4635 return stripAggregateTypeWrapping(DL, ElementTy);
4636
4637 StructType::element_iterator EI = STy->element_begin() + Index,
4638 EE = STy->element_end();
4639 if (EndOffset < SL->getSizeInBytes()) {
4640 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4641 if (Index == EndIndex)
4642 return nullptr; // Within a single element and its padding.
4643
4644 // Don't try to form "natural" types if the elements don't line up with the
4645 // expected size.
4646 // FIXME: We could potentially recurse down through the last element in the
4647 // sub-struct to find a natural end point.
4648 if (SL->getElementOffset(EndIndex) != EndOffset)
4649 return nullptr;
4650
4651 assert(Index < EndIndex);
4652 EE = STy->element_begin() + EndIndex;
4653 }
4654
4655 // Try to build up a sub-structure.
4656 StructType *SubTy =
4657 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4658 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4659 if (Size != SubSL->getSizeInBytes())
4660 return nullptr; // The sub-struct doesn't have quite the size needed.
4661
4662 return SubTy;
4663}
4664
4665/// Pre-split loads and stores to simplify rewriting.
4666///
4667/// We want to break up the splittable load+store pairs as much as
4668/// possible. This is important to do as a preprocessing step, as once we
4669/// start rewriting the accesses to partitions of the alloca we lose the
4670/// necessary information to correctly split apart paired loads and stores
4671/// which both point into this alloca. The case to consider is something like
4672/// the following:
4673///
4674/// %a = alloca [12 x i8]
4675/// %gep1 = getelementptr i8, ptr %a, i32 0
4676/// %gep2 = getelementptr i8, ptr %a, i32 4
4677/// %gep3 = getelementptr i8, ptr %a, i32 8
4678/// store float 0.0, ptr %gep1
4679/// store float 1.0, ptr %gep2
4680/// %v = load i64, ptr %gep1
4681/// store i64 %v, ptr %gep2
4682/// %f1 = load float, ptr %gep2
4683/// %f2 = load float, ptr %gep3
4684///
4685/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4686/// promote everything so we recover the 2 SSA values that should have been
4687/// there all along.
4688///
4689/// \returns true if any changes are made.
4690bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4691 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4692
4693 // Track the loads and stores which are candidates for pre-splitting here, in
4694 // the order they first appear during the partition scan. These give stable
4695 // iteration order and a basis for tracking which loads and stores we
4696 // actually split.
4699
4700 // We need to accumulate the splits required of each load or store where we
4701 // can find them via a direct lookup. This is important to cross-check loads
4702 // and stores against each other. We also track the slice so that we can kill
4703 // all the slices that end up split.
4704 struct SplitOffsets {
4705 Slice *S;
4706 std::vector<uint64_t> Splits;
4707 };
4708 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4709
4710 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4711 // This is important as we also cannot pre-split stores of those loads!
4712 // FIXME: This is all pretty gross. It means that we can be more aggressive
4713 // in pre-splitting when the load feeding the store happens to come from
4714 // a separate alloca. Put another way, the effectiveness of SROA would be
4715 // decreased by a frontend which just concatenated all of its local allocas
4716 // into one big flat alloca. But defeating such patterns is exactly the job
4717 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4718 // change store pre-splitting to actually force pre-splitting of the load
4719 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4720 // maybe it would make it more principled?
4721 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4722
4723 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4724 for (auto &P : AS.partitions()) {
4725 for (Slice &S : P) {
4726 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4727 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4728 // If this is a load we have to track that it can't participate in any
4729 // pre-splitting. If this is a store of a load we have to track that
4730 // that load also can't participate in any pre-splitting.
4731 if (auto *LI = dyn_cast<LoadInst>(I))
4732 UnsplittableLoads.insert(LI);
4733 else if (auto *SI = dyn_cast<StoreInst>(I))
4734 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4735 UnsplittableLoads.insert(LI);
4736 continue;
4737 }
4738 assert(P.endOffset() > S.beginOffset() &&
4739 "Empty or backwards partition!");
4740
4741 // Determine if this is a pre-splittable slice.
4742 if (auto *LI = dyn_cast<LoadInst>(I)) {
4743 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4744
4745 // The load must be used exclusively to store into other pointers for
4746 // us to be able to arbitrarily pre-split it. The stores must also be
4747 // simple to avoid changing semantics.
4748 auto IsLoadSimplyStored = [](LoadInst *LI) {
4749 for (User *LU : LI->users()) {
4750 auto *SI = dyn_cast<StoreInst>(LU);
4751 if (!SI || !SI->isSimple())
4752 return false;
4753 }
4754 return true;
4755 };
4756 if (!IsLoadSimplyStored(LI)) {
4757 UnsplittableLoads.insert(LI);
4758 continue;
4759 }
4760
4761 Loads.push_back(LI);
4762 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4763 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4764 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4765 continue;
4766 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4767 if (!StoredLoad || !StoredLoad->isSimple())
4768 continue;
4769 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4770
4771 Stores.push_back(SI);
4772 } else {
4773 // Other uses cannot be pre-split.
4774 continue;
4775 }
4776
4777 // Record the initial split.
4778 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4779 auto &Offsets = SplitOffsetsMap[I];
4780 assert(Offsets.Splits.empty() &&
4781 "Should not have splits the first time we see an instruction!");
4782 Offsets.S = &S;
4783 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4784 }
4785
4786 // Now scan the already split slices, and add a split for any of them which
4787 // we're going to pre-split.
4788 for (Slice *S : P.splitSliceTails()) {
4789 auto SplitOffsetsMapI =
4790 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4791 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4792 continue;
4793 auto &Offsets = SplitOffsetsMapI->second;
4794
4795 assert(Offsets.S == S && "Found a mismatched slice!");
4796 assert(!Offsets.Splits.empty() &&
4797 "Cannot have an empty set of splits on the second partition!");
4798 assert(Offsets.Splits.back() ==
4799 P.beginOffset() - Offsets.S->beginOffset() &&
4800 "Previous split does not end where this one begins!");
4801
4802 // Record each split. The last partition's end isn't needed as the size
4803 // of the slice dictates that.
4804 if (S->endOffset() > P.endOffset())
4805 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4806 }
4807 }
4808
4809 // We may have split loads where some of their stores are split stores. For
4810 // such loads and stores, we can only pre-split them if their splits exactly
4811 // match relative to their starting offset. We have to verify this prior to
4812 // any rewriting.
4813 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4814 // Lookup the load we are storing in our map of split
4815 // offsets.
4816 auto *LI = cast<LoadInst>(SI->getValueOperand());
4817 // If it was completely unsplittable, then we're done,
4818 // and this store can't be pre-split.
4819 if (UnsplittableLoads.count(LI))
4820 return true;
4821
4822 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4823 if (LoadOffsetsI == SplitOffsetsMap.end())
4824 return false; // Unrelated loads are definitely safe.
4825 auto &LoadOffsets = LoadOffsetsI->second;
4826
4827 // Now lookup the store's offsets.
4828 auto &StoreOffsets = SplitOffsetsMap[SI];
4829
4830 // If the relative offsets of each split in the load and
4831 // store match exactly, then we can split them and we
4832 // don't need to remove them here.
4833 if (LoadOffsets.Splits == StoreOffsets.Splits)
4834 return false;
4835
4836 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4837 << " " << *LI << "\n"
4838 << " " << *SI << "\n");
4839
4840 // We've found a store and load that we need to split
4841 // with mismatched relative splits. Just give up on them
4842 // and remove both instructions from our list of
4843 // candidates.
4844 UnsplittableLoads.insert(LI);
4845 return true;
4846 });
4847 // Now we have to go *back* through all the stores, because a later store may
4848 // have caused an earlier store's load to become unsplittable and if it is
4849 // unsplittable for the later store, then we can't rely on it being split in
4850 // the earlier store either.
4851 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4852 auto *LI = cast<LoadInst>(SI->getValueOperand());
4853 return UnsplittableLoads.count(LI);
4854 });
4855 // Once we've established all the loads that can't be split for some reason,
4856 // filter any that made it into our list out.
4857 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4858 return UnsplittableLoads.count(LI);
4859 });
4860
4861 // If no loads or stores are left, there is no pre-splitting to be done for
4862 // this alloca.
4863 if (Loads.empty() && Stores.empty())
4864 return false;
4865
4866 // From here on, we can't fail and will be building new accesses, so rig up
4867 // an IR builder.
4868 IRBuilderTy IRB(&AI);
4869
4870 // Collect the new slices which we will merge into the alloca slices.
4871 SmallVector<Slice, 4> NewSlices;
4872
4873 // Track any allocas we end up splitting loads and stores for so we iterate
4874 // on them.
4875 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4876
4877 // At this point, we have collected all of the loads and stores we can
4878 // pre-split, and the specific splits needed for them. We actually do the
4879 // splitting in a specific order in order to handle when one of the loads in
4880 // the value operand to one of the stores.
4881 //
4882 // First, we rewrite all of the split loads, and just accumulate each split
4883 // load in a parallel structure. We also build the slices for them and append
4884 // them to the alloca slices.
4885 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4886 std::vector<LoadInst *> SplitLoads;
4887 const DataLayout &DL = AI.getDataLayout();
4888 for (LoadInst *LI : Loads) {
4889 SplitLoads.clear();
4890
4891 auto &Offsets = SplitOffsetsMap[LI];
4892 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4893 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4894 "Load must have type size equal to store size");
4895 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4896 "Load must be >= slice size");
4897
4898 uint64_t BaseOffset = Offsets.S->beginOffset();
4899 assert(BaseOffset + SliceSize > BaseOffset &&
4900 "Cannot represent alloca access size using 64-bit integers!");
4901
4903 IRB.SetInsertPoint(LI);
4904
4905 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4906
4907 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4908 int Idx = 0, Size = Offsets.Splits.size();
4909 for (;;) {
4910 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4911 auto AS = LI->getPointerAddressSpace();
4912 auto *PartPtrTy = LI->getPointerOperandType();
4913 LoadInst *PLoad = IRB.CreateAlignedLoad(
4914 PartTy,
4915 getAdjustedPtr(IRB, DL, BasePtr,
4916 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4917 PartPtrTy, BasePtr->getName() + "."),
4918 getAdjustedAlignment(LI, PartOffset),
4919 /*IsVolatile*/ false, LI->getName());
4920 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4921 LLVMContext::MD_access_group});
4922
4923 // Append this load onto the list of split loads so we can find it later
4924 // to rewrite the stores.
4925 SplitLoads.push_back(PLoad);
4926
4927 // Now build a new slice for the alloca.
4928 NewSlices.push_back(
4929 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4930 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4931 /*IsSplittable*/ false));
4932 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4933 << ", " << NewSlices.back().endOffset()
4934 << "): " << *PLoad << "\n");
4935
4936 // See if we've handled all the splits.
4937 if (Idx >= Size)
4938 break;
4939
4940 // Setup the next partition.
4941 PartOffset = Offsets.Splits[Idx];
4942 ++Idx;
4943 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4944 }
4945
4946 // Now that we have the split loads, do the slow walk over all uses of the
4947 // load and rewrite them as split stores, or save the split loads to use
4948 // below if the store is going to be split there anyways.
4949 bool DeferredStores = false;
4950 for (User *LU : LI->users()) {
4951 StoreInst *SI = cast<StoreInst>(LU);
4952 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4953 DeferredStores = true;
4954 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4955 << "\n");
4956 continue;
4957 }
4958
4959 Value *StoreBasePtr = SI->getPointerOperand();
4960 IRB.SetInsertPoint(SI);
4961 AAMDNodes AATags = SI->getAAMetadata();
4962
4963 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4964
4965 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4966 LoadInst *PLoad = SplitLoads[Idx];
4967 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4968 auto *PartPtrTy = SI->getPointerOperandType();
4969
4970 auto AS = SI->getPointerAddressSpace();
4971 StoreInst *PStore = IRB.CreateAlignedStore(
4972 PLoad,
4973 getAdjustedPtr(IRB, DL, StoreBasePtr,
4974 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4975 PartPtrTy, StoreBasePtr->getName() + "."),
4976 getAdjustedAlignment(SI, PartOffset),
4977 /*IsVolatile*/ false);
4978 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4979 LLVMContext::MD_access_group,
4980 LLVMContext::MD_DIAssignID});
4981
4982 if (AATags)
4983 PStore->setAAMetadata(
4984 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4985 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4986 }
4987
4988 // We want to immediately iterate on any allocas impacted by splitting
4989 // this store, and we have to track any promotable alloca (indicated by
4990 // a direct store) as needing to be resplit because it is no longer
4991 // promotable.
4992 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4993 ResplitPromotableAllocas.insert(OtherAI);
4994 Worklist.insert(OtherAI);
4995 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4996 StoreBasePtr->stripInBoundsOffsets())) {
4997 Worklist.insert(OtherAI);
4998 }
4999
5000 // Mark the original store as dead.
5001 DeadInsts.push_back(SI);
5002 }
5003
5004 // Save the split loads if there are deferred stores among the users.
5005 if (DeferredStores)
5006 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
5007
5008 // Mark the original load as dead and kill the original slice.
5009 DeadInsts.push_back(LI);
5010 Offsets.S->kill();
5011 }
5012
5013 // Second, we rewrite all of the split stores. At this point, we know that
5014 // all loads from this alloca have been split already. For stores of such
5015 // loads, we can simply look up the pre-existing split loads. For stores of
5016 // other loads, we split those loads first and then write split stores of
5017 // them.
5018 for (StoreInst *SI : Stores) {
5019 auto *LI = cast<LoadInst>(SI->getValueOperand());
5020 IntegerType *Ty = cast<IntegerType>(LI->getType());
5021 assert(Ty->getBitWidth() % 8 == 0);
5022 uint64_t StoreSize = Ty->getBitWidth() / 8;
5023 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
5024
5025 auto &Offsets = SplitOffsetsMap[SI];
5026 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
5027 "Slice size should always match load size exactly!");
5028 uint64_t BaseOffset = Offsets.S->beginOffset();
5029 assert(BaseOffset + StoreSize > BaseOffset &&
5030 "Cannot represent alloca access size using 64-bit integers!");
5031
5032 Value *LoadBasePtr = LI->getPointerOperand();
5033 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
5034
5035 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5036
5037 // Check whether we have an already split load.
5038 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
5039 std::vector<LoadInst *> *SplitLoads = nullptr;
5040 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5041 SplitLoads = &SplitLoadsMapI->second;
5042 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5043 "Too few split loads for the number of splits in the store!");
5044 } else {
5045 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5046 }
5047
5048 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5049 int Idx = 0, Size = Offsets.Splits.size();
5050 for (;;) {
5051 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5052 auto *LoadPartPtrTy = LI->getPointerOperandType();
5053 auto *StorePartPtrTy = SI->getPointerOperandType();
5054
5055 // Either lookup a split load or create one.
5056 LoadInst *PLoad;
5057 if (SplitLoads) {
5058 PLoad = (*SplitLoads)[Idx];
5059 } else {
5060 IRB.SetInsertPoint(LI);
5061 auto AS = LI->getPointerAddressSpace();
5062 PLoad = IRB.CreateAlignedLoad(
5063 PartTy,
5064 getAdjustedPtr(IRB, DL, LoadBasePtr,
5065 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5066 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5067 getAdjustedAlignment(LI, PartOffset),
5068 /*IsVolatile*/ false, LI->getName());
5069 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5070 LLVMContext::MD_access_group});
5071 }
5072
5073 // And store this partition.
5074 IRB.SetInsertPoint(SI);
5075 auto AS = SI->getPointerAddressSpace();
5076 StoreInst *PStore = IRB.CreateAlignedStore(
5077 PLoad,
5078 getAdjustedPtr(IRB, DL, StoreBasePtr,
5079 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5080 StorePartPtrTy, StoreBasePtr->getName() + "."),
5081 getAdjustedAlignment(SI, PartOffset),
5082 /*IsVolatile*/ false);
5083 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5084 LLVMContext::MD_access_group});
5085
5086 // Now build a new slice for the alloca.
5087 NewSlices.push_back(
5088 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5089 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5090 /*IsSplittable*/ false));
5091 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5092 << ", " << NewSlices.back().endOffset()
5093 << "): " << *PStore << "\n");
5094 if (!SplitLoads) {
5095 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5096 }
5097
5098 // See if we've finished all the splits.
5099 if (Idx >= Size)
5100 break;
5101
5102 // Setup the next partition.
5103 PartOffset = Offsets.Splits[Idx];
5104 ++Idx;
5105 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5106 }
5107
5108 // We want to immediately iterate on any allocas impacted by splitting
5109 // this load, which is only relevant if it isn't a load of this alloca and
5110 // thus we didn't already split the loads above. We also have to keep track
5111 // of any promotable allocas we split loads on as they can no longer be
5112 // promoted.
5113 if (!SplitLoads) {
5114 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5115 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5116 ResplitPromotableAllocas.insert(OtherAI);
5117 Worklist.insert(OtherAI);
5118 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5119 LoadBasePtr->stripInBoundsOffsets())) {
5120 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5121 Worklist.insert(OtherAI);
5122 }
5123 }
5124
5125 // Mark the original store as dead now that we've split it up and kill its
5126 // slice. Note that we leave the original load in place unless this store
5127 // was its only use. It may in turn be split up if it is an alloca load
5128 // for some other alloca, but it may be a normal load. This may introduce
5129 // redundant loads, but where those can be merged the rest of the optimizer
5130 // should handle the merging, and this uncovers SSA splits which is more
5131 // important. In practice, the original loads will almost always be fully
5132 // split and removed eventually, and the splits will be merged by any
5133 // trivial CSE, including instcombine.
5134 if (LI->hasOneUse()) {
5135 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5136 DeadInsts.push_back(LI);
5137 }
5138 DeadInsts.push_back(SI);
5139 Offsets.S->kill();
5140 }
5141
5142 // Remove the killed slices that have ben pre-split.
5143 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5144
5145 // Insert our new slices. This will sort and merge them into the sorted
5146 // sequence.
5147 AS.insert(NewSlices);
5148
5149 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5150#ifndef NDEBUG
5151 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5152 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5153#endif
5154
5155 // Finally, don't try to promote any allocas that new require re-splitting.
5156 // They have already been added to the worklist above.
5157 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5158
5159 return true;
5160}
5161
5162/// Rewrite an alloca partition's users.
5163///
5164/// This routine drives both of the rewriting goals of the SROA pass. It tries
5165/// to rewrite uses of an alloca partition to be conducive for SSA value
5166/// promotion. If the partition needs a new, more refined alloca, this will
5167/// build that new alloca, preserving as much type information as possible, and
5168/// rewrite the uses of the old alloca to point at the new one and have the
5169/// appropriate new offsets. It also evaluates how successful the rewrite was
5170/// at enabling promotion and if it was successful queues the alloca to be
5171/// promoted.
5172AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
5173 Partition &P) {
5174 // Try to compute a friendly type for this partition of the alloca. This
5175 // won't always succeed, in which case we fall back to a legal integer type
5176 // or an i8 array of an appropriate size.
5177 Type *SliceTy = nullptr;
5178 VectorType *SliceVecTy = nullptr;
5179 const DataLayout &DL = AI.getDataLayout();
5180 unsigned VScale = AI.getFunction()->getVScaleValue();
5181
5182 std::pair<Type *, IntegerType *> CommonUseTy =
5183 findCommonType(P.begin(), P.end(), P.endOffset());
5184 // Do all uses operate on the same type?
5185 if (CommonUseTy.first) {
5186 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
5187 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5188 SliceTy = CommonUseTy.first;
5189 SliceVecTy = dyn_cast<VectorType>(SliceTy);
5190 }
5191 }
5192 // If not, can we find an appropriate subtype in the original allocated type?
5193 if (!SliceTy)
5194 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5195 P.beginOffset(), P.size()))
5196 SliceTy = TypePartitionTy;
5197
5198 // If still not, can we use the largest bitwidth integer type used?
5199 if (!SliceTy && CommonUseTy.second)
5200 if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
5201 SliceTy = CommonUseTy.second;
5202 SliceVecTy = dyn_cast<VectorType>(SliceTy);
5203 }
5204 if ((!SliceTy || (SliceTy->isArrayTy() &&
5205 SliceTy->getArrayElementType()->isIntegerTy())) &&
5206 DL.isLegalInteger(P.size() * 8)) {
5207 SliceTy = Type::getIntNTy(*C, P.size() * 8);
5208 }
5209
5210 // If the common use types are not viable for promotion then attempt to find
5211 // another type that is viable.
5212 if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
5213 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5214 P.beginOffset(), P.size())) {
5215 VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
5216 if (TypePartitionVecTy &&
5217 checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
5218 SliceTy = TypePartitionTy;
5219 }
5220
5221 if (!SliceTy)
5222 SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
5223 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
5224
5225 bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
5226
5227 VectorType *VecTy =
5228 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
5229 if (VecTy)
5230 SliceTy = VecTy;
5231
5232 // Check for the case where we're going to rewrite to a new alloca of the
5233 // exact same type as the original, and with the same access offsets. In that
5234 // case, re-use the existing alloca, but still run through the rewriter to
5235 // perform phi and select speculation.
5236 // P.beginOffset() can be non-zero even with the same type in a case with
5237 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5238 AllocaInst *NewAI;
5239 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5240 NewAI = &AI;
5241 // FIXME: We should be able to bail at this point with "nothing changed".
5242 // FIXME: We might want to defer PHI speculation until after here.
5243 // FIXME: return nullptr;
5244 } else {
5245 // Make sure the alignment is compatible with P.beginOffset().
5246 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5247 // If we will get at least this much alignment from the type alone, leave
5248 // the alloca's alignment unconstrained.
5249 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
5250 NewAI = new AllocaInst(
5251 SliceTy, AI.getAddressSpace(), nullptr,
5252 IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
5253 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5254 AI.getIterator());
5255 // Copy the old AI debug location over to the new one.
5256 NewAI->setDebugLoc(AI.getDebugLoc());
5257 ++NumNewAllocas;
5258 }
5259
5260 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5261 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5262
5263 // Track the high watermark on the worklist as it is only relevant for
5264 // promoted allocas. We will reset it to this point if the alloca is not in
5265 // fact scheduled for promotion.
5266 unsigned PPWOldSize = PostPromotionWorklist.size();
5267 unsigned NumUses = 0;
5268 SmallSetVector<PHINode *, 8> PHIUsers;
5269 SmallSetVector<SelectInst *, 8> SelectUsers;
5270
5271 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5272 P.endOffset(), IsIntegerPromotable, VecTy,
5273 PHIUsers, SelectUsers);
5274 bool Promotable = true;
5275 // Check whether we can have tree-structured merge.
5276 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5277 NumUses += DeletedValues->size() + 1;
5278 for (Value *V : *DeletedValues)
5279 DeadInsts.push_back(V);
5280 } else {
5281 for (Slice *S : P.splitSliceTails()) {
5282 Promotable &= Rewriter.visit(S);
5283 ++NumUses;
5284 }
5285 for (Slice &S : P) {
5286 Promotable &= Rewriter.visit(&S);
5287 ++NumUses;
5288 }
5289 }
5290
5291 NumAllocaPartitionUses += NumUses;
5292 MaxUsesPerAllocaPartition.updateMax(NumUses);
5293
5294 // Now that we've processed all the slices in the new partition, check if any
5295 // PHIs or Selects would block promotion.
5296 for (PHINode *PHI : PHIUsers)
5297 if (!isSafePHIToSpeculate(*PHI)) {
5298 Promotable = false;
5299 PHIUsers.clear();
5300 SelectUsers.clear();
5301 break;
5302 }
5303
5305 NewSelectsToRewrite;
5306 NewSelectsToRewrite.reserve(SelectUsers.size());
5307 for (SelectInst *Sel : SelectUsers) {
5308 std::optional<RewriteableMemOps> Ops =
5309 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5310 if (!Ops) {
5311 Promotable = false;
5312 PHIUsers.clear();
5313 SelectUsers.clear();
5314 NewSelectsToRewrite.clear();
5315 break;
5316 }
5317 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5318 }
5319
5320 if (Promotable) {
5321 for (Use *U : AS.getDeadUsesIfPromotable()) {
5322 auto *OldInst = dyn_cast<Instruction>(U->get());
5323 Value::dropDroppableUse(*U);
5324 if (OldInst)
5325 if (isInstructionTriviallyDead(OldInst))
5326 DeadInsts.push_back(OldInst);
5327 }
5328 if (PHIUsers.empty() && SelectUsers.empty()) {
5329 // Promote the alloca.
5330 PromotableAllocas.insert(NewAI);
5331 } else {
5332 // If we have either PHIs or Selects to speculate, add them to those
5333 // worklists and re-queue the new alloca so that we promote in on the
5334 // next iteration.
5335 SpeculatablePHIs.insert_range(PHIUsers);
5336 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5337 NewSelectsToRewrite.size());
5338 for (auto &&KV : llvm::make_range(
5339 std::make_move_iterator(NewSelectsToRewrite.begin()),
5340 std::make_move_iterator(NewSelectsToRewrite.end())))
5341 SelectsToRewrite.insert(std::move(KV));
5342 Worklist.insert(NewAI);
5343 }
5344 } else {
5345 // Drop any post-promotion work items if promotion didn't happen.
5346 while (PostPromotionWorklist.size() > PPWOldSize)
5347 PostPromotionWorklist.pop_back();
5348
5349 // We couldn't promote and we didn't create a new partition, nothing
5350 // happened.
5351 if (NewAI == &AI)
5352 return nullptr;
5353
5354 // If we can't promote the alloca, iterate on it to check for new
5355 // refinements exposed by splitting the current alloca. Don't iterate on an
5356 // alloca which didn't actually change and didn't get promoted.
5357 Worklist.insert(NewAI);
5358 }
5359
5360 return NewAI;
5361}
5362
5363// There isn't a shared interface to get the "address" parts out of a
5364// dbg.declare and dbg.assign, so provide some wrappers.
5367 return DVR->isKillAddress();
5368 return DVR->isKillLocation();
5369}
5370
5373 return DVR->getAddressExpression();
5374 return DVR->getExpression();
5375}
5376
5377/// Create or replace an existing fragment in a DIExpression with \p Frag.
5378/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5379/// operation, add \p BitExtractOffset to the offset part.
5380///
5381/// Returns the new expression, or nullptr if this fails (see details below).
5382///
5383/// This function is similar to DIExpression::createFragmentExpression except
5384/// for 3 important distinctions:
5385/// 1. The new fragment isn't relative to an existing fragment.
5386/// 2. It assumes the computed location is a memory location. This means we
5387/// don't need to perform checks that creating the fragment preserves the
5388/// expression semantics.
5389/// 3. Existing extract_bits are modified independently of fragment changes
5390/// using \p BitExtractOffset. A change to the fragment offset or size
5391/// may affect a bit extract. But a bit extract offset can change
5392/// independently of the fragment dimensions.
5393///
5394/// Returns the new expression, or nullptr if one couldn't be created.
5395/// Ideally this is only used to signal that a bit-extract has become
5396/// zero-sized (and thus the new debug record has no size and can be
5397/// dropped), however, it fails for other reasons too - see the FIXME below.
5398///
5399/// FIXME: To keep the change that introduces this function NFC it bails
5400/// in some situations unecessarily, e.g. when fragment and bit extract
5401/// sizes differ.
5404 int64_t BitExtractOffset) {
5406 bool HasFragment = false;
5407 bool HasBitExtract = false;
5408
5409 for (auto &Op : Expr->expr_ops()) {
5410 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5411 HasFragment = true;
5412 continue;
5413 }
5414 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5416 HasBitExtract = true;
5417 int64_t ExtractOffsetInBits = Op.getArg(0);
5418 int64_t ExtractSizeInBits = Op.getArg(1);
5419
5420 // DIExpression::createFragmentExpression doesn't know how to handle
5421 // a fragment that is smaller than the extract. Copy the behaviour
5422 // (bail) to avoid non-NFC changes.
5423 // FIXME: Don't do this.
5424 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5425 return nullptr;
5426
5427 assert(BitExtractOffset <= 0);
5428 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5429
5430 // DIExpression::createFragmentExpression doesn't know what to do
5431 // if the new extract starts "outside" the existing one. Copy the
5432 // behaviour (bail) to avoid non-NFC changes.
5433 // FIXME: Don't do this.
5434 if (AdjustedOffset < 0)
5435 return nullptr;
5436
5437 Ops.push_back(Op.getOp());
5438 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5439 Ops.push_back(ExtractSizeInBits);
5440 continue;
5441 }
5442 Op.appendToVector(Ops);
5443 }
5444
5445 // Unsupported by createFragmentExpression, so don't support it here yet to
5446 // preserve NFC-ness.
5447 if (HasFragment && HasBitExtract)
5448 return nullptr;
5449
5450 if (!HasBitExtract) {
5452 Ops.push_back(Frag.OffsetInBits);
5453 Ops.push_back(Frag.SizeInBits);
5454 }
5455 return DIExpression::get(Expr->getContext(), Ops);
5456}
5457
5458/// Insert a new DbgRecord.
5459/// \p Orig Original to copy record type, debug loc and variable from, and
5460/// additionally value and value expression for dbg_assign records.
5461/// \p NewAddr Location's new base address.
5462/// \p NewAddrExpr New expression to apply to address.
5463/// \p BeforeInst Insert position.
5464/// \p NewFragment New fragment (absolute, non-relative).
5465/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5466static void
5468 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5469 std::optional<DIExpression::FragmentInfo> NewFragment,
5470 int64_t BitExtractAdjustment) {
5471 (void)DIB;
5472
5473 // A dbg_assign puts fragment info in the value expression only. The address
5474 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5475 // new fragment info into NewAddrExpr (as it only has one expression).
5476 DIExpression *NewFragmentExpr =
5477 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5478 if (NewFragment)
5479 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5480 BitExtractAdjustment);
5481 if (!NewFragmentExpr)
5482 return;
5483
5484 if (Orig->isDbgDeclare()) {
5486 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5487 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5488 BeforeInst->getIterator());
5489 return;
5490 }
5491
5492 if (Orig->isDbgValue()) {
5494 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5495 // Drop debug information if the expression doesn't start with a
5496 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5497 // describes the address of alloca rather than the value inside the alloca.
5498 if (!NewFragmentExpr->startsWithDeref())
5499 DVR->setKillAddress();
5500 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5501 BeforeInst->getIterator());
5502 return;
5503 }
5504
5505 // Apply a DIAssignID to the store if it doesn't already have it.
5506 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5507 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5509 }
5510
5512 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5513 NewAddrExpr, Orig->getDebugLoc());
5514 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5515 (void)NewAssign;
5516}
5517
5518/// Walks the slices of an alloca and form partitions based on them,
5519/// rewriting each of their uses.
5520bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5521 if (AS.begin() == AS.end())
5522 return false;
5523
5524 unsigned NumPartitions = 0;
5525 bool Changed = false;
5526 const DataLayout &DL = AI.getModule()->getDataLayout();
5527
5528 // First try to pre-split loads and stores.
5529 Changed |= presplitLoadsAndStores(AI, AS);
5530
5531 // Now that we have identified any pre-splitting opportunities,
5532 // mark loads and stores unsplittable except for the following case.
5533 // We leave a slice splittable if all other slices are disjoint or fully
5534 // included in the slice, such as whole-alloca loads and stores.
5535 // If we fail to split these during pre-splitting, we want to force them
5536 // to be rewritten into a partition.
5537 bool IsSorted = true;
5538
5539 uint64_t AllocaSize =
5540 DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue();
5541 const uint64_t MaxBitVectorSize = 1024;
5542 if (AllocaSize <= MaxBitVectorSize) {
5543 // If a byte boundary is included in any load or store, a slice starting or
5544 // ending at the boundary is not splittable.
5545 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5546 for (Slice &S : AS)
5547 for (unsigned O = S.beginOffset() + 1;
5548 O < S.endOffset() && O < AllocaSize; O++)
5549 SplittableOffset.reset(O);
5550
5551 for (Slice &S : AS) {
5552 if (!S.isSplittable())
5553 continue;
5554
5555 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5556 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5557 continue;
5558
5559 if (isa<LoadInst>(S.getUse()->getUser()) ||
5560 isa<StoreInst>(S.getUse()->getUser())) {
5561 S.makeUnsplittable();
5562 IsSorted = false;
5563 }
5564 }
5565 } else {
5566 // We only allow whole-alloca splittable loads and stores
5567 // for a large alloca to avoid creating too large BitVector.
5568 for (Slice &S : AS) {
5569 if (!S.isSplittable())
5570 continue;
5571
5572 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5573 continue;
5574
5575 if (isa<LoadInst>(S.getUse()->getUser()) ||
5576 isa<StoreInst>(S.getUse()->getUser())) {
5577 S.makeUnsplittable();
5578 IsSorted = false;
5579 }
5580 }
5581 }
5582
5583 if (!IsSorted)
5585
5586 /// Describes the allocas introduced by rewritePartition in order to migrate
5587 /// the debug info.
5588 struct Fragment {
5589 AllocaInst *Alloca;
5590 uint64_t Offset;
5591 uint64_t Size;
5592 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5593 : Alloca(AI), Offset(O), Size(S) {}
5594 };
5595 SmallVector<Fragment, 4> Fragments;
5596
5597 // Rewrite each partition.
5598 for (auto &P : AS.partitions()) {
5599 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5600 Changed = true;
5601 if (NewAI != &AI) {
5602 uint64_t SizeOfByte = 8;
5603 uint64_t AllocaSize =
5604 DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
5605 // Don't include any padding.
5606 uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
5607 Fragments.push_back(
5608 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5609 }
5610 }
5611 ++NumPartitions;
5612 }
5613
5614 NumAllocaPartitions += NumPartitions;
5615 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5616
5617 // Migrate debug information from the old alloca to the new alloca(s)
5618 // and the individual partitions.
5619 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5620 // Can't overlap with undef memory.
5621 if (isKillAddress(DbgVariable))
5622 return;
5623
5624 const Value *DbgPtr = DbgVariable->getAddress();
5626 DbgVariable->getFragmentOrEntireVariable();
5627 // Get the address expression constant offset if one exists and the ops
5628 // that come after it.
5629 int64_t CurrentExprOffsetInBytes = 0;
5630 SmallVector<uint64_t> PostOffsetOps;
5631 if (!getAddressExpression(DbgVariable)
5632 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5633 return; // Couldn't interpret this DIExpression - drop the var.
5634
5635 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5636 int64_t ExtractOffsetInBits = 0;
5637 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5638 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5640 ExtractOffsetInBits = Op.getArg(0);
5641 break;
5642 }
5643 }
5644
5645 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5646 for (auto Fragment : Fragments) {
5647 int64_t OffsetFromLocationInBits;
5648 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5649 // Find the variable fragment that the new alloca slice covers.
5650 // Drop debug info for this variable fragment if we can't compute an
5651 // intersect between it and the alloca slice.
5653 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5654 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5655 NewDbgFragment, OffsetFromLocationInBits))
5656 continue; // Do not migrate this fragment to this slice.
5657
5658 // Zero sized fragment indicates there's no intersect between the variable
5659 // fragment and the alloca slice. Skip this slice for this variable
5660 // fragment.
5661 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5662 continue; // Do not migrate this fragment to this slice.
5663
5664 // No fragment indicates DbgVariable's variable or fragment exactly
5665 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5666 if (!NewDbgFragment)
5667 NewDbgFragment = DbgVariable->getFragment();
5668
5669 // Reduce the new expression offset by the bit-extract offset since
5670 // we'll be keeping that.
5671 int64_t OffestFromNewAllocaInBits =
5672 OffsetFromLocationInBits - ExtractOffsetInBits;
5673 // We need to adjust an existing bit extract if the offset expression
5674 // can't eat the slack (i.e., if the new offset would be negative).
5675 int64_t BitExtractOffset =
5676 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5677 // The magnitude of a negative value indicates the number of bits into
5678 // the existing variable fragment that the memory region begins. The new
5679 // variable fragment already excludes those bits - the new DbgPtr offset
5680 // only needs to be applied if it's positive.
5681 OffestFromNewAllocaInBits =
5682 std::max(int64_t(0), OffestFromNewAllocaInBits);
5683
5684 // Rebuild the expression:
5685 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5686 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5687 // address expression but the value expression instead.
5688 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5689 if (OffestFromNewAllocaInBits > 0) {
5690 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5691 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5692 }
5693
5694 // Remove any existing intrinsics on the new alloca describing
5695 // the variable fragment.
5696 auto RemoveOne = [DbgVariable](auto *OldDII) {
5697 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5698 return LHS->getVariable() == RHS->getVariable() &&
5699 LHS->getDebugLoc()->getInlinedAt() ==
5700 RHS->getDebugLoc()->getInlinedAt();
5701 };
5702 if (SameVariableFragment(OldDII, DbgVariable))
5703 OldDII->eraseFromParent();
5704 };
5705 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5706 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5707 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5708 NewDbgFragment, BitExtractOffset);
5709 }
5710 };
5711
5712 // Migrate debug information from the old alloca to the new alloca(s)
5713 // and the individual partitions.
5714 for_each(findDVRDeclares(&AI), MigrateOne);
5715 for_each(findDVRValues(&AI), MigrateOne);
5716 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5717
5718 return Changed;
5719}
5720
5721/// Clobber a use with poison, deleting the used value if it becomes dead.
5722void SROA::clobberUse(Use &U) {
5723 Value *OldV = U;
5724 // Replace the use with an poison value.
5725 U = PoisonValue::get(OldV->getType());
5726
5727 // Check for this making an instruction dead. We have to garbage collect
5728 // all the dead instructions to ensure the uses of any alloca end up being
5729 // minimal.
5730 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5731 if (isInstructionTriviallyDead(OldI)) {
5732 DeadInsts.push_back(OldI);
5733 }
5734}
5735
5736/// A basic LoadAndStorePromoter that does not remove store nodes.
5738public:
5740 Type *ZeroType)
5741 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5742 bool shouldDelete(Instruction *I) const override {
5743 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5744 }
5745
5747 return UndefValue::get(ZeroType);
5748 }
5749
5750private:
5751 Type *ZeroType;
5752};
5753
5754bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5755 // Look through each "partition", looking for slices with the same start/end
5756 // that do not overlap with any before them. The slices are sorted by
5757 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5758 // sophisticated algorithm that takes splittable slices into account.
5759 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5760 bool AllSameAndValid = true;
5761 Type *PartitionType = nullptr;
5763 uint64_t BeginOffset = 0;
5764 uint64_t EndOffset = 0;
5765
5766 auto Flush = [&]() {
5767 if (AllSameAndValid && !Insts.empty()) {
5768 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5769 << EndOffset << ")\n");
5771 SSAUpdater SSA(&NewPHIs);
5772 Insts.push_back(&AI);
5773 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5774 Promoter.run(Insts);
5775 }
5776 AllSameAndValid = true;
5777 PartitionType = nullptr;
5778 Insts.clear();
5779 };
5780
5781 for (Slice &S : AS) {
5782 auto *User = cast<Instruction>(S.getUse()->getUser());
5783 if (isAssumeLikeIntrinsic(User)) {
5784 LLVM_DEBUG({
5785 dbgs() << "Ignoring slice: ";
5786 AS.print(dbgs(), &S);
5787 });
5788 continue;
5789 }
5790 if (S.beginOffset() >= EndOffset) {
5791 Flush();
5792 BeginOffset = S.beginOffset();
5793 EndOffset = S.endOffset();
5794 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5795 if (AllSameAndValid) {
5796 LLVM_DEBUG({
5797 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5798 << EndOffset << ")";
5799 AS.print(dbgs(), &S);
5800 });
5801 AllSameAndValid = false;
5802 }
5803 EndOffset = std::max(EndOffset, S.endOffset());
5804 continue;
5805 }
5806
5807 if (auto *LI = dyn_cast<LoadInst>(User)) {
5808 Type *UserTy = LI->getType();
5809 // LoadAndStorePromoter requires all the types to be the same.
5810 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5811 AllSameAndValid = false;
5812 PartitionType = UserTy;
5813 Insts.push_back(User);
5814 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5815 Type *UserTy = SI->getValueOperand()->getType();
5816 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5817 AllSameAndValid = false;
5818 PartitionType = UserTy;
5819 Insts.push_back(User);
5820 } else {
5821 AllSameAndValid = false;
5822 }
5823 }
5824
5825 Flush();
5826 return true;
5827}
5828
5829/// Analyze an alloca for SROA.
5830///
5831/// This analyzes the alloca to ensure we can reason about it, builds
5832/// the slices of the alloca, and then hands it off to be split and
5833/// rewritten as needed.
5834std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5835SROA::runOnAlloca(AllocaInst &AI) {
5836 bool Changed = false;
5837 bool CFGChanged = false;
5838
5839 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5840 ++NumAllocasAnalyzed;
5841
5842 // Special case dead allocas, as they're trivial.
5843 if (AI.use_empty()) {
5844 AI.eraseFromParent();
5845 Changed = true;
5846 return {Changed, CFGChanged};
5847 }
5848 const DataLayout &DL = AI.getDataLayout();
5849
5850 // Skip alloca forms that this analysis can't handle.
5851 auto *AT = AI.getAllocatedType();
5852 TypeSize Size = DL.getTypeAllocSize(AT);
5853 if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5854 Size.getFixedValue() == 0)
5855 return {Changed, CFGChanged};
5856
5857 // First, split any FCA loads and stores touching this alloca to promote
5858 // better splitting and promotion opportunities.
5859 IRBuilderTy IRB(&AI);
5860 AggLoadStoreRewriter AggRewriter(DL, IRB);
5861 Changed |= AggRewriter.rewrite(AI);
5862
5863 // Build the slices using a recursive instruction-visiting builder.
5864 AllocaSlices AS(DL, AI);
5865 LLVM_DEBUG(AS.print(dbgs()));
5866 if (AS.isEscaped())
5867 return {Changed, CFGChanged};
5868
5869 if (AS.isEscapedReadOnly()) {
5870 Changed |= propagateStoredValuesToLoads(AI, AS);
5871 return {Changed, CFGChanged};
5872 }
5873
5874 // Delete all the dead users of this alloca before splitting and rewriting it.
5875 for (Instruction *DeadUser : AS.getDeadUsers()) {
5876 // Free up everything used by this instruction.
5877 for (Use &DeadOp : DeadUser->operands())
5878 clobberUse(DeadOp);
5879
5880 // Now replace the uses of this instruction.
5881 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5882
5883 // And mark it for deletion.
5884 DeadInsts.push_back(DeadUser);
5885 Changed = true;
5886 }
5887 for (Use *DeadOp : AS.getDeadOperands()) {
5888 clobberUse(*DeadOp);
5889 Changed = true;
5890 }
5891
5892 // No slices to split. Leave the dead alloca for a later pass to clean up.
5893 if (AS.begin() == AS.end())
5894 return {Changed, CFGChanged};
5895
5896 Changed |= splitAlloca(AI, AS);
5897
5898 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5899 while (!SpeculatablePHIs.empty())
5900 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5901
5902 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5903 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5904 while (!RemainingSelectsToRewrite.empty()) {
5905 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5906 CFGChanged |=
5907 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5908 }
5909
5910 return {Changed, CFGChanged};
5911}
5912
5913/// Delete the dead instructions accumulated in this run.
5914///
5915/// Recursively deletes the dead instructions we've accumulated. This is done
5916/// at the very end to maximize locality of the recursive delete and to
5917/// minimize the problems of invalidated instruction pointers as such pointers
5918/// are used heavily in the intermediate stages of the algorithm.
5919///
5920/// We also record the alloca instructions deleted here so that they aren't
5921/// subsequently handed to mem2reg to promote.
5922bool SROA::deleteDeadInstructions(
5923 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5924 bool Changed = false;
5925 while (!DeadInsts.empty()) {
5926 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5927 if (!I)
5928 continue;
5929 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5930
5931 // If the instruction is an alloca, find the possible dbg.declare connected
5932 // to it, and remove it too. We must do this before calling RAUW or we will
5933 // not be able to find it.
5934 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5935 DeletedAllocas.insert(AI);
5936 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5937 OldDII->eraseFromParent();
5938 }
5939
5941 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5942
5943 for (Use &Operand : I->operands())
5944 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5945 // Zero out the operand and see if it becomes trivially dead.
5946 Operand = nullptr;
5948 DeadInsts.push_back(U);
5949 }
5950
5951 ++NumDeleted;
5952 I->eraseFromParent();
5953 Changed = true;
5954 }
5955 return Changed;
5956}
5957/// Promote the allocas, using the best available technique.
5958///
5959/// This attempts to promote whatever allocas have been identified as viable in
5960/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5961/// This function returns whether any promotion occurred.
5962bool SROA::promoteAllocas() {
5963 if (PromotableAllocas.empty())
5964 return false;
5965
5966 if (SROASkipMem2Reg) {
5967 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5968 } else {
5969 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5970 NumPromoted += PromotableAllocas.size();
5971 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5972 }
5973
5974 PromotableAllocas.clear();
5975 return true;
5976}
5977
5978std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5979 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5980
5981 const DataLayout &DL = F.getDataLayout();
5982 BasicBlock &EntryBB = F.getEntryBlock();
5983 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
5984 I != E; ++I) {
5985 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5986 if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
5988 PromotableAllocas.insert(AI);
5989 else
5990 Worklist.insert(AI);
5991 }
5992 }
5993
5994 bool Changed = false;
5995 bool CFGChanged = false;
5996 // A set of deleted alloca instruction pointers which should be removed from
5997 // the list of promotable allocas.
5998 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
5999
6000 do {
6001 while (!Worklist.empty()) {
6002 auto [IterationChanged, IterationCFGChanged] =
6003 runOnAlloca(*Worklist.pop_back_val());
6004 Changed |= IterationChanged;
6005 CFGChanged |= IterationCFGChanged;
6006
6007 Changed |= deleteDeadInstructions(DeletedAllocas);
6008
6009 // Remove the deleted allocas from various lists so that we don't try to
6010 // continue processing them.
6011 if (!DeletedAllocas.empty()) {
6012 Worklist.set_subtract(DeletedAllocas);
6013 PostPromotionWorklist.set_subtract(DeletedAllocas);
6014 PromotableAllocas.set_subtract(DeletedAllocas);
6015 DeletedAllocas.clear();
6016 }
6017 }
6018
6019 Changed |= promoteAllocas();
6020
6021 Worklist = PostPromotionWorklist;
6022 PostPromotionWorklist.clear();
6023 } while (!Worklist.empty());
6024
6025 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6026 assert((!CFGChanged || !PreserveCFG) &&
6027 "Should not have modified the CFG when told to preserve it.");
6028
6029 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6030 for (auto &BB : F) {
6032 }
6033 }
6034
6035 return {Changed, CFGChanged};
6036}
6037
6041 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6042 auto [Changed, CFGChanged] =
6043 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6044 if (!Changed)
6045 return PreservedAnalyses::all();
6047 if (!CFGChanged)
6050 return PA;
6051}
6052
6054 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6055 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6056 OS, MapClassName2PassName);
6057 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6058 : "<modify-cfg>");
6059}
6060
6061SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6062
6063namespace {
6064
6065/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6066class SROALegacyPass : public FunctionPass {
6068
6069public:
6070 static char ID;
6071
6075 }
6076
6077 bool runOnFunction(Function &F) override {
6078 if (skipFunction(F))
6079 return false;
6080
6081 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6082 AssumptionCache &AC =
6083 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6084 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6085 auto [Changed, _] =
6086 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6087 return Changed;
6088 }
6089
6090 void getAnalysisUsage(AnalysisUsage &AU) const override {
6091 AU.addRequired<AssumptionCacheTracker>();
6092 AU.addRequired<DominatorTreeWrapperPass>();
6093 AU.addPreserved<GlobalsAAWrapperPass>();
6094 AU.addPreserved<DominatorTreeWrapperPass>();
6095 }
6096
6097 StringRef getPassName() const override { return "SROA"; }
6098};
6099
6100} // end anonymous namespace
6101
6102char SROALegacyPass::ID = 0;
6103
6108
6109INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6110 "Scalar Replacement Of Aggregates", false, false)
6113INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file implements a map that provides insertion order iteration.
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:338
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2324
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1899
static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, const DataLayout &DL, unsigned VScale)
Test whether a vector type is viable for promotion.
Definition SROA.cpp:2165
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2194
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1466
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4519
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:273
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5402
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2567
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2090
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1888
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2406
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2600
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:985
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1854
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1787
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:972
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5365
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2622
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2501
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1606
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2280
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:319
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1532
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2542
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5467
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1749
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2694
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5371
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4557
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1909
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1687
static Value * convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy)
Generic routine to convert an SSA value to a value of a different type.
Definition SROA.cpp:1999
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
Shrink Wrap Pass
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:997
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1013
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:785
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:932
partition_iterator & operator++()
Definition SROA.cpp:952
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5742
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5739
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5746
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
DebugLoc getDebugLoc() const
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI bool isKillLocation() const
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DIExpression * getExpression() const
LLVM_ABI void setKillAddress()
Kill the address component.
DILocalVariable * getVariable() const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:69
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:161
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:807
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:113
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
LLVMContext & getContext() const
Definition Metadata.h:1241
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6038
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6053
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6061
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:104
void clear()
Completely clear the SetVector.
Definition SetVector.h:284
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:581
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:353
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:301
static constexpr size_t npos
Definition StringRef.h:57
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:621
TypeSize getSizeInBytes() const
Definition DataLayout.h:630
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:652
TypeSize getSizeInBits() const
Definition DataLayout.h:632
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getArrayElementType() const
Definition Type.h:408
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:203
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
op_iterator op_begin()
Definition User.h:284
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
op_iterator op_end()
Definition User.h:286
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:812
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:218
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:130
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:201
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:362
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2038
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1698
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2113
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1495
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2108
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:336
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:431
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:66
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2100
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:49
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6104
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:760
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:819
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:223
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:254
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:270
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70