LLVM 22.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
76#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <queue>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <variant>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "sroa"
104
105STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
106STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
107STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
108STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
109STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
110STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
111STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
112STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
113STATISTIC(NumLoadsPredicated,
114 "Number of loads rewritten into predicated loads to allow promotion");
116 NumStoresPredicated,
117 "Number of stores rewritten into predicated loads to allow promotion");
118STATISTIC(NumDeleted, "Number of instructions deleted");
119STATISTIC(NumVectorized, "Number of vectorized aggregates");
120
121namespace llvm {
122/// Disable running mem2reg during SROA in order to test or debug SROA.
123static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
124 cl::Hidden);
126} // namespace llvm
127
128namespace {
129
130class AllocaSliceRewriter;
131class AllocaSlices;
132class Partition;
133
134class SelectHandSpeculativity {
135 unsigned char Storage = 0; // None are speculatable by default.
136 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
137 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
138public:
139 SelectHandSpeculativity() = default;
140 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
141 bool isSpeculatable(bool isTrueVal) const;
142 bool areAllSpeculatable() const;
143 bool areAnySpeculatable() const;
144 bool areNoneSpeculatable() const;
145 // For interop as int half of PointerIntPair.
146 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
147 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
148};
149static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
150
151using PossiblySpeculatableLoad =
153using UnspeculatableStore = StoreInst *;
154using RewriteableMemOp =
155 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
156using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
157
158/// An optimization pass providing Scalar Replacement of Aggregates.
159///
160/// This pass takes allocations which can be completely analyzed (that is, they
161/// don't escape) and tries to turn them into scalar SSA values. There are
162/// a few steps to this process.
163///
164/// 1) It takes allocations of aggregates and analyzes the ways in which they
165/// are used to try to split them into smaller allocations, ideally of
166/// a single scalar data type. It will split up memcpy and memset accesses
167/// as necessary and try to isolate individual scalar accesses.
168/// 2) It will transform accesses into forms which are suitable for SSA value
169/// promotion. This can be replacing a memset with a scalar store of an
170/// integer value, or it can involve speculating operations on a PHI or
171/// select to be a PHI or select of the results.
172/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
173/// onto insert and extract operations on a vector value, and convert them to
174/// this form. By doing so, it will enable promotion of vector aggregates to
175/// SSA vector values.
176class SROA {
177 LLVMContext *const C;
178 DomTreeUpdater *const DTU;
179 AssumptionCache *const AC;
180 const bool PreserveCFG;
181
182 /// Worklist of alloca instructions to simplify.
183 ///
184 /// Each alloca in the function is added to this. Each new alloca formed gets
185 /// added to it as well to recursively simplify unless that alloca can be
186 /// directly promoted. Finally, each time we rewrite a use of an alloca other
187 /// the one being actively rewritten, we add it back onto the list if not
188 /// already present to ensure it is re-visited.
189 SmallSetVector<AllocaInst *, 16> Worklist;
190
191 /// A collection of instructions to delete.
192 /// We try to batch deletions to simplify code and make things a bit more
193 /// efficient. We also make sure there is no dangling pointers.
194 SmallVector<WeakVH, 8> DeadInsts;
195
196 /// Post-promotion worklist.
197 ///
198 /// Sometimes we discover an alloca which has a high probability of becoming
199 /// viable for SROA after a round of promotion takes place. In those cases,
200 /// the alloca is enqueued here for re-processing.
201 ///
202 /// Note that we have to be very careful to clear allocas out of this list in
203 /// the event they are deleted.
204 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
205
206 /// A collection of alloca instructions we can directly promote.
207 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
208 SmallPtrSet<AllocaInst *, 16>, 16>
209 PromotableAllocas;
210
211 /// A worklist of PHIs to speculate prior to promoting allocas.
212 ///
213 /// All of these PHIs have been checked for the safety of speculation and by
214 /// being speculated will allow promoting allocas currently in the promotable
215 /// queue.
216 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
217
218 /// A worklist of select instructions to rewrite prior to promoting
219 /// allocas.
220 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
221
222 /// Select instructions that use an alloca and are subsequently loaded can be
223 /// rewritten to load both input pointers and then select between the result,
224 /// allowing the load of the alloca to be promoted.
225 /// From this:
226 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
227 /// %V = load <type>, ptr %P2
228 /// to:
229 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
230 /// %V2 = load <type>, ptr %Other
231 /// %V = select i1 %cond, <type> %V1, <type> %V2
232 ///
233 /// We can do this to a select if its only uses are loads
234 /// and if either the operand to the select can be loaded unconditionally,
235 /// or if we are allowed to perform CFG modifications.
236 /// If found an intervening bitcast with a single use of the load,
237 /// allow the promotion.
238 static std::optional<RewriteableMemOps>
239 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
240
241public:
242 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
243 SROAOptions PreserveCFG_)
244 : C(C), DTU(DTU), AC(AC),
245 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
246
247 /// Main run method used by both the SROAPass and by the legacy pass.
248 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
249
250private:
251 friend class AllocaSliceRewriter;
252
253 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
254 AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
255 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
256 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
257 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
258 void clobberUse(Use &U);
259 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
260 bool promoteAllocas();
261};
262
263} // end anonymous namespace
264
265/// Calculate the fragment of a variable to use when slicing a store
266/// based on the slice dimensions, existing fragment, and base storage
267/// fragment.
268/// Results:
269/// UseFrag - Use Target as the new fragment.
270/// UseNoFrag - The new slice already covers the whole variable.
271/// Skip - The new alloca slice doesn't include this variable.
272/// FIXME: Can we use calculateFragmentIntersect instead?
273namespace {
274enum FragCalcResult { UseFrag, UseNoFrag, Skip };
275}
276static FragCalcResult
278 uint64_t NewStorageSliceOffsetInBits,
279 uint64_t NewStorageSliceSizeInBits,
280 std::optional<DIExpression::FragmentInfo> StorageFragment,
281 std::optional<DIExpression::FragmentInfo> CurrentFragment,
283 // If the base storage describes part of the variable apply the offset and
284 // the size constraint.
285 if (StorageFragment) {
286 Target.SizeInBits =
287 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
288 Target.OffsetInBits =
289 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
290 } else {
291 Target.SizeInBits = NewStorageSliceSizeInBits;
292 Target.OffsetInBits = NewStorageSliceOffsetInBits;
293 }
294
295 // If this slice extracts the entirety of an independent variable from a
296 // larger alloca, do not produce a fragment expression, as the variable is
297 // not fragmented.
298 if (!CurrentFragment) {
299 if (auto Size = Variable->getSizeInBits()) {
300 // Treat the current fragment as covering the whole variable.
301 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
302 if (Target == CurrentFragment)
303 return UseNoFrag;
304 }
305 }
306
307 // No additional work to do if there isn't a fragment already, or there is
308 // but it already exactly describes the new assignment.
309 if (!CurrentFragment || *CurrentFragment == Target)
310 return UseFrag;
311
312 // Reject the target fragment if it doesn't fit wholly within the current
313 // fragment. TODO: We could instead chop up the target to fit in the case of
314 // a partial overlap.
315 if (Target.startInBits() < CurrentFragment->startInBits() ||
316 Target.endInBits() > CurrentFragment->endInBits())
317 return Skip;
318
319 // Target fits within the current fragment, return it.
320 return UseFrag;
321}
322
324 return DebugVariable(DVR->getVariable(), std::nullopt,
325 DVR->getDebugLoc().getInlinedAt());
326}
327
328/// Find linked dbg.assign and generate a new one with the correct
329/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
330/// value component is copied from the old dbg.assign to the new.
331/// \param OldAlloca Alloca for the variable before splitting.
332/// \param IsSplit True if the store (not necessarily alloca)
333/// is being split.
334/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
335/// \param SliceSizeInBits New number of bits being written to.
336/// \param OldInst Instruction that is being split.
337/// \param Inst New instruction performing this part of the
338/// split store.
339/// \param Dest Store destination.
340/// \param Value Stored value.
341/// \param DL Datalayout.
342static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
343 uint64_t OldAllocaOffsetInBits,
344 uint64_t SliceSizeInBits, Instruction *OldInst,
345 Instruction *Inst, Value *Dest, Value *Value,
346 const DataLayout &DL) {
347 // If we want allocas to be migrated using this helper then we need to ensure
348 // that the BaseFragments map code still works. A simple solution would be
349 // to choose to always clone alloca dbg_assigns (rather than sometimes
350 // "stealing" them).
351 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
352
353 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
354 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
355 if (DVRAssignMarkerRange.empty())
356 return;
357
358 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
359 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
360 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
361 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
362 << "\n");
363 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
364 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
365 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
366 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
367 if (Value)
368 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
369
370 /// Map of aggregate variables to their fragment associated with OldAlloca.
372 BaseFragments;
373 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
374 BaseFragments[getAggregateVariable(DVR)] =
375 DVR->getExpression()->getFragmentInfo();
376
377 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
378 // one). It shouldn't already have one: assert this assumption.
379 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
380 DIAssignID *NewID = nullptr;
381 auto &Ctx = Inst->getContext();
382 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
383 assert(OldAlloca->isStaticAlloca());
384
385 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
386 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
387 << "\n");
388 auto *Expr = DbgAssign->getExpression();
389 bool SetKillLocation = false;
390
391 if (IsSplit) {
392 std::optional<DIExpression::FragmentInfo> BaseFragment;
393 {
394 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
395 if (R == BaseFragments.end())
396 return;
397 BaseFragment = R->second;
398 }
399 std::optional<DIExpression::FragmentInfo> CurrentFragment =
400 Expr->getFragmentInfo();
401 DIExpression::FragmentInfo NewFragment;
402 FragCalcResult Result = calculateFragment(
403 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
404 BaseFragment, CurrentFragment, NewFragment);
405
406 if (Result == Skip)
407 return;
408 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
409 if (CurrentFragment) {
410 // Rewrite NewFragment to be relative to the existing one (this is
411 // what createFragmentExpression wants). CalculateFragment has
412 // already resolved the size for us. FIXME: Should it return the
413 // relative fragment too?
414 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
415 }
416 // Add the new fragment info to the existing expression if possible.
418 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
419 Expr = *E;
420 } else {
421 // Otherwise, add the new fragment info to an empty expression and
422 // discard the value component of this dbg.assign as the value cannot
423 // be computed with the new fragment.
425 DIExpression::get(Expr->getContext(), {}),
426 NewFragment.OffsetInBits, NewFragment.SizeInBits);
427 SetKillLocation = true;
428 }
429 }
430 }
431
432 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
433 if (!NewID) {
434 NewID = DIAssignID::getDistinct(Ctx);
435 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
436 }
437
438 DbgVariableRecord *NewAssign;
439 if (IsSplit) {
440 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
442 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
443 Dest, DIExpression::get(Expr->getContext(), {}),
444 DbgAssign->getDebugLoc())));
445 } else {
446 // The store is not split, simply steal the existing dbg_assign.
447 NewAssign = DbgAssign;
448 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
449 NewAssign->setAddress(Dest);
450 if (Value)
451 NewAssign->replaceVariableLocationOp(0u, Value);
452 assert(Expr == NewAssign->getExpression());
453 }
454
455 // If we've updated the value but the original dbg.assign has an arglist
456 // then kill it now - we can't use the requested new value.
457 // We can't replace the DIArgList with the new value as it'd leave
458 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
459 // an arglist). And we can't keep the DIArgList in case the linked store
460 // is being split - in which case the DIArgList + expression may no longer
461 // be computing the correct value.
462 // This should be a very rare situation as it requires the value being
463 // stored to differ from the dbg.assign (i.e., the value has been
464 // represented differently in the debug intrinsic for some reason).
465 SetKillLocation |=
466 Value && (DbgAssign->hasArgList() ||
467 !DbgAssign->getExpression()->isSingleLocationExpression());
468 if (SetKillLocation)
469 NewAssign->setKillLocation();
470
471 // We could use more precision here at the cost of some additional (code)
472 // complexity - if the original dbg.assign was adjacent to its store, we
473 // could position this new dbg.assign adjacent to its store rather than the
474 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
475 // what we get now:
476 // split store !1
477 // split store !2
478 // dbg.assign !1
479 // dbg.assign !2
480 // This (current behaviour) results results in debug assignments being
481 // noted as slightly offset (in code) from the store. In practice this
482 // should have little effect on the debugging experience due to the fact
483 // that all the split stores should get the same line number.
484 if (NewAssign != DbgAssign) {
485 NewAssign->moveBefore(DbgAssign->getIterator());
486 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
487 }
488 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
489 };
490
491 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
492}
493
494namespace {
495
496/// A custom IRBuilder inserter which prefixes all names, but only in
497/// Assert builds.
498class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
499 std::string Prefix;
500
501 Twine getNameWithPrefix(const Twine &Name) const {
502 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
503 }
504
505public:
506 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
507
508 void InsertHelper(Instruction *I, const Twine &Name,
509 BasicBlock::iterator InsertPt) const override {
510 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
511 InsertPt);
512 }
513};
514
515/// Provide a type for IRBuilder that drops names in release builds.
517
518/// A used slice of an alloca.
519///
520/// This structure represents a slice of an alloca used by some instruction. It
521/// stores both the begin and end offsets of this use, a pointer to the use
522/// itself, and a flag indicating whether we can classify the use as splittable
523/// or not when forming partitions of the alloca.
524class Slice {
525 /// The beginning offset of the range.
526 uint64_t BeginOffset = 0;
527
528 /// The ending offset, not included in the range.
529 uint64_t EndOffset = 0;
530
531 /// Storage for both the use of this slice and whether it can be
532 /// split.
533 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
534
535public:
536 Slice() = default;
537
538 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
539 : BeginOffset(BeginOffset), EndOffset(EndOffset),
540 UseAndIsSplittable(U, IsSplittable) {}
541
542 uint64_t beginOffset() const { return BeginOffset; }
543 uint64_t endOffset() const { return EndOffset; }
544
545 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
546 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
547
548 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
549
550 bool isDead() const { return getUse() == nullptr; }
551 void kill() { UseAndIsSplittable.setPointer(nullptr); }
552
553 /// Support for ordering ranges.
554 ///
555 /// This provides an ordering over ranges such that start offsets are
556 /// always increasing, and within equal start offsets, the end offsets are
557 /// decreasing. Thus the spanning range comes first in a cluster with the
558 /// same start position.
559 bool operator<(const Slice &RHS) const {
560 if (beginOffset() < RHS.beginOffset())
561 return true;
562 if (beginOffset() > RHS.beginOffset())
563 return false;
564 if (isSplittable() != RHS.isSplittable())
565 return !isSplittable();
566 if (endOffset() > RHS.endOffset())
567 return true;
568 return false;
569 }
570
571 /// Support comparison with a single offset to allow binary searches.
572 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
573 return LHS.beginOffset() < RHSOffset;
574 }
575 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
576 return LHSOffset < RHS.beginOffset();
577 }
578
579 bool operator==(const Slice &RHS) const {
580 return isSplittable() == RHS.isSplittable() &&
581 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
582 }
583 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
584};
585
586/// Representation of the alloca slices.
587///
588/// This class represents the slices of an alloca which are formed by its
589/// various uses. If a pointer escapes, we can't fully build a representation
590/// for the slices used and we reflect that in this structure. The uses are
591/// stored, sorted by increasing beginning offset and with unsplittable slices
592/// starting at a particular offset before splittable slices.
593class AllocaSlices {
594public:
595 /// Construct the slices of a particular alloca.
596 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
597
598 /// Test whether a pointer to the allocation escapes our analysis.
599 ///
600 /// If this is true, the slices are never fully built and should be
601 /// ignored.
602 bool isEscaped() const { return PointerEscapingInstr; }
603 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
604
605 /// Support for iterating over the slices.
606 /// @{
607 using iterator = SmallVectorImpl<Slice>::iterator;
608 using range = iterator_range<iterator>;
609
610 iterator begin() { return Slices.begin(); }
611 iterator end() { return Slices.end(); }
612
613 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
614 using const_range = iterator_range<const_iterator>;
615
616 const_iterator begin() const { return Slices.begin(); }
617 const_iterator end() const { return Slices.end(); }
618 /// @}
619
620 /// Erase a range of slices.
621 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
622
623 /// Insert new slices for this alloca.
624 ///
625 /// This moves the slices into the alloca's slices collection, and re-sorts
626 /// everything so that the usual ordering properties of the alloca's slices
627 /// hold.
628 void insert(ArrayRef<Slice> NewSlices) {
629 int OldSize = Slices.size();
630 Slices.append(NewSlices.begin(), NewSlices.end());
631 auto SliceI = Slices.begin() + OldSize;
632 std::stable_sort(SliceI, Slices.end());
633 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
634 }
635
636 // Forward declare the iterator and range accessor for walking the
637 // partitions.
638 class partition_iterator;
640
641 /// Access the dead users for this alloca.
642 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
643
644 /// Access Uses that should be dropped if the alloca is promotable.
645 ArrayRef<Use *> getDeadUsesIfPromotable() const {
646 return DeadUseIfPromotable;
647 }
648
649 /// Access the dead operands referring to this alloca.
650 ///
651 /// These are operands which have cannot actually be used to refer to the
652 /// alloca as they are outside its range and the user doesn't correct for
653 /// that. These mostly consist of PHI node inputs and the like which we just
654 /// need to replace with undef.
655 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
656
657#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
658 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
659 void printSlice(raw_ostream &OS, const_iterator I,
660 StringRef Indent = " ") const;
661 void printUse(raw_ostream &OS, const_iterator I,
662 StringRef Indent = " ") const;
663 void print(raw_ostream &OS) const;
664 void dump(const_iterator I) const;
665 void dump() const;
666#endif
667
668private:
669 template <typename DerivedT, typename RetT = void> class BuilderBase;
670 class SliceBuilder;
671
672 friend class AllocaSlices::SliceBuilder;
673
674#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
675 /// Handle to alloca instruction to simplify method interfaces.
676 AllocaInst &AI;
677#endif
678
679 /// The instruction responsible for this alloca not having a known set
680 /// of slices.
681 ///
682 /// When an instruction (potentially) escapes the pointer to the alloca, we
683 /// store a pointer to that here and abort trying to form slices of the
684 /// alloca. This will be null if the alloca slices are analyzed successfully.
685 Instruction *PointerEscapingInstr;
686 Instruction *PointerEscapingInstrReadOnly;
687
688 /// The slices of the alloca.
689 ///
690 /// We store a vector of the slices formed by uses of the alloca here. This
691 /// vector is sorted by increasing begin offset, and then the unsplittable
692 /// slices before the splittable ones. See the Slice inner class for more
693 /// details.
695
696 /// Instructions which will become dead if we rewrite the alloca.
697 ///
698 /// Note that these are not separated by slice. This is because we expect an
699 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
700 /// all these instructions can simply be removed and replaced with poison as
701 /// they come from outside of the allocated space.
702 SmallVector<Instruction *, 8> DeadUsers;
703
704 /// Uses which will become dead if can promote the alloca.
705 SmallVector<Use *, 8> DeadUseIfPromotable;
706
707 /// Operands which will become dead if we rewrite the alloca.
708 ///
709 /// These are operands that in their particular use can be replaced with
710 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
711 /// to PHI nodes and the like. They aren't entirely dead (there might be
712 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
713 /// want to swap this particular input for poison to simplify the use lists of
714 /// the alloca.
715 SmallVector<Use *, 8> DeadOperands;
716};
717
718/// A partition of the slices.
719///
720/// An ephemeral representation for a range of slices which can be viewed as
721/// a partition of the alloca. This range represents a span of the alloca's
722/// memory which cannot be split, and provides access to all of the slices
723/// overlapping some part of the partition.
724///
725/// Objects of this type are produced by traversing the alloca's slices, but
726/// are only ephemeral and not persistent.
727class Partition {
728private:
729 friend class AllocaSlices;
730 friend class AllocaSlices::partition_iterator;
731
732 using iterator = AllocaSlices::iterator;
733
734 /// The beginning and ending offsets of the alloca for this
735 /// partition.
736 uint64_t BeginOffset = 0, EndOffset = 0;
737
738 /// The start and end iterators of this partition.
739 iterator SI, SJ;
740
741 /// A collection of split slice tails overlapping the partition.
742 SmallVector<Slice *, 4> SplitTails;
743
744 /// Raw constructor builds an empty partition starting and ending at
745 /// the given iterator.
746 Partition(iterator SI) : SI(SI), SJ(SI) {}
747
748public:
749 /// The start offset of this partition.
750 ///
751 /// All of the contained slices start at or after this offset.
752 uint64_t beginOffset() const { return BeginOffset; }
753
754 /// The end offset of this partition.
755 ///
756 /// All of the contained slices end at or before this offset.
757 uint64_t endOffset() const { return EndOffset; }
758
759 /// The size of the partition.
760 ///
761 /// Note that this can never be zero.
762 uint64_t size() const {
763 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
764 return EndOffset - BeginOffset;
765 }
766
767 /// Test whether this partition contains no slices, and merely spans
768 /// a region occupied by split slices.
769 bool empty() const { return SI == SJ; }
770
771 /// \name Iterate slices that start within the partition.
772 /// These may be splittable or unsplittable. They have a begin offset >= the
773 /// partition begin offset.
774 /// @{
775 // FIXME: We should probably define a "concat_iterator" helper and use that
776 // to stitch together pointee_iterators over the split tails and the
777 // contiguous iterators of the partition. That would give a much nicer
778 // interface here. We could then additionally expose filtered iterators for
779 // split, unsplit, and unsplittable splices based on the usage patterns.
780 iterator begin() const { return SI; }
781 iterator end() const { return SJ; }
782 /// @}
783
784 /// Get the sequence of split slice tails.
785 ///
786 /// These tails are of slices which start before this partition but are
787 /// split and overlap into the partition. We accumulate these while forming
788 /// partitions.
789 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
790};
791
792} // end anonymous namespace
793
794/// An iterator over partitions of the alloca's slices.
795///
796/// This iterator implements the core algorithm for partitioning the alloca's
797/// slices. It is a forward iterator as we don't support backtracking for
798/// efficiency reasons, and re-use a single storage area to maintain the
799/// current set of split slices.
800///
801/// It is templated on the slice iterator type to use so that it can operate
802/// with either const or non-const slice iterators.
804 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
805 Partition> {
806 friend class AllocaSlices;
807
808 /// Most of the state for walking the partitions is held in a class
809 /// with a nice interface for examining them.
810 Partition P;
811
812 /// We need to keep the end of the slices to know when to stop.
813 AllocaSlices::iterator SE;
814
815 /// We also need to keep track of the maximum split end offset seen.
816 /// FIXME: Do we really?
817 uint64_t MaxSplitSliceEndOffset = 0;
818
819 /// Sets the partition to be empty at given iterator, and sets the
820 /// end iterator.
821 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
822 : P(SI), SE(SE) {
823 // If not already at the end, advance our state to form the initial
824 // partition.
825 if (SI != SE)
826 advance();
827 }
828
829 /// Advance the iterator to the next partition.
830 ///
831 /// Requires that the iterator not be at the end of the slices.
832 void advance() {
833 assert((P.SI != SE || !P.SplitTails.empty()) &&
834 "Cannot advance past the end of the slices!");
835
836 // Clear out any split uses which have ended.
837 if (!P.SplitTails.empty()) {
838 if (P.EndOffset >= MaxSplitSliceEndOffset) {
839 // If we've finished all splits, this is easy.
840 P.SplitTails.clear();
841 MaxSplitSliceEndOffset = 0;
842 } else {
843 // Remove the uses which have ended in the prior partition. This
844 // cannot change the max split slice end because we just checked that
845 // the prior partition ended prior to that max.
846 llvm::erase_if(P.SplitTails,
847 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
848 assert(llvm::any_of(P.SplitTails,
849 [&](Slice *S) {
850 return S->endOffset() == MaxSplitSliceEndOffset;
851 }) &&
852 "Could not find the current max split slice offset!");
853 assert(llvm::all_of(P.SplitTails,
854 [&](Slice *S) {
855 return S->endOffset() <= MaxSplitSliceEndOffset;
856 }) &&
857 "Max split slice end offset is not actually the max!");
858 }
859 }
860
861 // If P.SI is already at the end, then we've cleared the split tail and
862 // now have an end iterator.
863 if (P.SI == SE) {
864 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
865 return;
866 }
867
868 // If we had a non-empty partition previously, set up the state for
869 // subsequent partitions.
870 if (P.SI != P.SJ) {
871 // Accumulate all the splittable slices which started in the old
872 // partition into the split list.
873 for (Slice &S : P)
874 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
875 P.SplitTails.push_back(&S);
876 MaxSplitSliceEndOffset =
877 std::max(S.endOffset(), MaxSplitSliceEndOffset);
878 }
879
880 // Start from the end of the previous partition.
881 P.SI = P.SJ;
882
883 // If P.SI is now at the end, we at most have a tail of split slices.
884 if (P.SI == SE) {
885 P.BeginOffset = P.EndOffset;
886 P.EndOffset = MaxSplitSliceEndOffset;
887 return;
888 }
889
890 // If the we have split slices and the next slice is after a gap and is
891 // not splittable immediately form an empty partition for the split
892 // slices up until the next slice begins.
893 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
894 !P.SI->isSplittable()) {
895 P.BeginOffset = P.EndOffset;
896 P.EndOffset = P.SI->beginOffset();
897 return;
898 }
899 }
900
901 // OK, we need to consume new slices. Set the end offset based on the
902 // current slice, and step SJ past it. The beginning offset of the
903 // partition is the beginning offset of the next slice unless we have
904 // pre-existing split slices that are continuing, in which case we begin
905 // at the prior end offset.
906 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
907 P.EndOffset = P.SI->endOffset();
908 ++P.SJ;
909
910 // There are two strategies to form a partition based on whether the
911 // partition starts with an unsplittable slice or a splittable slice.
912 if (!P.SI->isSplittable()) {
913 // When we're forming an unsplittable region, it must always start at
914 // the first slice and will extend through its end.
915 assert(P.BeginOffset == P.SI->beginOffset());
916
917 // Form a partition including all of the overlapping slices with this
918 // unsplittable slice.
919 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
920 if (!P.SJ->isSplittable())
921 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
922 ++P.SJ;
923 }
924
925 // We have a partition across a set of overlapping unsplittable
926 // partitions.
927 return;
928 }
929
930 // If we're starting with a splittable slice, then we need to form
931 // a synthetic partition spanning it and any other overlapping splittable
932 // splices.
933 assert(P.SI->isSplittable() && "Forming a splittable partition!");
934
935 // Collect all of the overlapping splittable slices.
936 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
937 P.SJ->isSplittable()) {
938 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
939 ++P.SJ;
940 }
941
942 // Back upiP.EndOffset if we ended the span early when encountering an
943 // unsplittable slice. This synthesizes the early end offset of
944 // a partition spanning only splittable slices.
945 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
946 assert(!P.SJ->isSplittable());
947 P.EndOffset = P.SJ->beginOffset();
948 }
949 }
950
951public:
952 bool operator==(const partition_iterator &RHS) const {
953 assert(SE == RHS.SE &&
954 "End iterators don't match between compared partition iterators!");
955
956 // The observed positions of partitions is marked by the P.SI iterator and
957 // the emptiness of the split slices. The latter is only relevant when
958 // P.SI == SE, as the end iterator will additionally have an empty split
959 // slices list, but the prior may have the same P.SI and a tail of split
960 // slices.
961 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
962 assert(P.SJ == RHS.P.SJ &&
963 "Same set of slices formed two different sized partitions!");
964 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
965 "Same slice position with differently sized non-empty split "
966 "slice tails!");
967 return true;
968 }
969 return false;
970 }
971
972 partition_iterator &operator++() {
973 advance();
974 return *this;
975 }
976
977 Partition &operator*() { return P; }
978};
979
980/// A forward range over the partitions of the alloca's slices.
981///
982/// This accesses an iterator range over the partitions of the alloca's
983/// slices. It computes these partitions on the fly based on the overlapping
984/// offsets of the slices and the ability to split them. It will visit "empty"
985/// partitions to cover regions of the alloca only accessed via split
986/// slices.
987iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
988 return make_range(partition_iterator(begin(), end()),
989 partition_iterator(end(), end()));
990}
991
993 // If the condition being selected on is a constant or the same value is
994 // being selected between, fold the select. Yes this does (rarely) happen
995 // early on.
996 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
997 return SI.getOperand(1 + CI->isZero());
998 if (SI.getOperand(1) == SI.getOperand(2))
999 return SI.getOperand(1);
1000
1001 return nullptr;
1002}
1003
1004/// A helper that folds a PHI node or a select.
1006 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1007 // If PN merges together the same value, return that value.
1008 return PN->hasConstantValue();
1009 }
1011}
1012
1013/// Builder for the alloca slices.
1014///
1015/// This class builds a set of alloca slices by recursively visiting the uses
1016/// of an alloca and making a slice for each load and store at each offset.
1017class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1018 friend class PtrUseVisitor<SliceBuilder>;
1019 friend class InstVisitor<SliceBuilder>;
1020
1021 using Base = PtrUseVisitor<SliceBuilder>;
1022
1023 const uint64_t AllocSize;
1024 AllocaSlices &AS;
1025
1026 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1028
1029 /// Set to de-duplicate dead instructions found in the use walk.
1030 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1031
1032public:
1033 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1035 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1036 AS(AS) {}
1037
1038private:
1039 void markAsDead(Instruction &I) {
1040 if (VisitedDeadInsts.insert(&I).second)
1041 AS.DeadUsers.push_back(&I);
1042 }
1043
1044 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1045 bool IsSplittable = false) {
1046 // Completely skip uses which have a zero size or start either before or
1047 // past the end of the allocation.
1048 if (Size == 0 || Offset.uge(AllocSize)) {
1049 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1050 << Offset
1051 << " which has zero size or starts outside of the "
1052 << AllocSize << " byte alloca:\n"
1053 << " alloca: " << AS.AI << "\n"
1054 << " use: " << I << "\n");
1055 return markAsDead(I);
1056 }
1057
1058 uint64_t BeginOffset = Offset.getZExtValue();
1059 uint64_t EndOffset = BeginOffset + Size;
1060
1061 // Clamp the end offset to the end of the allocation. Note that this is
1062 // formulated to handle even the case where "BeginOffset + Size" overflows.
1063 // This may appear superficially to be something we could ignore entirely,
1064 // but that is not so! There may be widened loads or PHI-node uses where
1065 // some instructions are dead but not others. We can't completely ignore
1066 // them, and so have to record at least the information here.
1067 assert(AllocSize >= BeginOffset); // Established above.
1068 if (Size > AllocSize - BeginOffset) {
1069 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1070 << Offset << " to remain within the " << AllocSize
1071 << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 EndOffset = AllocSize;
1075 }
1076
1077 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1078 }
1079
1080 void visitBitCastInst(BitCastInst &BC) {
1081 if (BC.use_empty())
1082 return markAsDead(BC);
1083
1084 return Base::visitBitCastInst(BC);
1085 }
1086
1087 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1088 if (ASC.use_empty())
1089 return markAsDead(ASC);
1090
1091 return Base::visitAddrSpaceCastInst(ASC);
1092 }
1093
1094 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1095 if (GEPI.use_empty())
1096 return markAsDead(GEPI);
1097
1098 return Base::visitGetElementPtrInst(GEPI);
1099 }
1100
1101 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1102 uint64_t Size, bool IsVolatile) {
1103 // We allow splitting of non-volatile loads and stores where the type is an
1104 // integer type. These may be used to implement 'memcpy' or other "transfer
1105 // of bits" patterns.
1106 bool IsSplittable =
1107 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1108
1109 insertUse(I, Offset, Size, IsSplittable);
1110 }
1111
1112 void visitLoadInst(LoadInst &LI) {
1113 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1114 "All simple FCA loads should have been pre-split");
1115
1116 // If there is a load with an unknown offset, we can still perform store
1117 // to load forwarding for other known-offset loads.
1118 if (!IsOffsetKnown)
1119 return PI.setEscapedReadOnly(&LI);
1120
1121 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1122 if (Size.isScalable()) {
1123 unsigned VScale = LI.getFunction()->getVScaleValue();
1124 if (!VScale)
1125 return PI.setAborted(&LI);
1126
1127 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1128 }
1129
1130 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1131 LI.isVolatile());
1132 }
1133
1134 void visitStoreInst(StoreInst &SI) {
1135 Value *ValOp = SI.getValueOperand();
1136 if (ValOp == *U)
1137 return PI.setEscapedAndAborted(&SI);
1138 if (!IsOffsetKnown)
1139 return PI.setAborted(&SI);
1140
1141 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1142 if (StoreSize.isScalable()) {
1143 unsigned VScale = SI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&SI);
1146
1147 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1148 }
1149
1150 uint64_t Size = StoreSize.getFixedValue();
1151
1152 // If this memory access can be shown to *statically* extend outside the
1153 // bounds of the allocation, it's behavior is undefined, so simply
1154 // ignore it. Note that this is more strict than the generic clamping
1155 // behavior of insertUse. We also try to handle cases which might run the
1156 // risk of overflow.
1157 // FIXME: We should instead consider the pointer to have escaped if this
1158 // function is being instrumented for addressing bugs or race conditions.
1159 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1160 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1161 << Offset << " which extends past the end of the "
1162 << AllocSize << " byte alloca:\n"
1163 << " alloca: " << AS.AI << "\n"
1164 << " use: " << SI << "\n");
1165 return markAsDead(SI);
1166 }
1167
1168 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1169 "All simple FCA stores should have been pre-split");
1170 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1171 }
1172
1173 void visitMemSetInst(MemSetInst &II) {
1174 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1175 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1176 if ((Length && Length->getValue() == 0) ||
1177 (IsOffsetKnown && Offset.uge(AllocSize)))
1178 // Zero-length mem transfer intrinsics can be ignored entirely.
1179 return markAsDead(II);
1180
1181 if (!IsOffsetKnown)
1182 return PI.setAborted(&II);
1183
1184 insertUse(II, Offset,
1185 Length ? Length->getLimitedValue()
1186 : AllocSize - Offset.getLimitedValue(),
1187 (bool)Length);
1188 }
1189
1190 void visitMemTransferInst(MemTransferInst &II) {
1191 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1192 if (Length && Length->getValue() == 0)
1193 // Zero-length mem transfer intrinsics can be ignored entirely.
1194 return markAsDead(II);
1195
1196 // Because we can visit these intrinsics twice, also check to see if the
1197 // first time marked this instruction as dead. If so, skip it.
1198 if (VisitedDeadInsts.count(&II))
1199 return;
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 // This side of the transfer is completely out-of-bounds, and so we can
1205 // nuke the entire transfer. However, we also need to nuke the other side
1206 // if already added to our partitions.
1207 // FIXME: Yet another place we really should bypass this when
1208 // instrumenting for ASan.
1209 if (Offset.uge(AllocSize)) {
1210 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1211 MemTransferSliceMap.find(&II);
1212 if (MTPI != MemTransferSliceMap.end())
1213 AS.Slices[MTPI->second].kill();
1214 return markAsDead(II);
1215 }
1216
1217 uint64_t RawOffset = Offset.getLimitedValue();
1218 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1219
1220 // Check for the special case where the same exact value is used for both
1221 // source and dest.
1222 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1223 // For non-volatile transfers this is a no-op.
1224 if (!II.isVolatile())
1225 return markAsDead(II);
1226
1227 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1228 }
1229
1230 // If we have seen both source and destination for a mem transfer, then
1231 // they both point to the same alloca.
1232 bool Inserted;
1233 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1234 std::tie(MTPI, Inserted) =
1235 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1236 unsigned PrevIdx = MTPI->second;
1237 if (!Inserted) {
1238 Slice &PrevP = AS.Slices[PrevIdx];
1239
1240 // Check if the begin offsets match and this is a non-volatile transfer.
1241 // In that case, we can completely elide the transfer.
1242 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1243 PrevP.kill();
1244 return markAsDead(II);
1245 }
1246
1247 // Otherwise we have an offset transfer within the same alloca. We can't
1248 // split those.
1249 PrevP.makeUnsplittable();
1250 }
1251
1252 // Insert the use now that we've fixed up the splittable nature.
1253 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1254
1255 // Check that we ended up with a valid index in the map.
1256 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1257 "Map index doesn't point back to a slice with this user.");
1258 }
1259
1260 // Disable SRoA for any intrinsics except for lifetime invariants.
1261 // FIXME: What about debug intrinsics? This matches old behavior, but
1262 // doesn't make sense.
1263 void visitIntrinsicInst(IntrinsicInst &II) {
1264 if (II.isDroppable()) {
1265 AS.DeadUseIfPromotable.push_back(U);
1266 return;
1267 }
1268
1269 if (!IsOffsetKnown)
1270 return PI.setAborted(&II);
1271
1272 if (II.isLifetimeStartOrEnd()) {
1273 insertUse(II, Offset, AllocSize, true);
1274 return;
1275 }
1276
1277 Base::visitIntrinsicInst(II);
1278 }
1279
1280 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1281 // We consider any PHI or select that results in a direct load or store of
1282 // the same offset to be a viable use for slicing purposes. These uses
1283 // are considered unsplittable and the size is the maximum loaded or stored
1284 // size.
1285 SmallPtrSet<Instruction *, 4> Visited;
1287 Visited.insert(Root);
1288 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1289 const DataLayout &DL = Root->getDataLayout();
1290 // If there are no loads or stores, the access is dead. We mark that as
1291 // a size zero access.
1292 Size = 0;
1293 do {
1294 Instruction *I, *UsedI;
1295 std::tie(UsedI, I) = Uses.pop_back_val();
1296
1297 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1298 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1299 if (LoadSize.isScalable()) {
1300 PI.setAborted(LI);
1301 return nullptr;
1302 }
1303 Size = std::max(Size, LoadSize.getFixedValue());
1304 continue;
1305 }
1306 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1307 Value *Op = SI->getOperand(0);
1308 if (Op == UsedI)
1309 return SI;
1310 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1311 if (StoreSize.isScalable()) {
1312 PI.setAborted(SI);
1313 return nullptr;
1314 }
1315 Size = std::max(Size, StoreSize.getFixedValue());
1316 continue;
1317 }
1318
1319 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1320 if (!GEP->hasAllZeroIndices())
1321 return GEP;
1322 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1324 return I;
1325 }
1326
1327 for (User *U : I->users())
1328 if (Visited.insert(cast<Instruction>(U)).second)
1329 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1330 } while (!Uses.empty());
1331
1332 return nullptr;
1333 }
1334
1335 void visitPHINodeOrSelectInst(Instruction &I) {
1337 if (I.use_empty())
1338 return markAsDead(I);
1339
1340 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1341 // instructions in this BB, which may be required during rewriting. Bail out
1342 // on these cases.
1343 if (isa<PHINode>(I) &&
1344 I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1345 return PI.setAborted(&I);
1346
1347 // TODO: We could use simplifyInstruction here to fold PHINodes and
1348 // SelectInsts. However, doing so requires to change the current
1349 // dead-operand-tracking mechanism. For instance, suppose neither loading
1350 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1351 // trap either. However, if we simply replace %U with undef using the
1352 // current dead-operand-tracking mechanism, "load (select undef, undef,
1353 // %other)" may trap because the select may return the first operand
1354 // "undef".
1355 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1356 if (Result == *U)
1357 // If the result of the constant fold will be the pointer, recurse
1358 // through the PHI/select as if we had RAUW'ed it.
1359 enqueueUsers(I);
1360 else
1361 // Otherwise the operand to the PHI/select is dead, and we can replace
1362 // it with poison.
1363 AS.DeadOperands.push_back(U);
1364
1365 return;
1366 }
1367
1368 if (!IsOffsetKnown)
1369 return PI.setAborted(&I);
1370
1371 // See if we already have computed info on this node.
1372 uint64_t &Size = PHIOrSelectSizes[&I];
1373 if (!Size) {
1374 // This is a new PHI/Select, check for an unsafe use of it.
1375 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1376 return PI.setAborted(UnsafeI);
1377 }
1378
1379 // For PHI and select operands outside the alloca, we can't nuke the entire
1380 // phi or select -- the other side might still be relevant, so we special
1381 // case them here and use a separate structure to track the operands
1382 // themselves which should be replaced with poison.
1383 // FIXME: This should instead be escaped in the event we're instrumenting
1384 // for address sanitization.
1385 if (Offset.uge(AllocSize)) {
1386 AS.DeadOperands.push_back(U);
1387 return;
1388 }
1389
1390 insertUse(I, Offset, Size);
1391 }
1392
1393 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1394
1395 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1396
1397 /// Disable SROA entirely if there are unhandled users of the alloca.
1398 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1399
1400 void visitCallBase(CallBase &CB) {
1401 // If the call operand is read-only and only does a read-only or address
1402 // capture, then we mark it as EscapedReadOnly.
1403 if (CB.isDataOperand(U) &&
1404 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1405 CB.onlyReadsMemory(U->getOperandNo())) {
1406 PI.setEscapedReadOnly(&CB);
1407 return;
1408 }
1409
1410 Base::visitCallBase(CB);
1411 }
1412};
1413
1414AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1415 :
1416#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1417 AI(AI),
1418#endif
1419 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1420 SliceBuilder PB(DL, AI, *this);
1421 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1422 if (PtrI.isEscaped() || PtrI.isAborted()) {
1423 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1424 // possibly by just storing the PtrInfo in the AllocaSlices.
1425 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1426 : PtrI.getAbortingInst();
1427 assert(PointerEscapingInstr && "Did not track a bad instruction");
1428 return;
1429 }
1430 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1431
1432 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1433
1434 // Sort the uses. This arranges for the offsets to be in ascending order,
1435 // and the sizes to be in descending order.
1436 llvm::stable_sort(Slices);
1437}
1438
1439#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1440
1441void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1442 StringRef Indent) const {
1443 printSlice(OS, I, Indent);
1444 OS << "\n";
1445 printUse(OS, I, Indent);
1446}
1447
1448void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1449 StringRef Indent) const {
1450 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1451 << " slice #" << (I - begin())
1452 << (I->isSplittable() ? " (splittable)" : "");
1453}
1454
1455void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1456 StringRef Indent) const {
1457 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1458}
1459
1460void AllocaSlices::print(raw_ostream &OS) const {
1461 if (PointerEscapingInstr) {
1462 OS << "Can't analyze slices for alloca: " << AI << "\n"
1463 << " A pointer to this alloca escaped by:\n"
1464 << " " << *PointerEscapingInstr << "\n";
1465 return;
1466 }
1467
1468 if (PointerEscapingInstrReadOnly)
1469 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1470
1471 OS << "Slices of alloca: " << AI << "\n";
1472 for (const_iterator I = begin(), E = end(); I != E; ++I)
1473 print(OS, I);
1474}
1475
1476LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1477 print(dbgs(), I);
1478}
1479LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1480
1481#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1482
1483/// Walk the range of a partitioning looking for a common type to cover this
1484/// sequence of slices.
1485static std::pair<Type *, IntegerType *>
1486findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1487 uint64_t EndOffset) {
1488 Type *Ty = nullptr;
1489 bool TyIsCommon = true;
1490 IntegerType *ITy = nullptr;
1491
1492 // Note that we need to look at *every* alloca slice's Use to ensure we
1493 // always get consistent results regardless of the order of slices.
1494 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1495 Use *U = I->getUse();
1496 if (isa<IntrinsicInst>(*U->getUser()))
1497 continue;
1498 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1499 continue;
1500
1501 Type *UserTy = nullptr;
1502 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1503 UserTy = LI->getType();
1504 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1505 UserTy = SI->getValueOperand()->getType();
1506 }
1507
1508 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1509 // If the type is larger than the partition, skip it. We only encounter
1510 // this for split integer operations where we want to use the type of the
1511 // entity causing the split. Also skip if the type is not a byte width
1512 // multiple.
1513 if (UserITy->getBitWidth() % 8 != 0 ||
1514 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1515 continue;
1516
1517 // Track the largest bitwidth integer type used in this way in case there
1518 // is no common type.
1519 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1520 ITy = UserITy;
1521 }
1522
1523 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1524 // depend on types skipped above.
1525 if (!UserTy || (Ty && Ty != UserTy))
1526 TyIsCommon = false; // Give up on anything but an iN type.
1527 else
1528 Ty = UserTy;
1529 }
1530
1531 return {TyIsCommon ? Ty : nullptr, ITy};
1532}
1533
1534/// PHI instructions that use an alloca and are subsequently loaded can be
1535/// rewritten to load both input pointers in the pred blocks and then PHI the
1536/// results, allowing the load of the alloca to be promoted.
1537/// From this:
1538/// %P2 = phi [i32* %Alloca, i32* %Other]
1539/// %V = load i32* %P2
1540/// to:
1541/// %V1 = load i32* %Alloca -> will be mem2reg'd
1542/// ...
1543/// %V2 = load i32* %Other
1544/// ...
1545/// %V = phi [i32 %V1, i32 %V2]
1546///
1547/// We can do this to a select if its only uses are loads and if the operands
1548/// to the select can be loaded unconditionally.
1549///
1550/// FIXME: This should be hoisted into a generic utility, likely in
1551/// Transforms/Util/Local.h
1553 const DataLayout &DL = PN.getDataLayout();
1554
1555 // For now, we can only do this promotion if the load is in the same block
1556 // as the PHI, and if there are no stores between the phi and load.
1557 // TODO: Allow recursive phi users.
1558 // TODO: Allow stores.
1559 BasicBlock *BB = PN.getParent();
1560 Align MaxAlign;
1561 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1562 Type *LoadType = nullptr;
1563 for (User *U : PN.users()) {
1565 if (!LI || !LI->isSimple())
1566 return false;
1567
1568 // For now we only allow loads in the same block as the PHI. This is
1569 // a common case that happens when instcombine merges two loads through
1570 // a PHI.
1571 if (LI->getParent() != BB)
1572 return false;
1573
1574 if (LoadType) {
1575 if (LoadType != LI->getType())
1576 return false;
1577 } else {
1578 LoadType = LI->getType();
1579 }
1580
1581 // Ensure that there are no instructions between the PHI and the load that
1582 // could store.
1583 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1584 if (BBI->mayWriteToMemory())
1585 return false;
1586
1587 MaxAlign = std::max(MaxAlign, LI->getAlign());
1588 }
1589
1590 if (!LoadType)
1591 return false;
1592
1593 APInt LoadSize =
1594 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1595
1596 // We can only transform this if it is safe to push the loads into the
1597 // predecessor blocks. The only thing to watch out for is that we can't put
1598 // a possibly trapping load in the predecessor if it is a critical edge.
1599 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1601 Value *InVal = PN.getIncomingValue(Idx);
1602
1603 // If the value is produced by the terminator of the predecessor (an
1604 // invoke) or it has side-effects, there is no valid place to put a load
1605 // in the predecessor.
1606 if (TI == InVal || TI->mayHaveSideEffects())
1607 return false;
1608
1609 // If the predecessor has a single successor, then the edge isn't
1610 // critical.
1611 if (TI->getNumSuccessors() == 1)
1612 continue;
1613
1614 // If this pointer is always safe to load, or if we can prove that there
1615 // is already a load in the block, then we can move the load to the pred
1616 // block.
1617 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1618 continue;
1619
1620 return false;
1621 }
1622
1623 return true;
1624}
1625
1626static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1627 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1628
1629 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1630 Type *LoadTy = SomeLoad->getType();
1631 IRB.SetInsertPoint(&PN);
1632 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1633 PN.getName() + ".sroa.speculated");
1634
1635 // Get the AA tags and alignment to use from one of the loads. It does not
1636 // matter which one we get and if any differ.
1637 AAMDNodes AATags = SomeLoad->getAAMetadata();
1638 Align Alignment = SomeLoad->getAlign();
1639
1640 // Rewrite all loads of the PN to use the new PHI.
1641 while (!PN.use_empty()) {
1642 LoadInst *LI = cast<LoadInst>(PN.user_back());
1643 LI->replaceAllUsesWith(NewPN);
1644 LI->eraseFromParent();
1645 }
1646
1647 // Inject loads into all of the pred blocks.
1648 DenseMap<BasicBlock *, Value *> InjectedLoads;
1649 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1650 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1651 Value *InVal = PN.getIncomingValue(Idx);
1652
1653 // A PHI node is allowed to have multiple (duplicated) entries for the same
1654 // basic block, as long as the value is the same. So if we already injected
1655 // a load in the predecessor, then we should reuse the same load for all
1656 // duplicated entries.
1657 if (Value *V = InjectedLoads.lookup(Pred)) {
1658 NewPN->addIncoming(V, Pred);
1659 continue;
1660 }
1661
1662 Instruction *TI = Pred->getTerminator();
1663 IRB.SetInsertPoint(TI);
1664
1665 LoadInst *Load = IRB.CreateAlignedLoad(
1666 LoadTy, InVal, Alignment,
1667 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1668 ++NumLoadsSpeculated;
1669 if (AATags)
1670 Load->setAAMetadata(AATags);
1671 NewPN->addIncoming(Load, Pred);
1672 InjectedLoads[Pred] = Load;
1673 }
1674
1675 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1676 PN.eraseFromParent();
1677}
1678
1679SelectHandSpeculativity &
1680SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1681 if (isTrueVal)
1683 else
1685 return *this;
1686}
1687
1688bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1689 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1690 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1691}
1692
1693bool SelectHandSpeculativity::areAllSpeculatable() const {
1694 return isSpeculatable(/*isTrueVal=*/true) &&
1695 isSpeculatable(/*isTrueVal=*/false);
1696}
1697
1698bool SelectHandSpeculativity::areAnySpeculatable() const {
1699 return isSpeculatable(/*isTrueVal=*/true) ||
1700 isSpeculatable(/*isTrueVal=*/false);
1701}
1702bool SelectHandSpeculativity::areNoneSpeculatable() const {
1703 return !areAnySpeculatable();
1704}
1705
1706static SelectHandSpeculativity
1708 assert(LI.isSimple() && "Only for simple loads");
1709 SelectHandSpeculativity Spec;
1710
1711 const DataLayout &DL = SI.getDataLayout();
1712 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1714 &LI))
1715 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1716 else if (PreserveCFG)
1717 return Spec;
1718
1719 return Spec;
1720}
1721
1722std::optional<RewriteableMemOps>
1723SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1724 RewriteableMemOps Ops;
1725
1726 for (User *U : SI.users()) {
1727 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1728 U = *BC->user_begin();
1729
1730 if (auto *Store = dyn_cast<StoreInst>(U)) {
1731 // Note that atomic stores can be transformed; atomic semantics do not
1732 // have any meaning for a local alloca. Stores are not speculatable,
1733 // however, so if we can't turn it into a predicated store, we are done.
1734 if (Store->isVolatile() || PreserveCFG)
1735 return {}; // Give up on this `select`.
1736 Ops.emplace_back(Store);
1737 continue;
1738 }
1739
1740 auto *LI = dyn_cast<LoadInst>(U);
1741
1742 // Note that atomic loads can be transformed;
1743 // atomic semantics do not have any meaning for a local alloca.
1744 if (!LI || LI->isVolatile())
1745 return {}; // Give up on this `select`.
1746
1747 PossiblySpeculatableLoad Load(LI);
1748 if (!LI->isSimple()) {
1749 // If the `load` is not simple, we can't speculatively execute it,
1750 // but we could handle this via a CFG modification. But can we?
1751 if (PreserveCFG)
1752 return {}; // Give up on this `select`.
1753 Ops.emplace_back(Load);
1754 continue;
1755 }
1756
1757 SelectHandSpeculativity Spec =
1759 if (PreserveCFG && !Spec.areAllSpeculatable())
1760 return {}; // Give up on this `select`.
1761
1762 Load.setInt(Spec);
1763 Ops.emplace_back(Load);
1764 }
1765
1766 return Ops;
1767}
1768
1770 IRBuilderTy &IRB) {
1771 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1772
1773 Value *TV = SI.getTrueValue();
1774 Value *FV = SI.getFalseValue();
1775 // Replace the given load of the select with a select of two loads.
1776
1777 assert(LI.isSimple() && "We only speculate simple loads");
1778
1779 IRB.SetInsertPoint(&LI);
1780
1781 LoadInst *TL =
1782 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1783 LI.getName() + ".sroa.speculate.load.true");
1784 LoadInst *FL =
1785 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1786 LI.getName() + ".sroa.speculate.load.false");
1787 NumLoadsSpeculated += 2;
1788
1789 // Transfer alignment and AA info if present.
1790 TL->setAlignment(LI.getAlign());
1791 FL->setAlignment(LI.getAlign());
1792
1793 AAMDNodes Tags = LI.getAAMetadata();
1794 if (Tags) {
1795 TL->setAAMetadata(Tags);
1796 FL->setAAMetadata(Tags);
1797 }
1798
1799 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1800 LI.getName() + ".sroa.speculated",
1801 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1802
1803 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1804 LI.replaceAllUsesWith(V);
1805}
1806
1807template <typename T>
1809 SelectHandSpeculativity Spec,
1810 DomTreeUpdater &DTU) {
1811 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1812 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1813 BasicBlock *Head = I.getParent();
1814 Instruction *ThenTerm = nullptr;
1815 Instruction *ElseTerm = nullptr;
1816 if (Spec.areNoneSpeculatable())
1817 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1818 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1819 else {
1820 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1821 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1822 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1823 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1824 cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1825 }
1826 auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1827 Spec = {}; // Do not use `Spec` beyond this point.
1828 BasicBlock *Tail = I.getParent();
1829 Tail->setName(Head->getName() + ".cont");
1830 PHINode *PN;
1831 if (isa<LoadInst>(I))
1832 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1833 for (BasicBlock *SuccBB : successors(Head)) {
1834 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1835 int SuccIdx = IsThen ? 0 : 1;
1836 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1837 auto &CondMemOp = cast<T>(*I.clone());
1838 if (NewMemOpBB != Head) {
1839 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1840 if (isa<LoadInst>(I))
1841 ++NumLoadsPredicated;
1842 else
1843 ++NumStoresPredicated;
1844 } else {
1845 CondMemOp.dropUBImplyingAttrsAndMetadata();
1846 ++NumLoadsSpeculated;
1847 }
1848 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1849 Value *Ptr = SI.getOperand(1 + SuccIdx);
1850 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1851 if (isa<LoadInst>(I)) {
1852 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1853 PN->addIncoming(&CondMemOp, NewMemOpBB);
1854 } else
1855 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1856 }
1857 if (isa<LoadInst>(I)) {
1858 PN->takeName(&I);
1859 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1860 I.replaceAllUsesWith(PN);
1861 }
1862}
1863
1865 SelectHandSpeculativity Spec,
1866 DomTreeUpdater &DTU) {
1867 if (auto *LI = dyn_cast<LoadInst>(&I))
1868 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1869 else if (auto *SI = dyn_cast<StoreInst>(&I))
1870 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1871 else
1872 llvm_unreachable_internal("Only for load and store.");
1873}
1874
1876 const RewriteableMemOps &Ops,
1877 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1878 bool CFGChanged = false;
1879 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1880
1881 for (const RewriteableMemOp &Op : Ops) {
1882 SelectHandSpeculativity Spec;
1883 Instruction *I;
1884 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1885 I = *US;
1886 } else {
1887 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1888 I = PSL.getPointer();
1889 Spec = PSL.getInt();
1890 }
1891 if (Spec.areAllSpeculatable()) {
1893 } else {
1894 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1895 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1896 CFGChanged = true;
1897 }
1898 I->eraseFromParent();
1899 }
1900
1901 for (User *U : make_early_inc_range(SI.users()))
1902 cast<BitCastInst>(U)->eraseFromParent();
1903 SI.eraseFromParent();
1904 return CFGChanged;
1905}
1906
1907/// Compute an adjusted pointer from Ptr by Offset bytes where the
1908/// resulting pointer has PointerTy.
1909static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1911 const Twine &NamePrefix) {
1912 if (Offset != 0)
1913 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1914 NamePrefix + "sroa_idx");
1915 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1916 NamePrefix + "sroa_cast");
1917}
1918
1919/// Compute the adjusted alignment for a load or store from an offset.
1923
1924/// Test whether we can convert a value from the old to the new type.
1925///
1926/// This predicate should be used to guard calls to convertValue in order to
1927/// ensure that we only try to convert viable values. The strategy is that we
1928/// will peel off single element struct and array wrappings to get to an
1929/// underlying value, and convert that value.
1930static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1931 unsigned VScale = 0) {
1932 if (OldTy == NewTy)
1933 return true;
1934
1935 // For integer types, we can't handle any bit-width differences. This would
1936 // break both vector conversions with extension and introduce endianness
1937 // issues when in conjunction with loads and stores.
1938 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1940 cast<IntegerType>(NewTy)->getBitWidth() &&
1941 "We can't have the same bitwidth for different int types");
1942 return false;
1943 }
1944
1945 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1946 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1947
1948 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1949 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1950 // Conversion is only possible when the size of scalable vectors is known.
1951 if (!VScale)
1952 return false;
1953
1954 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1955 // a single domain (either fixed or scalable). Any additional conversion
1956 // between fixed and scalable types is handled through integer types.
1957 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1958 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1959
1960 if (isa<ScalableVectorType>(NewTy)) {
1962 return false;
1963
1964 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1965 } else {
1967 return false;
1968
1969 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1970 }
1971 }
1972
1973 if (NewSize != OldSize)
1974 return false;
1975 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1976 return false;
1977
1978 // We can convert pointers to integers and vice-versa. Same for vectors
1979 // of pointers and integers.
1980 OldTy = OldTy->getScalarType();
1981 NewTy = NewTy->getScalarType();
1982 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1983 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1984 unsigned OldAS = OldTy->getPointerAddressSpace();
1985 unsigned NewAS = NewTy->getPointerAddressSpace();
1986 // Convert pointers if they are pointers from the same address space or
1987 // different integral (not non-integral) address spaces with the same
1988 // pointer size.
1989 return OldAS == NewAS ||
1990 (!DL.isNonIntegralAddressSpace(OldAS) &&
1991 !DL.isNonIntegralAddressSpace(NewAS) &&
1992 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1993 }
1994
1995 // We can convert integers to integral pointers, but not to non-integral
1996 // pointers.
1997 if (OldTy->isIntegerTy())
1998 return !DL.isNonIntegralPointerType(NewTy);
1999
2000 // We can convert integral pointers to integers, but non-integral pointers
2001 // need to remain pointers.
2002 if (!DL.isNonIntegralPointerType(OldTy))
2003 return NewTy->isIntegerTy();
2004
2005 return false;
2006 }
2007
2008 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2009 return false;
2010
2011 return true;
2012}
2013
2014/// Generic routine to convert an SSA value to a value of a different
2015/// type.
2016///
2017/// This will try various different casting techniques, such as bitcasts,
2018/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2019/// two types for viability with this routine.
2020static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2021 Type *NewTy) {
2022 Type *OldTy = V->getType();
2023
2024#ifndef NDEBUG
2025 BasicBlock *BB = IRB.GetInsertBlock();
2026 assert(BB && BB->getParent() && "VScale unknown!");
2027 unsigned VScale = BB->getParent()->getVScaleValue();
2028 assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2029 "Value not convertable to type");
2030#endif
2031
2032 if (OldTy == NewTy)
2033 return V;
2034
2035 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2036 "Integer types must be the exact same to convert.");
2037
2038 // A variant of bitcast that supports a mixture of fixed and scalable types
2039 // that are know to have the same size.
2040 auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
2041 Type *InTy = In->getType();
2042 if (InTy == Ty)
2043 return In;
2044
2046 // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2047 // <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2049 return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
2050 PoisonValue::get(VTy), In,
2051 IRB.getInt64(0)),
2052 Ty);
2053 }
2054
2056 // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2057 // <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2059 return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
2060 IRB.getInt64(0));
2061 }
2062
2063 return IRB.CreateBitCast(In, Ty);
2064 };
2065
2066 // See if we need inttoptr for this type pair. May require additional bitcast.
2067 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2068 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2069 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2070 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2071 // Directly handle i64 to i8*
2072 return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
2073 NewTy);
2074 }
2075
2076 // See if we need ptrtoint for this type pair. May require additional bitcast.
2077 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2078 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2079 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2080 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2081 // Expand i8* to i64 --> i8* to i64 to i64
2082 return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2083 NewTy);
2084 }
2085
2086 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2087 unsigned OldAS = OldTy->getPointerAddressSpace();
2088 unsigned NewAS = NewTy->getPointerAddressSpace();
2089 // To convert pointers with different address spaces (they are already
2090 // checked convertible, i.e. they have the same pointer size), so far we
2091 // cannot use `bitcast` (which has restrict on the same address space) or
2092 // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2093 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2094 // size.
2095 if (OldAS != NewAS) {
2096 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2097 return IRB.CreateIntToPtr(
2098 CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2099 DL.getIntPtrType(NewTy)),
2100 NewTy);
2101 }
2102 }
2103
2104 return CreateBitCastLike(V, NewTy);
2105}
2106
2107/// Test whether the given slice use can be promoted to a vector.
2108///
2109/// This function is called to test each entry in a partition which is slated
2110/// for a single slice.
2111static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2112 VectorType *Ty,
2113 uint64_t ElementSize,
2114 const DataLayout &DL,
2115 unsigned VScale) {
2116 // First validate the slice offsets.
2117 uint64_t BeginOffset =
2118 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2119 uint64_t BeginIndex = BeginOffset / ElementSize;
2120 if (BeginIndex * ElementSize != BeginOffset ||
2121 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2122 return false;
2123 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2124 uint64_t EndIndex = EndOffset / ElementSize;
2125 if (EndIndex * ElementSize != EndOffset ||
2126 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2127 return false;
2128
2129 assert(EndIndex > BeginIndex && "Empty vector!");
2130 uint64_t NumElements = EndIndex - BeginIndex;
2131 Type *SliceTy = (NumElements == 1)
2132 ? Ty->getElementType()
2133 : FixedVectorType::get(Ty->getElementType(), NumElements);
2134
2135 Type *SplitIntTy =
2136 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2137
2138 Use *U = S.getUse();
2139
2140 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2141 if (MI->isVolatile())
2142 return false;
2143 if (!S.isSplittable())
2144 return false; // Skip any unsplittable intrinsics.
2145 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2146 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2147 return false;
2148 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2149 if (LI->isVolatile())
2150 return false;
2151 Type *LTy = LI->getType();
2152 // Disable vector promotion when there are loads or stores of an FCA.
2153 if (LTy->isStructTy())
2154 return false;
2155 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2156 assert(LTy->isIntegerTy());
2157 LTy = SplitIntTy;
2158 }
2159 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2160 return false;
2161 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2162 if (SI->isVolatile())
2163 return false;
2164 Type *STy = SI->getValueOperand()->getType();
2165 // Disable vector promotion when there are loads or stores of an FCA.
2166 if (STy->isStructTy())
2167 return false;
2168 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2169 assert(STy->isIntegerTy());
2170 STy = SplitIntTy;
2171 }
2172 if (!canConvertValue(DL, STy, SliceTy, VScale))
2173 return false;
2174 } else {
2175 return false;
2176 }
2177
2178 return true;
2179}
2180
2181/// Test whether any vector type in \p CandidateTys is viable for promotion.
2182///
2183/// This implements the necessary checking for \c isVectorPromotionViable over
2184/// all slices of the alloca for the given VectorType.
2185static VectorType *
2187 SmallVectorImpl<VectorType *> &CandidateTys,
2188 bool HaveCommonEltTy, Type *CommonEltTy,
2189 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2190 VectorType *CommonVecPtrTy, unsigned VScale) {
2191 // If we didn't find a vector type, nothing to do here.
2192 if (CandidateTys.empty())
2193 return nullptr;
2194
2195 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2196 // then we should choose it, not some other alternative.
2197 // But, we can't perform a no-op pointer address space change via bitcast,
2198 // so if we didn't have a common pointer element type, bail.
2199 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2200 return nullptr;
2201
2202 // Try to pick the "best" element type out of the choices.
2203 if (!HaveCommonEltTy && HaveVecPtrTy) {
2204 // If there was a pointer element type, there's really only one choice.
2205 CandidateTys.clear();
2206 CandidateTys.push_back(CommonVecPtrTy);
2207 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2208 // Integer-ify vector types.
2209 for (VectorType *&VTy : CandidateTys) {
2210 if (!VTy->getElementType()->isIntegerTy())
2211 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2212 VTy->getContext(), VTy->getScalarSizeInBits())));
2213 }
2214
2215 // Rank the remaining candidate vector types. This is easy because we know
2216 // they're all integer vectors. We sort by ascending number of elements.
2217 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2218 (void)DL;
2219 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2220 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2221 "Cannot have vector types of different sizes!");
2222 assert(RHSTy->getElementType()->isIntegerTy() &&
2223 "All non-integer types eliminated!");
2224 assert(LHSTy->getElementType()->isIntegerTy() &&
2225 "All non-integer types eliminated!");
2226 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2227 cast<FixedVectorType>(LHSTy)->getNumElements();
2228 };
2229 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2230 (void)DL;
2231 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2232 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2233 "Cannot have vector types of different sizes!");
2234 assert(RHSTy->getElementType()->isIntegerTy() &&
2235 "All non-integer types eliminated!");
2236 assert(LHSTy->getElementType()->isIntegerTy() &&
2237 "All non-integer types eliminated!");
2238 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2239 cast<FixedVectorType>(LHSTy)->getNumElements();
2240 };
2241 llvm::sort(CandidateTys, RankVectorTypesComp);
2242 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2243 CandidateTys.end());
2244 } else {
2245// The only way to have the same element type in every vector type is to
2246// have the same vector type. Check that and remove all but one.
2247#ifndef NDEBUG
2248 for (VectorType *VTy : CandidateTys) {
2249 assert(VTy->getElementType() == CommonEltTy &&
2250 "Unaccounted for element type!");
2251 assert(VTy == CandidateTys[0] &&
2252 "Different vector types with the same element type!");
2253 }
2254#endif
2255 CandidateTys.resize(1);
2256 }
2257
2258 // FIXME: hack. Do we have a named constant for this?
2259 // SDAG SDNode can't have more than 65535 operands.
2260 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2261 return cast<FixedVectorType>(VTy)->getNumElements() >
2262 std::numeric_limits<unsigned short>::max();
2263 });
2264
2265 // Find a vector type viable for promotion by iterating over all slices.
2266 auto *VTy = llvm::find_if(CandidateTys, [&](VectorType *VTy) -> bool {
2267 uint64_t ElementSize =
2268 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2269
2270 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2271 // that aren't byte sized.
2272 if (ElementSize % 8)
2273 return false;
2274 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2275 "vector size not a multiple of element size?");
2276 ElementSize /= 8;
2277
2278 for (const Slice &S : P)
2279 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2280 return false;
2281
2282 for (const Slice *S : P.splitSliceTails())
2283 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2284 return false;
2285
2286 return true;
2287 });
2288 return VTy != CandidateTys.end() ? *VTy : nullptr;
2289}
2290
2292 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2293 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2294 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2295 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2296 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2297 [[maybe_unused]] VectorType *OriginalElt =
2298 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2299 // Consider additional vector types where the element type size is a
2300 // multiple of load/store element size.
2301 for (Type *Ty : OtherTys) {
2303 continue;
2304 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2305 // Make a copy of CandidateTys and iterate through it, because we
2306 // might append to CandidateTys in the loop.
2307 for (VectorType *const VTy : CandidateTysCopy) {
2308 // The elements in the copy should remain invariant throughout the loop
2309 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2310 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2311 unsigned ElementSize =
2312 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2313 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2314 VectorSize % TypeSize == 0) {
2315 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2316 CheckCandidateType(NewVTy);
2317 }
2318 }
2319 }
2320
2322 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2323 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2324}
2325
2326/// Test whether the given alloca partitioning and range of slices can be
2327/// promoted to a vector.
2328///
2329/// This is a quick test to check whether we can rewrite a particular alloca
2330/// partition (and its newly formed alloca) into a vector alloca with only
2331/// whole-vector loads and stores such that it could be promoted to a vector
2332/// SSA value. We only can ensure this for a limited set of operations, and we
2333/// don't want to do the rewrites unless we are confident that the result will
2334/// be promotable, so we have an early test here.
2336 unsigned VScale) {
2337 // Collect the candidate types for vector-based promotion. Also track whether
2338 // we have different element types.
2339 SmallVector<VectorType *, 4> CandidateTys;
2340 SetVector<Type *> LoadStoreTys;
2341 SetVector<Type *> DeferredTys;
2342 Type *CommonEltTy = nullptr;
2343 VectorType *CommonVecPtrTy = nullptr;
2344 bool HaveVecPtrTy = false;
2345 bool HaveCommonEltTy = true;
2346 bool HaveCommonVecPtrTy = true;
2347 auto CheckCandidateType = [&](Type *Ty) {
2348 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2349 // Return if bitcast to vectors is different for total size in bits.
2350 if (!CandidateTys.empty()) {
2351 VectorType *V = CandidateTys[0];
2352 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2353 DL.getTypeSizeInBits(V).getFixedValue()) {
2354 CandidateTys.clear();
2355 return;
2356 }
2357 }
2358 CandidateTys.push_back(VTy);
2359 Type *EltTy = VTy->getElementType();
2360
2361 if (!CommonEltTy)
2362 CommonEltTy = EltTy;
2363 else if (CommonEltTy != EltTy)
2364 HaveCommonEltTy = false;
2365
2366 if (EltTy->isPointerTy()) {
2367 HaveVecPtrTy = true;
2368 if (!CommonVecPtrTy)
2369 CommonVecPtrTy = VTy;
2370 else if (CommonVecPtrTy != VTy)
2371 HaveCommonVecPtrTy = false;
2372 }
2373 }
2374 };
2375
2376 // Put load and store types into a set for de-duplication.
2377 for (const Slice &S : P) {
2378 Type *Ty;
2379 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2380 Ty = LI->getType();
2381 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2382 Ty = SI->getValueOperand()->getType();
2383 else
2384 continue;
2385
2386 auto CandTy = Ty->getScalarType();
2387 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2388 S.endOffset() != P.endOffset())) {
2389 DeferredTys.insert(Ty);
2390 continue;
2391 }
2392
2393 LoadStoreTys.insert(Ty);
2394 // Consider any loads or stores that are the exact size of the slice.
2395 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2396 CheckCandidateType(Ty);
2397 }
2398
2399 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2401 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2402 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2403 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2404 return VTy;
2405
2406 CandidateTys.clear();
2408 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2409 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2410 CommonVecPtrTy, VScale);
2411}
2412
2413/// Test whether a slice of an alloca is valid for integer widening.
2414///
2415/// This implements the necessary checking for the \c isIntegerWideningViable
2416/// test below on a single slice of the alloca.
2417static bool isIntegerWideningViableForSlice(const Slice &S,
2418 uint64_t AllocBeginOffset,
2419 Type *AllocaTy,
2420 const DataLayout &DL,
2421 bool &WholeAllocaOp) {
2422 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2423
2424 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2425 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2426
2427 Use *U = S.getUse();
2428
2429 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2430 // larger than other load/store slices (RelEnd > Size). But lifetime are
2431 // always promotable and should not impact other slices' promotability of the
2432 // partition.
2433 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2434 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2435 return true;
2436 }
2437
2438 // We can't reasonably handle cases where the load or store extends past
2439 // the end of the alloca's type and into its padding.
2440 if (RelEnd > Size)
2441 return false;
2442
2443 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2444 if (LI->isVolatile())
2445 return false;
2446 // We can't handle loads that extend past the allocated memory.
2447 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2448 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2449 return false;
2450 // So far, AllocaSliceRewriter does not support widening split slice tails
2451 // in rewriteIntegerLoad.
2452 if (S.beginOffset() < AllocBeginOffset)
2453 return false;
2454 // Note that we don't count vector loads or stores as whole-alloca
2455 // operations which enable integer widening because we would prefer to use
2456 // vector widening instead.
2457 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2458 WholeAllocaOp = true;
2459 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2460 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2461 return false;
2462 } else if (RelBegin != 0 || RelEnd != Size ||
2463 !canConvertValue(DL, AllocaTy, LI->getType())) {
2464 // Non-integer loads need to be convertible from the alloca type so that
2465 // they are promotable.
2466 return false;
2467 }
2468 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2469 Type *ValueTy = SI->getValueOperand()->getType();
2470 if (SI->isVolatile())
2471 return false;
2472 // We can't handle stores that extend past the allocated memory.
2473 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2474 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2475 return false;
2476 // So far, AllocaSliceRewriter does not support widening split slice tails
2477 // in rewriteIntegerStore.
2478 if (S.beginOffset() < AllocBeginOffset)
2479 return false;
2480 // Note that we don't count vector loads or stores as whole-alloca
2481 // operations which enable integer widening because we would prefer to use
2482 // vector widening instead.
2483 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2484 WholeAllocaOp = true;
2485 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2486 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2487 return false;
2488 } else if (RelBegin != 0 || RelEnd != Size ||
2489 !canConvertValue(DL, ValueTy, AllocaTy)) {
2490 // Non-integer stores need to be convertible to the alloca type so that
2491 // they are promotable.
2492 return false;
2493 }
2494 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2495 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2496 return false;
2497 if (!S.isSplittable())
2498 return false; // Skip any unsplittable intrinsics.
2499 } else {
2500 return false;
2501 }
2502
2503 return true;
2504}
2505
2506/// Test whether the given alloca partition's integer operations can be
2507/// widened to promotable ones.
2508///
2509/// This is a quick test to check whether we can rewrite the integer loads and
2510/// stores to a particular alloca into wider loads and stores and be able to
2511/// promote the resulting alloca.
2512static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2513 const DataLayout &DL) {
2514 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2515 // Don't create integer types larger than the maximum bitwidth.
2516 if (SizeInBits > IntegerType::MAX_INT_BITS)
2517 return false;
2518
2519 // Don't try to handle allocas with bit-padding.
2520 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2521 return false;
2522
2523 // We need to ensure that an integer type with the appropriate bitwidth can
2524 // be converted to the alloca type, whatever that is. We don't want to force
2525 // the alloca itself to have an integer type if there is a more suitable one.
2526 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2527 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2528 !canConvertValue(DL, IntTy, AllocaTy))
2529 return false;
2530
2531 // While examining uses, we ensure that the alloca has a covering load or
2532 // store. We don't want to widen the integer operations only to fail to
2533 // promote due to some other unsplittable entry (which we may make splittable
2534 // later). However, if there are only splittable uses, go ahead and assume
2535 // that we cover the alloca.
2536 // FIXME: We shouldn't consider split slices that happen to start in the
2537 // partition here...
2538 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2539
2540 for (const Slice &S : P)
2541 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2542 WholeAllocaOp))
2543 return false;
2544
2545 for (const Slice *S : P.splitSliceTails())
2546 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2547 WholeAllocaOp))
2548 return false;
2549
2550 return WholeAllocaOp;
2551}
2552
2553static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2555 const Twine &Name) {
2556 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2557 IntegerType *IntTy = cast<IntegerType>(V->getType());
2558 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2559 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2560 "Element extends past full value");
2561 uint64_t ShAmt = 8 * Offset;
2562 if (DL.isBigEndian())
2563 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2564 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2565 if (ShAmt) {
2566 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2567 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2568 }
2569 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2570 "Cannot extract to a larger integer!");
2571 if (Ty != IntTy) {
2572 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2573 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2574 }
2575 return V;
2576}
2577
2578static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2579 Value *V, uint64_t Offset, const Twine &Name) {
2580 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2581 IntegerType *Ty = cast<IntegerType>(V->getType());
2582 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2583 "Cannot insert a larger integer!");
2584 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2585 if (Ty != IntTy) {
2586 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2587 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2588 }
2589 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2590 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2591 "Element store outside of alloca store");
2592 uint64_t ShAmt = 8 * Offset;
2593 if (DL.isBigEndian())
2594 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2595 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2596 if (ShAmt) {
2597 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2598 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2599 }
2600
2601 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2602 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2603 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2604 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2605 V = IRB.CreateOr(Old, V, Name + ".insert");
2606 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2607 }
2608 return V;
2609}
2610
2611static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2612 unsigned EndIndex, const Twine &Name) {
2613 auto *VecTy = cast<FixedVectorType>(V->getType());
2614 unsigned NumElements = EndIndex - BeginIndex;
2615 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2616
2617 if (NumElements == VecTy->getNumElements())
2618 return V;
2619
2620 if (NumElements == 1) {
2621 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2622 Name + ".extract");
2623 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2624 return V;
2625 }
2626
2627 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2628 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2629 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2630 return V;
2631}
2632
2633static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2634 unsigned BeginIndex, const Twine &Name) {
2635 VectorType *VecTy = cast<VectorType>(Old->getType());
2636 assert(VecTy && "Can only insert a vector into a vector");
2637
2638 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2639 if (!Ty) {
2640 // Single element to insert.
2641 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2642 Name + ".insert");
2643 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2644 return V;
2645 }
2646
2649 "Too many elements!");
2652 assert(V->getType() == VecTy && "Vector type mismatch");
2653 return V;
2654 }
2655 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2656
2657 // When inserting a smaller vector into the larger to store, we first
2658 // use a shuffle vector to widen it with undef elements, and then
2659 // a second shuffle vector to select between the loaded vector and the
2660 // incoming vector.
2662 Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2663 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2664 if (i >= BeginIndex && i < EndIndex)
2665 Mask.push_back(i - BeginIndex);
2666 else
2667 Mask.push_back(-1);
2668 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2669 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2670
2673 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2674 Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2675
2676 // No profiling support for vector selects.
2677 V = IRB.CreateSelectWithUnknownProfile(ConstantVector::get(Mask2), V, Old,
2678 DEBUG_TYPE, Name + "blend");
2679
2680 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2681 return V;
2682}
2683
2684/// This function takes two vector values and combines them into a single vector
2685/// by concatenating their elements. The function handles:
2686///
2687/// 1. Element type mismatch: If either vector's element type differs from
2688/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2689/// preserving the total bit width (adjusting the number of elements
2690/// accordingly).
2691///
2692/// 2. Size mismatch: After transforming the vectors to have the desired element
2693/// type, if the two vectors have different numbers of elements, the smaller
2694/// vector is extended with poison values to match the size of the larger
2695/// vector before concatenation.
2696///
2697/// 3. Concatenation: The vectors are merged using a shuffle operation that
2698/// places all elements of V0 first, followed by all elements of V1.
2699///
2700/// \param V0 The first vector to merge (must be a vector type)
2701/// \param V1 The second vector to merge (must be a vector type)
2702/// \param DL The data layout for size calculations
2703/// \param NewAIEltTy The desired element type for the result vector
2704/// \param Builder IRBuilder for creating new instructions
2705/// \return A new vector containing all elements from V0 followed by all
2706/// elements from V1
2708 Type *NewAIEltTy, IRBuilder<> &Builder) {
2709 // V0 and V1 are vectors
2710 // Create a new vector type with combined elements
2711 // Use ShuffleVector to concatenate the vectors
2712 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2713 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2714
2715 // If V0/V1 element types are different from NewAllocaElementType,
2716 // we need to introduce bitcasts before merging them
2717 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2718 const char *DebugName) {
2719 Type *EltType = VecType->getElementType();
2720 if (EltType != NewAIEltTy) {
2721 // Calculate new number of elements to maintain same bit width
2722 unsigned TotalBits =
2723 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2724 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2725
2726 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2727 V = Builder.CreateBitCast(V, NewVecType);
2728 VecType = NewVecType;
2729 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2730 }
2731 };
2732
2733 BitcastIfNeeded(V0, VecType0, "V0");
2734 BitcastIfNeeded(V1, VecType1, "V1");
2735
2736 unsigned NumElts0 = VecType0->getNumElements();
2737 unsigned NumElts1 = VecType1->getNumElements();
2738
2739 SmallVector<int, 16> ShuffleMask;
2740
2741 if (NumElts0 == NumElts1) {
2742 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2743 ShuffleMask.push_back(i);
2744 } else {
2745 // If two vectors have different sizes, we need to extend
2746 // the smaller vector to the size of the larger vector.
2747 unsigned SmallSize = std::min(NumElts0, NumElts1);
2748 unsigned LargeSize = std::max(NumElts0, NumElts1);
2749 bool IsV0Smaller = NumElts0 < NumElts1;
2750 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2751 SmallVector<int, 16> ExtendMask;
2752 for (unsigned i = 0; i < SmallSize; ++i)
2753 ExtendMask.push_back(i);
2754 for (unsigned i = SmallSize; i < LargeSize; ++i)
2755 ExtendMask.push_back(PoisonMaskElem);
2756 ExtendedVec = Builder.CreateShuffleVector(
2757 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2758 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2759 for (unsigned i = 0; i < NumElts0; ++i)
2760 ShuffleMask.push_back(i);
2761 for (unsigned i = 0; i < NumElts1; ++i)
2762 ShuffleMask.push_back(LargeSize + i);
2763 }
2764
2765 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2766}
2767
2768namespace {
2769
2770/// Visitor to rewrite instructions using p particular slice of an alloca
2771/// to use a new alloca.
2772///
2773/// Also implements the rewriting to vector-based accesses when the partition
2774/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2775/// lives here.
2776class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2777 // Befriend the base class so it can delegate to private visit methods.
2778 friend class InstVisitor<AllocaSliceRewriter, bool>;
2779
2780 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2781
2782 const DataLayout &DL;
2783 AllocaSlices &AS;
2784 SROA &Pass;
2785 AllocaInst &OldAI, &NewAI;
2786 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2787 Type *NewAllocaTy;
2788
2789 // This is a convenience and flag variable that will be null unless the new
2790 // alloca's integer operations should be widened to this integer type due to
2791 // passing isIntegerWideningViable above. If it is non-null, the desired
2792 // integer type will be stored here for easy access during rewriting.
2793 IntegerType *IntTy;
2794
2795 // If we are rewriting an alloca partition which can be written as pure
2796 // vector operations, we stash extra information here. When VecTy is
2797 // non-null, we have some strict guarantees about the rewritten alloca:
2798 // - The new alloca is exactly the size of the vector type here.
2799 // - The accesses all either map to the entire vector or to a single
2800 // element.
2801 // - The set of accessing instructions is only one of those handled above
2802 // in isVectorPromotionViable. Generally these are the same access kinds
2803 // which are promotable via mem2reg.
2804 VectorType *VecTy;
2805 Type *ElementTy;
2806 uint64_t ElementSize;
2807
2808 // The original offset of the slice currently being rewritten relative to
2809 // the original alloca.
2810 uint64_t BeginOffset = 0;
2811 uint64_t EndOffset = 0;
2812
2813 // The new offsets of the slice currently being rewritten relative to the
2814 // original alloca.
2815 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2816
2817 uint64_t SliceSize = 0;
2818 bool IsSplittable = false;
2819 bool IsSplit = false;
2820 Use *OldUse = nullptr;
2821 Instruction *OldPtr = nullptr;
2822
2823 // Track post-rewrite users which are PHI nodes and Selects.
2824 SmallSetVector<PHINode *, 8> &PHIUsers;
2825 SmallSetVector<SelectInst *, 8> &SelectUsers;
2826
2827 // Utility IR builder, whose name prefix is setup for each visited use, and
2828 // the insertion point is set to point to the user.
2829 IRBuilderTy IRB;
2830
2831 // Return the new alloca, addrspacecasted if required to avoid changing the
2832 // addrspace of a volatile access.
2833 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2834 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2835 return &NewAI;
2836
2837 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2838 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2839 }
2840
2841public:
2842 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2843 AllocaInst &OldAI, AllocaInst &NewAI,
2844 uint64_t NewAllocaBeginOffset,
2845 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2846 VectorType *PromotableVecTy,
2847 SmallSetVector<PHINode *, 8> &PHIUsers,
2848 SmallSetVector<SelectInst *, 8> &SelectUsers)
2849 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2850 NewAllocaBeginOffset(NewAllocaBeginOffset),
2851 NewAllocaEndOffset(NewAllocaEndOffset),
2852 NewAllocaTy(NewAI.getAllocatedType()),
2853 IntTy(
2854 IsIntegerPromotable
2855 ? Type::getIntNTy(NewAI.getContext(),
2856 DL.getTypeSizeInBits(NewAI.getAllocatedType())
2857 .getFixedValue())
2858 : nullptr),
2859 VecTy(PromotableVecTy),
2860 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2861 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2862 : 0),
2863 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2864 IRB(NewAI.getContext(), ConstantFolder()) {
2865 if (VecTy) {
2866 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2867 "Only multiple-of-8 sized vector elements are viable");
2868 ++NumVectorized;
2869 }
2870 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2871 }
2872
2873 bool visit(AllocaSlices::const_iterator I) {
2874 bool CanSROA = true;
2875 BeginOffset = I->beginOffset();
2876 EndOffset = I->endOffset();
2877 IsSplittable = I->isSplittable();
2878 IsSplit =
2879 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2880 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2881 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2882 LLVM_DEBUG(dbgs() << "\n");
2883
2884 // Compute the intersecting offset range.
2885 assert(BeginOffset < NewAllocaEndOffset);
2886 assert(EndOffset > NewAllocaBeginOffset);
2887 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2888 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2889
2890 SliceSize = NewEndOffset - NewBeginOffset;
2891 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2892 << ") NewBegin:(" << NewBeginOffset << ", "
2893 << NewEndOffset << ") NewAllocaBegin:("
2894 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2895 << ")\n");
2896 assert(IsSplit || NewBeginOffset == BeginOffset);
2897 OldUse = I->getUse();
2898 OldPtr = cast<Instruction>(OldUse->get());
2899
2900 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2901 IRB.SetInsertPoint(OldUserI);
2902 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2903 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2904 Twine(BeginOffset) + ".");
2905
2906 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2907 if (VecTy || IntTy)
2908 assert(CanSROA);
2909 return CanSROA;
2910 }
2911
2912 /// Attempts to rewrite a partition using tree-structured merge optimization.
2913 ///
2914 /// This function analyzes a partition to determine if it can be optimized
2915 /// using a tree-structured merge pattern, where multiple non-overlapping
2916 /// stores completely fill an alloca. And there is no load from the alloca in
2917 /// the middle of the stores. Such patterns can be optimized by eliminating
2918 /// the intermediate stores and directly constructing the final vector by
2919 /// using shufflevectors.
2920 ///
2921 /// Example transformation:
2922 /// Before: (stores do not have to be in order)
2923 /// %alloca = alloca <8 x float>
2924 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2925 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2926 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2927 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2928 ///
2929 /// After:
2930 /// %alloca = alloca <8 x float>
2931 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2932 /// i32 3>
2933 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2934 /// i32 3>
2935 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2936 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2937 /// store %shuffle2, ptr %alloca
2938 ///
2939 /// The optimization looks for partitions that:
2940 /// 1. Have no overlapping split slice tails
2941 /// 2. Contain non-overlapping stores that cover the entire alloca
2942 /// 3. Have exactly one load that reads the complete alloca structure and not
2943 /// in the middle of the stores (TODO: maybe we can relax the constraint
2944 /// about reading the entire alloca structure)
2945 ///
2946 /// \param P The partition to analyze and potentially rewrite
2947 /// \return An optional vector of values that were deleted during the rewrite
2948 /// process, or std::nullopt if the partition cannot be optimized
2949 /// using tree-structured merge
2950 std::optional<SmallVector<Value *, 4>>
2951 rewriteTreeStructuredMerge(Partition &P) {
2952 // No tail slices that overlap with the partition
2953 if (P.splitSliceTails().size() > 0)
2954 return std::nullopt;
2955
2956 SmallVector<Value *, 4> DeletedValues;
2957 LoadInst *TheLoad = nullptr;
2958
2959 // Structure to hold store information
2960 struct StoreInfo {
2961 StoreInst *Store;
2962 uint64_t BeginOffset;
2963 uint64_t EndOffset;
2964 Value *StoredValue;
2965 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2966 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2967 };
2968
2969 SmallVector<StoreInfo, 4> StoreInfos;
2970
2971 // If the new alloca is a fixed vector type, we use its element type as the
2972 // allocated element type, otherwise we use i8 as the allocated element
2973 Type *AllocatedEltTy =
2975 ? cast<FixedVectorType>(NewAI.getAllocatedType())->getElementType()
2976 : Type::getInt8Ty(NewAI.getContext());
2977 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2978
2979 // Helper to check if a type is
2980 // 1. A fixed vector type
2981 // 2. The element type is not a pointer
2982 // 3. The element type size is byte-aligned
2983 // We only handle the cases that the ld/st meet these conditions
2984 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2985 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2986 return FixedVecTy &&
2987 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2988 !FixedVecTy->getElementType()->isPointerTy();
2989 };
2990
2991 for (Slice &S : P) {
2992 auto *User = cast<Instruction>(S.getUse()->getUser());
2993 if (auto *LI = dyn_cast<LoadInst>(User)) {
2994 // Do not handle the case if
2995 // 1. There is more than one load
2996 // 2. The load is volatile
2997 // 3. The load does not read the entire alloca structure
2998 // 4. The load does not meet the conditions in the helper function
2999 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
3000 S.beginOffset() != NewAllocaBeginOffset ||
3001 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
3002 return std::nullopt;
3003 TheLoad = LI;
3004 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
3005 // Do not handle the case if
3006 // 1. The store does not meet the conditions in the helper function
3007 // 2. The store is volatile
3008 // 3. The total store size is not a multiple of the allocated element
3009 // type size
3010 if (!IsTypeValidForTreeStructuredMerge(
3011 SI->getValueOperand()->getType()) ||
3012 SI->isVolatile())
3013 return std::nullopt;
3014 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
3015 unsigned NumElts = VecTy->getNumElements();
3016 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
3017 if (NumElts * EltSize % AllocatedEltTySize != 0)
3018 return std::nullopt;
3019 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
3020 SI->getValueOperand());
3021 } else {
3022 // If we have instructions other than load and store, we cannot do the
3023 // tree structured merge
3024 return std::nullopt;
3025 }
3026 }
3027 // If we do not have any load, we cannot do the tree structured merge
3028 if (!TheLoad)
3029 return std::nullopt;
3030
3031 // If we do not have multiple stores, we cannot do the tree structured merge
3032 if (StoreInfos.size() < 2)
3033 return std::nullopt;
3034
3035 // Stores should not overlap and should cover the whole alloca
3036 // Sort by begin offset
3037 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
3038 return A.BeginOffset < B.BeginOffset;
3039 });
3040
3041 // Check for overlaps and coverage
3042 uint64_t ExpectedStart = NewAllocaBeginOffset;
3043 for (auto &StoreInfo : StoreInfos) {
3044 uint64_t BeginOff = StoreInfo.BeginOffset;
3045 uint64_t EndOff = StoreInfo.EndOffset;
3046
3047 // Check for gap or overlap
3048 if (BeginOff != ExpectedStart)
3049 return std::nullopt;
3050
3051 ExpectedStart = EndOff;
3052 }
3053 // Check that stores cover the entire alloca
3054 if (ExpectedStart != NewAllocaEndOffset)
3055 return std::nullopt;
3056
3057 // Stores should be in the same basic block
3058 // The load should not be in the middle of the stores
3059 // Note:
3060 // If the load is in a different basic block with the stores, we can still
3061 // do the tree structured merge. This is because we do not have the
3062 // store->load forwarding here. The merged vector will be stored back to
3063 // NewAI and the new load will load from NewAI. The forwarding will be
3064 // handled later when we try to promote NewAI.
3065 BasicBlock *LoadBB = TheLoad->getParent();
3066 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3067
3068 for (auto &StoreInfo : StoreInfos) {
3069 if (StoreInfo.Store->getParent() != StoreBB)
3070 return std::nullopt;
3071 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
3072 return std::nullopt;
3073 }
3074
3075 // If we reach here, the partition can be merged with a tree structured
3076 // merge
3077 LLVM_DEBUG({
3078 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3079 << "\n Ordered stores:\n";
3080 for (auto [i, Info] : enumerate(StoreInfos))
3081 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3082 << Info.EndOffset << ") \tStore: " << *Info.Store
3083 << "\tValue: " << *Info.StoredValue << "\n";
3084 });
3085
3086 // Instead of having these stores, we merge all the stored values into a
3087 // vector and store the merged value into the alloca
3088 std::queue<Value *> VecElements;
3089 IRBuilder<> Builder(StoreInfos.back().Store);
3090 for (const auto &Info : StoreInfos) {
3091 DeletedValues.push_back(Info.Store);
3092 VecElements.push(Info.StoredValue);
3093 }
3094
3095 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3096 while (VecElements.size() > 1) {
3097 const auto NumElts = VecElements.size();
3098 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3099 Value *V0 = VecElements.front();
3100 VecElements.pop();
3101 Value *V1 = VecElements.front();
3102 VecElements.pop();
3103 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3104 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3105 VecElements.push(Merged);
3106 }
3107 if (NumElts % 2 == 1) {
3108 Value *V = VecElements.front();
3109 VecElements.pop();
3110 VecElements.push(V);
3111 }
3112 }
3113
3114 // Store the merged value into the alloca
3115 Value *MergedValue = VecElements.front();
3116 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3117
3118 IRBuilder<> LoadBuilder(TheLoad);
3119 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3120 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3121 TheLoad->getName() + ".sroa.new.load"));
3122 DeletedValues.push_back(TheLoad);
3123
3124 return DeletedValues;
3125 }
3126
3127private:
3128 // Make sure the other visit overloads are visible.
3129 using Base::visit;
3130
3131 // Every instruction which can end up as a user must have a rewrite rule.
3132 bool visitInstruction(Instruction &I) {
3133 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3134 llvm_unreachable("No rewrite rule for this instruction!");
3135 }
3136
3137 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3138 // Note that the offset computation can use BeginOffset or NewBeginOffset
3139 // interchangeably for unsplit slices.
3140 assert(IsSplit || BeginOffset == NewBeginOffset);
3141 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3142
3143 StringRef OldName = OldPtr->getName();
3144 // Skip through the last '.sroa.' component of the name.
3145 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3146 if (LastSROAPrefix != StringRef::npos) {
3147 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3148 // Look for an SROA slice index.
3149 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3150 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3151 // Strip the index and look for the offset.
3152 OldName = OldName.substr(IndexEnd + 1);
3153 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3154 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3155 // Strip the offset.
3156 OldName = OldName.substr(OffsetEnd + 1);
3157 }
3158 }
3159 // Strip any SROA suffixes as well.
3160 OldName = OldName.substr(0, OldName.find(".sroa_"));
3161
3162 return getAdjustedPtr(IRB, DL, &NewAI,
3163 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3164 PointerTy, Twine(OldName) + ".");
3165 }
3166
3167 /// Compute suitable alignment to access this slice of the *new*
3168 /// alloca.
3169 ///
3170 /// You can optionally pass a type to this routine and if that type's ABI
3171 /// alignment is itself suitable, this will return zero.
3172 Align getSliceAlign() {
3173 return commonAlignment(NewAI.getAlign(),
3174 NewBeginOffset - NewAllocaBeginOffset);
3175 }
3176
3177 unsigned getIndex(uint64_t Offset) {
3178 assert(VecTy && "Can only call getIndex when rewriting a vector");
3179 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3180 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3181 uint32_t Index = RelOffset / ElementSize;
3182 assert(Index * ElementSize == RelOffset);
3183 return Index;
3184 }
3185
3186 void deleteIfTriviallyDead(Value *V) {
3189 Pass.DeadInsts.push_back(I);
3190 }
3191
3192 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3193 unsigned BeginIndex = getIndex(NewBeginOffset);
3194 unsigned EndIndex = getIndex(NewEndOffset);
3195 assert(EndIndex > BeginIndex && "Empty vector!");
3196
3197 LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3198 NewAI.getAlign(), "load");
3199
3200 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3201 LLVMContext::MD_access_group});
3202 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3203 }
3204
3205 Value *rewriteIntegerLoad(LoadInst &LI) {
3206 assert(IntTy && "We cannot insert an integer to the alloca");
3207 assert(!LI.isVolatile());
3208 Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3209 NewAI.getAlign(), "load");
3210 V = convertValue(DL, IRB, V, IntTy);
3211 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3212 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3213 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3214 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3215 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3216 }
3217 // It is possible that the extracted type is not the load type. This
3218 // happens if there is a load past the end of the alloca, and as
3219 // a consequence the slice is narrower but still a candidate for integer
3220 // lowering. To handle this case, we just zero extend the extracted
3221 // integer.
3222 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3223 "Can only handle an extract for an overly wide load");
3224 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3225 V = IRB.CreateZExt(V, LI.getType());
3226 return V;
3227 }
3228
3229 bool visitLoadInst(LoadInst &LI) {
3230 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3231 Value *OldOp = LI.getOperand(0);
3232 assert(OldOp == OldPtr);
3233
3234 AAMDNodes AATags = LI.getAAMetadata();
3235
3236 unsigned AS = LI.getPointerAddressSpace();
3237
3238 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3239 : LI.getType();
3240 bool IsPtrAdjusted = false;
3241 Value *V;
3242 if (VecTy) {
3243 V = rewriteVectorizedLoadInst(LI);
3244 } else if (IntTy && LI.getType()->isIntegerTy()) {
3245 V = rewriteIntegerLoad(LI);
3246 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3247 NewEndOffset == NewAllocaEndOffset &&
3248 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3249 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3250 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3251 !LI.isVolatile()))) {
3252 Value *NewPtr =
3253 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3254 LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
3255 NewAI.getAlign(), LI.isVolatile(),
3256 LI.getName());
3257 if (LI.isVolatile())
3258 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3259 if (NewLI->isAtomic())
3260 NewLI->setAlignment(LI.getAlign());
3261
3262 // Copy any metadata that is valid for the new load. This may require
3263 // conversion to a different kind of metadata, e.g. !nonnull might change
3264 // to !range or vice versa.
3265 copyMetadataForLoad(*NewLI, LI);
3266
3267 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3268 if (AATags)
3269 NewLI->setAAMetadata(AATags.adjustForAccess(
3270 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3271
3272 // Try to preserve nonnull metadata
3273 V = NewLI;
3274
3275 // If this is an integer load past the end of the slice (which means the
3276 // bytes outside the slice are undef or this load is dead) just forcibly
3277 // fix the integer size with correct handling of endianness.
3278 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3279 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3280 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3281 V = IRB.CreateZExt(V, TITy, "load.ext");
3282 if (DL.isBigEndian())
3283 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3284 "endian_shift");
3285 }
3286 } else {
3287 Type *LTy = IRB.getPtrTy(AS);
3288 LoadInst *NewLI =
3289 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3290 getSliceAlign(), LI.isVolatile(), LI.getName());
3291
3292 if (AATags)
3293 NewLI->setAAMetadata(AATags.adjustForAccess(
3294 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3295
3296 if (LI.isVolatile())
3297 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3298 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3299 LLVMContext::MD_access_group});
3300
3301 V = NewLI;
3302 IsPtrAdjusted = true;
3303 }
3304 V = convertValue(DL, IRB, V, TargetTy);
3305
3306 if (IsSplit) {
3307 assert(!LI.isVolatile());
3308 assert(LI.getType()->isIntegerTy() &&
3309 "Only integer type loads and stores are split");
3310 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3311 "Split load isn't smaller than original load");
3312 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3313 "Non-byte-multiple bit width");
3314 // Move the insertion point just past the load so that we can refer to it.
3315 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3316 // Ensure the insertion point comes before any debug-info immediately
3317 // after the load, so that variable values referring to the load are
3318 // dominated by it.
3319 LIIt.setHeadBit(true);
3320 IRB.SetInsertPoint(LI.getParent(), LIIt);
3321 // Create a placeholder value with the same type as LI to use as the
3322 // basis for the new value. This allows us to replace the uses of LI with
3323 // the computed value, and then replace the placeholder with LI, leaving
3324 // LI only used for this computation.
3325 Value *Placeholder =
3326 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3327 false, Align(1));
3328 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3329 "insert");
3330 LI.replaceAllUsesWith(V);
3331 Placeholder->replaceAllUsesWith(&LI);
3332 Placeholder->deleteValue();
3333 } else {
3334 LI.replaceAllUsesWith(V);
3335 }
3336
3337 Pass.DeadInsts.push_back(&LI);
3338 deleteIfTriviallyDead(OldOp);
3339 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3340 return !LI.isVolatile() && !IsPtrAdjusted;
3341 }
3342
3343 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3344 AAMDNodes AATags) {
3345 // Capture V for the purpose of debug-info accounting once it's converted
3346 // to a vector store.
3347 Value *OrigV = V;
3348 if (V->getType() != VecTy) {
3349 unsigned BeginIndex = getIndex(NewBeginOffset);
3350 unsigned EndIndex = getIndex(NewEndOffset);
3351 assert(EndIndex > BeginIndex && "Empty vector!");
3352 unsigned NumElements = EndIndex - BeginIndex;
3353 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3354 "Too many elements!");
3355 Type *SliceTy = (NumElements == 1)
3356 ? ElementTy
3357 : FixedVectorType::get(ElementTy, NumElements);
3358 if (V->getType() != SliceTy)
3359 V = convertValue(DL, IRB, V, SliceTy);
3360
3361 // Mix in the existing elements.
3362 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3363 NewAI.getAlign(), "load");
3364 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3365 }
3366 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3367 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3368 LLVMContext::MD_access_group});
3369 if (AATags)
3370 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3371 V->getType(), DL));
3372 Pass.DeadInsts.push_back(&SI);
3373
3374 // NOTE: Careful to use OrigV rather than V.
3375 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3376 Store, Store->getPointerOperand(), OrigV, DL);
3377 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3378 return true;
3379 }
3380
3381 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3382 assert(IntTy && "We cannot extract an integer from the alloca");
3383 assert(!SI.isVolatile());
3384 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3385 IntTy->getBitWidth()) {
3386 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3387 NewAI.getAlign(), "oldload");
3388 Old = convertValue(DL, IRB, Old, IntTy);
3389 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3390 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3391 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3392 }
3393 V = convertValue(DL, IRB, V, NewAllocaTy);
3394 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3395 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3396 LLVMContext::MD_access_group});
3397 if (AATags)
3398 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3399 V->getType(), DL));
3400
3401 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3402 Store, Store->getPointerOperand(),
3403 Store->getValueOperand(), DL);
3404
3405 Pass.DeadInsts.push_back(&SI);
3406 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3407 return true;
3408 }
3409
3410 bool visitStoreInst(StoreInst &SI) {
3411 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3412 Value *OldOp = SI.getOperand(1);
3413 assert(OldOp == OldPtr);
3414
3415 AAMDNodes AATags = SI.getAAMetadata();
3416 Value *V = SI.getValueOperand();
3417
3418 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3419 // alloca that should be re-examined after promoting this alloca.
3420 if (V->getType()->isPointerTy())
3421 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3422 Pass.PostPromotionWorklist.insert(AI);
3423
3424 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3425 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3426 assert(!SI.isVolatile());
3427 assert(V->getType()->isIntegerTy() &&
3428 "Only integer type loads and stores are split");
3429 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3430 "Non-byte-multiple bit width");
3431 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3432 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3433 "extract");
3434 }
3435
3436 if (VecTy)
3437 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3438 if (IntTy && V->getType()->isIntegerTy())
3439 return rewriteIntegerStore(V, SI, AATags);
3440
3441 StoreInst *NewSI;
3442 if (NewBeginOffset == NewAllocaBeginOffset &&
3443 NewEndOffset == NewAllocaEndOffset &&
3444 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3445 V = convertValue(DL, IRB, V, NewAllocaTy);
3446 Value *NewPtr =
3447 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3448
3449 NewSI =
3450 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3451 } else {
3452 unsigned AS = SI.getPointerAddressSpace();
3453 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3454 NewSI =
3455 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3456 }
3457 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3458 LLVMContext::MD_access_group});
3459 if (AATags)
3460 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3461 V->getType(), DL));
3462 if (SI.isVolatile())
3463 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3464 if (NewSI->isAtomic())
3465 NewSI->setAlignment(SI.getAlign());
3466
3467 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3468 NewSI, NewSI->getPointerOperand(),
3469 NewSI->getValueOperand(), DL);
3470
3471 Pass.DeadInsts.push_back(&SI);
3472 deleteIfTriviallyDead(OldOp);
3473
3474 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3475 return NewSI->getPointerOperand() == &NewAI &&
3476 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3477 !SI.isVolatile();
3478 }
3479
3480 /// Compute an integer value from splatting an i8 across the given
3481 /// number of bytes.
3482 ///
3483 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3484 /// call this routine.
3485 /// FIXME: Heed the advice above.
3486 ///
3487 /// \param V The i8 value to splat.
3488 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3489 Value *getIntegerSplat(Value *V, unsigned Size) {
3490 assert(Size > 0 && "Expected a positive number of bytes.");
3491 IntegerType *VTy = cast<IntegerType>(V->getType());
3492 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3493 if (Size == 1)
3494 return V;
3495
3496 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3497 V = IRB.CreateMul(
3498 IRB.CreateZExt(V, SplatIntTy, "zext"),
3499 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3500 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3501 SplatIntTy)),
3502 "isplat");
3503 return V;
3504 }
3505
3506 /// Compute a vector splat for a given element value.
3507 Value *getVectorSplat(Value *V, unsigned NumElements) {
3508 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3509 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3510 return V;
3511 }
3512
3513 bool visitMemSetInst(MemSetInst &II) {
3514 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3515 assert(II.getRawDest() == OldPtr);
3516
3517 AAMDNodes AATags = II.getAAMetadata();
3518
3519 // If the memset has a variable size, it cannot be split, just adjust the
3520 // pointer to the new alloca.
3521 if (!isa<ConstantInt>(II.getLength())) {
3522 assert(!IsSplit);
3523 assert(NewBeginOffset == BeginOffset);
3524 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3525 II.setDestAlignment(getSliceAlign());
3526 // In theory we should call migrateDebugInfo here. However, we do not
3527 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3528 // constant geps, or storing a variable number of bytes.
3530 "AT: Unexpected link to non-const GEP");
3531 deleteIfTriviallyDead(OldPtr);
3532 return false;
3533 }
3534
3535 // Record this instruction for deletion.
3536 Pass.DeadInsts.push_back(&II);
3537
3538 Type *AllocaTy = NewAI.getAllocatedType();
3539 Type *ScalarTy = AllocaTy->getScalarType();
3540
3541 const bool CanContinue = [&]() {
3542 if (VecTy || IntTy)
3543 return true;
3544 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3545 return false;
3546 // Length must be in range for FixedVectorType.
3547 auto *C = cast<ConstantInt>(II.getLength());
3548 const uint64_t Len = C->getLimitedValue();
3549 if (Len > std::numeric_limits<unsigned>::max())
3550 return false;
3551 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3552 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3553 return canConvertValue(DL, SrcTy, AllocaTy) &&
3554 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3555 }();
3556
3557 // If this doesn't map cleanly onto the alloca type, and that type isn't
3558 // a single value type, just emit a memset.
3559 if (!CanContinue) {
3560 Type *SizeTy = II.getLength()->getType();
3561 unsigned Sz = NewEndOffset - NewBeginOffset;
3562 Constant *Size = ConstantInt::get(SizeTy, Sz);
3563 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3564 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3565 MaybeAlign(getSliceAlign()), II.isVolatile()));
3566 if (AATags)
3567 New->setAAMetadata(
3568 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3569
3570 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3571 New, New->getRawDest(), nullptr, DL);
3572
3573 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3574 return false;
3575 }
3576
3577 // If we can represent this as a simple value, we have to build the actual
3578 // value to store, which requires expanding the byte present in memset to
3579 // a sensible representation for the alloca type. This is essentially
3580 // splatting the byte to a sufficiently wide integer, splatting it across
3581 // any desired vector width, and bitcasting to the final type.
3582 Value *V;
3583
3584 if (VecTy) {
3585 // If this is a memset of a vectorized alloca, insert it.
3586 assert(ElementTy == ScalarTy);
3587
3588 unsigned BeginIndex = getIndex(NewBeginOffset);
3589 unsigned EndIndex = getIndex(NewEndOffset);
3590 assert(EndIndex > BeginIndex && "Empty vector!");
3591 unsigned NumElements = EndIndex - BeginIndex;
3592 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3593 "Too many elements!");
3594
3595 Value *Splat = getIntegerSplat(
3596 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3597 Splat = convertValue(DL, IRB, Splat, ElementTy);
3598 if (NumElements > 1)
3599 Splat = getVectorSplat(Splat, NumElements);
3600
3601 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3602 NewAI.getAlign(), "oldload");
3603 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3604 } else if (IntTy) {
3605 // If this is a memset on an alloca where we can widen stores, insert the
3606 // set integer.
3607 assert(!II.isVolatile());
3608
3609 uint64_t Size = NewEndOffset - NewBeginOffset;
3610 V = getIntegerSplat(II.getValue(), Size);
3611
3612 if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3613 EndOffset != NewAllocaBeginOffset)) {
3614 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3615 NewAI.getAlign(), "oldload");
3616 Old = convertValue(DL, IRB, Old, IntTy);
3617 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3618 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3619 } else {
3620 assert(V->getType() == IntTy &&
3621 "Wrong type for an alloca wide integer!");
3622 }
3623 V = convertValue(DL, IRB, V, AllocaTy);
3624 } else {
3625 // Established these invariants above.
3626 assert(NewBeginOffset == NewAllocaBeginOffset);
3627 assert(NewEndOffset == NewAllocaEndOffset);
3628
3629 V = getIntegerSplat(II.getValue(),
3630 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3631 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
3632 V = getVectorSplat(
3633 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3634
3635 V = convertValue(DL, IRB, V, AllocaTy);
3636 }
3637
3638 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3639 StoreInst *New =
3640 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3641 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3642 LLVMContext::MD_access_group});
3643 if (AATags)
3644 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3645 V->getType(), DL));
3646
3647 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3648 New, New->getPointerOperand(), V, DL);
3649
3650 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3651 return !II.isVolatile();
3652 }
3653
3654 bool visitMemTransferInst(MemTransferInst &II) {
3655 // Rewriting of memory transfer instructions can be a bit tricky. We break
3656 // them into two categories: split intrinsics and unsplit intrinsics.
3657
3658 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3659
3660 AAMDNodes AATags = II.getAAMetadata();
3661
3662 bool IsDest = &II.getRawDestUse() == OldUse;
3663 assert((IsDest && II.getRawDest() == OldPtr) ||
3664 (!IsDest && II.getRawSource() == OldPtr));
3665
3666 Align SliceAlign = getSliceAlign();
3667 // For unsplit intrinsics, we simply modify the source and destination
3668 // pointers in place. This isn't just an optimization, it is a matter of
3669 // correctness. With unsplit intrinsics we may be dealing with transfers
3670 // within a single alloca before SROA ran, or with transfers that have
3671 // a variable length. We may also be dealing with memmove instead of
3672 // memcpy, and so simply updating the pointers is the necessary for us to
3673 // update both source and dest of a single call.
3674 if (!IsSplittable) {
3675 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3676 if (IsDest) {
3677 // Update the address component of linked dbg.assigns.
3678 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3679 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3680 DbgAssign->getAddress() == II.getDest())
3681 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3682 }
3683 II.setDest(AdjustedPtr);
3684 II.setDestAlignment(SliceAlign);
3685 } else {
3686 II.setSource(AdjustedPtr);
3687 II.setSourceAlignment(SliceAlign);
3688 }
3689
3690 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3691 deleteIfTriviallyDead(OldPtr);
3692 return false;
3693 }
3694 // For split transfer intrinsics we have an incredibly useful assurance:
3695 // the source and destination do not reside within the same alloca, and at
3696 // least one of them does not escape. This means that we can replace
3697 // memmove with memcpy, and we don't need to worry about all manner of
3698 // downsides to splitting and transforming the operations.
3699
3700 // If this doesn't map cleanly onto the alloca type, and that type isn't
3701 // a single value type, just emit a memcpy.
3702 bool EmitMemCpy =
3703 !VecTy && !IntTy &&
3704 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3705 SliceSize !=
3706 DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
3707 !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
3709
3710 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3711 // size hasn't been shrunk based on analysis of the viable range, this is
3712 // a no-op.
3713 if (EmitMemCpy && &OldAI == &NewAI) {
3714 // Ensure the start lines up.
3715 assert(NewBeginOffset == BeginOffset);
3716
3717 // Rewrite the size as needed.
3718 if (NewEndOffset != EndOffset)
3719 II.setLength(NewEndOffset - NewBeginOffset);
3720 return false;
3721 }
3722 // Record this instruction for deletion.
3723 Pass.DeadInsts.push_back(&II);
3724
3725 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3726 // alloca that should be re-examined after rewriting this instruction.
3727 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3728 if (AllocaInst *AI =
3730 assert(AI != &OldAI && AI != &NewAI &&
3731 "Splittable transfers cannot reach the same alloca on both ends.");
3732 Pass.Worklist.insert(AI);
3733 }
3734
3735 Type *OtherPtrTy = OtherPtr->getType();
3736 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3737
3738 // Compute the relative offset for the other pointer within the transfer.
3739 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3740 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3741 Align OtherAlign =
3742 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3743 OtherAlign =
3744 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3745
3746 if (EmitMemCpy) {
3747 // Compute the other pointer, folding as much as possible to produce
3748 // a single, simple GEP in most cases.
3749 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3750 OtherPtr->getName() + ".");
3751
3752 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3753 Type *SizeTy = II.getLength()->getType();
3754 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3755
3756 Value *DestPtr, *SrcPtr;
3757 MaybeAlign DestAlign, SrcAlign;
3758 // Note: IsDest is true iff we're copying into the new alloca slice
3759 if (IsDest) {
3760 DestPtr = OurPtr;
3761 DestAlign = SliceAlign;
3762 SrcPtr = OtherPtr;
3763 SrcAlign = OtherAlign;
3764 } else {
3765 DestPtr = OtherPtr;
3766 DestAlign = OtherAlign;
3767 SrcPtr = OurPtr;
3768 SrcAlign = SliceAlign;
3769 }
3770 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3771 Size, II.isVolatile());
3772 if (AATags)
3773 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3774
3775 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3776 if (IsDest) {
3777 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3778 &II, New, DestPtr, nullptr, DL);
3779 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3781 DL, Offset, /*AllowNonInbounds*/ true))) {
3782 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3783 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3784 }
3785 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3786 return false;
3787 }
3788
3789 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3790 NewEndOffset == NewAllocaEndOffset;
3791 uint64_t Size = NewEndOffset - NewBeginOffset;
3792 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3793 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3794 unsigned NumElements = EndIndex - BeginIndex;
3795 IntegerType *SubIntTy =
3796 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3797
3798 // Reset the other pointer type to match the register type we're going to
3799 // use, but using the address space of the original other pointer.
3800 Type *OtherTy;
3801 if (VecTy && !IsWholeAlloca) {
3802 if (NumElements == 1)
3803 OtherTy = VecTy->getElementType();
3804 else
3805 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3806 } else if (IntTy && !IsWholeAlloca) {
3807 OtherTy = SubIntTy;
3808 } else {
3809 OtherTy = NewAllocaTy;
3810 }
3811
3812 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3813 OtherPtr->getName() + ".");
3814 MaybeAlign SrcAlign = OtherAlign;
3815 MaybeAlign DstAlign = SliceAlign;
3816 if (!IsDest)
3817 std::swap(SrcAlign, DstAlign);
3818
3819 Value *SrcPtr;
3820 Value *DstPtr;
3821
3822 if (IsDest) {
3823 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3824 SrcPtr = AdjPtr;
3825 } else {
3826 DstPtr = AdjPtr;
3827 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3828 }
3829
3830 Value *Src;
3831 if (VecTy && !IsWholeAlloca && !IsDest) {
3832 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3833 NewAI.getAlign(), "load");
3834 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3835 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3836 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3837 NewAI.getAlign(), "load");
3838 Src = convertValue(DL, IRB, Src, IntTy);
3839 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3840 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3841 } else {
3842 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3843 II.isVolatile(), "copyload");
3844 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3845 LLVMContext::MD_access_group});
3846 if (AATags)
3847 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3848 Load->getType(), DL));
3849 Src = Load;
3850 }
3851
3852 if (VecTy && !IsWholeAlloca && IsDest) {
3853 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3854 NewAI.getAlign(), "oldload");
3855 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3856 } else if (IntTy && !IsWholeAlloca && IsDest) {
3857 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3858 NewAI.getAlign(), "oldload");
3859 Old = convertValue(DL, IRB, Old, IntTy);
3860 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3861 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3862 Src = convertValue(DL, IRB, Src, NewAllocaTy);
3863 }
3864
3865 StoreInst *Store = cast<StoreInst>(
3866 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3867 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3868 LLVMContext::MD_access_group});
3869 if (AATags)
3870 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3871 Src->getType(), DL));
3872
3873 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3874 if (IsDest) {
3875
3876 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3877 Store, DstPtr, Src, DL);
3878 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3880 DL, Offset, /*AllowNonInbounds*/ true))) {
3881 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3882 &II, Store, DstPtr, Src, DL);
3883 }
3884
3885 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3886 return !II.isVolatile();
3887 }
3888
3889 bool visitIntrinsicInst(IntrinsicInst &II) {
3890 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3891 "Unexpected intrinsic!");
3892 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3893
3894 // Record this instruction for deletion.
3895 Pass.DeadInsts.push_back(&II);
3896
3897 if (II.isDroppable()) {
3898 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3899 // TODO For now we forget assumed information, this can be improved.
3900 OldPtr->dropDroppableUsesIn(II);
3901 return true;
3902 }
3903
3904 assert(II.getArgOperand(0) == OldPtr);
3905 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3906 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3907 Value *New;
3908 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3909 New = IRB.CreateLifetimeStart(Ptr);
3910 else
3911 New = IRB.CreateLifetimeEnd(Ptr);
3912
3913 (void)New;
3914 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3915
3916 return true;
3917 }
3918
3919 void fixLoadStoreAlign(Instruction &Root) {
3920 // This algorithm implements the same visitor loop as
3921 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3922 // or store found.
3923 SmallPtrSet<Instruction *, 4> Visited;
3924 SmallVector<Instruction *, 4> Uses;
3925 Visited.insert(&Root);
3926 Uses.push_back(&Root);
3927 do {
3928 Instruction *I = Uses.pop_back_val();
3929
3930 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3931 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3932 continue;
3933 }
3934 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3935 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3936 continue;
3937 }
3938
3942 for (User *U : I->users())
3943 if (Visited.insert(cast<Instruction>(U)).second)
3944 Uses.push_back(cast<Instruction>(U));
3945 } while (!Uses.empty());
3946 }
3947
3948 bool visitPHINode(PHINode &PN) {
3949 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3950 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3951 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3952
3953 // We would like to compute a new pointer in only one place, but have it be
3954 // as local as possible to the PHI. To do that, we re-use the location of
3955 // the old pointer, which necessarily must be in the right position to
3956 // dominate the PHI.
3957 IRBuilderBase::InsertPointGuard Guard(IRB);
3958 if (isa<PHINode>(OldPtr))
3959 IRB.SetInsertPoint(OldPtr->getParent(),
3960 OldPtr->getParent()->getFirstInsertionPt());
3961 else
3962 IRB.SetInsertPoint(OldPtr);
3963 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3964
3965 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3966 // Replace the operands which were using the old pointer.
3967 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3968
3969 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3970 deleteIfTriviallyDead(OldPtr);
3971
3972 // Fix the alignment of any loads or stores using this PHI node.
3973 fixLoadStoreAlign(PN);
3974
3975 // PHIs can't be promoted on their own, but often can be speculated. We
3976 // check the speculation outside of the rewriter so that we see the
3977 // fully-rewritten alloca.
3978 PHIUsers.insert(&PN);
3979 return true;
3980 }
3981
3982 bool visitSelectInst(SelectInst &SI) {
3983 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3984 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3985 "Pointer isn't an operand!");
3986 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3987 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3988
3989 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3990 // Replace the operands which were using the old pointer.
3991 if (SI.getOperand(1) == OldPtr)
3992 SI.setOperand(1, NewPtr);
3993 if (SI.getOperand(2) == OldPtr)
3994 SI.setOperand(2, NewPtr);
3995
3996 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3997 deleteIfTriviallyDead(OldPtr);
3998
3999 // Fix the alignment of any loads or stores using this select.
4000 fixLoadStoreAlign(SI);
4001
4002 // Selects can't be promoted on their own, but often can be speculated. We
4003 // check the speculation outside of the rewriter so that we see the
4004 // fully-rewritten alloca.
4005 SelectUsers.insert(&SI);
4006 return true;
4007 }
4008};
4009
4010/// Visitor to rewrite aggregate loads and stores as scalar.
4011///
4012/// This pass aggressively rewrites all aggregate loads and stores on
4013/// a particular pointer (or any pointer derived from it which we can identify)
4014/// with scalar loads and stores.
4015class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
4016 // Befriend the base class so it can delegate to private visit methods.
4017 friend class InstVisitor<AggLoadStoreRewriter, bool>;
4018
4019 /// Queue of pointer uses to analyze and potentially rewrite.
4021
4022 /// Set to prevent us from cycling with phi nodes and loops.
4023 SmallPtrSet<User *, 8> Visited;
4024
4025 /// The current pointer use being rewritten. This is used to dig up the used
4026 /// value (as opposed to the user).
4027 Use *U = nullptr;
4028
4029 /// Used to calculate offsets, and hence alignment, of subobjects.
4030 const DataLayout &DL;
4031
4032 IRBuilderTy &IRB;
4033
4034public:
4035 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
4036 : DL(DL), IRB(IRB) {}
4037
4038 /// Rewrite loads and stores through a pointer and all pointers derived from
4039 /// it.
4040 bool rewrite(Instruction &I) {
4041 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
4042 enqueueUsers(I);
4043 bool Changed = false;
4044 while (!Queue.empty()) {
4045 U = Queue.pop_back_val();
4046 Changed |= visit(cast<Instruction>(U->getUser()));
4047 }
4048 return Changed;
4049 }
4050
4051private:
4052 /// Enqueue all the users of the given instruction for further processing.
4053 /// This uses a set to de-duplicate users.
4054 void enqueueUsers(Instruction &I) {
4055 for (Use &U : I.uses())
4056 if (Visited.insert(U.getUser()).second)
4057 Queue.push_back(&U);
4058 }
4059
4060 // Conservative default is to not rewrite anything.
4061 bool visitInstruction(Instruction &I) { return false; }
4062
4063 /// Generic recursive split emission class.
4064 template <typename Derived> class OpSplitter {
4065 protected:
4066 /// The builder used to form new instructions.
4067 IRBuilderTy &IRB;
4068
4069 /// The indices which to be used with insert- or extractvalue to select the
4070 /// appropriate value within the aggregate.
4071 SmallVector<unsigned, 4> Indices;
4072
4073 /// The indices to a GEP instruction which will move Ptr to the correct slot
4074 /// within the aggregate.
4075 SmallVector<Value *, 4> GEPIndices;
4076
4077 /// The base pointer of the original op, used as a base for GEPing the
4078 /// split operations.
4079 Value *Ptr;
4080
4081 /// The base pointee type being GEPed into.
4082 Type *BaseTy;
4083
4084 /// Known alignment of the base pointer.
4085 Align BaseAlign;
4086
4087 /// To calculate offset of each component so we can correctly deduce
4088 /// alignments.
4089 const DataLayout &DL;
4090
4091 /// Initialize the splitter with an insertion point, Ptr and start with a
4092 /// single zero GEP index.
4093 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4094 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4095 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4096 BaseAlign(BaseAlign), DL(DL) {
4097 IRB.SetInsertPoint(InsertionPoint);
4098 }
4099
4100 public:
4101 /// Generic recursive split emission routine.
4102 ///
4103 /// This method recursively splits an aggregate op (load or store) into
4104 /// scalar or vector ops. It splits recursively until it hits a single value
4105 /// and emits that single value operation via the template argument.
4106 ///
4107 /// The logic of this routine relies on GEPs and insertvalue and
4108 /// extractvalue all operating with the same fundamental index list, merely
4109 /// formatted differently (GEPs need actual values).
4110 ///
4111 /// \param Ty The type being split recursively into smaller ops.
4112 /// \param Agg The aggregate value being built up or stored, depending on
4113 /// whether this is splitting a load or a store respectively.
4114 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4115 if (Ty->isSingleValueType()) {
4116 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4117 return static_cast<Derived *>(this)->emitFunc(
4118 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4119 }
4120
4121 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4122 unsigned OldSize = Indices.size();
4123 (void)OldSize;
4124 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4125 ++Idx) {
4126 assert(Indices.size() == OldSize && "Did not return to the old size");
4127 Indices.push_back(Idx);
4128 GEPIndices.push_back(IRB.getInt32(Idx));
4129 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4130 GEPIndices.pop_back();
4131 Indices.pop_back();
4132 }
4133 return;
4134 }
4135
4136 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4137 unsigned OldSize = Indices.size();
4138 (void)OldSize;
4139 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4140 ++Idx) {
4141 assert(Indices.size() == OldSize && "Did not return to the old size");
4142 Indices.push_back(Idx);
4143 GEPIndices.push_back(IRB.getInt32(Idx));
4144 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4145 GEPIndices.pop_back();
4146 Indices.pop_back();
4147 }
4148 return;
4149 }
4150
4151 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4152 }
4153 };
4154
4155 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4156 AAMDNodes AATags;
4157 // A vector to hold the split components that we want to emit
4158 // separate fake uses for.
4159 SmallVector<Value *, 4> Components;
4160 // A vector to hold all the fake uses of the struct that we are splitting.
4161 // Usually there should only be one, but we are handling the general case.
4163
4164 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4165 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4166 IRBuilderTy &IRB)
4167 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4168 IRB),
4169 AATags(AATags) {}
4170
4171 /// Emit a leaf load of a single value. This is called at the leaves of the
4172 /// recursive emission to actually load values.
4173 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4175 // Load the single value and insert it using the indices.
4176 Value *GEP =
4177 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4178 LoadInst *Load =
4179 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4180
4181 APInt Offset(
4182 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4183 if (AATags &&
4184 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4185 Load->setAAMetadata(
4186 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4187 // Record the load so we can generate a fake use for this aggregate
4188 // component.
4189 Components.push_back(Load);
4190
4191 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4192 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4193 }
4194
4195 // Stash the fake uses that use the value generated by this instruction.
4196 void recordFakeUses(LoadInst &LI) {
4197 for (Use &U : LI.uses())
4198 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4199 if (II->getIntrinsicID() == Intrinsic::fake_use)
4200 FakeUses.push_back(II);
4201 }
4202
4203 // Replace all fake uses of the aggregate with a series of fake uses, one
4204 // for each split component.
4205 void emitFakeUses() {
4206 for (Instruction *I : FakeUses) {
4207 IRB.SetInsertPoint(I);
4208 for (auto *V : Components)
4209 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4210 I->eraseFromParent();
4211 }
4212 }
4213 };
4214
4215 bool visitLoadInst(LoadInst &LI) {
4216 assert(LI.getPointerOperand() == *U);
4217 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4218 return false;
4219
4220 // We have an aggregate being loaded, split it apart.
4221 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4222 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4223 getAdjustedAlignment(&LI, 0), DL, IRB);
4224 Splitter.recordFakeUses(LI);
4226 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4227 Splitter.emitFakeUses();
4228 Visited.erase(&LI);
4229 LI.replaceAllUsesWith(V);
4230 LI.eraseFromParent();
4231 return true;
4232 }
4233
4234 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4235 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4236 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4237 const DataLayout &DL, IRBuilderTy &IRB)
4238 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4239 DL, IRB),
4240 AATags(AATags), AggStore(AggStore) {}
4241 AAMDNodes AATags;
4242 StoreInst *AggStore;
4243 /// Emit a leaf store of a single value. This is called at the leaves of the
4244 /// recursive emission to actually produce stores.
4245 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4247 // Extract the single value and store it using the indices.
4248 //
4249 // The gep and extractvalue values are factored out of the CreateStore
4250 // call to make the output independent of the argument evaluation order.
4251 Value *ExtractValue =
4252 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4253 Value *InBoundsGEP =
4254 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4255 StoreInst *Store =
4256 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4257
4258 APInt Offset(
4259 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4260 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4261 if (AATags) {
4262 Store->setAAMetadata(AATags.adjustForAccess(
4263 Offset.getZExtValue(), ExtractValue->getType(), DL));
4264 }
4265
4266 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4267 // If we cannot (because there's an intervening non-const or unbounded
4268 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4269 // this instruction.
4271 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4272 uint64_t SizeInBits =
4273 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4274 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4275 SizeInBits, AggStore, Store,
4276 Store->getPointerOperand(), Store->getValueOperand(),
4277 DL);
4278 } else {
4280 "AT: unexpected debug.assign linked to store through "
4281 "unbounded GEP");
4282 }
4283 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4284 }
4285 };
4286
4287 bool visitStoreInst(StoreInst &SI) {
4288 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4289 return false;
4290 Value *V = SI.getValueOperand();
4291 if (V->getType()->isSingleValueType())
4292 return false;
4293
4294 // We have an aggregate being stored, split it apart.
4295 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4296 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4297 getAdjustedAlignment(&SI, 0), DL, IRB);
4298 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4299 Visited.erase(&SI);
4300 // The stores replacing SI each have markers describing fragments of the
4301 // assignment so delete the assignment markers linked to SI.
4303 SI.eraseFromParent();
4304 return true;
4305 }
4306
4307 bool visitBitCastInst(BitCastInst &BC) {
4308 enqueueUsers(BC);
4309 return false;
4310 }
4311
4312 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4313 enqueueUsers(ASC);
4314 return false;
4315 }
4316
4317 // Unfold gep (select cond, ptr1, ptr2), idx
4318 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4319 // and gep ptr, (select cond, idx1, idx2)
4320 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4321 // We also allow for i1 zext indices, which are equivalent to selects.
4322 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4323 // Check whether the GEP has exactly one select operand and all indices
4324 // will become constant after the transform.
4326 for (Value *Op : GEPI.indices()) {
4327 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4328 if (Sel)
4329 return false;
4330
4331 Sel = SI;
4332 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4333 !isa<ConstantInt>(SI->getFalseValue()))
4334 return false;
4335 continue;
4336 }
4337 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4338 if (Sel)
4339 return false;
4340 Sel = ZI;
4341 if (!ZI->getSrcTy()->isIntegerTy(1))
4342 return false;
4343 continue;
4344 }
4345
4346 if (!isa<ConstantInt>(Op))
4347 return false;
4348 }
4349
4350 if (!Sel)
4351 return false;
4352
4353 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4354 dbgs() << " original: " << *Sel << "\n";
4355 dbgs() << " " << GEPI << "\n";);
4356
4357 auto GetNewOps = [&](Value *SelOp) {
4358 SmallVector<Value *> NewOps;
4359 for (Value *Op : GEPI.operands())
4360 if (Op == Sel)
4361 NewOps.push_back(SelOp);
4362 else
4363 NewOps.push_back(Op);
4364 return NewOps;
4365 };
4366
4367 Value *Cond, *True, *False;
4368 Instruction *MDFrom = nullptr;
4369 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4370 Cond = SI->getCondition();
4371 True = SI->getTrueValue();
4372 False = SI->getFalseValue();
4374 MDFrom = SI;
4375 } else {
4376 Cond = Sel->getOperand(0);
4377 True = ConstantInt::get(Sel->getType(), 1);
4378 False = ConstantInt::get(Sel->getType(), 0);
4379 }
4380 SmallVector<Value *> TrueOps = GetNewOps(True);
4381 SmallVector<Value *> FalseOps = GetNewOps(False);
4382
4383 IRB.SetInsertPoint(&GEPI);
4384 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4385
4386 Type *Ty = GEPI.getSourceElementType();
4387 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4388 True->getName() + ".sroa.gep", NW);
4389
4390 Value *NFalse =
4391 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4392 False->getName() + ".sroa.gep", NW);
4393
4394 Value *NSel = MDFrom
4395 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4396 Sel->getName() + ".sroa.sel", MDFrom)
4397 : IRB.CreateSelectWithUnknownProfile(
4398 Cond, NTrue, NFalse, DEBUG_TYPE,
4399 Sel->getName() + ".sroa.sel");
4400 Visited.erase(&GEPI);
4401 GEPI.replaceAllUsesWith(NSel);
4402 GEPI.eraseFromParent();
4403 Instruction *NSelI = cast<Instruction>(NSel);
4404 Visited.insert(NSelI);
4405 enqueueUsers(*NSelI);
4406
4407 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4408 dbgs() << " " << *NFalse << "\n";
4409 dbgs() << " " << *NSel << "\n";);
4410
4411 return true;
4412 }
4413
4414 // Unfold gep (phi ptr1, ptr2), idx
4415 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4416 // and gep ptr, (phi idx1, idx2)
4417 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4418 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4419 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4420 // operand (looking through the phi if it is the phi we want to unfold) is
4421 // an instruction besides a static alloca.
4422 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4423 auto IsInvalidPointerOperand = [](Value *V) {
4424 if (!isa<Instruction>(V))
4425 return false;
4426 if (auto *AI = dyn_cast<AllocaInst>(V))
4427 return !AI->isStaticAlloca();
4428 return true;
4429 };
4430 if (Phi) {
4431 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4432 return false;
4433 } else {
4434 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4435 return false;
4436 }
4437 // Check whether the GEP has exactly one phi operand (including the pointer
4438 // operand) and all indices will become constant after the transform.
4439 for (Value *Op : GEPI.indices()) {
4440 if (auto *SI = dyn_cast<PHINode>(Op)) {
4441 if (Phi)
4442 return false;
4443
4444 Phi = SI;
4445 if (!all_of(Phi->incoming_values(),
4446 [](Value *V) { return isa<ConstantInt>(V); }))
4447 return false;
4448 continue;
4449 }
4450
4451 if (!isa<ConstantInt>(Op))
4452 return false;
4453 }
4454
4455 if (!Phi)
4456 return false;
4457
4458 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4459 dbgs() << " original: " << *Phi << "\n";
4460 dbgs() << " " << GEPI << "\n";);
4461
4462 auto GetNewOps = [&](Value *PhiOp) {
4463 SmallVector<Value *> NewOps;
4464 for (Value *Op : GEPI.operands())
4465 if (Op == Phi)
4466 NewOps.push_back(PhiOp);
4467 else
4468 NewOps.push_back(Op);
4469 return NewOps;
4470 };
4471
4472 IRB.SetInsertPoint(Phi);
4473 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4474 Phi->getName() + ".sroa.phi");
4475
4476 Type *SourceTy = GEPI.getSourceElementType();
4477 // We only handle arguments, constants, and static allocas here, so we can
4478 // insert GEPs at the end of the entry block.
4479 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4480 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4481 Value *Op = Phi->getIncomingValue(I);
4482 BasicBlock *BB = Phi->getIncomingBlock(I);
4483 Value *NewGEP;
4484 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4485 NewGEP = NewPhi->getIncomingValue(NI);
4486 } else {
4487 SmallVector<Value *> NewOps = GetNewOps(Op);
4488 NewGEP =
4489 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4490 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4491 }
4492 NewPhi->addIncoming(NewGEP, BB);
4493 }
4494
4495 Visited.erase(&GEPI);
4496 GEPI.replaceAllUsesWith(NewPhi);
4497 GEPI.eraseFromParent();
4498 Visited.insert(NewPhi);
4499 enqueueUsers(*NewPhi);
4500
4501 LLVM_DEBUG(dbgs() << " to: ";
4502 for (Value *In
4503 : NewPhi->incoming_values()) dbgs()
4504 << "\n " << *In;
4505 dbgs() << "\n " << *NewPhi << '\n');
4506
4507 return true;
4508 }
4509
4510 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4511 if (unfoldGEPSelect(GEPI))
4512 return true;
4513
4514 if (unfoldGEPPhi(GEPI))
4515 return true;
4516
4517 enqueueUsers(GEPI);
4518 return false;
4519 }
4520
4521 bool visitPHINode(PHINode &PN) {
4522 enqueueUsers(PN);
4523 return false;
4524 }
4525
4526 bool visitSelectInst(SelectInst &SI) {
4527 enqueueUsers(SI);
4528 return false;
4529 }
4530};
4531
4532} // end anonymous namespace
4533
4534/// Strip aggregate type wrapping.
4535///
4536/// This removes no-op aggregate types wrapping an underlying type. It will
4537/// strip as many layers of types as it can without changing either the type
4538/// size or the allocated size.
4540 if (Ty->isSingleValueType())
4541 return Ty;
4542
4543 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4544 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4545
4546 Type *InnerTy;
4547 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4548 InnerTy = ArrTy->getElementType();
4549 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4550 const StructLayout *SL = DL.getStructLayout(STy);
4551 unsigned Index = SL->getElementContainingOffset(0);
4552 InnerTy = STy->getElementType(Index);
4553 } else {
4554 return Ty;
4555 }
4556
4557 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4558 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4559 return Ty;
4560
4561 return stripAggregateTypeWrapping(DL, InnerTy);
4562}
4563
4564/// Try to find a partition of the aggregate type passed in for a given
4565/// offset and size.
4566///
4567/// This recurses through the aggregate type and tries to compute a subtype
4568/// based on the offset and size. When the offset and size span a sub-section
4569/// of an array, it will even compute a new array type for that sub-section,
4570/// and the same for structs.
4571///
4572/// Note that this routine is very strict and tries to find a partition of the
4573/// type which produces the *exact* right offset and size. It is not forgiving
4574/// when the size or offset cause either end of type-based partition to be off.
4575/// Also, this is a best-effort routine. It is reasonable to give up and not
4576/// return a type if necessary.
4578 uint64_t Size) {
4579 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4580 return stripAggregateTypeWrapping(DL, Ty);
4581 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4582 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4583 return nullptr;
4584
4585 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4586 Type *ElementTy;
4587 uint64_t TyNumElements;
4588 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4589 ElementTy = AT->getElementType();
4590 TyNumElements = AT->getNumElements();
4591 } else {
4592 // FIXME: This isn't right for vectors with non-byte-sized or
4593 // non-power-of-two sized elements.
4594 auto *VT = cast<FixedVectorType>(Ty);
4595 ElementTy = VT->getElementType();
4596 TyNumElements = VT->getNumElements();
4597 }
4598 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4599 uint64_t NumSkippedElements = Offset / ElementSize;
4600 if (NumSkippedElements >= TyNumElements)
4601 return nullptr;
4602 Offset -= NumSkippedElements * ElementSize;
4603
4604 // First check if we need to recurse.
4605 if (Offset > 0 || Size < ElementSize) {
4606 // Bail if the partition ends in a different array element.
4607 if ((Offset + Size) > ElementSize)
4608 return nullptr;
4609 // Recurse through the element type trying to peel off offset bytes.
4610 return getTypePartition(DL, ElementTy, Offset, Size);
4611 }
4612 assert(Offset == 0);
4613
4614 if (Size == ElementSize)
4615 return stripAggregateTypeWrapping(DL, ElementTy);
4616 assert(Size > ElementSize);
4617 uint64_t NumElements = Size / ElementSize;
4618 if (NumElements * ElementSize != Size)
4619 return nullptr;
4620 return ArrayType::get(ElementTy, NumElements);
4621 }
4622
4624 if (!STy)
4625 return nullptr;
4626
4627 const StructLayout *SL = DL.getStructLayout(STy);
4628
4629 if (SL->getSizeInBits().isScalable())
4630 return nullptr;
4631
4632 if (Offset >= SL->getSizeInBytes())
4633 return nullptr;
4634 uint64_t EndOffset = Offset + Size;
4635 if (EndOffset > SL->getSizeInBytes())
4636 return nullptr;
4637
4638 unsigned Index = SL->getElementContainingOffset(Offset);
4639 Offset -= SL->getElementOffset(Index);
4640
4641 Type *ElementTy = STy->getElementType(Index);
4642 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4643 if (Offset >= ElementSize)
4644 return nullptr; // The offset points into alignment padding.
4645
4646 // See if any partition must be contained by the element.
4647 if (Offset > 0 || Size < ElementSize) {
4648 if ((Offset + Size) > ElementSize)
4649 return nullptr;
4650 return getTypePartition(DL, ElementTy, Offset, Size);
4651 }
4652 assert(Offset == 0);
4653
4654 if (Size == ElementSize)
4655 return stripAggregateTypeWrapping(DL, ElementTy);
4656
4657 StructType::element_iterator EI = STy->element_begin() + Index,
4658 EE = STy->element_end();
4659 if (EndOffset < SL->getSizeInBytes()) {
4660 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4661 if (Index == EndIndex)
4662 return nullptr; // Within a single element and its padding.
4663
4664 // Don't try to form "natural" types if the elements don't line up with the
4665 // expected size.
4666 // FIXME: We could potentially recurse down through the last element in the
4667 // sub-struct to find a natural end point.
4668 if (SL->getElementOffset(EndIndex) != EndOffset)
4669 return nullptr;
4670
4671 assert(Index < EndIndex);
4672 EE = STy->element_begin() + EndIndex;
4673 }
4674
4675 // Try to build up a sub-structure.
4676 StructType *SubTy =
4677 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4678 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4679 if (Size != SubSL->getSizeInBytes())
4680 return nullptr; // The sub-struct doesn't have quite the size needed.
4681
4682 return SubTy;
4683}
4684
4685/// Pre-split loads and stores to simplify rewriting.
4686///
4687/// We want to break up the splittable load+store pairs as much as
4688/// possible. This is important to do as a preprocessing step, as once we
4689/// start rewriting the accesses to partitions of the alloca we lose the
4690/// necessary information to correctly split apart paired loads and stores
4691/// which both point into this alloca. The case to consider is something like
4692/// the following:
4693///
4694/// %a = alloca [12 x i8]
4695/// %gep1 = getelementptr i8, ptr %a, i32 0
4696/// %gep2 = getelementptr i8, ptr %a, i32 4
4697/// %gep3 = getelementptr i8, ptr %a, i32 8
4698/// store float 0.0, ptr %gep1
4699/// store float 1.0, ptr %gep2
4700/// %v = load i64, ptr %gep1
4701/// store i64 %v, ptr %gep2
4702/// %f1 = load float, ptr %gep2
4703/// %f2 = load float, ptr %gep3
4704///
4705/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4706/// promote everything so we recover the 2 SSA values that should have been
4707/// there all along.
4708///
4709/// \returns true if any changes are made.
4710bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4711 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4712
4713 // Track the loads and stores which are candidates for pre-splitting here, in
4714 // the order they first appear during the partition scan. These give stable
4715 // iteration order and a basis for tracking which loads and stores we
4716 // actually split.
4719
4720 // We need to accumulate the splits required of each load or store where we
4721 // can find them via a direct lookup. This is important to cross-check loads
4722 // and stores against each other. We also track the slice so that we can kill
4723 // all the slices that end up split.
4724 struct SplitOffsets {
4725 Slice *S;
4726 std::vector<uint64_t> Splits;
4727 };
4728 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4729
4730 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4731 // This is important as we also cannot pre-split stores of those loads!
4732 // FIXME: This is all pretty gross. It means that we can be more aggressive
4733 // in pre-splitting when the load feeding the store happens to come from
4734 // a separate alloca. Put another way, the effectiveness of SROA would be
4735 // decreased by a frontend which just concatenated all of its local allocas
4736 // into one big flat alloca. But defeating such patterns is exactly the job
4737 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4738 // change store pre-splitting to actually force pre-splitting of the load
4739 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4740 // maybe it would make it more principled?
4741 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4742
4743 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4744 for (auto &P : AS.partitions()) {
4745 for (Slice &S : P) {
4746 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4747 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4748 // If this is a load we have to track that it can't participate in any
4749 // pre-splitting. If this is a store of a load we have to track that
4750 // that load also can't participate in any pre-splitting.
4751 if (auto *LI = dyn_cast<LoadInst>(I))
4752 UnsplittableLoads.insert(LI);
4753 else if (auto *SI = dyn_cast<StoreInst>(I))
4754 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4755 UnsplittableLoads.insert(LI);
4756 continue;
4757 }
4758 assert(P.endOffset() > S.beginOffset() &&
4759 "Empty or backwards partition!");
4760
4761 // Determine if this is a pre-splittable slice.
4762 if (auto *LI = dyn_cast<LoadInst>(I)) {
4763 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4764
4765 // The load must be used exclusively to store into other pointers for
4766 // us to be able to arbitrarily pre-split it. The stores must also be
4767 // simple to avoid changing semantics.
4768 auto IsLoadSimplyStored = [](LoadInst *LI) {
4769 for (User *LU : LI->users()) {
4770 auto *SI = dyn_cast<StoreInst>(LU);
4771 if (!SI || !SI->isSimple())
4772 return false;
4773 }
4774 return true;
4775 };
4776 if (!IsLoadSimplyStored(LI)) {
4777 UnsplittableLoads.insert(LI);
4778 continue;
4779 }
4780
4781 Loads.push_back(LI);
4782 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4783 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4784 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4785 continue;
4786 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4787 if (!StoredLoad || !StoredLoad->isSimple())
4788 continue;
4789 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4790
4791 Stores.push_back(SI);
4792 } else {
4793 // Other uses cannot be pre-split.
4794 continue;
4795 }
4796
4797 // Record the initial split.
4798 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4799 auto &Offsets = SplitOffsetsMap[I];
4800 assert(Offsets.Splits.empty() &&
4801 "Should not have splits the first time we see an instruction!");
4802 Offsets.S = &S;
4803 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4804 }
4805
4806 // Now scan the already split slices, and add a split for any of them which
4807 // we're going to pre-split.
4808 for (Slice *S : P.splitSliceTails()) {
4809 auto SplitOffsetsMapI =
4810 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4811 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4812 continue;
4813 auto &Offsets = SplitOffsetsMapI->second;
4814
4815 assert(Offsets.S == S && "Found a mismatched slice!");
4816 assert(!Offsets.Splits.empty() &&
4817 "Cannot have an empty set of splits on the second partition!");
4818 assert(Offsets.Splits.back() ==
4819 P.beginOffset() - Offsets.S->beginOffset() &&
4820 "Previous split does not end where this one begins!");
4821
4822 // Record each split. The last partition's end isn't needed as the size
4823 // of the slice dictates that.
4824 if (S->endOffset() > P.endOffset())
4825 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4826 }
4827 }
4828
4829 // We may have split loads where some of their stores are split stores. For
4830 // such loads and stores, we can only pre-split them if their splits exactly
4831 // match relative to their starting offset. We have to verify this prior to
4832 // any rewriting.
4833 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4834 // Lookup the load we are storing in our map of split
4835 // offsets.
4836 auto *LI = cast<LoadInst>(SI->getValueOperand());
4837 // If it was completely unsplittable, then we're done,
4838 // and this store can't be pre-split.
4839 if (UnsplittableLoads.count(LI))
4840 return true;
4841
4842 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4843 if (LoadOffsetsI == SplitOffsetsMap.end())
4844 return false; // Unrelated loads are definitely safe.
4845 auto &LoadOffsets = LoadOffsetsI->second;
4846
4847 // Now lookup the store's offsets.
4848 auto &StoreOffsets = SplitOffsetsMap[SI];
4849
4850 // If the relative offsets of each split in the load and
4851 // store match exactly, then we can split them and we
4852 // don't need to remove them here.
4853 if (LoadOffsets.Splits == StoreOffsets.Splits)
4854 return false;
4855
4856 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4857 << " " << *LI << "\n"
4858 << " " << *SI << "\n");
4859
4860 // We've found a store and load that we need to split
4861 // with mismatched relative splits. Just give up on them
4862 // and remove both instructions from our list of
4863 // candidates.
4864 UnsplittableLoads.insert(LI);
4865 return true;
4866 });
4867 // Now we have to go *back* through all the stores, because a later store may
4868 // have caused an earlier store's load to become unsplittable and if it is
4869 // unsplittable for the later store, then we can't rely on it being split in
4870 // the earlier store either.
4871 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4872 auto *LI = cast<LoadInst>(SI->getValueOperand());
4873 return UnsplittableLoads.count(LI);
4874 });
4875 // Once we've established all the loads that can't be split for some reason,
4876 // filter any that made it into our list out.
4877 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4878 return UnsplittableLoads.count(LI);
4879 });
4880
4881 // If no loads or stores are left, there is no pre-splitting to be done for
4882 // this alloca.
4883 if (Loads.empty() && Stores.empty())
4884 return false;
4885
4886 // From here on, we can't fail and will be building new accesses, so rig up
4887 // an IR builder.
4888 IRBuilderTy IRB(&AI);
4889
4890 // Collect the new slices which we will merge into the alloca slices.
4891 SmallVector<Slice, 4> NewSlices;
4892
4893 // Track any allocas we end up splitting loads and stores for so we iterate
4894 // on them.
4895 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4896
4897 // At this point, we have collected all of the loads and stores we can
4898 // pre-split, and the specific splits needed for them. We actually do the
4899 // splitting in a specific order in order to handle when one of the loads in
4900 // the value operand to one of the stores.
4901 //
4902 // First, we rewrite all of the split loads, and just accumulate each split
4903 // load in a parallel structure. We also build the slices for them and append
4904 // them to the alloca slices.
4905 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4906 std::vector<LoadInst *> SplitLoads;
4907 const DataLayout &DL = AI.getDataLayout();
4908 for (LoadInst *LI : Loads) {
4909 SplitLoads.clear();
4910
4911 auto &Offsets = SplitOffsetsMap[LI];
4912 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4913 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4914 "Load must have type size equal to store size");
4915 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4916 "Load must be >= slice size");
4917
4918 uint64_t BaseOffset = Offsets.S->beginOffset();
4919 assert(BaseOffset + SliceSize > BaseOffset &&
4920 "Cannot represent alloca access size using 64-bit integers!");
4921
4923 IRB.SetInsertPoint(LI);
4924
4925 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4926
4927 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4928 int Idx = 0, Size = Offsets.Splits.size();
4929 for (;;) {
4930 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4931 auto AS = LI->getPointerAddressSpace();
4932 auto *PartPtrTy = LI->getPointerOperandType();
4933 LoadInst *PLoad = IRB.CreateAlignedLoad(
4934 PartTy,
4935 getAdjustedPtr(IRB, DL, BasePtr,
4936 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4937 PartPtrTy, BasePtr->getName() + "."),
4938 getAdjustedAlignment(LI, PartOffset),
4939 /*IsVolatile*/ false, LI->getName());
4940 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4941 LLVMContext::MD_access_group});
4942
4943 // Append this load onto the list of split loads so we can find it later
4944 // to rewrite the stores.
4945 SplitLoads.push_back(PLoad);
4946
4947 // Now build a new slice for the alloca.
4948 NewSlices.push_back(
4949 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4950 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4951 /*IsSplittable*/ false));
4952 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4953 << ", " << NewSlices.back().endOffset()
4954 << "): " << *PLoad << "\n");
4955
4956 // See if we've handled all the splits.
4957 if (Idx >= Size)
4958 break;
4959
4960 // Setup the next partition.
4961 PartOffset = Offsets.Splits[Idx];
4962 ++Idx;
4963 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4964 }
4965
4966 // Now that we have the split loads, do the slow walk over all uses of the
4967 // load and rewrite them as split stores, or save the split loads to use
4968 // below if the store is going to be split there anyways.
4969 bool DeferredStores = false;
4970 for (User *LU : LI->users()) {
4971 StoreInst *SI = cast<StoreInst>(LU);
4972 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4973 DeferredStores = true;
4974 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4975 << "\n");
4976 continue;
4977 }
4978
4979 Value *StoreBasePtr = SI->getPointerOperand();
4980 IRB.SetInsertPoint(SI);
4981 AAMDNodes AATags = SI->getAAMetadata();
4982
4983 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4984
4985 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4986 LoadInst *PLoad = SplitLoads[Idx];
4987 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4988 auto *PartPtrTy = SI->getPointerOperandType();
4989
4990 auto AS = SI->getPointerAddressSpace();
4991 StoreInst *PStore = IRB.CreateAlignedStore(
4992 PLoad,
4993 getAdjustedPtr(IRB, DL, StoreBasePtr,
4994 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4995 PartPtrTy, StoreBasePtr->getName() + "."),
4996 getAdjustedAlignment(SI, PartOffset),
4997 /*IsVolatile*/ false);
4998 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4999 LLVMContext::MD_access_group,
5000 LLVMContext::MD_DIAssignID});
5001
5002 if (AATags)
5003 PStore->setAAMetadata(
5004 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
5005 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
5006 }
5007
5008 // We want to immediately iterate on any allocas impacted by splitting
5009 // this store, and we have to track any promotable alloca (indicated by
5010 // a direct store) as needing to be resplit because it is no longer
5011 // promotable.
5012 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
5013 ResplitPromotableAllocas.insert(OtherAI);
5014 Worklist.insert(OtherAI);
5015 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5016 StoreBasePtr->stripInBoundsOffsets())) {
5017 Worklist.insert(OtherAI);
5018 }
5019
5020 // Mark the original store as dead.
5021 DeadInsts.push_back(SI);
5022 }
5023
5024 // Save the split loads if there are deferred stores among the users.
5025 if (DeferredStores)
5026 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
5027
5028 // Mark the original load as dead and kill the original slice.
5029 DeadInsts.push_back(LI);
5030 Offsets.S->kill();
5031 }
5032
5033 // Second, we rewrite all of the split stores. At this point, we know that
5034 // all loads from this alloca have been split already. For stores of such
5035 // loads, we can simply look up the pre-existing split loads. For stores of
5036 // other loads, we split those loads first and then write split stores of
5037 // them.
5038 for (StoreInst *SI : Stores) {
5039 auto *LI = cast<LoadInst>(SI->getValueOperand());
5040 IntegerType *Ty = cast<IntegerType>(LI->getType());
5041 assert(Ty->getBitWidth() % 8 == 0);
5042 uint64_t StoreSize = Ty->getBitWidth() / 8;
5043 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
5044
5045 auto &Offsets = SplitOffsetsMap[SI];
5046 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
5047 "Slice size should always match load size exactly!");
5048 uint64_t BaseOffset = Offsets.S->beginOffset();
5049 assert(BaseOffset + StoreSize > BaseOffset &&
5050 "Cannot represent alloca access size using 64-bit integers!");
5051
5052 Value *LoadBasePtr = LI->getPointerOperand();
5053 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
5054
5055 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5056
5057 // Check whether we have an already split load.
5058 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
5059 std::vector<LoadInst *> *SplitLoads = nullptr;
5060 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5061 SplitLoads = &SplitLoadsMapI->second;
5062 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5063 "Too few split loads for the number of splits in the store!");
5064 } else {
5065 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5066 }
5067
5068 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5069 int Idx = 0, Size = Offsets.Splits.size();
5070 for (;;) {
5071 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5072 auto *LoadPartPtrTy = LI->getPointerOperandType();
5073 auto *StorePartPtrTy = SI->getPointerOperandType();
5074
5075 // Either lookup a split load or create one.
5076 LoadInst *PLoad;
5077 if (SplitLoads) {
5078 PLoad = (*SplitLoads)[Idx];
5079 } else {
5080 IRB.SetInsertPoint(LI);
5081 auto AS = LI->getPointerAddressSpace();
5082 PLoad = IRB.CreateAlignedLoad(
5083 PartTy,
5084 getAdjustedPtr(IRB, DL, LoadBasePtr,
5085 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5086 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5087 getAdjustedAlignment(LI, PartOffset),
5088 /*IsVolatile*/ false, LI->getName());
5089 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5090 LLVMContext::MD_access_group});
5091 }
5092
5093 // And store this partition.
5094 IRB.SetInsertPoint(SI);
5095 auto AS = SI->getPointerAddressSpace();
5096 StoreInst *PStore = IRB.CreateAlignedStore(
5097 PLoad,
5098 getAdjustedPtr(IRB, DL, StoreBasePtr,
5099 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5100 StorePartPtrTy, StoreBasePtr->getName() + "."),
5101 getAdjustedAlignment(SI, PartOffset),
5102 /*IsVolatile*/ false);
5103 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5104 LLVMContext::MD_access_group});
5105
5106 // Now build a new slice for the alloca.
5107 NewSlices.push_back(
5108 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5109 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5110 /*IsSplittable*/ false));
5111 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5112 << ", " << NewSlices.back().endOffset()
5113 << "): " << *PStore << "\n");
5114 if (!SplitLoads) {
5115 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5116 }
5117
5118 // See if we've finished all the splits.
5119 if (Idx >= Size)
5120 break;
5121
5122 // Setup the next partition.
5123 PartOffset = Offsets.Splits[Idx];
5124 ++Idx;
5125 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5126 }
5127
5128 // We want to immediately iterate on any allocas impacted by splitting
5129 // this load, which is only relevant if it isn't a load of this alloca and
5130 // thus we didn't already split the loads above. We also have to keep track
5131 // of any promotable allocas we split loads on as they can no longer be
5132 // promoted.
5133 if (!SplitLoads) {
5134 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5135 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5136 ResplitPromotableAllocas.insert(OtherAI);
5137 Worklist.insert(OtherAI);
5138 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5139 LoadBasePtr->stripInBoundsOffsets())) {
5140 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5141 Worklist.insert(OtherAI);
5142 }
5143 }
5144
5145 // Mark the original store as dead now that we've split it up and kill its
5146 // slice. Note that we leave the original load in place unless this store
5147 // was its only use. It may in turn be split up if it is an alloca load
5148 // for some other alloca, but it may be a normal load. This may introduce
5149 // redundant loads, but where those can be merged the rest of the optimizer
5150 // should handle the merging, and this uncovers SSA splits which is more
5151 // important. In practice, the original loads will almost always be fully
5152 // split and removed eventually, and the splits will be merged by any
5153 // trivial CSE, including instcombine.
5154 if (LI->hasOneUse()) {
5155 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5156 DeadInsts.push_back(LI);
5157 }
5158 DeadInsts.push_back(SI);
5159 Offsets.S->kill();
5160 }
5161
5162 // Remove the killed slices that have ben pre-split.
5163 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5164
5165 // Insert our new slices. This will sort and merge them into the sorted
5166 // sequence.
5167 AS.insert(NewSlices);
5168
5169 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5170#ifndef NDEBUG
5171 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5172 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5173#endif
5174
5175 // Finally, don't try to promote any allocas that new require re-splitting.
5176 // They have already been added to the worklist above.
5177 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5178
5179 return true;
5180}
5181
5182/// Rewrite an alloca partition's users.
5183///
5184/// This routine drives both of the rewriting goals of the SROA pass. It tries
5185/// to rewrite uses of an alloca partition to be conducive for SSA value
5186/// promotion. If the partition needs a new, more refined alloca, this will
5187/// build that new alloca, preserving as much type information as possible, and
5188/// rewrite the uses of the old alloca to point at the new one and have the
5189/// appropriate new offsets. It also evaluates how successful the rewrite was
5190/// at enabling promotion and if it was successful queues the alloca to be
5191/// promoted.
5192AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
5193 Partition &P) {
5194 // Try to compute a friendly type for this partition of the alloca. This
5195 // won't always succeed, in which case we fall back to a legal integer type
5196 // or an i8 array of an appropriate size.
5197 Type *SliceTy = nullptr;
5198 const DataLayout &DL = AI.getDataLayout();
5199 unsigned VScale = AI.getFunction()->getVScaleValue();
5200
5201 std::pair<Type *, IntegerType *> CommonUseTy =
5202 findCommonType(P.begin(), P.end(), P.endOffset());
5203 // Do all uses operate on the same type?
5204 if (CommonUseTy.first) {
5205 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
5206 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size())
5207 SliceTy = CommonUseTy.first;
5208 }
5209 // If not, can we find an appropriate subtype in the original allocated type?
5210 if (!SliceTy)
5211 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5212 P.beginOffset(), P.size()))
5213 SliceTy = TypePartitionTy;
5214
5215 // If still not, can we use the largest bitwidth integer type used?
5216 if (!SliceTy && CommonUseTy.second)
5217 if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
5218 SliceTy = CommonUseTy.second;
5219 if ((!SliceTy || (SliceTy->isArrayTy() &&
5220 SliceTy->getArrayElementType()->isIntegerTy())) &&
5221 DL.isLegalInteger(P.size() * 8)) {
5222 SliceTy = Type::getIntNTy(*C, P.size() * 8);
5223 }
5224
5225 if (!SliceTy)
5226 SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
5227 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
5228
5229 bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
5230
5231 VectorType *VecTy =
5232 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
5233 if (VecTy)
5234 SliceTy = VecTy;
5235
5236 // Check for the case where we're going to rewrite to a new alloca of the
5237 // exact same type as the original, and with the same access offsets. In that
5238 // case, re-use the existing alloca, but still run through the rewriter to
5239 // perform phi and select speculation.
5240 // P.beginOffset() can be non-zero even with the same type in a case with
5241 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5242 AllocaInst *NewAI;
5243 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5244 NewAI = &AI;
5245 // FIXME: We should be able to bail at this point with "nothing changed".
5246 // FIXME: We might want to defer PHI speculation until after here.
5247 // FIXME: return nullptr;
5248 } else {
5249 // Make sure the alignment is compatible with P.beginOffset().
5250 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5251 // If we will get at least this much alignment from the type alone, leave
5252 // the alloca's alignment unconstrained.
5253 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
5254 NewAI = new AllocaInst(
5255 SliceTy, AI.getAddressSpace(), nullptr,
5256 IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
5257 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5258 AI.getIterator());
5259 // Copy the old AI debug location over to the new one.
5260 NewAI->setDebugLoc(AI.getDebugLoc());
5261 ++NumNewAllocas;
5262 }
5263
5264 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5265 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5266
5267 // Track the high watermark on the worklist as it is only relevant for
5268 // promoted allocas. We will reset it to this point if the alloca is not in
5269 // fact scheduled for promotion.
5270 unsigned PPWOldSize = PostPromotionWorklist.size();
5271 unsigned NumUses = 0;
5272 SmallSetVector<PHINode *, 8> PHIUsers;
5273 SmallSetVector<SelectInst *, 8> SelectUsers;
5274
5275 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5276 P.endOffset(), IsIntegerPromotable, VecTy,
5277 PHIUsers, SelectUsers);
5278 bool Promotable = true;
5279 // Check whether we can have tree-structured merge.
5280 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5281 NumUses += DeletedValues->size() + 1;
5282 for (Value *V : *DeletedValues)
5283 DeadInsts.push_back(V);
5284 } else {
5285 for (Slice *S : P.splitSliceTails()) {
5286 Promotable &= Rewriter.visit(S);
5287 ++NumUses;
5288 }
5289 for (Slice &S : P) {
5290 Promotable &= Rewriter.visit(&S);
5291 ++NumUses;
5292 }
5293 }
5294
5295 NumAllocaPartitionUses += NumUses;
5296 MaxUsesPerAllocaPartition.updateMax(NumUses);
5297
5298 // Now that we've processed all the slices in the new partition, check if any
5299 // PHIs or Selects would block promotion.
5300 for (PHINode *PHI : PHIUsers)
5301 if (!isSafePHIToSpeculate(*PHI)) {
5302 Promotable = false;
5303 PHIUsers.clear();
5304 SelectUsers.clear();
5305 break;
5306 }
5307
5309 NewSelectsToRewrite;
5310 NewSelectsToRewrite.reserve(SelectUsers.size());
5311 for (SelectInst *Sel : SelectUsers) {
5312 std::optional<RewriteableMemOps> Ops =
5313 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5314 if (!Ops) {
5315 Promotable = false;
5316 PHIUsers.clear();
5317 SelectUsers.clear();
5318 NewSelectsToRewrite.clear();
5319 break;
5320 }
5321 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5322 }
5323
5324 if (Promotable) {
5325 for (Use *U : AS.getDeadUsesIfPromotable()) {
5326 auto *OldInst = dyn_cast<Instruction>(U->get());
5327 Value::dropDroppableUse(*U);
5328 if (OldInst)
5329 if (isInstructionTriviallyDead(OldInst))
5330 DeadInsts.push_back(OldInst);
5331 }
5332 if (PHIUsers.empty() && SelectUsers.empty()) {
5333 // Promote the alloca.
5334 PromotableAllocas.insert(NewAI);
5335 } else {
5336 // If we have either PHIs or Selects to speculate, add them to those
5337 // worklists and re-queue the new alloca so that we promote in on the
5338 // next iteration.
5339 SpeculatablePHIs.insert_range(PHIUsers);
5340 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5341 NewSelectsToRewrite.size());
5342 for (auto &&KV : llvm::make_range(
5343 std::make_move_iterator(NewSelectsToRewrite.begin()),
5344 std::make_move_iterator(NewSelectsToRewrite.end())))
5345 SelectsToRewrite.insert(std::move(KV));
5346 Worklist.insert(NewAI);
5347 }
5348 } else {
5349 // Drop any post-promotion work items if promotion didn't happen.
5350 while (PostPromotionWorklist.size() > PPWOldSize)
5351 PostPromotionWorklist.pop_back();
5352
5353 // We couldn't promote and we didn't create a new partition, nothing
5354 // happened.
5355 if (NewAI == &AI)
5356 return nullptr;
5357
5358 // If we can't promote the alloca, iterate on it to check for new
5359 // refinements exposed by splitting the current alloca. Don't iterate on an
5360 // alloca which didn't actually change and didn't get promoted.
5361 Worklist.insert(NewAI);
5362 }
5363
5364 return NewAI;
5365}
5366
5367// There isn't a shared interface to get the "address" parts out of a
5368// dbg.declare and dbg.assign, so provide some wrappers.
5371 return DVR->isKillAddress();
5372 return DVR->isKillLocation();
5373}
5374
5377 return DVR->getAddressExpression();
5378 return DVR->getExpression();
5379}
5380
5381/// Create or replace an existing fragment in a DIExpression with \p Frag.
5382/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5383/// operation, add \p BitExtractOffset to the offset part.
5384///
5385/// Returns the new expression, or nullptr if this fails (see details below).
5386///
5387/// This function is similar to DIExpression::createFragmentExpression except
5388/// for 3 important distinctions:
5389/// 1. The new fragment isn't relative to an existing fragment.
5390/// 2. It assumes the computed location is a memory location. This means we
5391/// don't need to perform checks that creating the fragment preserves the
5392/// expression semantics.
5393/// 3. Existing extract_bits are modified independently of fragment changes
5394/// using \p BitExtractOffset. A change to the fragment offset or size
5395/// may affect a bit extract. But a bit extract offset can change
5396/// independently of the fragment dimensions.
5397///
5398/// Returns the new expression, or nullptr if one couldn't be created.
5399/// Ideally this is only used to signal that a bit-extract has become
5400/// zero-sized (and thus the new debug record has no size and can be
5401/// dropped), however, it fails for other reasons too - see the FIXME below.
5402///
5403/// FIXME: To keep the change that introduces this function NFC it bails
5404/// in some situations unecessarily, e.g. when fragment and bit extract
5405/// sizes differ.
5408 int64_t BitExtractOffset) {
5410 bool HasFragment = false;
5411 bool HasBitExtract = false;
5412
5413 for (auto &Op : Expr->expr_ops()) {
5414 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5415 HasFragment = true;
5416 continue;
5417 }
5418 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5420 HasBitExtract = true;
5421 int64_t ExtractOffsetInBits = Op.getArg(0);
5422 int64_t ExtractSizeInBits = Op.getArg(1);
5423
5424 // DIExpression::createFragmentExpression doesn't know how to handle
5425 // a fragment that is smaller than the extract. Copy the behaviour
5426 // (bail) to avoid non-NFC changes.
5427 // FIXME: Don't do this.
5428 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5429 return nullptr;
5430
5431 assert(BitExtractOffset <= 0);
5432 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5433
5434 // DIExpression::createFragmentExpression doesn't know what to do
5435 // if the new extract starts "outside" the existing one. Copy the
5436 // behaviour (bail) to avoid non-NFC changes.
5437 // FIXME: Don't do this.
5438 if (AdjustedOffset < 0)
5439 return nullptr;
5440
5441 Ops.push_back(Op.getOp());
5442 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5443 Ops.push_back(ExtractSizeInBits);
5444 continue;
5445 }
5446 Op.appendToVector(Ops);
5447 }
5448
5449 // Unsupported by createFragmentExpression, so don't support it here yet to
5450 // preserve NFC-ness.
5451 if (HasFragment && HasBitExtract)
5452 return nullptr;
5453
5454 if (!HasBitExtract) {
5456 Ops.push_back(Frag.OffsetInBits);
5457 Ops.push_back(Frag.SizeInBits);
5458 }
5459 return DIExpression::get(Expr->getContext(), Ops);
5460}
5461
5462/// Insert a new DbgRecord.
5463/// \p Orig Original to copy record type, debug loc and variable from, and
5464/// additionally value and value expression for dbg_assign records.
5465/// \p NewAddr Location's new base address.
5466/// \p NewAddrExpr New expression to apply to address.
5467/// \p BeforeInst Insert position.
5468/// \p NewFragment New fragment (absolute, non-relative).
5469/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5470static void
5472 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5473 std::optional<DIExpression::FragmentInfo> NewFragment,
5474 int64_t BitExtractAdjustment) {
5475 (void)DIB;
5476
5477 // A dbg_assign puts fragment info in the value expression only. The address
5478 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5479 // new fragment info into NewAddrExpr (as it only has one expression).
5480 DIExpression *NewFragmentExpr =
5481 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5482 if (NewFragment)
5483 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5484 BitExtractAdjustment);
5485 if (!NewFragmentExpr)
5486 return;
5487
5488 if (Orig->isDbgDeclare()) {
5490 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5491 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5492 BeforeInst->getIterator());
5493 return;
5494 }
5495
5496 if (Orig->isDbgValue()) {
5498 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5499 // Drop debug information if the expression doesn't start with a
5500 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5501 // describes the address of alloca rather than the value inside the alloca.
5502 if (!NewFragmentExpr->startsWithDeref())
5503 DVR->setKillAddress();
5504 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5505 BeforeInst->getIterator());
5506 return;
5507 }
5508
5509 // Apply a DIAssignID to the store if it doesn't already have it.
5510 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5511 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5513 }
5514
5516 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5517 NewAddrExpr, Orig->getDebugLoc());
5518 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5519 (void)NewAssign;
5520}
5521
5522/// Walks the slices of an alloca and form partitions based on them,
5523/// rewriting each of their uses.
5524bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5525 if (AS.begin() == AS.end())
5526 return false;
5527
5528 unsigned NumPartitions = 0;
5529 bool Changed = false;
5530 const DataLayout &DL = AI.getModule()->getDataLayout();
5531
5532 // First try to pre-split loads and stores.
5533 Changed |= presplitLoadsAndStores(AI, AS);
5534
5535 // Now that we have identified any pre-splitting opportunities,
5536 // mark loads and stores unsplittable except for the following case.
5537 // We leave a slice splittable if all other slices are disjoint or fully
5538 // included in the slice, such as whole-alloca loads and stores.
5539 // If we fail to split these during pre-splitting, we want to force them
5540 // to be rewritten into a partition.
5541 bool IsSorted = true;
5542
5543 uint64_t AllocaSize =
5544 DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue();
5545 const uint64_t MaxBitVectorSize = 1024;
5546 if (AllocaSize <= MaxBitVectorSize) {
5547 // If a byte boundary is included in any load or store, a slice starting or
5548 // ending at the boundary is not splittable.
5549 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5550 for (Slice &S : AS)
5551 for (unsigned O = S.beginOffset() + 1;
5552 O < S.endOffset() && O < AllocaSize; O++)
5553 SplittableOffset.reset(O);
5554
5555 for (Slice &S : AS) {
5556 if (!S.isSplittable())
5557 continue;
5558
5559 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5560 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5561 continue;
5562
5563 if (isa<LoadInst>(S.getUse()->getUser()) ||
5564 isa<StoreInst>(S.getUse()->getUser())) {
5565 S.makeUnsplittable();
5566 IsSorted = false;
5567 }
5568 }
5569 } else {
5570 // We only allow whole-alloca splittable loads and stores
5571 // for a large alloca to avoid creating too large BitVector.
5572 for (Slice &S : AS) {
5573 if (!S.isSplittable())
5574 continue;
5575
5576 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5577 continue;
5578
5579 if (isa<LoadInst>(S.getUse()->getUser()) ||
5580 isa<StoreInst>(S.getUse()->getUser())) {
5581 S.makeUnsplittable();
5582 IsSorted = false;
5583 }
5584 }
5585 }
5586
5587 if (!IsSorted)
5589
5590 /// Describes the allocas introduced by rewritePartition in order to migrate
5591 /// the debug info.
5592 struct Fragment {
5593 AllocaInst *Alloca;
5594 uint64_t Offset;
5595 uint64_t Size;
5596 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5597 : Alloca(AI), Offset(O), Size(S) {}
5598 };
5599 SmallVector<Fragment, 4> Fragments;
5600
5601 // Rewrite each partition.
5602 for (auto &P : AS.partitions()) {
5603 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5604 Changed = true;
5605 if (NewAI != &AI) {
5606 uint64_t SizeOfByte = 8;
5607 uint64_t AllocaSize =
5608 DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
5609 // Don't include any padding.
5610 uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
5611 Fragments.push_back(
5612 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5613 }
5614 }
5615 ++NumPartitions;
5616 }
5617
5618 NumAllocaPartitions += NumPartitions;
5619 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5620
5621 // Migrate debug information from the old alloca to the new alloca(s)
5622 // and the individual partitions.
5623 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5624 // Can't overlap with undef memory.
5625 if (isKillAddress(DbgVariable))
5626 return;
5627
5628 const Value *DbgPtr = DbgVariable->getAddress();
5630 DbgVariable->getFragmentOrEntireVariable();
5631 // Get the address expression constant offset if one exists and the ops
5632 // that come after it.
5633 int64_t CurrentExprOffsetInBytes = 0;
5634 SmallVector<uint64_t> PostOffsetOps;
5635 if (!getAddressExpression(DbgVariable)
5636 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5637 return; // Couldn't interpret this DIExpression - drop the var.
5638
5639 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5640 int64_t ExtractOffsetInBits = 0;
5641 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5642 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5644 ExtractOffsetInBits = Op.getArg(0);
5645 break;
5646 }
5647 }
5648
5649 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5650 for (auto Fragment : Fragments) {
5651 int64_t OffsetFromLocationInBits;
5652 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5653 // Find the variable fragment that the new alloca slice covers.
5654 // Drop debug info for this variable fragment if we can't compute an
5655 // intersect between it and the alloca slice.
5657 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5658 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5659 NewDbgFragment, OffsetFromLocationInBits))
5660 continue; // Do not migrate this fragment to this slice.
5661
5662 // Zero sized fragment indicates there's no intersect between the variable
5663 // fragment and the alloca slice. Skip this slice for this variable
5664 // fragment.
5665 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5666 continue; // Do not migrate this fragment to this slice.
5667
5668 // No fragment indicates DbgVariable's variable or fragment exactly
5669 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5670 if (!NewDbgFragment)
5671 NewDbgFragment = DbgVariable->getFragment();
5672
5673 // Reduce the new expression offset by the bit-extract offset since
5674 // we'll be keeping that.
5675 int64_t OffestFromNewAllocaInBits =
5676 OffsetFromLocationInBits - ExtractOffsetInBits;
5677 // We need to adjust an existing bit extract if the offset expression
5678 // can't eat the slack (i.e., if the new offset would be negative).
5679 int64_t BitExtractOffset =
5680 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5681 // The magnitude of a negative value indicates the number of bits into
5682 // the existing variable fragment that the memory region begins. The new
5683 // variable fragment already excludes those bits - the new DbgPtr offset
5684 // only needs to be applied if it's positive.
5685 OffestFromNewAllocaInBits =
5686 std::max(int64_t(0), OffestFromNewAllocaInBits);
5687
5688 // Rebuild the expression:
5689 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5690 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5691 // address expression but the value expression instead.
5692 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5693 if (OffestFromNewAllocaInBits > 0) {
5694 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5695 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5696 }
5697
5698 // Remove any existing intrinsics on the new alloca describing
5699 // the variable fragment.
5700 auto RemoveOne = [DbgVariable](auto *OldDII) {
5701 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5702 return LHS->getVariable() == RHS->getVariable() &&
5703 LHS->getDebugLoc()->getInlinedAt() ==
5704 RHS->getDebugLoc()->getInlinedAt();
5705 };
5706 if (SameVariableFragment(OldDII, DbgVariable))
5707 OldDII->eraseFromParent();
5708 };
5709 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5710 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5711 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5712 NewDbgFragment, BitExtractOffset);
5713 }
5714 };
5715
5716 // Migrate debug information from the old alloca to the new alloca(s)
5717 // and the individual partitions.
5718 for_each(findDVRDeclares(&AI), MigrateOne);
5719 for_each(findDVRValues(&AI), MigrateOne);
5720 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5721
5722 return Changed;
5723}
5724
5725/// Clobber a use with poison, deleting the used value if it becomes dead.
5726void SROA::clobberUse(Use &U) {
5727 Value *OldV = U;
5728 // Replace the use with an poison value.
5729 U = PoisonValue::get(OldV->getType());
5730
5731 // Check for this making an instruction dead. We have to garbage collect
5732 // all the dead instructions to ensure the uses of any alloca end up being
5733 // minimal.
5734 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5735 if (isInstructionTriviallyDead(OldI)) {
5736 DeadInsts.push_back(OldI);
5737 }
5738}
5739
5740/// A basic LoadAndStorePromoter that does not remove store nodes.
5742public:
5744 Type *ZeroType)
5745 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5746 bool shouldDelete(Instruction *I) const override {
5747 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5748 }
5749
5751 return UndefValue::get(ZeroType);
5752 }
5753
5754private:
5755 Type *ZeroType;
5756};
5757
5758bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5759 // Look through each "partition", looking for slices with the same start/end
5760 // that do not overlap with any before them. The slices are sorted by
5761 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5762 // sophisticated algorithm that takes splittable slices into account.
5763 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5764 bool AllSameAndValid = true;
5765 Type *PartitionType = nullptr;
5767 uint64_t BeginOffset = 0;
5768 uint64_t EndOffset = 0;
5769
5770 auto Flush = [&]() {
5771 if (AllSameAndValid && !Insts.empty()) {
5772 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5773 << EndOffset << ")\n");
5775 SSAUpdater SSA(&NewPHIs);
5776 Insts.push_back(&AI);
5777 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5778 Promoter.run(Insts);
5779 }
5780 AllSameAndValid = true;
5781 PartitionType = nullptr;
5782 Insts.clear();
5783 };
5784
5785 for (Slice &S : AS) {
5786 auto *User = cast<Instruction>(S.getUse()->getUser());
5787 if (isAssumeLikeIntrinsic(User)) {
5788 LLVM_DEBUG({
5789 dbgs() << "Ignoring slice: ";
5790 AS.print(dbgs(), &S);
5791 });
5792 continue;
5793 }
5794 if (S.beginOffset() >= EndOffset) {
5795 Flush();
5796 BeginOffset = S.beginOffset();
5797 EndOffset = S.endOffset();
5798 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5799 if (AllSameAndValid) {
5800 LLVM_DEBUG({
5801 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5802 << EndOffset << ")";
5803 AS.print(dbgs(), &S);
5804 });
5805 AllSameAndValid = false;
5806 }
5807 EndOffset = std::max(EndOffset, S.endOffset());
5808 continue;
5809 }
5810
5811 if (auto *LI = dyn_cast<LoadInst>(User)) {
5812 Type *UserTy = LI->getType();
5813 // LoadAndStorePromoter requires all the types to be the same.
5814 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5815 AllSameAndValid = false;
5816 PartitionType = UserTy;
5817 Insts.push_back(User);
5818 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5819 Type *UserTy = SI->getValueOperand()->getType();
5820 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5821 AllSameAndValid = false;
5822 PartitionType = UserTy;
5823 Insts.push_back(User);
5824 } else {
5825 AllSameAndValid = false;
5826 }
5827 }
5828
5829 Flush();
5830 return true;
5831}
5832
5833/// Analyze an alloca for SROA.
5834///
5835/// This analyzes the alloca to ensure we can reason about it, builds
5836/// the slices of the alloca, and then hands it off to be split and
5837/// rewritten as needed.
5838std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5839SROA::runOnAlloca(AllocaInst &AI) {
5840 bool Changed = false;
5841 bool CFGChanged = false;
5842
5843 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5844 ++NumAllocasAnalyzed;
5845
5846 // Special case dead allocas, as they're trivial.
5847 if (AI.use_empty()) {
5848 AI.eraseFromParent();
5849 Changed = true;
5850 return {Changed, CFGChanged};
5851 }
5852 const DataLayout &DL = AI.getDataLayout();
5853
5854 // Skip alloca forms that this analysis can't handle.
5855 auto *AT = AI.getAllocatedType();
5856 TypeSize Size = DL.getTypeAllocSize(AT);
5857 if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5858 Size.getFixedValue() == 0)
5859 return {Changed, CFGChanged};
5860
5861 // First, split any FCA loads and stores touching this alloca to promote
5862 // better splitting and promotion opportunities.
5863 IRBuilderTy IRB(&AI);
5864 AggLoadStoreRewriter AggRewriter(DL, IRB);
5865 Changed |= AggRewriter.rewrite(AI);
5866
5867 // Build the slices using a recursive instruction-visiting builder.
5868 AllocaSlices AS(DL, AI);
5869 LLVM_DEBUG(AS.print(dbgs()));
5870 if (AS.isEscaped())
5871 return {Changed, CFGChanged};
5872
5873 if (AS.isEscapedReadOnly()) {
5874 Changed |= propagateStoredValuesToLoads(AI, AS);
5875 return {Changed, CFGChanged};
5876 }
5877
5878 // Delete all the dead users of this alloca before splitting and rewriting it.
5879 for (Instruction *DeadUser : AS.getDeadUsers()) {
5880 // Free up everything used by this instruction.
5881 for (Use &DeadOp : DeadUser->operands())
5882 clobberUse(DeadOp);
5883
5884 // Now replace the uses of this instruction.
5885 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5886
5887 // And mark it for deletion.
5888 DeadInsts.push_back(DeadUser);
5889 Changed = true;
5890 }
5891 for (Use *DeadOp : AS.getDeadOperands()) {
5892 clobberUse(*DeadOp);
5893 Changed = true;
5894 }
5895
5896 // No slices to split. Leave the dead alloca for a later pass to clean up.
5897 if (AS.begin() == AS.end())
5898 return {Changed, CFGChanged};
5899
5900 Changed |= splitAlloca(AI, AS);
5901
5902 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5903 while (!SpeculatablePHIs.empty())
5904 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5905
5906 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5907 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5908 while (!RemainingSelectsToRewrite.empty()) {
5909 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5910 CFGChanged |=
5911 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5912 }
5913
5914 return {Changed, CFGChanged};
5915}
5916
5917/// Delete the dead instructions accumulated in this run.
5918///
5919/// Recursively deletes the dead instructions we've accumulated. This is done
5920/// at the very end to maximize locality of the recursive delete and to
5921/// minimize the problems of invalidated instruction pointers as such pointers
5922/// are used heavily in the intermediate stages of the algorithm.
5923///
5924/// We also record the alloca instructions deleted here so that they aren't
5925/// subsequently handed to mem2reg to promote.
5926bool SROA::deleteDeadInstructions(
5927 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5928 bool Changed = false;
5929 while (!DeadInsts.empty()) {
5930 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5931 if (!I)
5932 continue;
5933 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5934
5935 // If the instruction is an alloca, find the possible dbg.declare connected
5936 // to it, and remove it too. We must do this before calling RAUW or we will
5937 // not be able to find it.
5938 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5939 DeletedAllocas.insert(AI);
5940 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5941 OldDII->eraseFromParent();
5942 }
5943
5945 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5946
5947 for (Use &Operand : I->operands())
5948 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5949 // Zero out the operand and see if it becomes trivially dead.
5950 Operand = nullptr;
5952 DeadInsts.push_back(U);
5953 }
5954
5955 ++NumDeleted;
5956 I->eraseFromParent();
5957 Changed = true;
5958 }
5959 return Changed;
5960}
5961/// Promote the allocas, using the best available technique.
5962///
5963/// This attempts to promote whatever allocas have been identified as viable in
5964/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5965/// This function returns whether any promotion occurred.
5966bool SROA::promoteAllocas() {
5967 if (PromotableAllocas.empty())
5968 return false;
5969
5970 if (SROASkipMem2Reg) {
5971 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5972 } else {
5973 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5974 NumPromoted += PromotableAllocas.size();
5975 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5976 }
5977
5978 PromotableAllocas.clear();
5979 return true;
5980}
5981
5982std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5983 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5984
5985 const DataLayout &DL = F.getDataLayout();
5986 BasicBlock &EntryBB = F.getEntryBlock();
5987 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
5988 I != E; ++I) {
5989 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5990 if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
5992 PromotableAllocas.insert(AI);
5993 else
5994 Worklist.insert(AI);
5995 }
5996 }
5997
5998 bool Changed = false;
5999 bool CFGChanged = false;
6000 // A set of deleted alloca instruction pointers which should be removed from
6001 // the list of promotable allocas.
6002 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6003
6004 do {
6005 while (!Worklist.empty()) {
6006 auto [IterationChanged, IterationCFGChanged] =
6007 runOnAlloca(*Worklist.pop_back_val());
6008 Changed |= IterationChanged;
6009 CFGChanged |= IterationCFGChanged;
6010
6011 Changed |= deleteDeadInstructions(DeletedAllocas);
6012
6013 // Remove the deleted allocas from various lists so that we don't try to
6014 // continue processing them.
6015 if (!DeletedAllocas.empty()) {
6016 Worklist.set_subtract(DeletedAllocas);
6017 PostPromotionWorklist.set_subtract(DeletedAllocas);
6018 PromotableAllocas.set_subtract(DeletedAllocas);
6019 DeletedAllocas.clear();
6020 }
6021 }
6022
6023 Changed |= promoteAllocas();
6024
6025 Worklist = PostPromotionWorklist;
6026 PostPromotionWorklist.clear();
6027 } while (!Worklist.empty());
6028
6029 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6030 assert((!CFGChanged || !PreserveCFG) &&
6031 "Should not have modified the CFG when told to preserve it.");
6032
6033 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6034 for (auto &BB : F) {
6036 }
6037 }
6038
6039 return {Changed, CFGChanged};
6040}
6041
6045 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6046 auto [Changed, CFGChanged] =
6047 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6048 if (!Changed)
6049 return PreservedAnalyses::all();
6051 if (!CFGChanged)
6054 return PA;
6055}
6056
6058 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6059 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6060 OS, MapClassName2PassName);
6061 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6062 : "<modify-cfg>");
6063}
6064
6065SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6066
6067namespace {
6068
6069/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6070class SROALegacyPass : public FunctionPass {
6072
6073public:
6074 static char ID;
6075
6079 }
6080
6081 bool runOnFunction(Function &F) override {
6082 if (skipFunction(F))
6083 return false;
6084
6085 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6086 AssumptionCache &AC =
6087 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6088 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6089 auto [Changed, _] =
6090 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6091 return Changed;
6092 }
6093
6094 void getAnalysisUsage(AnalysisUsage &AU) const override {
6095 AU.addRequired<AssumptionCacheTracker>();
6096 AU.addRequired<DominatorTreeWrapperPass>();
6097 AU.addPreserved<GlobalsAAWrapperPass>();
6098 AU.addPreserved<DominatorTreeWrapperPass>();
6099 }
6100
6101 StringRef getPassName() const override { return "SROA"; }
6102};
6103
6104} // end anonymous namespace
6105
6106char SROALegacyPass::ID = 0;
6107
6112
6113INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6114 "Scalar Replacement Of Aggregates", false, false)
6117INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:593
This file implements a map that provides insertion order iteration.
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:342
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2335
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1920
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2186
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1486
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4539
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:277
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5406
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2578
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2111
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1909
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2417
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2611
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1005
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1875
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1808
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:992
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5369
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2633
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2512
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1626
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2291
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:323
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1552
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2553
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5471
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1769
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2707
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5375
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4577
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1930
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1707
static Value * convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy)
Generic routine to convert an SSA value to a value of a different type.
Definition SROA.cpp:2020
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1017
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1033
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:805
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:952
partition_iterator & operator++()
Definition SROA.cpp:972
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5746
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5743
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5750
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
DebugLoc getDebugLoc() const
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI void setKillAddress()
Kill the address component.
LLVM_ABI bool isKillLocation() const
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
LLVM_ABI void setAssignId(DIAssignID *New)
DIExpression * getExpression() const
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DILocalVariable * getVariable() const
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:67
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:807
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:114
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
LLVMContext & getContext() const
Definition Metadata.h:1242
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6042
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6057
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6065
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:345
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:293
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:712
TypeSize getSizeInBytes() const
Definition DataLayout.h:721
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:743
TypeSize getSizeInBits() const
Definition DataLayout.h:723
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getArrayElementType() const
Definition Type.h:408
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:203
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
op_iterator op_begin()
Definition User.h:284
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
op_iterator op_end()
Definition User.h:286
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:812
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:218
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:195
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:362
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2114
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1499
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2088
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2140
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:341
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:82
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2132
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
cl::opt< bool > ProfcheckDisableMetadataFixes("profcheck-disable-metadata-fixes", cl::Hidden, cl::init(false), cl::desc("Disable metadata propagation fixes discovered through Issue #147390"))
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6108
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:820
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:69