LLVM 23.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
76#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <string>
95#include <tuple>
96#include <utility>
97#include <variant>
98#include <vector>
99
100using namespace llvm;
101
102#define DEBUG_TYPE "sroa"
103
104STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
105STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
106STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
107STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
108STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
109STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
110STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
111STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
112STATISTIC(NumLoadsPredicated,
113 "Number of loads rewritten into predicated loads to allow promotion");
115 NumStoresPredicated,
116 "Number of stores rewritten into predicated loads to allow promotion");
117STATISTIC(NumDeleted, "Number of instructions deleted");
118STATISTIC(NumVectorized, "Number of vectorized aggregates");
119
120namespace llvm {
121/// Disable running mem2reg during SROA in order to test or debug SROA.
122static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
123 cl::Hidden);
125} // namespace llvm
126
127namespace {
128
129class AllocaSliceRewriter;
130class AllocaSlices;
131class Partition;
132
133class SelectHandSpeculativity {
134 unsigned char Storage = 0; // None are speculatable by default.
135 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
136 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
137public:
138 SelectHandSpeculativity() = default;
139 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
140 bool isSpeculatable(bool isTrueVal) const;
141 bool areAllSpeculatable() const;
142 bool areAnySpeculatable() const;
143 bool areNoneSpeculatable() const;
144 // For interop as int half of PointerIntPair.
145 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
146 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
147};
148static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
149
150using PossiblySpeculatableLoad =
152using UnspeculatableStore = StoreInst *;
153using RewriteableMemOp =
154 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
155using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
156
157/// An optimization pass providing Scalar Replacement of Aggregates.
158///
159/// This pass takes allocations which can be completely analyzed (that is, they
160/// don't escape) and tries to turn them into scalar SSA values. There are
161/// a few steps to this process.
162///
163/// 1) It takes allocations of aggregates and analyzes the ways in which they
164/// are used to try to split them into smaller allocations, ideally of
165/// a single scalar data type. It will split up memcpy and memset accesses
166/// as necessary and try to isolate individual scalar accesses.
167/// 2) It will transform accesses into forms which are suitable for SSA value
168/// promotion. This can be replacing a memset with a scalar store of an
169/// integer value, or it can involve speculating operations on a PHI or
170/// select to be a PHI or select of the results.
171/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
172/// onto insert and extract operations on a vector value, and convert them to
173/// this form. By doing so, it will enable promotion of vector aggregates to
174/// SSA vector values.
175class SROA {
176 LLVMContext *const C;
177 DomTreeUpdater *const DTU;
178 AssumptionCache *const AC;
179 const bool PreserveCFG;
180 const bool AggregateToVector;
181
182 /// Worklist of alloca instructions to simplify.
183 ///
184 /// Each alloca in the function is added to this. Each new alloca formed gets
185 /// added to it as well to recursively simplify unless that alloca can be
186 /// directly promoted. Finally, each time we rewrite a use of an alloca other
187 /// the one being actively rewritten, we add it back onto the list if not
188 /// already present to ensure it is re-visited.
189 SmallSetVector<AllocaInst *, 16> Worklist;
190
191 /// A collection of instructions to delete.
192 /// We try to batch deletions to simplify code and make things a bit more
193 /// efficient. We also make sure there is no dangling pointers.
194 SmallVector<WeakVH, 8> DeadInsts;
195
196 /// Post-promotion worklist.
197 ///
198 /// Sometimes we discover an alloca which has a high probability of becoming
199 /// viable for SROA after a round of promotion takes place. In those cases,
200 /// the alloca is enqueued here for re-processing.
201 ///
202 /// Note that we have to be very careful to clear allocas out of this list in
203 /// the event they are deleted.
204 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
205
206 /// A collection of alloca instructions we can directly promote.
207 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
208 SmallPtrSet<AllocaInst *, 16>, 16>
209 PromotableAllocas;
210
211 /// A worklist of PHIs to speculate prior to promoting allocas.
212 ///
213 /// All of these PHIs have been checked for the safety of speculation and by
214 /// being speculated will allow promoting allocas currently in the promotable
215 /// queue.
216 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
217
218 /// A worklist of select instructions to rewrite prior to promoting
219 /// allocas.
220 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
221
222 /// Select instructions that use an alloca and are subsequently loaded can be
223 /// rewritten to load both input pointers and then select between the result,
224 /// allowing the load of the alloca to be promoted.
225 /// From this:
226 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
227 /// %V = load <type>, ptr %P2
228 /// to:
229 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
230 /// %V2 = load <type>, ptr %Other
231 /// %V = select i1 %cond, <type> %V1, <type> %V2
232 ///
233 /// We can do this to a select if its only uses are loads
234 /// and if either the operand to the select can be loaded unconditionally,
235 /// or if we are allowed to perform CFG modifications.
236 /// If found an intervening bitcast with a single use of the load,
237 /// allow the promotion.
238 static std::optional<RewriteableMemOps>
239 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
240
241public:
242 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
243 SROAOptions Options)
244 : C(C), DTU(DTU), AC(AC),
245 PreserveCFG(Options.CFG == SROAOptions::PreserveCFG),
246 AggregateToVector(Options.AggregateToVector) {}
247
248 /// Main run method used by both the SROAPass and by the legacy pass.
249 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
250
251private:
252 friend class AllocaSliceRewriter;
253
254 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255 std::pair<AllocaInst *, uint64_t>
256 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
257 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
258 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
259 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
260 void clobberUse(Use &U);
261 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
262 bool promoteAllocas();
263};
264
265} // end anonymous namespace
266
267/// Calculate the fragment of a variable to use when slicing a store
268/// based on the slice dimensions, existing fragment, and base storage
269/// fragment.
270/// Results:
271/// UseFrag - Use Target as the new fragment.
272/// UseNoFrag - The new slice already covers the whole variable.
273/// Skip - The new alloca slice doesn't include this variable.
274/// FIXME: Can we use calculateFragmentIntersect instead?
275namespace {
276enum FragCalcResult { UseFrag, UseNoFrag, Skip };
277}
278static FragCalcResult
280 uint64_t NewStorageSliceOffsetInBits,
281 uint64_t NewStorageSliceSizeInBits,
282 std::optional<DIExpression::FragmentInfo> StorageFragment,
283 std::optional<DIExpression::FragmentInfo> CurrentFragment,
285 // If the base storage describes part of the variable apply the offset and
286 // the size constraint.
287 if (StorageFragment) {
288 Target.SizeInBits =
289 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
290 Target.OffsetInBits =
291 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
292 } else {
293 Target.SizeInBits = NewStorageSliceSizeInBits;
294 Target.OffsetInBits = NewStorageSliceOffsetInBits;
295 }
296
297 // If this slice extracts the entirety of an independent variable from a
298 // larger alloca, do not produce a fragment expression, as the variable is
299 // not fragmented.
300 if (!CurrentFragment) {
301 if (auto Size = Variable->getSizeInBits()) {
302 // Treat the current fragment as covering the whole variable.
303 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
304 if (Target == CurrentFragment)
305 return UseNoFrag;
306 }
307 }
308
309 // No additional work to do if there isn't a fragment already, or there is
310 // but it already exactly describes the new assignment.
311 if (!CurrentFragment || *CurrentFragment == Target)
312 return UseFrag;
313
314 // Reject the target fragment if it doesn't fit wholly within the current
315 // fragment. TODO: We could instead chop up the target to fit in the case of
316 // a partial overlap.
317 if (Target.startInBits() < CurrentFragment->startInBits() ||
318 Target.endInBits() > CurrentFragment->endInBits())
319 return Skip;
320
321 // Target fits within the current fragment, return it.
322 return UseFrag;
323}
324
326 return DebugVariable(DVR->getVariable(), std::nullopt,
327 DVR->getDebugLoc().getInlinedAt());
328}
329
330/// Find linked dbg.assign and generate a new one with the correct
331/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
332/// value component is copied from the old dbg.assign to the new.
333/// \param OldAlloca Alloca for the variable before splitting.
334/// \param IsSplit True if the store (not necessarily alloca)
335/// is being split.
336/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
337/// \param SliceSizeInBits New number of bits being written to.
338/// \param OldInst Instruction that is being split.
339/// \param Inst New instruction performing this part of the
340/// split store.
341/// \param Dest Store destination.
342/// \param Value Stored value.
343/// \param DL Datalayout.
344static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
345 uint64_t OldAllocaOffsetInBits,
346 uint64_t SliceSizeInBits, Instruction *OldInst,
347 Instruction *Inst, Value *Dest, Value *Value,
348 const DataLayout &DL) {
349 // If we want allocas to be migrated using this helper then we need to ensure
350 // that the BaseFragments map code still works. A simple solution would be
351 // to choose to always clone alloca dbg_assigns (rather than sometimes
352 // "stealing" them).
353 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
354
355 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
356 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
357 if (DVRAssignMarkerRange.empty())
358 return;
359
360 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
361 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
362 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
363 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
364 << "\n");
365 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
366 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
367 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
368 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
369 if (Value)
370 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
371
372 /// Map of aggregate variables to their fragment associated with OldAlloca.
374 BaseFragments;
375 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
376 BaseFragments[getAggregateVariable(DVR)] =
377 DVR->getExpression()->getFragmentInfo();
378
379 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
380 // one). It shouldn't already have one: assert this assumption.
381 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
382 DIAssignID *NewID = nullptr;
383 auto &Ctx = Inst->getContext();
384 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
385 assert(OldAlloca->isStaticAlloca());
386
387 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
388 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
389 << "\n");
390 auto *Expr = DbgAssign->getExpression();
391 bool SetKillLocation = false;
392
393 if (IsSplit) {
394 std::optional<DIExpression::FragmentInfo> BaseFragment;
395 {
396 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
397 if (R == BaseFragments.end())
398 return;
399 BaseFragment = R->second;
400 }
401 std::optional<DIExpression::FragmentInfo> CurrentFragment =
402 Expr->getFragmentInfo();
403 DIExpression::FragmentInfo NewFragment;
404 FragCalcResult Result = calculateFragment(
405 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
406 BaseFragment, CurrentFragment, NewFragment);
407
408 if (Result == Skip)
409 return;
410 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
411 if (CurrentFragment) {
412 // Rewrite NewFragment to be relative to the existing one (this is
413 // what createFragmentExpression wants). CalculateFragment has
414 // already resolved the size for us. FIXME: Should it return the
415 // relative fragment too?
416 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
417 }
418 // Add the new fragment info to the existing expression if possible.
420 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
421 Expr = *E;
422 } else {
423 // Otherwise, add the new fragment info to an empty expression and
424 // discard the value component of this dbg.assign as the value cannot
425 // be computed with the new fragment.
427 DIExpression::get(Expr->getContext(), {}),
428 NewFragment.OffsetInBits, NewFragment.SizeInBits);
429 SetKillLocation = true;
430 }
431 }
432 }
433
434 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
435 if (!NewID) {
436 NewID = DIAssignID::getDistinct(Ctx);
437 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
438 }
439
440 DbgVariableRecord *NewAssign;
441 if (IsSplit) {
442 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
444 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
445 Dest, DIExpression::get(Expr->getContext(), {}),
446 DbgAssign->getDebugLoc())));
447 } else {
448 // The store is not split, simply steal the existing dbg_assign.
449 NewAssign = DbgAssign;
450 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
451 NewAssign->setAddress(Dest);
452 if (Value)
453 NewAssign->replaceVariableLocationOp(0u, Value);
454 assert(Expr == NewAssign->getExpression());
455 }
456
457 // If we've updated the value but the original dbg.assign has an arglist
458 // then kill it now - we can't use the requested new value.
459 // We can't replace the DIArgList with the new value as it'd leave
460 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
461 // an arglist). And we can't keep the DIArgList in case the linked store
462 // is being split - in which case the DIArgList + expression may no longer
463 // be computing the correct value.
464 // This should be a very rare situation as it requires the value being
465 // stored to differ from the dbg.assign (i.e., the value has been
466 // represented differently in the debug intrinsic for some reason).
467 SetKillLocation |=
468 Value && (DbgAssign->hasArgList() ||
469 !DbgAssign->getExpression()->isSingleLocationExpression());
470 if (SetKillLocation)
471 NewAssign->setKillLocation();
472
473 // We could use more precision here at the cost of some additional (code)
474 // complexity - if the original dbg.assign was adjacent to its store, we
475 // could position this new dbg.assign adjacent to its store rather than the
476 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
477 // what we get now:
478 // split store !1
479 // split store !2
480 // dbg.assign !1
481 // dbg.assign !2
482 // This (current behaviour) results results in debug assignments being
483 // noted as slightly offset (in code) from the store. In practice this
484 // should have little effect on the debugging experience due to the fact
485 // that all the split stores should get the same line number.
486 if (NewAssign != DbgAssign) {
487 NewAssign->moveBefore(DbgAssign->getIterator());
488 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
489 }
490 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
491 };
492
493 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
494}
495
496namespace {
497
498/// A custom IRBuilder inserter which prefixes all names, but only in
499/// Assert builds.
500class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
501 std::string Prefix;
502
503 Twine getNameWithPrefix(const Twine &Name) const {
504 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
505 }
506
507public:
508 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
509
510 void InsertHelper(Instruction *I, const Twine &Name,
511 BasicBlock::iterator InsertPt) const override {
512 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
513 InsertPt);
514 }
515};
516
517/// Provide a type for IRBuilder that drops names in release builds.
519
520/// A used slice of an alloca.
521///
522/// This structure represents a slice of an alloca used by some instruction. It
523/// stores both the begin and end offsets of this use, a pointer to the use
524/// itself, and a flag indicating whether we can classify the use as splittable
525/// or not when forming partitions of the alloca.
526class Slice {
527 /// The beginning offset of the range.
528 uint64_t BeginOffset = 0;
529
530 /// The ending offset, not included in the range.
531 uint64_t EndOffset = 0;
532
533 /// Storage for both the use of this slice and whether it can be
534 /// split.
535 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
536
537public:
538 Slice() = default;
539
540 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
541 : BeginOffset(BeginOffset), EndOffset(EndOffset),
542 UseAndIsSplittable(U, IsSplittable) {}
543
544 uint64_t beginOffset() const { return BeginOffset; }
545 uint64_t endOffset() const { return EndOffset; }
546
547 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
548 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
549
550 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
551
552 bool isDead() const { return getUse() == nullptr; }
553 void kill() { UseAndIsSplittable.setPointer(nullptr); }
554
555 /// Support for ordering ranges.
556 ///
557 /// This provides an ordering over ranges such that start offsets are
558 /// always increasing, and within equal start offsets, the end offsets are
559 /// decreasing. Thus the spanning range comes first in a cluster with the
560 /// same start position.
561 bool operator<(const Slice &RHS) const {
562 if (beginOffset() < RHS.beginOffset())
563 return true;
564 if (beginOffset() > RHS.beginOffset())
565 return false;
566 if (isSplittable() != RHS.isSplittable())
567 return !isSplittable();
568 if (endOffset() > RHS.endOffset())
569 return true;
570 return false;
571 }
572
573 /// Support comparison with a single offset to allow binary searches.
574 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
575 return LHS.beginOffset() < RHSOffset;
576 }
577 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
578 return LHSOffset < RHS.beginOffset();
579 }
580
581 bool operator==(const Slice &RHS) const {
582 return isSplittable() == RHS.isSplittable() &&
583 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
584 }
585 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
586};
587
588/// Representation of the alloca slices.
589///
590/// This class represents the slices of an alloca which are formed by its
591/// various uses. If a pointer escapes, we can't fully build a representation
592/// for the slices used and we reflect that in this structure. The uses are
593/// stored, sorted by increasing beginning offset and with unsplittable slices
594/// starting at a particular offset before splittable slices.
595class AllocaSlices {
596public:
597 /// Construct the slices of a particular alloca.
598 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
599
600 /// Test whether a pointer to the allocation escapes our analysis.
601 ///
602 /// If this is true, the slices are never fully built and should be
603 /// ignored.
604 bool isEscaped() const { return PointerEscapingInstr; }
605 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
606
607 /// Support for iterating over the slices.
608 /// @{
609 using iterator = SmallVectorImpl<Slice>::iterator;
610 using range = iterator_range<iterator>;
611
612 iterator begin() { return Slices.begin(); }
613 iterator end() { return Slices.end(); }
614
615 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
616 using const_range = iterator_range<const_iterator>;
617
618 const_iterator begin() const { return Slices.begin(); }
619 const_iterator end() const { return Slices.end(); }
620 /// @}
621
622 /// Erase a range of slices.
623 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
624
625 /// Insert new slices for this alloca.
626 ///
627 /// This moves the slices into the alloca's slices collection, and re-sorts
628 /// everything so that the usual ordering properties of the alloca's slices
629 /// hold.
630 void insert(ArrayRef<Slice> NewSlices) {
631 int OldSize = Slices.size();
632 Slices.append(NewSlices.begin(), NewSlices.end());
633 auto SliceI = Slices.begin() + OldSize;
634 std::stable_sort(SliceI, Slices.end());
635 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
636 }
637
638 // Forward declare the iterator and range accessor for walking the
639 // partitions.
640 class partition_iterator;
642
643 /// Access the dead users for this alloca.
644 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
645
646 /// Access Uses that should be dropped if the alloca is promotable.
647 ArrayRef<Use *> getDeadUsesIfPromotable() const {
648 return DeadUseIfPromotable;
649 }
650
651 /// Access the dead operands referring to this alloca.
652 ///
653 /// These are operands which have cannot actually be used to refer to the
654 /// alloca as they are outside its range and the user doesn't correct for
655 /// that. These mostly consist of PHI node inputs and the like which we just
656 /// need to replace with undef.
657 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
658
659#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
660 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
661 void printSlice(raw_ostream &OS, const_iterator I,
662 StringRef Indent = " ") const;
663 void printUse(raw_ostream &OS, const_iterator I,
664 StringRef Indent = " ") const;
665 void print(raw_ostream &OS) const;
666 void dump(const_iterator I) const;
667 void dump() const;
668#endif
669
670private:
671 template <typename DerivedT, typename RetT = void> class BuilderBase;
672 class SliceBuilder;
673
674 friend class AllocaSlices::SliceBuilder;
675
676#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
677 /// Handle to alloca instruction to simplify method interfaces.
678 AllocaInst &AI;
679#endif
680
681 /// The instruction responsible for this alloca not having a known set
682 /// of slices.
683 ///
684 /// When an instruction (potentially) escapes the pointer to the alloca, we
685 /// store a pointer to that here and abort trying to form slices of the
686 /// alloca. This will be null if the alloca slices are analyzed successfully.
687 Instruction *PointerEscapingInstr;
688 Instruction *PointerEscapingInstrReadOnly;
689
690 /// The slices of the alloca.
691 ///
692 /// We store a vector of the slices formed by uses of the alloca here. This
693 /// vector is sorted by increasing begin offset, and then the unsplittable
694 /// slices before the splittable ones. See the Slice inner class for more
695 /// details.
697
698 /// Instructions which will become dead if we rewrite the alloca.
699 ///
700 /// Note that these are not separated by slice. This is because we expect an
701 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
702 /// all these instructions can simply be removed and replaced with poison as
703 /// they come from outside of the allocated space.
704 SmallVector<Instruction *, 8> DeadUsers;
705
706 /// Uses which will become dead if can promote the alloca.
707 SmallVector<Use *, 8> DeadUseIfPromotable;
708
709 /// Operands which will become dead if we rewrite the alloca.
710 ///
711 /// These are operands that in their particular use can be replaced with
712 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
713 /// to PHI nodes and the like. They aren't entirely dead (there might be
714 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
715 /// want to swap this particular input for poison to simplify the use lists of
716 /// the alloca.
717 SmallVector<Use *, 8> DeadOperands;
718};
719
720/// A partition of the slices.
721///
722/// An ephemeral representation for a range of slices which can be viewed as
723/// a partition of the alloca. This range represents a span of the alloca's
724/// memory which cannot be split, and provides access to all of the slices
725/// overlapping some part of the partition.
726///
727/// Objects of this type are produced by traversing the alloca's slices, but
728/// are only ephemeral and not persistent.
729class Partition {
730private:
731 friend class AllocaSlices;
732 friend class AllocaSlices::partition_iterator;
733
734 using iterator = AllocaSlices::iterator;
735
736 /// The beginning and ending offsets of the alloca for this
737 /// partition.
738 uint64_t BeginOffset = 0, EndOffset = 0;
739
740 /// The start and end iterators of this partition.
741 iterator SI, SJ;
742
743 /// A collection of split slice tails overlapping the partition.
744 SmallVector<Slice *, 4> SplitTails;
745
746 /// Raw constructor builds an empty partition starting and ending at
747 /// the given iterator.
748 Partition(iterator SI) : SI(SI), SJ(SI) {}
749
750public:
751 /// The start offset of this partition.
752 ///
753 /// All of the contained slices start at or after this offset.
754 uint64_t beginOffset() const { return BeginOffset; }
755
756 /// The end offset of this partition.
757 ///
758 /// All of the contained slices end at or before this offset.
759 uint64_t endOffset() const { return EndOffset; }
760
761 /// The size of the partition.
762 ///
763 /// Note that this can never be zero.
764 uint64_t size() const {
765 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
766 return EndOffset - BeginOffset;
767 }
768
769 /// Test whether this partition contains no slices, and merely spans
770 /// a region occupied by split slices.
771 bool empty() const { return SI == SJ; }
772
773 /// \name Iterate slices that start within the partition.
774 /// These may be splittable or unsplittable. They have a begin offset >= the
775 /// partition begin offset.
776 /// @{
777 // FIXME: We should probably define a "concat_iterator" helper and use that
778 // to stitch together pointee_iterators over the split tails and the
779 // contiguous iterators of the partition. That would give a much nicer
780 // interface here. We could then additionally expose filtered iterators for
781 // split, unsplit, and unsplittable splices based on the usage patterns.
782 iterator begin() const { return SI; }
783 iterator end() const { return SJ; }
784 /// @}
785
786 /// Get the sequence of split slice tails.
787 ///
788 /// These tails are of slices which start before this partition but are
789 /// split and overlap into the partition. We accumulate these while forming
790 /// partitions.
791 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
792};
793
794} // end anonymous namespace
795
796/// An iterator over partitions of the alloca's slices.
797///
798/// This iterator implements the core algorithm for partitioning the alloca's
799/// slices. It is a forward iterator as we don't support backtracking for
800/// efficiency reasons, and re-use a single storage area to maintain the
801/// current set of split slices.
802///
803/// It is templated on the slice iterator type to use so that it can operate
804/// with either const or non-const slice iterators.
806 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
807 Partition> {
808 friend class AllocaSlices;
809
810 /// Most of the state for walking the partitions is held in a class
811 /// with a nice interface for examining them.
812 Partition P;
813
814 /// We need to keep the end of the slices to know when to stop.
815 AllocaSlices::iterator SE;
816
817 /// We also need to keep track of the maximum split end offset seen.
818 /// FIXME: Do we really?
819 uint64_t MaxSplitSliceEndOffset = 0;
820
821 /// Sets the partition to be empty at given iterator, and sets the
822 /// end iterator.
823 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
824 : P(SI), SE(SE) {
825 // If not already at the end, advance our state to form the initial
826 // partition.
827 if (SI != SE)
828 advance();
829 }
830
831 /// Advance the iterator to the next partition.
832 ///
833 /// Requires that the iterator not be at the end of the slices.
834 void advance() {
835 assert((P.SI != SE || !P.SplitTails.empty()) &&
836 "Cannot advance past the end of the slices!");
837
838 // Clear out any split uses which have ended.
839 if (!P.SplitTails.empty()) {
840 if (P.EndOffset >= MaxSplitSliceEndOffset) {
841 // If we've finished all splits, this is easy.
842 P.SplitTails.clear();
843 MaxSplitSliceEndOffset = 0;
844 } else {
845 // Remove the uses which have ended in the prior partition. This
846 // cannot change the max split slice end because we just checked that
847 // the prior partition ended prior to that max.
848 llvm::erase_if(P.SplitTails,
849 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
850 assert(llvm::any_of(P.SplitTails,
851 [&](Slice *S) {
852 return S->endOffset() == MaxSplitSliceEndOffset;
853 }) &&
854 "Could not find the current max split slice offset!");
855 assert(llvm::all_of(P.SplitTails,
856 [&](Slice *S) {
857 return S->endOffset() <= MaxSplitSliceEndOffset;
858 }) &&
859 "Max split slice end offset is not actually the max!");
860 }
861 }
862
863 // If P.SI is already at the end, then we've cleared the split tail and
864 // now have an end iterator.
865 if (P.SI == SE) {
866 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
867 return;
868 }
869
870 // If we had a non-empty partition previously, set up the state for
871 // subsequent partitions.
872 if (P.SI != P.SJ) {
873 // Accumulate all the splittable slices which started in the old
874 // partition into the split list.
875 for (Slice &S : P)
876 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
877 P.SplitTails.push_back(&S);
878 MaxSplitSliceEndOffset =
879 std::max(S.endOffset(), MaxSplitSliceEndOffset);
880 }
881
882 // Start from the end of the previous partition.
883 P.SI = P.SJ;
884
885 // If P.SI is now at the end, we at most have a tail of split slices.
886 if (P.SI == SE) {
887 P.BeginOffset = P.EndOffset;
888 P.EndOffset = MaxSplitSliceEndOffset;
889 return;
890 }
891
892 // If the we have split slices and the next slice is after a gap and is
893 // not splittable immediately form an empty partition for the split
894 // slices up until the next slice begins.
895 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
896 !P.SI->isSplittable()) {
897 P.BeginOffset = P.EndOffset;
898 P.EndOffset = P.SI->beginOffset();
899 return;
900 }
901 }
902
903 // OK, we need to consume new slices. Set the end offset based on the
904 // current slice, and step SJ past it. The beginning offset of the
905 // partition is the beginning offset of the next slice unless we have
906 // pre-existing split slices that are continuing, in which case we begin
907 // at the prior end offset.
908 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
909 P.EndOffset = P.SI->endOffset();
910 ++P.SJ;
911
912 // There are two strategies to form a partition based on whether the
913 // partition starts with an unsplittable slice or a splittable slice.
914 if (!P.SI->isSplittable()) {
915 // When we're forming an unsplittable region, it must always start at
916 // the first slice and will extend through its end.
917 assert(P.BeginOffset == P.SI->beginOffset());
918
919 // Form a partition including all of the overlapping slices with this
920 // unsplittable slice.
921 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
922 if (!P.SJ->isSplittable())
923 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
924 ++P.SJ;
925 }
926
927 // We have a partition across a set of overlapping unsplittable
928 // partitions.
929 return;
930 }
931
932 // If we're starting with a splittable slice, then we need to form
933 // a synthetic partition spanning it and any other overlapping splittable
934 // splices.
935 assert(P.SI->isSplittable() && "Forming a splittable partition!");
936
937 // Collect all of the overlapping splittable slices.
938 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
939 P.SJ->isSplittable()) {
940 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
941 ++P.SJ;
942 }
943
944 // Back upiP.EndOffset if we ended the span early when encountering an
945 // unsplittable slice. This synthesizes the early end offset of
946 // a partition spanning only splittable slices.
947 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
948 assert(!P.SJ->isSplittable());
949 P.EndOffset = P.SJ->beginOffset();
950 }
951 }
952
953public:
954 bool operator==(const partition_iterator &RHS) const {
955 assert(SE == RHS.SE &&
956 "End iterators don't match between compared partition iterators!");
957
958 // The observed positions of partitions is marked by the P.SI iterator and
959 // the emptiness of the split slices. The latter is only relevant when
960 // P.SI == SE, as the end iterator will additionally have an empty split
961 // slices list, but the prior may have the same P.SI and a tail of split
962 // slices.
963 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
964 assert(P.SJ == RHS.P.SJ &&
965 "Same set of slices formed two different sized partitions!");
966 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
967 "Same slice position with differently sized non-empty split "
968 "slice tails!");
969 return true;
970 }
971 return false;
972 }
973
974 partition_iterator &operator++() {
975 advance();
976 return *this;
977 }
978
979 Partition &operator*() { return P; }
980};
981
982/// A forward range over the partitions of the alloca's slices.
983///
984/// This accesses an iterator range over the partitions of the alloca's
985/// slices. It computes these partitions on the fly based on the overlapping
986/// offsets of the slices and the ability to split them. It will visit "empty"
987/// partitions to cover regions of the alloca only accessed via split
988/// slices.
989iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
990 return make_range(partition_iterator(begin(), end()),
991 partition_iterator(end(), end()));
992}
993
995 // If the condition being selected on is a constant or the same value is
996 // being selected between, fold the select. Yes this does (rarely) happen
997 // early on.
998 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
999 return SI.getOperand(1 + CI->isZero());
1000 if (SI.getOperand(1) == SI.getOperand(2))
1001 return SI.getOperand(1);
1002
1003 return nullptr;
1004}
1005
1006/// A helper that folds a PHI node or a select.
1008 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1009 // If PN merges together the same value, return that value.
1010 return PN->hasConstantValue();
1011 }
1013}
1014
1015/// Builder for the alloca slices.
1016///
1017/// This class builds a set of alloca slices by recursively visiting the uses
1018/// of an alloca and making a slice for each load and store at each offset.
1019class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1020 friend class PtrUseVisitor<SliceBuilder>;
1021 friend class InstVisitor<SliceBuilder>;
1022
1023 using Base = PtrUseVisitor<SliceBuilder>;
1024
1025 const uint64_t AllocSize;
1026 AllocaSlices &AS;
1027
1028 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1030
1031 /// Set to de-duplicate dead instructions found in the use walk.
1032 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1033
1034public:
1035 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1037 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1038
1039private:
1040 void markAsDead(Instruction &I) {
1041 if (VisitedDeadInsts.insert(&I).second)
1042 AS.DeadUsers.push_back(&I);
1043 }
1044
1045 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1046 bool IsSplittable = false) {
1047 // Completely skip uses which have a zero size or start either before or
1048 // past the end of the allocation.
1049 if (Size == 0 || Offset.uge(AllocSize)) {
1050 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1051 << Offset
1052 << " which has zero size or starts outside of the "
1053 << AllocSize << " byte alloca:\n"
1054 << " alloca: " << AS.AI << "\n"
1055 << " use: " << I << "\n");
1056 return markAsDead(I);
1057 }
1058
1059 uint64_t BeginOffset = Offset.getZExtValue();
1060 uint64_t EndOffset = BeginOffset + Size;
1061
1062 // Clamp the end offset to the end of the allocation. Note that this is
1063 // formulated to handle even the case where "BeginOffset + Size" overflows.
1064 // This may appear superficially to be something we could ignore entirely,
1065 // but that is not so! There may be widened loads or PHI-node uses where
1066 // some instructions are dead but not others. We can't completely ignore
1067 // them, and so have to record at least the information here.
1068 assert(AllocSize >= BeginOffset); // Established above.
1069 if (Size > AllocSize - BeginOffset) {
1070 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1071 << Offset << " to remain within the " << AllocSize
1072 << " byte alloca:\n"
1073 << " alloca: " << AS.AI << "\n"
1074 << " use: " << I << "\n");
1075 EndOffset = AllocSize;
1076 }
1077
1078 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1079 }
1080
1081 void visitBitCastInst(BitCastInst &BC) {
1082 if (BC.use_empty())
1083 return markAsDead(BC);
1084
1085 return Base::visitBitCastInst(BC);
1086 }
1087
1088 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1089 if (ASC.use_empty())
1090 return markAsDead(ASC);
1091
1092 return Base::visitAddrSpaceCastInst(ASC);
1093 }
1094
1095 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1096 if (GEPI.use_empty())
1097 return markAsDead(GEPI);
1098
1099 return Base::visitGetElementPtrInst(GEPI);
1100 }
1101
1102 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1103 uint64_t Size, bool IsVolatile) {
1104 // We allow splitting of non-volatile loads and stores where the type is an
1105 // integer type. These may be used to implement 'memcpy' or other "transfer
1106 // of bits" patterns.
1107 bool IsSplittable =
1108 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1109
1110 insertUse(I, Offset, Size, IsSplittable);
1111 }
1112
1113 void visitLoadInst(LoadInst &LI) {
1114 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1115 "All simple FCA loads should have been pre-split");
1116
1117 // If there is a load with an unknown offset, we can still perform store
1118 // to load forwarding for other known-offset loads.
1119 if (!IsOffsetKnown)
1120 return PI.setEscapedReadOnly(&LI);
1121
1122 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1123 if (Size.isScalable()) {
1124 unsigned VScale = LI.getFunction()->getVScaleValue();
1125 if (!VScale)
1126 return PI.setAborted(&LI);
1127
1128 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1129 }
1130
1131 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1132 LI.isVolatile());
1133 }
1134
1135 void visitStoreInst(StoreInst &SI) {
1136 Value *ValOp = SI.getValueOperand();
1137 if (ValOp == *U)
1138 return PI.setEscapedAndAborted(&SI);
1139 if (!IsOffsetKnown)
1140 return PI.setAborted(&SI);
1141
1142 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1143 if (StoreSize.isScalable()) {
1144 unsigned VScale = SI.getFunction()->getVScaleValue();
1145 if (!VScale)
1146 return PI.setAborted(&SI);
1147
1148 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1149 }
1150
1151 uint64_t Size = StoreSize.getFixedValue();
1152
1153 // If this memory access can be shown to *statically* extend outside the
1154 // bounds of the allocation, it's behavior is undefined, so simply
1155 // ignore it. Note that this is more strict than the generic clamping
1156 // behavior of insertUse. We also try to handle cases which might run the
1157 // risk of overflow.
1158 // FIXME: We should instead consider the pointer to have escaped if this
1159 // function is being instrumented for addressing bugs or race conditions.
1160 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1161 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1162 << Offset << " which extends past the end of the "
1163 << AllocSize << " byte alloca:\n"
1164 << " alloca: " << AS.AI << "\n"
1165 << " use: " << SI << "\n");
1166 return markAsDead(SI);
1167 }
1168
1169 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1170 "All simple FCA stores should have been pre-split");
1171 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1172 }
1173
1174 void visitMemSetInst(MemSetInst &II) {
1175 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1176 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1177 if ((Length && Length->getValue() == 0) ||
1178 (IsOffsetKnown && Offset.uge(AllocSize)))
1179 // Zero-length mem transfer intrinsics can be ignored entirely.
1180 return markAsDead(II);
1181
1182 if (!IsOffsetKnown)
1183 return PI.setAborted(&II);
1184
1185 insertUse(II, Offset,
1186 Length ? Length->getLimitedValue()
1187 : AllocSize - Offset.getLimitedValue(),
1188 (bool)Length);
1189 }
1190
1191 void visitMemTransferInst(MemTransferInst &II) {
1192 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1193 if (Length && Length->getValue() == 0)
1194 // Zero-length mem transfer intrinsics can be ignored entirely.
1195 return markAsDead(II);
1196
1197 // Because we can visit these intrinsics twice, also check to see if the
1198 // first time marked this instruction as dead. If so, skip it.
1199 if (VisitedDeadInsts.count(&II))
1200 return;
1201
1202 if (!IsOffsetKnown)
1203 return PI.setAborted(&II);
1204
1205 // This side of the transfer is completely out-of-bounds, and so we can
1206 // nuke the entire transfer. However, we also need to nuke the other side
1207 // if already added to our partitions.
1208 // FIXME: Yet another place we really should bypass this when
1209 // instrumenting for ASan.
1210 if (Offset.uge(AllocSize)) {
1211 auto MTPI = MemTransferSliceMap.find(&II);
1212 if (MTPI != MemTransferSliceMap.end())
1213 AS.Slices[MTPI->second].kill();
1214 return markAsDead(II);
1215 }
1216
1217 uint64_t RawOffset = Offset.getLimitedValue();
1218 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1219
1220 // Check for the special case where the same exact value is used for both
1221 // source and dest.
1222 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1223 // For non-volatile transfers this is a no-op.
1224 if (!II.isVolatile())
1225 return markAsDead(II);
1226
1227 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1228 }
1229
1230 // If we have seen both source and destination for a mem transfer, then
1231 // they both point to the same alloca.
1232 bool Inserted;
1233 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1234 std::tie(MTPI, Inserted) =
1235 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1236 unsigned PrevIdx = MTPI->second;
1237 if (!Inserted) {
1238 Slice &PrevP = AS.Slices[PrevIdx];
1239
1240 // Check if the begin offsets match and this is a non-volatile transfer.
1241 // In that case, we can completely elide the transfer.
1242 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1243 PrevP.kill();
1244 return markAsDead(II);
1245 }
1246
1247 // Otherwise we have an offset transfer within the same alloca. We can't
1248 // split those.
1249 PrevP.makeUnsplittable();
1250 }
1251
1252 // Insert the use now that we've fixed up the splittable nature.
1253 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1254
1255 // Check that we ended up with a valid index in the map.
1256 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1257 "Map index doesn't point back to a slice with this user.");
1258 }
1259
1260 // Disable SRoA for any intrinsics except for lifetime invariants.
1261 // FIXME: What about debug intrinsics? This matches old behavior, but
1262 // doesn't make sense.
1263 void visitIntrinsicInst(IntrinsicInst &II) {
1264 if (II.isDroppable()) {
1265 AS.DeadUseIfPromotable.push_back(U);
1266 return;
1267 }
1268
1269 if (!IsOffsetKnown)
1270 return PI.setAborted(&II);
1271
1272 if (II.isLifetimeStartOrEnd()) {
1273 insertUse(II, Offset, AllocSize, true);
1274 return;
1275 }
1276
1277 Base::visitIntrinsicInst(II);
1278 }
1279
1280 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1281 // We consider any PHI or select that results in a direct load or store of
1282 // the same offset to be a viable use for slicing purposes. These uses
1283 // are considered unsplittable and the size is the maximum loaded or stored
1284 // size.
1285 SmallPtrSet<Instruction *, 4> Visited;
1287 Visited.insert(Root);
1288 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1289 const DataLayout &DL = Root->getDataLayout();
1290 // If there are no loads or stores, the access is dead. We mark that as
1291 // a size zero access.
1292 Size = 0;
1293 do {
1294 Instruction *I, *UsedI;
1295 std::tie(UsedI, I) = Uses.pop_back_val();
1296
1297 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1298 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1299 if (LoadSize.isScalable()) {
1300 PI.setAborted(LI);
1301 return nullptr;
1302 }
1303 Size = std::max(Size, LoadSize.getFixedValue());
1304 continue;
1305 }
1306 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1307 Value *Op = SI->getOperand(0);
1308 if (Op == UsedI)
1309 return SI;
1310 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1311 if (StoreSize.isScalable()) {
1312 PI.setAborted(SI);
1313 return nullptr;
1314 }
1315 Size = std::max(Size, StoreSize.getFixedValue());
1316 continue;
1317 }
1318
1319 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1320 if (!GEP->hasAllZeroIndices())
1321 return GEP;
1322 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1324 return I;
1325 }
1326
1327 for (User *U : I->users())
1328 if (Visited.insert(cast<Instruction>(U)).second)
1329 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1330 } while (!Uses.empty());
1331
1332 return nullptr;
1333 }
1334
1335 void visitPHINodeOrSelectInst(Instruction &I) {
1337 if (I.use_empty())
1338 return markAsDead(I);
1339
1340 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1341 // instructions in this BB, which may be required during rewriting. Bail out
1342 // on these cases.
1343 if (isa<PHINode>(I) && !I.getParent()->hasInsertionPt())
1344 return PI.setAborted(&I);
1345
1346 // TODO: We could use simplifyInstruction here to fold PHINodes and
1347 // SelectInsts. However, doing so requires to change the current
1348 // dead-operand-tracking mechanism. For instance, suppose neither loading
1349 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1350 // trap either. However, if we simply replace %U with undef using the
1351 // current dead-operand-tracking mechanism, "load (select undef, undef,
1352 // %other)" may trap because the select may return the first operand
1353 // "undef".
1354 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1355 if (Result == *U)
1356 // If the result of the constant fold will be the pointer, recurse
1357 // through the PHI/select as if we had RAUW'ed it.
1358 enqueueUsers(I);
1359 else
1360 // Otherwise the operand to the PHI/select is dead, and we can replace
1361 // it with poison.
1362 AS.DeadOperands.push_back(U);
1363
1364 return;
1365 }
1366
1367 if (!IsOffsetKnown)
1368 return PI.setAborted(&I);
1369
1370 // See if we already have computed info on this node.
1371 uint64_t &Size = PHIOrSelectSizes[&I];
1372 if (!Size) {
1373 // This is a new PHI/Select, check for an unsafe use of it.
1374 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1375 return PI.setAborted(UnsafeI);
1376 }
1377
1378 // For PHI and select operands outside the alloca, we can't nuke the entire
1379 // phi or select -- the other side might still be relevant, so we special
1380 // case them here and use a separate structure to track the operands
1381 // themselves which should be replaced with poison.
1382 // FIXME: This should instead be escaped in the event we're instrumenting
1383 // for address sanitization.
1384 if (Offset.uge(AllocSize)) {
1385 AS.DeadOperands.push_back(U);
1386 return;
1387 }
1388
1389 insertUse(I, Offset, Size);
1390 }
1391
1392 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1393
1394 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1395
1396 /// Disable SROA entirely if there are unhandled users of the alloca.
1397 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1398
1399 void visitCallBase(CallBase &CB) {
1400 // If the call operand is read-only and only does a read-only or address
1401 // capture, then we mark it as EscapedReadOnly.
1402 if (CB.isDataOperand(U) &&
1403 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1404 CB.onlyReadsMemory(U->getOperandNo())) {
1405 PI.setEscapedReadOnly(&CB);
1406 return;
1407 }
1408
1409 Base::visitCallBase(CB);
1410 }
1411};
1412
1413AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1414 :
1415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1416 AI(AI),
1417#endif
1418 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1419 SliceBuilder PB(DL, AI, *this);
1420 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1421 if (PtrI.isEscaped() || PtrI.isAborted()) {
1422 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1423 // possibly by just storing the PtrInfo in the AllocaSlices.
1424 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1425 : PtrI.getAbortingInst();
1426 assert(PointerEscapingInstr && "Did not track a bad instruction");
1427 return;
1428 }
1429 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1430
1431 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1432
1433 // Sort the uses. This arranges for the offsets to be in ascending order,
1434 // and the sizes to be in descending order.
1435 llvm::stable_sort(Slices);
1436}
1437
1438#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1439
1440void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1441 StringRef Indent) const {
1442 printSlice(OS, I, Indent);
1443 OS << "\n";
1444 printUse(OS, I, Indent);
1445}
1446
1447void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1448 StringRef Indent) const {
1449 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1450 << " slice #" << (I - begin())
1451 << (I->isSplittable() ? " (splittable)" : "");
1452}
1453
1454void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1455 StringRef Indent) const {
1456 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1457}
1458
1459void AllocaSlices::print(raw_ostream &OS) const {
1460 if (PointerEscapingInstr) {
1461 OS << "Can't analyze slices for alloca: " << AI << "\n"
1462 << " A pointer to this alloca escaped by:\n"
1463 << " " << *PointerEscapingInstr << "\n";
1464 return;
1465 }
1466
1467 if (PointerEscapingInstrReadOnly)
1468 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1469
1470 OS << "Slices of alloca: " << AI << "\n";
1471 for (const_iterator I = begin(), E = end(); I != E; ++I)
1472 print(OS, I);
1473}
1474
1475LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1476 print(dbgs(), I);
1477}
1478LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1479
1480#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1481
1482/// Walk the range of a partitioning looking for a common type to cover this
1483/// sequence of slices.
1484static std::pair<Type *, IntegerType *>
1485findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1486 uint64_t EndOffset) {
1487 Type *Ty = nullptr;
1488 bool TyIsCommon = true;
1489 IntegerType *ITy = nullptr;
1490
1491 // Note that we need to look at *every* alloca slice's Use to ensure we
1492 // always get consistent results regardless of the order of slices.
1493 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1494 Use *U = I->getUse();
1495 if (isa<IntrinsicInst>(*U->getUser()))
1496 continue;
1497 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1498 continue;
1499
1500 Type *UserTy = nullptr;
1501 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1502 UserTy = LI->getType();
1503 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1504 UserTy = SI->getValueOperand()->getType();
1505 }
1506
1507 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1508 // If the type is larger than the partition, skip it. We only encounter
1509 // this for split integer operations where we want to use the type of the
1510 // entity causing the split. Also skip if the type is not a byte width
1511 // multiple.
1512 if (UserITy->getBitWidth() % 8 != 0 ||
1513 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1514 continue;
1515
1516 // Track the largest bitwidth integer type used in this way in case there
1517 // is no common type.
1518 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1519 ITy = UserITy;
1520 }
1521
1522 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1523 // depend on types skipped above.
1524 if (!UserTy || (Ty && Ty != UserTy))
1525 TyIsCommon = false; // Give up on anything but an iN type.
1526 else
1527 Ty = UserTy;
1528 }
1529
1530 return {TyIsCommon ? Ty : nullptr, ITy};
1531}
1532
1533/// PHI instructions that use an alloca and are subsequently loaded can be
1534/// rewritten to load both input pointers in the pred blocks and then PHI the
1535/// results, allowing the load of the alloca to be promoted.
1536/// From this:
1537/// %P2 = phi [i32* %Alloca, i32* %Other]
1538/// %V = load i32* %P2
1539/// to:
1540/// %V1 = load i32* %Alloca -> will be mem2reg'd
1541/// ...
1542/// %V2 = load i32* %Other
1543/// ...
1544/// %V = phi [i32 %V1, i32 %V2]
1545///
1546/// We can do this to a select if its only uses are loads and if the operands
1547/// to the select can be loaded unconditionally.
1548///
1549/// FIXME: This should be hoisted into a generic utility, likely in
1550/// Transforms/Util/Local.h
1552 const DataLayout &DL = PN.getDataLayout();
1553
1554 // For now, we can only do this promotion if the load is in the same block
1555 // as the PHI, and if there are no stores between the phi and load.
1556 // TODO: Allow recursive phi users.
1557 // TODO: Allow stores.
1558 BasicBlock *BB = PN.getParent();
1559 Align MaxAlign;
1560 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1561 Type *LoadType = nullptr;
1562 for (User *U : PN.users()) {
1564 if (!LI || !LI->isSimple())
1565 return false;
1566
1567 // For now we only allow loads in the same block as the PHI. This is
1568 // a common case that happens when instcombine merges two loads through
1569 // a PHI.
1570 if (LI->getParent() != BB)
1571 return false;
1572
1573 if (LoadType) {
1574 if (LoadType != LI->getType())
1575 return false;
1576 } else {
1577 LoadType = LI->getType();
1578 }
1579
1580 // Ensure that there are no instructions between the PHI and the load that
1581 // could store.
1582 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1583 if (BBI->mayWriteToMemory())
1584 return false;
1585
1586 MaxAlign = std::max(MaxAlign, LI->getAlign());
1587 }
1588
1589 if (!LoadType)
1590 return false;
1591
1592 APInt LoadSize =
1593 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1594
1595 // We can only transform this if it is safe to push the loads into the
1596 // predecessor blocks. The only thing to watch out for is that we can't put
1597 // a possibly trapping load in the predecessor if it is a critical edge.
1598 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1600 Value *InVal = PN.getIncomingValue(Idx);
1601
1602 // If the value is produced by the terminator of the predecessor (an
1603 // invoke) or it has side-effects, there is no valid place to put a load
1604 // in the predecessor.
1605 if (TI == InVal || TI->mayHaveSideEffects())
1606 return false;
1607
1608 // If the predecessor has a single successor, then the edge isn't
1609 // critical.
1610 if (TI->getNumSuccessors() == 1)
1611 continue;
1612
1613 // If this pointer is always safe to load, or if we can prove that there
1614 // is already a load in the block, then we can move the load to the pred
1615 // block.
1616 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1617 continue;
1618
1619 return false;
1620 }
1621
1622 return true;
1623}
1624
1625static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1626 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1627
1628 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1629 Type *LoadTy = SomeLoad->getType();
1630 IRB.SetInsertPoint(&PN);
1631 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1632 PN.getName() + ".sroa.speculated");
1633
1634 // Get the AA tags and alignment to use from one of the loads. It does not
1635 // matter which one we get and if any differ.
1636 AAMDNodes AATags = SomeLoad->getAAMetadata();
1637 Align Alignment = SomeLoad->getAlign();
1638
1639 // Rewrite all loads of the PN to use the new PHI.
1640 while (!PN.use_empty()) {
1641 LoadInst *LI = cast<LoadInst>(PN.user_back());
1642 LI->replaceAllUsesWith(NewPN);
1643 LI->eraseFromParent();
1644 }
1645
1646 // Inject loads into all of the pred blocks.
1647 DenseMap<BasicBlock *, Value *> InjectedLoads;
1648 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1649 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1650 Value *InVal = PN.getIncomingValue(Idx);
1651
1652 // A PHI node is allowed to have multiple (duplicated) entries for the same
1653 // basic block, as long as the value is the same. So if we already injected
1654 // a load in the predecessor, then we should reuse the same load for all
1655 // duplicated entries.
1656 if (Value *V = InjectedLoads.lookup(Pred)) {
1657 NewPN->addIncoming(V, Pred);
1658 continue;
1659 }
1660
1661 Instruction *TI = Pred->getTerminator();
1662 IRB.SetInsertPoint(TI);
1663
1664 LoadInst *Load = IRB.CreateAlignedLoad(
1665 LoadTy, InVal, Alignment,
1666 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1667 ++NumLoadsSpeculated;
1668 if (AATags)
1669 Load->setAAMetadata(AATags);
1670 NewPN->addIncoming(Load, Pred);
1671 InjectedLoads[Pred] = Load;
1672 }
1673
1674 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1675 PN.eraseFromParent();
1676}
1677
1678SelectHandSpeculativity &
1679SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1680 if (isTrueVal)
1682 else
1684 return *this;
1685}
1686
1687bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1688 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1689 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1690}
1691
1692bool SelectHandSpeculativity::areAllSpeculatable() const {
1693 return isSpeculatable(/*isTrueVal=*/true) &&
1694 isSpeculatable(/*isTrueVal=*/false);
1695}
1696
1697bool SelectHandSpeculativity::areAnySpeculatable() const {
1698 return isSpeculatable(/*isTrueVal=*/true) ||
1699 isSpeculatable(/*isTrueVal=*/false);
1700}
1701bool SelectHandSpeculativity::areNoneSpeculatable() const {
1702 return !areAnySpeculatable();
1703}
1704
1705static SelectHandSpeculativity
1707 assert(LI.isSimple() && "Only for simple loads");
1708 SelectHandSpeculativity Spec;
1709
1710 const DataLayout &DL = SI.getDataLayout();
1711 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1713 &LI))
1714 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1715 else if (PreserveCFG)
1716 return Spec;
1717
1718 return Spec;
1719}
1720
1721std::optional<RewriteableMemOps>
1722SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1723 RewriteableMemOps Ops;
1724
1725 for (User *U : SI.users()) {
1726 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1727 U = *BC->user_begin();
1728
1729 if (auto *Store = dyn_cast<StoreInst>(U)) {
1730 // Note that atomic stores can be transformed; atomic semantics do not
1731 // have any meaning for a local alloca. Stores are not speculatable,
1732 // however, so if we can't turn it into a predicated store, we are done.
1733 if (Store->isVolatile() || PreserveCFG)
1734 return {}; // Give up on this `select`.
1735 Ops.emplace_back(Store);
1736 continue;
1737 }
1738
1739 auto *LI = dyn_cast<LoadInst>(U);
1740
1741 // Note that atomic loads can be transformed;
1742 // atomic semantics do not have any meaning for a local alloca.
1743 if (!LI || LI->isVolatile())
1744 return {}; // Give up on this `select`.
1745
1746 PossiblySpeculatableLoad Load(LI);
1747 if (!LI->isSimple()) {
1748 // If the `load` is not simple, we can't speculatively execute it,
1749 // but we could handle this via a CFG modification. But can we?
1750 if (PreserveCFG)
1751 return {}; // Give up on this `select`.
1752 Ops.emplace_back(Load);
1753 continue;
1754 }
1755
1756 SelectHandSpeculativity Spec =
1757 isSafeLoadOfSelectToSpeculate(*LI, SI, PreserveCFG);
1758 if (PreserveCFG && !Spec.areAllSpeculatable())
1759 return {}; // Give up on this `select`.
1760
1761 Load.setInt(Spec);
1762 Ops.emplace_back(Load);
1763 }
1764
1765 return Ops;
1766}
1767
1769 IRBuilderTy &IRB) {
1770 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1771
1772 Value *TV = SI.getTrueValue();
1773 Value *FV = SI.getFalseValue();
1774 // Replace the given load of the select with a select of two loads.
1775
1776 assert(LI.isSimple() && "We only speculate simple loads");
1777
1778 IRB.SetInsertPoint(&LI);
1779
1780 LoadInst *TL =
1781 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1782 LI.getName() + ".sroa.speculate.load.true");
1783 LoadInst *FL =
1784 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1785 LI.getName() + ".sroa.speculate.load.false");
1786 NumLoadsSpeculated += 2;
1787
1788 // Transfer alignment and AA info if present.
1789 TL->setAlignment(LI.getAlign());
1790 FL->setAlignment(LI.getAlign());
1791
1792 AAMDNodes Tags = LI.getAAMetadata();
1793 if (Tags) {
1794 TL->setAAMetadata(Tags);
1795 FL->setAAMetadata(Tags);
1796 }
1797
1798 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1799 LI.getName() + ".sroa.speculated",
1800 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1801
1802 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1803 LI.replaceAllUsesWith(V);
1804}
1805
1806template <typename T>
1808 SelectHandSpeculativity Spec,
1809 DomTreeUpdater &DTU) {
1810 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1811 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1812 BasicBlock *Head = I.getParent();
1813 Instruction *ThenTerm = nullptr;
1814 Instruction *ElseTerm = nullptr;
1815 if (Spec.areNoneSpeculatable())
1816 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1817 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1818 else {
1819 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1820 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1821 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1822 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1823 cast<CondBrInst>(Head->getTerminator())->swapSuccessors();
1824 }
1825 auto *HeadBI = cast<CondBrInst>(Head->getTerminator());
1826 Spec = {}; // Do not use `Spec` beyond this point.
1827 BasicBlock *Tail = I.getParent();
1828 Tail->setName(Head->getName() + ".cont");
1829 PHINode *PN;
1830 if (isa<LoadInst>(I))
1831 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1832 for (BasicBlock *SuccBB : successors(Head)) {
1833 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1834 int SuccIdx = IsThen ? 0 : 1;
1835 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1836 auto &CondMemOp = cast<T>(*I.clone());
1837 if (NewMemOpBB != Head) {
1838 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1839 if (isa<LoadInst>(I))
1840 ++NumLoadsPredicated;
1841 else
1842 ++NumStoresPredicated;
1843 } else {
1844 CondMemOp.dropUBImplyingAttrsAndMetadata();
1845 ++NumLoadsSpeculated;
1846 }
1847 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1848 Value *Ptr = SI.getOperand(1 + SuccIdx);
1849 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1850 if (isa<LoadInst>(I)) {
1851 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1852 PN->addIncoming(&CondMemOp, NewMemOpBB);
1853 } else
1854 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1855 }
1856 if (isa<LoadInst>(I)) {
1857 PN->takeName(&I);
1858 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1859 I.replaceAllUsesWith(PN);
1860 }
1861}
1862
1864 SelectHandSpeculativity Spec,
1865 DomTreeUpdater &DTU) {
1866 if (auto *LI = dyn_cast<LoadInst>(&I))
1867 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1868 else if (auto *SI = dyn_cast<StoreInst>(&I))
1869 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1870 else
1871 llvm_unreachable_internal("Only for load and store.");
1872}
1873
1875 const RewriteableMemOps &Ops,
1876 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1877 bool CFGChanged = false;
1878 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1879
1880 for (const RewriteableMemOp &Op : Ops) {
1881 SelectHandSpeculativity Spec;
1882 Instruction *I;
1883 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1884 I = *US;
1885 } else {
1886 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1887 I = PSL.getPointer();
1888 Spec = PSL.getInt();
1889 }
1890 if (Spec.areAllSpeculatable()) {
1892 } else {
1893 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1894 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1895 CFGChanged = true;
1896 }
1897 I->eraseFromParent();
1898 }
1899
1900 for (User *U : make_early_inc_range(SI.users()))
1901 cast<BitCastInst>(U)->eraseFromParent();
1902 SI.eraseFromParent();
1903 return CFGChanged;
1904}
1905
1906/// Compute an adjusted pointer from Ptr by Offset bytes where the
1907/// resulting pointer has PointerTy.
1908static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1910 const Twine &NamePrefix) {
1911 if (Offset != 0)
1912 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1913 NamePrefix + "sroa_idx");
1914 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1915 NamePrefix + "sroa_cast");
1916}
1917
1918/// Compute the adjusted alignment for a load or store from an offset.
1922
1923/// Test whether we can convert a value from the old to the new type.
1924///
1925/// This predicate should be used to guard calls to convertValue in order to
1926/// ensure that we only try to convert viable values. The strategy is that we
1927/// will peel off single element struct and array wrappings to get to an
1928/// underlying value, and convert that value.
1929static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1930 unsigned VScale = 0) {
1931 if (OldTy == NewTy)
1932 return true;
1933
1934 // For integer types, we can't handle any bit-width differences. This would
1935 // break both vector conversions with extension and introduce endianness
1936 // issues when in conjunction with loads and stores.
1937 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1939 cast<IntegerType>(NewTy)->getBitWidth() &&
1940 "We can't have the same bitwidth for different int types");
1941 return false;
1942 }
1943
1944 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1945 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1946
1947 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1948 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1949 // Conversion is only possible when the size of scalable vectors is known.
1950 if (!VScale)
1951 return false;
1952
1953 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1954 // a single domain (either fixed or scalable). Any additional conversion
1955 // between fixed and scalable types is handled through integer types.
1956 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1957 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1958
1959 if (isa<ScalableVectorType>(NewTy)) {
1961 return false;
1962
1963 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1964 } else {
1966 return false;
1967
1968 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1969 }
1970 }
1971
1972 if (NewSize != OldSize)
1973 return false;
1974 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1975 return false;
1976
1977 // We can convert pointers to integers and vice-versa. Same for vectors
1978 // of pointers and integers.
1979 OldTy = OldTy->getScalarType();
1980 NewTy = NewTy->getScalarType();
1981 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1982 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1983 unsigned OldAS = OldTy->getPointerAddressSpace();
1984 unsigned NewAS = NewTy->getPointerAddressSpace();
1985 // Convert pointers if they are pointers from the same address space or
1986 // different integral (not non-integral) address spaces with the same
1987 // pointer size.
1988 return OldAS == NewAS ||
1989 (!DL.isNonIntegralAddressSpace(OldAS) &&
1990 !DL.isNonIntegralAddressSpace(NewAS) &&
1991 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1992 }
1993
1994 // We can convert integers to integral pointers, but not to non-integral
1995 // pointers.
1996 if (OldTy->isIntegerTy())
1997 return !DL.isNonIntegralPointerType(NewTy);
1998
1999 // We can convert integral pointers to integers, but non-integral pointers
2000 // need to remain pointers.
2001 if (!DL.isNonIntegralPointerType(OldTy))
2002 return NewTy->isIntegerTy();
2003
2004 return false;
2005 }
2006
2007 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2008 return false;
2009
2010 return true;
2011}
2012
2013/// Test whether the given slice use can be promoted to a vector.
2014///
2015/// This function is called to test each entry in a partition which is slated
2016/// for a single slice.
2017static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2018 VectorType *Ty,
2019 uint64_t ElementSize,
2020 const DataLayout &DL,
2021 unsigned VScale) {
2022 // First validate the slice offsets.
2023 uint64_t BeginOffset =
2024 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2025 uint64_t BeginIndex = BeginOffset / ElementSize;
2026 if (BeginIndex * ElementSize != BeginOffset ||
2027 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2028 return false;
2029 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2030 uint64_t EndIndex = EndOffset / ElementSize;
2031 if (EndIndex * ElementSize != EndOffset ||
2032 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2033 return false;
2034
2035 assert(EndIndex > BeginIndex && "Empty vector!");
2036 uint64_t NumElements = EndIndex - BeginIndex;
2037 Type *SliceTy = (NumElements == 1)
2038 ? Ty->getElementType()
2039 : FixedVectorType::get(Ty->getElementType(), NumElements);
2040
2041 Type *SplitIntTy =
2042 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2043
2044 Use *U = S.getUse();
2045
2046 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2047 if (MI->isVolatile())
2048 return false;
2049 if (!S.isSplittable())
2050 return false; // Skip any unsplittable intrinsics.
2051 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2052 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2053 return false;
2054 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2055 if (LI->isVolatile())
2056 return false;
2057 Type *LTy = LI->getType();
2058 // Disable vector promotion when there are loads or stores of an FCA.
2059 if (LTy->isStructTy())
2060 return false;
2061 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2062 assert(LTy->isIntegerTy());
2063 LTy = SplitIntTy;
2064 }
2065 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2066 return false;
2067 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2068 if (SI->isVolatile())
2069 return false;
2070 Type *STy = SI->getValueOperand()->getType();
2071 // Disable vector promotion when there are loads or stores of an FCA.
2072 if (STy->isStructTy())
2073 return false;
2074 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2075 assert(STy->isIntegerTy());
2076 STy = SplitIntTy;
2077 }
2078 if (!canConvertValue(DL, STy, SliceTy, VScale))
2079 return false;
2080 } else {
2081 return false;
2082 }
2083
2084 return true;
2085}
2086
2087/// Test whether any vector type in \p CandidateTys is viable for promotion.
2088///
2089/// This implements the necessary checking for \c isVectorPromotionViable over
2090/// all slices of the alloca for the given VectorType.
2091static VectorType *
2093 SmallVectorImpl<VectorType *> &CandidateTys,
2094 bool HaveCommonEltTy, Type *CommonEltTy,
2095 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2096 VectorType *CommonVecPtrTy, unsigned VScale) {
2097 // If we didn't find a vector type, nothing to do here.
2098 if (CandidateTys.empty())
2099 return nullptr;
2100
2101 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2102 // then we should choose it, not some other alternative.
2103 // But, we can't perform a no-op pointer address space change via bitcast,
2104 // so if we didn't have a common pointer element type, bail.
2105 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2106 return nullptr;
2107
2108 // Try to pick the "best" element type out of the choices.
2109 if (!HaveCommonEltTy && HaveVecPtrTy) {
2110 // If there was a pointer element type, there's really only one choice.
2111 CandidateTys.clear();
2112 CandidateTys.push_back(CommonVecPtrTy);
2113 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2114 // Integer-ify vector types.
2115 for (VectorType *&VTy : CandidateTys) {
2116 if (!VTy->getElementType()->isIntegerTy())
2117 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2118 VTy->getContext(), VTy->getScalarSizeInBits())));
2119 }
2120
2121 // Rank the remaining candidate vector types. This is easy because we know
2122 // they're all integer vectors. We sort by ascending number of elements.
2123 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2124 (void)DL;
2125 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2126 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2127 "Cannot have vector types of different sizes!");
2128 assert(RHSTy->getElementType()->isIntegerTy() &&
2129 "All non-integer types eliminated!");
2130 assert(LHSTy->getElementType()->isIntegerTy() &&
2131 "All non-integer types eliminated!");
2132 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2133 cast<FixedVectorType>(LHSTy)->getNumElements();
2134 };
2135 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2136 (void)DL;
2137 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2138 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2139 "Cannot have vector types of different sizes!");
2140 assert(RHSTy->getElementType()->isIntegerTy() &&
2141 "All non-integer types eliminated!");
2142 assert(LHSTy->getElementType()->isIntegerTy() &&
2143 "All non-integer types eliminated!");
2144 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2145 cast<FixedVectorType>(LHSTy)->getNumElements();
2146 };
2147 llvm::sort(CandidateTys, RankVectorTypesComp);
2148 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2149 CandidateTys.end());
2150 } else {
2151// The only way to have the same element type in every vector type is to
2152// have the same vector type. Check that and remove all but one.
2153#ifndef NDEBUG
2154 for (VectorType *VTy : CandidateTys) {
2155 assert(VTy->getElementType() == CommonEltTy &&
2156 "Unaccounted for element type!");
2157 assert(VTy == CandidateTys[0] &&
2158 "Different vector types with the same element type!");
2159 }
2160#endif
2161 CandidateTys.resize(1);
2162 }
2163
2164 // FIXME: hack. Do we have a named constant for this?
2165 // SDAG SDNode can't have more than 65535 operands.
2166 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2167 return cast<FixedVectorType>(VTy)->getNumElements() >
2168 std::numeric_limits<unsigned short>::max();
2169 });
2170
2171 // Find a vector type viable for promotion by iterating over all slices.
2172 auto *VTy = llvm::find_if(CandidateTys, [&](VectorType *VTy) -> bool {
2173 uint64_t ElementSize =
2174 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2175
2176 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2177 // that aren't byte sized.
2178 if (ElementSize % 8)
2179 return false;
2180 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2181 "vector size not a multiple of element size?");
2182 ElementSize /= 8;
2183
2184 for (const Slice &S : P)
2185 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2186 return false;
2187
2188 for (const Slice *S : P.splitSliceTails())
2189 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2190 return false;
2191
2192 return true;
2193 });
2194 return VTy != CandidateTys.end() ? *VTy : nullptr;
2195}
2196
2198 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2199 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2200 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2201 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2202 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2203 [[maybe_unused]] VectorType *OriginalElt =
2204 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2205 // Consider additional vector types where the element type size is a
2206 // multiple of load/store element size.
2207 for (Type *Ty : OtherTys) {
2209 continue;
2210 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2211 // Make a copy of CandidateTys and iterate through it, because we
2212 // might append to CandidateTys in the loop.
2213 for (VectorType *const VTy : CandidateTysCopy) {
2214 // The elements in the copy should remain invariant throughout the loop
2215 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2216 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2217 unsigned ElementSize =
2218 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2219 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2220 VectorSize % TypeSize == 0) {
2221 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2222 CheckCandidateType(NewVTy);
2223 }
2224 }
2225 }
2226
2228 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2229 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2230}
2231
2232/// Test whether the given alloca partitioning and range of slices can be
2233/// promoted to a vector.
2234///
2235/// This is a quick test to check whether we can rewrite a particular alloca
2236/// partition (and its newly formed alloca) into a vector alloca with only
2237/// whole-vector loads and stores such that it could be promoted to a vector
2238/// SSA value. We only can ensure this for a limited set of operations, and we
2239/// don't want to do the rewrites unless we are confident that the result will
2240/// be promotable, so we have an early test here.
2242 unsigned VScale) {
2243 // Collect the candidate types for vector-based promotion. Also track whether
2244 // we have different element types.
2245 SmallVector<VectorType *, 4> CandidateTys;
2246 SetVector<Type *> LoadStoreTys;
2247 SetVector<Type *> DeferredTys;
2248 Type *CommonEltTy = nullptr;
2249 VectorType *CommonVecPtrTy = nullptr;
2250 bool HaveVecPtrTy = false;
2251 bool HaveCommonEltTy = true;
2252 bool HaveCommonVecPtrTy = true;
2253 auto CheckCandidateType = [&](Type *Ty) {
2254 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2255 // Return if bitcast to vectors is different for total size in bits.
2256 if (!CandidateTys.empty()) {
2257 VectorType *V = CandidateTys[0];
2258 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2259 DL.getTypeSizeInBits(V).getFixedValue()) {
2260 CandidateTys.clear();
2261 return;
2262 }
2263 }
2264 CandidateTys.push_back(VTy);
2265 Type *EltTy = VTy->getElementType();
2266
2267 if (!CommonEltTy)
2268 CommonEltTy = EltTy;
2269 else if (CommonEltTy != EltTy)
2270 HaveCommonEltTy = false;
2271
2272 if (EltTy->isPointerTy()) {
2273 HaveVecPtrTy = true;
2274 if (!CommonVecPtrTy)
2275 CommonVecPtrTy = VTy;
2276 else if (CommonVecPtrTy != VTy)
2277 HaveCommonVecPtrTy = false;
2278 }
2279 }
2280 };
2281
2282 // Put load and store types into a set for de-duplication.
2283 for (const Slice &S : P) {
2284 Type *Ty;
2285 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2286 Ty = LI->getType();
2287 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2288 Ty = SI->getValueOperand()->getType();
2289 else
2290 continue;
2291
2292 auto CandTy = Ty->getScalarType();
2293 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2294 S.endOffset() != P.endOffset())) {
2295 DeferredTys.insert(Ty);
2296 continue;
2297 }
2298
2299 LoadStoreTys.insert(Ty);
2300 // Consider any loads or stores that are the exact size of the slice.
2301 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2302 CheckCandidateType(Ty);
2303 }
2304
2305 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2307 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2308 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2309 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2310 return VTy;
2311
2312 CandidateTys.clear();
2314 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2315 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2316 CommonVecPtrTy, VScale);
2317}
2318
2319/// Test whether a slice of an alloca is valid for integer widening.
2320///
2321/// This implements the necessary checking for the \c isIntegerWideningViable
2322/// test below on a single slice of the alloca.
2323static bool isIntegerWideningViableForSlice(const Slice &S,
2324 uint64_t AllocBeginOffset,
2325 Type *AllocaTy,
2326 const DataLayout &DL,
2327 bool &WholeAllocaOp) {
2328 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2329
2330 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2331 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2332
2333 Use *U = S.getUse();
2334
2335 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2336 // larger than other load/store slices (RelEnd > Size). But lifetime are
2337 // always promotable and should not impact other slices' promotability of the
2338 // partition.
2339 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2340 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2341 return true;
2342 }
2343
2344 // We can't reasonably handle cases where the load or store extends past
2345 // the end of the alloca's type and into its padding.
2346 if (RelEnd > Size)
2347 return false;
2348
2349 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2350 if (LI->isVolatile())
2351 return false;
2352 // We can't handle loads that extend past the allocated memory.
2353 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2354 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2355 return false;
2356 // So far, AllocaSliceRewriter does not support widening split slice tails
2357 // in rewriteIntegerLoad.
2358 if (S.beginOffset() < AllocBeginOffset)
2359 return false;
2360 // Note that we don't count vector loads or stores as whole-alloca
2361 // operations which enable integer widening because we would prefer to use
2362 // vector widening instead.
2363 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2364 WholeAllocaOp = true;
2365 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2366 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2367 return false;
2368 } else if (RelBegin != 0 || RelEnd != Size ||
2369 !canConvertValue(DL, AllocaTy, LI->getType())) {
2370 // Non-integer loads need to be convertible from the alloca type so that
2371 // they are promotable.
2372 return false;
2373 }
2374 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2375 Type *ValueTy = SI->getValueOperand()->getType();
2376 if (SI->isVolatile())
2377 return false;
2378 // We can't handle stores that extend past the allocated memory.
2379 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2380 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2381 return false;
2382 // So far, AllocaSliceRewriter does not support widening split slice tails
2383 // in rewriteIntegerStore.
2384 if (S.beginOffset() < AllocBeginOffset)
2385 return false;
2386 // Note that we don't count vector loads or stores as whole-alloca
2387 // operations which enable integer widening because we would prefer to use
2388 // vector widening instead.
2389 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2390 WholeAllocaOp = true;
2391 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2392 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2393 return false;
2394 } else if (RelBegin != 0 || RelEnd != Size ||
2395 !canConvertValue(DL, ValueTy, AllocaTy)) {
2396 // Non-integer stores need to be convertible to the alloca type so that
2397 // they are promotable.
2398 return false;
2399 }
2400 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2401 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2402 return false;
2403 if (!S.isSplittable())
2404 return false; // Skip any unsplittable intrinsics.
2405 } else {
2406 return false;
2407 }
2408
2409 return true;
2410}
2411
2412/// Test whether the given alloca partition's integer operations can be
2413/// widened to promotable ones.
2414///
2415/// This is a quick test to check whether we can rewrite the integer loads and
2416/// stores to a particular alloca into wider loads and stores and be able to
2417/// promote the resulting alloca.
2418static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2419 const DataLayout &DL) {
2420 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2421 // Don't create integer types larger than the maximum bitwidth.
2422 if (SizeInBits > IntegerType::MAX_INT_BITS)
2423 return false;
2424
2425 // Don't try to handle allocas with bit-padding.
2426 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2427 return false;
2428
2429 // We need to ensure that an integer type with the appropriate bitwidth can
2430 // be converted to the alloca type, whatever that is. We don't want to force
2431 // the alloca itself to have an integer type if there is a more suitable one.
2432 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2433 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2434 !canConvertValue(DL, IntTy, AllocaTy))
2435 return false;
2436
2437 // While examining uses, we ensure that the alloca has a covering load or
2438 // store. We don't want to widen the integer operations only to fail to
2439 // promote due to some other unsplittable entry (which we may make splittable
2440 // later). However, if there are only splittable uses, go ahead and assume
2441 // that we cover the alloca.
2442 // FIXME: We shouldn't consider split slices that happen to start in the
2443 // partition here...
2444 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2445
2446 for (const Slice &S : P)
2447 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2448 WholeAllocaOp))
2449 return false;
2450
2451 for (const Slice *S : P.splitSliceTails())
2452 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2453 WholeAllocaOp))
2454 return false;
2455
2456 return WholeAllocaOp;
2457}
2458
2459static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2461 const Twine &Name) {
2462 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2463 IntegerType *IntTy = cast<IntegerType>(V->getType());
2464 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2465 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2466 "Element extends past full value");
2467 uint64_t ShAmt = 8 * Offset;
2468 if (DL.isBigEndian())
2469 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2470 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2471 if (ShAmt) {
2472 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2473 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2474 }
2475 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2476 "Cannot extract to a larger integer!");
2477 if (Ty != IntTy) {
2478 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2479 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2480 }
2481 return V;
2482}
2483
2484static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2485 Value *V, uint64_t Offset, const Twine &Name) {
2486 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2487 IntegerType *Ty = cast<IntegerType>(V->getType());
2488 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2489 "Cannot insert a larger integer!");
2490 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2491 if (Ty != IntTy) {
2492 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2493 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2494 }
2495 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2496 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2497 "Element store outside of alloca store");
2498 uint64_t ShAmt = 8 * Offset;
2499 if (DL.isBigEndian())
2500 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2501 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2502 if (ShAmt) {
2503 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2504 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2505 }
2506
2507 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2508 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2509 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2510 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2511 V = IRB.CreateOr(Old, V, Name + ".insert");
2512 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2513 }
2514 return V;
2515}
2516
2517static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2518 unsigned EndIndex, const Twine &Name) {
2519 auto *VecTy = cast<FixedVectorType>(V->getType());
2520 unsigned NumElements = EndIndex - BeginIndex;
2521 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2522
2523 if (NumElements == VecTy->getNumElements())
2524 return V;
2525
2526 if (NumElements == 1) {
2527 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2528 Name + ".extract");
2529 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2530 return V;
2531 }
2532
2533 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2534 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2535 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2536 return V;
2537}
2538
2539static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2540 unsigned BeginIndex, const Twine &Name) {
2541 VectorType *VecTy = cast<VectorType>(Old->getType());
2542 assert(VecTy && "Can only insert a vector into a vector");
2543
2544 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2545 if (!Ty) {
2546 // Single element to insert.
2547 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2548 Name + ".insert");
2549 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2550 return V;
2551 }
2552
2553 unsigned NumSubElements = cast<FixedVectorType>(Ty)->getNumElements();
2554 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
2555
2556 assert(NumSubElements <= NumElements && "Too many elements!");
2557 if (NumSubElements == NumElements) {
2558 assert(V->getType() == VecTy && "Vector type mismatch");
2559 return V;
2560 }
2561 unsigned EndIndex = BeginIndex + NumSubElements;
2562
2563 // When inserting a smaller vector into the larger to store, we first
2564 // use a shuffle vector to widen it with undef elements, and then
2565 // a second shuffle vector to select between the loaded vector and the
2566 // incoming vector.
2568 Mask.reserve(NumElements);
2569 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2570 if (Idx >= BeginIndex && Idx < EndIndex)
2571 Mask.push_back(Idx - BeginIndex);
2572 else
2573 Mask.push_back(-1);
2574 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2575 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2576
2577 Mask.clear();
2578 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2579 if (Idx >= BeginIndex && Idx < EndIndex)
2580 Mask.push_back(Idx);
2581 else
2582 Mask.push_back(Idx + NumElements);
2583 V = IRB.CreateShuffleVector(V, Old, Mask, Name + "blend");
2584 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2585 return V;
2586}
2587
2588/// This function takes two vector values and combines them into a single vector
2589/// by concatenating their elements. The function handles:
2590///
2591/// 1. Element type mismatch: If either vector's element type differs from
2592/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2593/// preserving the total bit width (adjusting the number of elements
2594/// accordingly).
2595///
2596/// 2. Size mismatch: After transforming the vectors to have the desired element
2597/// type, if the two vectors have different numbers of elements, the smaller
2598/// vector is extended with poison values to match the size of the larger
2599/// vector before concatenation.
2600///
2601/// 3. Concatenation: The vectors are merged using a shuffle operation that
2602/// places all elements of V0 first, followed by all elements of V1.
2603///
2604/// \param V0 The first vector to merge (must be a vector type)
2605/// \param V1 The second vector to merge (must be a vector type)
2606/// \param DL The data layout for size calculations
2607/// \param NewAIEltTy The desired element type for the result vector
2608/// \param Builder IRBuilder for creating new instructions
2609/// \return A new vector containing all elements from V0 followed by all
2610/// elements from V1
2612 Type *NewAIEltTy, IRBuilder<> &Builder) {
2613 // V0 and V1 are vectors
2614 // Create a new vector type with combined elements
2615 // Use ShuffleVector to concatenate the vectors
2616 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2617 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2618
2619 // If V0/V1 element types are different from NewAllocaElementType,
2620 // we need to introduce bitcasts before merging them
2621 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2622 const char *DebugName) {
2623 Type *EltType = VecType->getElementType();
2624 if (EltType != NewAIEltTy) {
2625 // Calculate new number of elements to maintain same bit width
2626 unsigned TotalBits =
2627 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2628 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2629
2630 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2631 V = Builder.CreateBitCast(V, NewVecType);
2632 VecType = NewVecType;
2633 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2634 }
2635 };
2636
2637 BitcastIfNeeded(V0, VecType0, "V0");
2638 BitcastIfNeeded(V1, VecType1, "V1");
2639
2640 unsigned NumElts0 = VecType0->getNumElements();
2641 unsigned NumElts1 = VecType1->getNumElements();
2642
2643 SmallVector<int, 16> ShuffleMask;
2644
2645 if (NumElts0 == NumElts1) {
2646 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2647 ShuffleMask.push_back(i);
2648 } else {
2649 // If two vectors have different sizes, we need to extend
2650 // the smaller vector to the size of the larger vector.
2651 unsigned SmallSize = std::min(NumElts0, NumElts1);
2652 unsigned LargeSize = std::max(NumElts0, NumElts1);
2653 bool IsV0Smaller = NumElts0 < NumElts1;
2654 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2655 SmallVector<int, 16> ExtendMask;
2656 for (unsigned i = 0; i < SmallSize; ++i)
2657 ExtendMask.push_back(i);
2658 for (unsigned i = SmallSize; i < LargeSize; ++i)
2659 ExtendMask.push_back(PoisonMaskElem);
2660 ExtendedVec = Builder.CreateShuffleVector(
2661 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2662 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2663 for (unsigned i = 0; i < NumElts0; ++i)
2664 ShuffleMask.push_back(i);
2665 for (unsigned i = 0; i < NumElts1; ++i)
2666 ShuffleMask.push_back(LargeSize + i);
2667 }
2668
2669 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2670}
2671
2672namespace {
2673
2674/// Visitor to rewrite instructions using p particular slice of an alloca
2675/// to use a new alloca.
2676///
2677/// Also implements the rewriting to vector-based accesses when the partition
2678/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2679/// lives here.
2680class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2681 // Befriend the base class so it can delegate to private visit methods.
2682 friend class InstVisitor<AllocaSliceRewriter, bool>;
2683
2684 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2685
2686 const DataLayout &DL;
2687 AllocaSlices &AS;
2688 SROA &Pass;
2689 AllocaInst &OldAI, &NewAI;
2690 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2691 Type *NewAllocaTy;
2692
2693 // This is a convenience and flag variable that will be null unless the new
2694 // alloca's integer operations should be widened to this integer type due to
2695 // passing isIntegerWideningViable above. If it is non-null, the desired
2696 // integer type will be stored here for easy access during rewriting.
2697 IntegerType *IntTy;
2698
2699 // If we are rewriting an alloca partition which can be written as pure
2700 // vector operations, we stash extra information here. When VecTy is
2701 // non-null, we have some strict guarantees about the rewritten alloca:
2702 // - The new alloca is exactly the size of the vector type here.
2703 // - The accesses all either map to the entire vector or to a single
2704 // element.
2705 // - The set of accessing instructions is only one of those handled above
2706 // in isVectorPromotionViable. Generally these are the same access kinds
2707 // which are promotable via mem2reg.
2708 VectorType *VecTy;
2709 Type *ElementTy;
2710 uint64_t ElementSize;
2711
2712 // The original offset of the slice currently being rewritten relative to
2713 // the original alloca.
2714 uint64_t BeginOffset = 0;
2715 uint64_t EndOffset = 0;
2716
2717 // The new offsets of the slice currently being rewritten relative to the
2718 // original alloca.
2719 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2720
2721 uint64_t SliceSize = 0;
2722 bool IsSplittable = false;
2723 bool IsSplit = false;
2724 Use *OldUse = nullptr;
2725 Instruction *OldPtr = nullptr;
2726
2727 // Track post-rewrite users which are PHI nodes and Selects.
2728 SmallSetVector<PHINode *, 8> &PHIUsers;
2729 SmallSetVector<SelectInst *, 8> &SelectUsers;
2730
2731 // Utility IR builder, whose name prefix is setup for each visited use, and
2732 // the insertion point is set to point to the user.
2733 IRBuilderTy IRB;
2734
2735 // Return the new alloca, addrspacecasted if required to avoid changing the
2736 // addrspace of a volatile access.
2737 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2738 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2739 return &NewAI;
2740
2741 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2742 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2743 }
2744
2745public:
2746 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2747 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2748 uint64_t NewAllocaBeginOffset,
2749 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2750 VectorType *PromotableVecTy,
2751 SmallSetVector<PHINode *, 8> &PHIUsers,
2752 SmallSetVector<SelectInst *, 8> &SelectUsers)
2753 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2754 NewAllocaBeginOffset(NewAllocaBeginOffset),
2755 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2756 IntTy(IsIntegerPromotable
2757 ? Type::getIntNTy(
2758 NewAI.getContext(),
2759 DL.getTypeSizeInBits(NewAllocaTy).getFixedValue())
2760 : nullptr),
2761 VecTy(PromotableVecTy),
2762 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2763 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2764 : 0),
2765 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2766 IRB(NewAI.getContext(), ConstantFolder()) {
2767 if (VecTy) {
2768 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2769 "Only multiple-of-8 sized vector elements are viable");
2770 ++NumVectorized;
2771 }
2772 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2773 }
2774
2775 bool visit(AllocaSlices::const_iterator I) {
2776 bool CanSROA = true;
2777 BeginOffset = I->beginOffset();
2778 EndOffset = I->endOffset();
2779 IsSplittable = I->isSplittable();
2780 IsSplit =
2781 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2782 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2783 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2784 LLVM_DEBUG(dbgs() << "\n");
2785
2786 // Compute the intersecting offset range.
2787 assert(BeginOffset < NewAllocaEndOffset);
2788 assert(EndOffset > NewAllocaBeginOffset);
2789 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2790 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2791
2792 SliceSize = NewEndOffset - NewBeginOffset;
2793 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2794 << ") NewBegin:(" << NewBeginOffset << ", "
2795 << NewEndOffset << ") NewAllocaBegin:("
2796 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2797 << ")\n");
2798 assert(IsSplit || NewBeginOffset == BeginOffset);
2799 OldUse = I->getUse();
2800 OldPtr = cast<Instruction>(OldUse->get());
2801
2802 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2803 IRB.SetInsertPoint(OldUserI);
2804 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2805 // Avoid materializing the name prefix when it is discarded anyway.
2806 if (!IRB.getContext().shouldDiscardValueNames())
2807 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2808 Twine(BeginOffset) + ".");
2809
2810 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2811 if (VecTy || IntTy)
2812 assert(CanSROA);
2813 return CanSROA;
2814 }
2815
2816 /// Attempts to rewrite a partition using tree-structured merge optimization.
2817 ///
2818 /// This function handles two patterns. Both produce an O(log n) tree of
2819 /// shufflevectors in place of the linear expand+blend chain that SROA would
2820 /// otherwise emit for each partial store.
2821 ///
2822 /// Pattern 1 (stores-only):
2823 /// Multiple non-overlapping partial stores completely fill the alloca
2824 /// and there is exactly one full-width load coming after the stores.
2825 /// The stores are tree-merged into a single vector and stored once.
2826 ///
2827 /// Example transformation:
2828 /// Before: (stores do not have to be in order)
2829 /// %alloca = alloca <8 x float>
2830 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2831 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2832 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2833 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2834 /// %r = load <8 x float>, ptr %alloca
2835 ///
2836 /// After: tree of shufflevectors producing <8 x float> directly.
2837 ///
2838 /// Pattern 2 (init + RMW, possibly multi-round):
2839 /// A single full-width init store, followed by partial loads and
2840 /// partial stores that read-modify-write the alloca one or more
2841 /// times, optionally followed by a full-width load. The only
2842 /// structural requirement is that the distinct [begin, end) ranges
2843 /// touched by the partial loads and stores, taken together, tile
2844 /// the alloca disjointly.
2845 ///
2846 /// We keep a map from each slice range to the SSA value that
2847 /// currently lives there, `SliceValues[r] -> Value*`:
2848 /// - initialize each entry to the corresponding piece of the
2849 /// init store's value (via a shufflevector picking the
2850 /// range's elements out of the init value),
2851 /// - walk partial loads and stores in block order,
2852 /// - for a partial load at range r: RAUW with `SliceValues[r]`,
2853 /// - for a partial store at range r: update `SliceValues[r]` to
2854 /// the stored value and drop the store.
2855 /// At the end, the final `SliceValues[r]` entries are tree-merged
2856 /// (in range order) into a single store to the alloca, and the
2857 /// optional full-width load is replaced by a load of the alloca.
2858 ///
2859 /// Because the ranges are disjoint by construction, a store at one
2860 /// range cannot affect another range's tracked value, so a single
2861 /// block-order walk correctly tracks the memory state at each
2862 /// range. The algorithm handles multi-round RMW, partial loads
2863 /// and stores interleaved in any order, read-only slices (the
2864 /// tracked value stays at the init extract), and write-only
2865 /// slices (the tracked value never flows into a load).
2866 ///
2867 /// \param P The partition to analyze and potentially rewrite
2868 /// \return An optional vector of values that were deleted during the
2869 /// rewrite, or std::nullopt if the partition cannot be optimized.
2870 std::optional<SmallVector<Value *, 4>>
2871 rewriteTreeStructuredMerge(Partition &P) {
2872 // No tail slices that overlap with the partition
2873 if (P.splitSliceTails().size() > 0)
2874 return std::nullopt;
2875
2876 // Structure to hold store information
2877 struct StoreInfo {
2878 StoreInst *Store;
2879 uint64_t BeginOffset;
2880 uint64_t EndOffset;
2881 Value *StoredValue;
2882 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2883 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2884 };
2885 struct LoadInfo {
2886 LoadInst *Load;
2887 uint64_t BeginOffset;
2888 uint64_t EndOffset;
2889 };
2890
2891 SmallVector<StoreInfo, 4> StoreInfos; // partial stores only
2892 SmallVector<LoadInfo, 4> LoadInfos; // partial loads only
2893 LoadInst *FullLoad = nullptr; // optional full-width load
2894 StoreInst *InitStore = nullptr; // optional full-width init store
2895
2896 // If the new alloca is a fixed vector type, we use its element type as the
2897 // allocated element type, otherwise we use i8 as the allocated element
2898 Type *AllocatedEltTy =
2899 isa<FixedVectorType>(NewAllocaTy)
2900 ? cast<FixedVectorType>(NewAllocaTy)->getElementType()
2901 : Type::getInt8Ty(NewAI.getContext());
2902 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2903
2904 // Helper to check if a type is
2905 // 1. A fixed vector type
2906 // 2. The element type is not a pointer
2907 // 3. The element type size is byte-aligned
2908 // We only handle the cases that the ld/st meet these conditions
2909 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2910 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2911 return FixedVecTy &&
2912 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2913 !FixedVecTy->getElementType()->isPointerTy();
2914 };
2915
2916 for (Slice &S : P) {
2917 auto *User = cast<Instruction>(S.getUse()->getUser());
2918 // A "full-width" slice spans the entire alloca; it's either the single
2919 // init store (Pattern 2) or the single final load (both patterns).
2920 bool IsFullWidth = (S.beginOffset() == NewAllocaBeginOffset &&
2921 S.endOffset() == NewAllocaEndOffset);
2922 if (auto *LI = dyn_cast<LoadInst>(User)) {
2923 // Only handle simple (non-volatile, non-atomic) loads.
2924 if (!LI->isSimple() ||
2925 !IsTypeValidForTreeStructuredMerge(LI->getType()))
2926 return std::nullopt;
2927 if (IsFullWidth) {
2928 // We accept at most one full-width load (the "final" load, after
2929 // all the partial stores).
2930 if (FullLoad)
2931 return std::nullopt;
2932 FullLoad = LI;
2933 } else {
2934 // Partial load (RMW pattern only).
2935 LoadInfos.push_back({LI, S.beginOffset(), S.endOffset()});
2936 }
2937 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
2938 // Do not handle the case if
2939 // 1. The store does not meet the conditions in the helper function
2940 // 2. The store is not simple — we drop stores as part of the
2941 // rewrite, so volatile stores (which must be kept) and atomic
2942 // stores (which carry memory-ordering semantics) are unsound
2943 // to replace with SSA bookkeeping.
2944 // 3. The total store size is not a multiple of the allocated
2945 // element type size (required so the tree merge can produce a
2946 // vector whose element type matches the alloca).
2947 if (!SI->isSimple() || !IsTypeValidForTreeStructuredMerge(
2948 SI->getValueOperand()->getType()))
2949 return std::nullopt;
2950 auto *StVecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
2951 unsigned NumElts = StVecTy->getNumElements();
2952 unsigned EltSize = DL.getTypeSizeInBits(StVecTy->getElementType());
2953 if (NumElts * EltSize % AllocatedEltTySize != 0)
2954 return std::nullopt;
2955 if (IsFullWidth) {
2956 // At most one full-width store is allowed — it's the init store
2957 // for the RMW pattern.
2958 if (InitStore)
2959 return std::nullopt;
2960 InitStore = SI;
2961 } else {
2962 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
2963 SI->getValueOperand());
2964 }
2965 } else {
2966 // If we have instructions other than load and store, we cannot do
2967 // the tree structured merge.
2968 return std::nullopt;
2969 }
2970 }
2971
2972 // Need at least two partial stores to benefit from tree-merging; a
2973 // single store is already optimal as-is. This applies to both patterns
2974 // below, so check it before classifying.
2975 if (StoreInfos.size() < 2)
2976 return std::nullopt;
2977
2978 // Classify the pattern by looking at what we collected:
2979 // Pattern 1 (stores-only): only partial stores + exactly one full load.
2980 // Pattern 2 (RMW): one full init store + partial loads + partial stores
2981 // (+ optional full final load). RMW also needs VecTy to be set
2982 // because we use getIndex() to convert byte offsets to element
2983 // indices, which requires a promoted vector alloca.
2984 bool IsRMWPattern = InitStore && VecTy && !LoadInfos.empty();
2985 bool IsStoresOnlyPattern = !InitStore && FullLoad && LoadInfos.empty();
2986 if (!IsRMWPattern && !IsStoresOnlyPattern)
2987 return std::nullopt;
2988
2989 // All partial stores must live in the same basic block — the tree merge
2990 // is built in a single BB using block-order ordering (comesBefore).
2991 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
2992 for (auto &Info : StoreInfos)
2993 if (Info.Store->getParent() != StoreBB)
2994 return std::nullopt;
2995
2996 SmallVector<Value *, 4> DeletedValues;
2997
2998 // Helper: pairwise tree-merge a list of vectors into a single vector.
2999 // At each iteration we merge each adjacent pair via mergeTwoVectors,
3000 // collect the merged values into Next, and (if Vals had odd length)
3001 // carry the trailing element through unchanged. Loop until one value
3002 // remains — the fully-merged vector.
3003 auto TreeMerge = [&](SmallVectorImpl<Value *> &Vals,
3004 IRBuilder<> &B) -> Value * {
3005 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3006 while (Vals.size() > 1) {
3007 SmallVector<Value *, 8> Next;
3008 for (unsigned I = 0, E = Vals.size(); I + 1 < E; I += 2) {
3009 Value *M =
3010 mergeTwoVectors(Vals[I], Vals[I + 1], DL, AllocatedEltTy, B);
3011 LLVM_DEBUG(dbgs() << " shufflevector: " << *M << "\n");
3012 Next.push_back(M);
3013 }
3014 if (Vals.size() % 2 == 1)
3015 Next.push_back(Vals.back());
3016 Vals = std::move(Next);
3017 }
3018 return Vals[0];
3019 };
3020
3021 // Replace a full-width load with a load of the freshly-merged alloca.
3022 // The merge stored a value of type Merged->getType() into NewAI; we load
3023 // that same type back so every access to NewAI stays consistently typed
3024 // (otherwise the alloca is no longer promotable).
3025 auto ReplaceFullLoad = [&](LoadInst *LoadToReplace, Value *Merged) {
3026 IRBuilder<> LoadBuilder(LoadToReplace);
3027 Value *NewLoad = LoadBuilder.CreateAlignedLoad(
3028 Merged->getType(), &NewAI, getSliceAlign(),
3029 LoadToReplace->isVolatile(),
3030 LoadToReplace->getName() + ".sroa.new.load");
3031 if (NewLoad->getType() != LoadToReplace->getType())
3032 NewLoad = LoadBuilder.CreateBitCast(NewLoad, LoadToReplace->getType());
3033 LoadToReplace->replaceAllUsesWith(NewLoad);
3034 DeletedValues.push_back(LoadToReplace);
3035 };
3036
3037 if (IsStoresOnlyPattern) {
3038 // Stores should not overlap and should cover the whole alloca.
3039 // Sort by begin offset to verify this with a single linear scan.
3040 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
3041 return A.BeginOffset < B.BeginOffset;
3042 });
3043 // Check for gap or overlap: each begin offset must equal the previous
3044 // end offset, i.e. the store ranges must tile [NewAllocaBeginOffset,
3045 // NewAllocaEndOffset) exactly.
3046 uint64_t Expected = NewAllocaBeginOffset;
3047 for (auto &Info : StoreInfos) {
3048 if (Info.BeginOffset != Expected)
3049 return std::nullopt;
3050 Expected = Info.EndOffset;
3051 }
3052 // Stores cover the entire alloca (no trailing gap either).
3053 if (Expected != NewAllocaEndOffset)
3054 return std::nullopt;
3055
3056 // The load should not be in the middle of the stores.
3057 // Note:
3058 // If the load is in a different basic block from the stores, we can
3059 // still do the tree-structured merge. We don't have store->load
3060 // forwarding here — the merged vector is stored back to NewAI and
3061 // the new load loads from NewAI. The forwarding will be handled
3062 // later when NewAI is promoted.
3063 BasicBlock *LoadBB = FullLoad->getParent();
3064 if (LoadBB == StoreBB) {
3065 for (auto &Info : StoreInfos)
3066 if (!Info.Store->comesBefore(FullLoad))
3067 return std::nullopt;
3068 }
3069
3070 LLVM_DEBUG({
3071 dbgs() << "Tree structured merge rewrite (stores-only):\n";
3072 dbgs() << " Load: " << *FullLoad << "\n Ordered stores:\n";
3073 for (auto [I, Info] : enumerate(StoreInfos)) {
3074 dbgs() << " [" << I << "] Range[" << Info.BeginOffset << ", "
3075 << Info.EndOffset << ") \tStore: " << *Info.Store
3076 << "\tValue: " << *Info.StoredValue << "\n";
3077 }
3078 });
3079
3080 // StoreInfos is sorted by offset, not by block order. Anchoring to
3081 // StoreInfos.back().Store (last by offset) can place shuffles before
3082 // operands that appear later in the block (invalid SSA). Insert before
3083 // FullLoad when it shares the store block (after all stores, before
3084 // any later IR in that block). Otherwise insert before the store
3085 // block's terminator so the merge runs after every store and any
3086 // trailing instructions in that block.
3087 IRBuilder<> Builder(LoadBB == StoreBB ? cast<Instruction>(FullLoad)
3088 : StoreBB->getTerminator());
3089 SmallVector<Value *, 8> Vals;
3090 for (const auto &Info : StoreInfos) {
3091 DeletedValues.push_back(Info.Store);
3092 Vals.push_back(Info.StoredValue);
3093 }
3094 // Merge all stored values and store the merged value into the alloca.
3095 Value *Merged = TreeMerge(Vals, Builder);
3096 Builder.CreateAlignedStore(Merged, &NewAI, getSliceAlign());
3097
3098 // Replace the original load with a load of the newly-merged alloca.
3099 ReplaceFullLoad(FullLoad, Merged);
3100 return DeletedValues;
3101 }
3102
3103 // RMW pattern handling starts from here.
3104 // Like StoreBB above: keep the init store, all partial loads and all
3105 // partial stores in one basic block so we can reason about ordering
3106 // with comesBefore and build SSA without PHIs.
3107 if (InitStore->getParent() != StoreBB)
3108 return std::nullopt;
3109 if (any_of(LoadInfos, [&](const LoadInfo &I) {
3110 return I.Load->getParent() != StoreBB;
3111 }))
3112 return std::nullopt;
3113 // FullLoad (if any) is allowed to live in a different basic block. See
3114 // the note on the stores-only path: we don't do store->load forwarding
3115 // directly — the merged vector is stored to NewAI and the new load
3116 // loads from NewAI, so cross-BB ordering is resolved later when NewAI
3117 // is promoted.
3118
3119 // Collect the combined partial-load/partial-store accesses sorted
3120 // by block order. Used both for ordering checks and for the rewrite
3121 // walk below.
3122 struct Access {
3123 Instruction *Inst;
3124 uint64_t BeginOffset, EndOffset;
3125 bool IsStore;
3126 };
3128 Accesses.reserve(LoadInfos.size() + StoreInfos.size());
3129 for (const auto &L : LoadInfos)
3130 Accesses.push_back({L.Load, L.BeginOffset, L.EndOffset, false});
3131 for (const auto &S : StoreInfos)
3132 Accesses.push_back({S.Store, S.BeginOffset, S.EndOffset, true});
3133 llvm::sort(Accesses, [](const Access &A, const Access &B) {
3134 return A.Inst->comesBefore(B.Inst);
3135 });
3136
3137 // Ordering constraint 1: InitStore must come before every partial
3138 // access — they read/write the RMW state initialised by InitStore.
3139 // Accesses is sorted by block order, so the first element is the
3140 // earliest; checking it is enough.
3141 if (!InitStore->comesBefore(Accesses.front().Inst))
3142 return std::nullopt;
3143 // Ordering constraint 2: when FullLoad shares the block with the
3144 // partial accesses, it must come after every one of them — otherwise
3145 // it could read a stale value. Accesses is sorted, so the last
3146 // element is the latest; checking it is enough. If FullLoad is in
3147 // another block, mem2reg forwards the merged store to it.
3148 if (FullLoad && FullLoad->getParent() == StoreBB &&
3149 !Accesses.back().Inst->comesBefore(FullLoad))
3150 return std::nullopt;
3151
3152 // Coverage check: the distinct [begin, end) ranges touched by the
3153 // partial loads and stores must tile the alloca disjointly. That is
3154 // the only precondition the per-range SliceValues tracking below
3155 // needs — a disjoint tile guarantees the entries don't alias each
3156 // other. We don't check per-range load/store counts: a range with
3157 // only loads ends with SliceValues[r] = the init extract
3158 // (contributed to the final tree-merge), and a range with only
3159 // stores ends with SliceValues[r] = its last stored value. Both are
3160 // correct.
3161 using SliceRange = std::pair<uint64_t, uint64_t>;
3162 SmallVector<SliceRange, 8> SortedRanges;
3163 SortedRanges.reserve(Accesses.size());
3164 for (auto &Acc : Accesses)
3165 SortedRanges.emplace_back(Acc.BeginOffset, Acc.EndOffset);
3166 llvm::sort(SortedRanges);
3167 SortedRanges.erase(llvm::unique(SortedRanges), SortedRanges.end());
3168 // Disjoint + contiguous tile of the whole alloca.
3169 uint64_t Expected = NewAllocaBeginOffset;
3170 for (auto &Range : SortedRanges) {
3171 if (Range.first != Expected)
3172 return std::nullopt;
3173 Expected = Range.second;
3174 }
3175 if (Expected != NewAllocaEndOffset)
3176 return std::nullopt;
3177
3178 LLVM_DEBUG({
3179 dbgs() << "Tree structured merge rewrite (RMW):\n";
3180 dbgs() << " Init store: " << *InitStore << "\n";
3181 if (FullLoad)
3182 dbgs() << " Final load: " << *FullLoad << "\n";
3183 dbgs() << " Slice ranges (" << SortedRanges.size() << "):\n";
3184 for (auto &Range : SortedRanges)
3185 dbgs() << " [" << Range.first << ", " << Range.second << ")\n";
3186 });
3187
3188 // Initialize SliceValues: one SSA value per slice range, tracking
3189 // the value the alloca currently holds at that range. Each entry
3190 // starts at the corresponding piece of the init store, obtained by
3191 // bitcasting the init value to the alloca's vector type (if needed)
3192 // and extracting the slice's sub-range.
3193 IRB.SetInsertPoint(InitStore->getNextNode());
3194 Value *InitVec = InitStore->getValueOperand();
3195 if (InitVec->getType() != NewAllocaTy)
3196 InitVec = IRB.CreateBitCast(InitVec, NewAllocaTy, "init.cast");
3197 DenseMap<SliceRange, Value *> SliceValues;
3198 for (auto &Range : SortedRanges) {
3199 unsigned BeginIdx = getIndex(Range.first);
3200 unsigned EndIdx = getIndex(Range.second);
3201 SliceValues[Range] = IRB.CreateShuffleVector(
3202 InitVec, createSequentialMask(BeginIdx, EndIdx - BeginIdx, 0),
3203 "init.extract");
3204 }
3205 // The init store itself becomes dead — its value is consumed via the
3206 // extracts above.
3207 DeletedValues.push_back(InitStore);
3208
3209 // Walk accesses in block order:
3210 // - partial load at range r: replace with SliceValues[r] (bitcast
3211 // if the load's type differs from the current tracked value's
3212 // type, e.g. because a previous store wrote a vector with a
3213 // different element type);
3214 // - partial store at range r: update SliceValues[r] to the stored
3215 // value and drop the store.
3216 for (auto &Acc : Accesses) {
3217 SliceRange Range{Acc.BeginOffset, Acc.EndOffset};
3218 if (!Acc.IsStore) {
3219 Value *V = SliceValues[Range];
3220 if (V->getType() != Acc.Inst->getType()) {
3221 IRB.SetInsertPoint(cast<LoadInst>(Acc.Inst));
3222 V = IRB.CreateBitCast(V, Acc.Inst->getType());
3223 }
3224 Acc.Inst->replaceAllUsesWith(V);
3225 } else {
3226 SliceValues[Range] = cast<StoreInst>(Acc.Inst)->getValueOperand();
3227 }
3228 DeletedValues.push_back(Acc.Inst);
3229 }
3230
3231 // Tree-merge the final per-range values (in range order) into the
3232 // alloca's final vector value. Anchor the IRBuilder to FullLoad (when it
3233 // shares the partial-access block) or otherwise to the block's
3234 // terminator — never to a partial access, since those are queued for
3235 // deletion. Both anchors are guaranteed to dominate every SliceValues
3236 // entry: each one is either an init extract (before any access) or a
3237 // stored value defined before its (now-deleted) store.
3238 IRBuilder<> Builder(FullLoad && FullLoad->getParent() == StoreBB
3239 ? cast<Instruction>(FullLoad)
3240 : StoreBB->getTerminator());
3241 SmallVector<Value *, 8> Vals;
3242 for (auto &Range : SortedRanges)
3243 Vals.push_back(SliceValues[Range]);
3244 Value *Merged = TreeMerge(Vals, Builder);
3245 Builder.CreateAlignedStore(Merged, &NewAI, getSliceAlign());
3246
3247 // Replace the optional final full-width load with a load of the newly
3248 // merged alloca. Later promotion will forward the store above to it.
3249 if (FullLoad)
3250 ReplaceFullLoad(FullLoad, Merged);
3251
3252 return DeletedValues;
3253 }
3254
3255private:
3256 // Make sure the other visit overloads are visible.
3257 using Base::visit;
3258
3259 // Every instruction which can end up as a user must have a rewrite rule.
3260 bool visitInstruction(Instruction &I) {
3261 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3262 llvm_unreachable("No rewrite rule for this instruction!");
3263 }
3264
3265 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3266 // Note that the offset computation can use BeginOffset or NewBeginOffset
3267 // interchangeably for unsplit slices.
3268 assert(IsSplit || BeginOffset == NewBeginOffset);
3269 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3270
3271 StringRef OldName = OldPtr->getName();
3272 // Skip through the last '.sroa.' component of the name.
3273 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3274 if (LastSROAPrefix != StringRef::npos) {
3275 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3276 // Look for an SROA slice index.
3277 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3278 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3279 // Strip the index and look for the offset.
3280 OldName = OldName.substr(IndexEnd + 1);
3281 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3282 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3283 // Strip the offset.
3284 OldName = OldName.substr(OffsetEnd + 1);
3285 }
3286 }
3287 // Strip any SROA suffixes as well.
3288 OldName = OldName.substr(0, OldName.find(".sroa_"));
3289
3290 return getAdjustedPtr(IRB, DL, &NewAI,
3291 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3292 PointerTy, Twine(OldName) + ".");
3293 }
3294
3295 /// Compute suitable alignment to access this slice of the *new*
3296 /// alloca.
3297 ///
3298 /// You can optionally pass a type to this routine and if that type's ABI
3299 /// alignment is itself suitable, this will return zero.
3300 Align getSliceAlign() {
3301 return commonAlignment(NewAI.getAlign(),
3302 NewBeginOffset - NewAllocaBeginOffset);
3303 }
3304
3305 unsigned getIndex(uint64_t Offset) {
3306 assert(VecTy && "Can only call getIndex when rewriting a vector");
3307 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3308 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3309 uint32_t Index = RelOffset / ElementSize;
3310 assert(Index * ElementSize == RelOffset);
3311 return Index;
3312 }
3313
3314 void deleteIfTriviallyDead(Value *V) {
3317 Pass.DeadInsts.push_back(I);
3318 }
3319
3320 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3321 unsigned BeginIndex = getIndex(NewBeginOffset);
3322 unsigned EndIndex = getIndex(NewEndOffset);
3323 assert(EndIndex > BeginIndex && "Empty vector!");
3324
3325 LoadInst *Load =
3326 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3327
3328 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3329 LLVMContext::MD_access_group});
3330 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3331 }
3332
3333 Value *rewriteIntegerLoad(LoadInst &LI) {
3334 assert(IntTy && "We cannot insert an integer to the alloca");
3335 assert(!LI.isVolatile());
3336 Value *V =
3337 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3338 V = IRB.CreateBitPreservingCastChain(DL, V, IntTy);
3339 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3340 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3341 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3342 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3343 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3344 }
3345 // It is possible that the extracted type is not the load type. This
3346 // happens if there is a load past the end of the alloca, and as
3347 // a consequence the slice is narrower but still a candidate for integer
3348 // lowering. To handle this case, we just zero extend the extracted
3349 // integer.
3350 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3351 "Can only handle an extract for an overly wide load");
3352 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3353 V = IRB.CreateZExt(V, LI.getType());
3354 return V;
3355 }
3356
3357 bool visitLoadInst(LoadInst &LI) {
3358 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3359 Value *OldOp = LI.getOperand(0);
3360 assert(OldOp == OldPtr);
3361
3362 AAMDNodes AATags = LI.getAAMetadata();
3363
3364 unsigned AS = LI.getPointerAddressSpace();
3365
3366 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3367 : LI.getType();
3368 bool IsPtrAdjusted = false;
3369 Value *V;
3370 if (VecTy) {
3371 V = rewriteVectorizedLoadInst(LI);
3372 } else if (IntTy && LI.getType()->isIntegerTy()) {
3373 V = rewriteIntegerLoad(LI);
3374 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3375 NewEndOffset == NewAllocaEndOffset &&
3376 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3377 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3378 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3379 !LI.isVolatile()))) {
3380 Value *NewPtr =
3381 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3382 LoadInst *NewLI = IRB.CreateAlignedLoad(
3383 NewAllocaTy, NewPtr, NewAI.getAlign(), LI.isVolatile(), LI.getName());
3384 if (LI.isVolatile())
3385 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3386 if (NewLI->isAtomic())
3387 NewLI->setAlignment(LI.getAlign());
3388
3389 // Copy any metadata that is valid for the new load. This may require
3390 // conversion to a different kind of metadata, e.g. !nonnull might change
3391 // to !range or vice versa.
3392 copyMetadataForLoad(*NewLI, LI);
3393
3394 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3395 if (AATags)
3396 NewLI->setAAMetadata(AATags.adjustForAccess(
3397 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3398
3399 // Try to preserve nonnull metadata
3400 V = NewLI;
3401
3402 // If this is an integer load past the end of the slice (which means the
3403 // bytes outside the slice are undef or this load is dead) just forcibly
3404 // fix the integer size with correct handling of endianness.
3405 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3406 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3407 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3408 V = IRB.CreateZExt(V, TITy, "load.ext");
3409 if (DL.isBigEndian())
3410 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3411 "endian_shift");
3412 }
3413 } else {
3414 Type *LTy = IRB.getPtrTy(AS);
3415 LoadInst *NewLI =
3416 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3417 getSliceAlign(), LI.isVolatile(), LI.getName());
3418
3419 if (AATags)
3420 NewLI->setAAMetadata(AATags.adjustForAccess(
3421 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3422
3423 if (LI.isVolatile())
3424 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3425 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3426 LLVMContext::MD_access_group});
3427
3428 V = NewLI;
3429 IsPtrAdjusted = true;
3430 }
3431 V = IRB.CreateBitPreservingCastChain(DL, V, TargetTy);
3432
3433 if (IsSplit) {
3434 assert(!LI.isVolatile());
3435 assert(LI.getType()->isIntegerTy() &&
3436 "Only integer type loads and stores are split");
3437 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3438 "Split load isn't smaller than original load");
3439 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3440 "Non-byte-multiple bit width");
3441 // Move the insertion point just past the load so that we can refer to it.
3442 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3443 // Ensure the insertion point comes before any debug-info immediately
3444 // after the load, so that variable values referring to the load are
3445 // dominated by it.
3446 LIIt.setHeadBit(true);
3447 IRB.SetInsertPoint(LI.getParent(), LIIt);
3448 // Create a placeholder value with the same type as LI to use as the
3449 // basis for the new value. This allows us to replace the uses of LI with
3450 // the computed value, and then replace the placeholder with LI, leaving
3451 // LI only used for this computation.
3452 Value *Placeholder =
3453 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3454 false, Align(1));
3455 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3456 "insert");
3457 LI.replaceAllUsesWith(V);
3458 Placeholder->replaceAllUsesWith(&LI);
3459 Placeholder->deleteValue();
3460 } else {
3461 LI.replaceAllUsesWith(V);
3462 }
3463
3464 Pass.DeadInsts.push_back(&LI);
3465 deleteIfTriviallyDead(OldOp);
3466 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3467 return !LI.isVolatile() && !IsPtrAdjusted;
3468 }
3469
3470 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3471 AAMDNodes AATags) {
3472 // Capture V for the purpose of debug-info accounting once it's converted
3473 // to a vector store.
3474 Value *OrigV = V;
3475 if (V->getType() != VecTy) {
3476 unsigned BeginIndex = getIndex(NewBeginOffset);
3477 unsigned EndIndex = getIndex(NewEndOffset);
3478 assert(EndIndex > BeginIndex && "Empty vector!");
3479 unsigned NumElements = EndIndex - BeginIndex;
3480 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3481 "Too many elements!");
3482 Type *SliceTy = (NumElements == 1)
3483 ? ElementTy
3484 : FixedVectorType::get(ElementTy, NumElements);
3485 if (V->getType() != SliceTy)
3486 V = IRB.CreateBitPreservingCastChain(DL, V, SliceTy);
3487
3488 // Mix in the existing elements.
3489 Value *Old =
3490 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3491 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3492 }
3493 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3494 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3495 LLVMContext::MD_access_group});
3496 if (AATags)
3497 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3498 V->getType(), DL));
3499 Pass.DeadInsts.push_back(&SI);
3500
3501 // NOTE: Careful to use OrigV rather than V.
3502 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3503 Store, Store->getPointerOperand(), OrigV, DL);
3504 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3505 return true;
3506 }
3507
3508 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3509 assert(IntTy && "We cannot extract an integer from the alloca");
3510 assert(!SI.isVolatile());
3511 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3512 IntTy->getBitWidth()) {
3513 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3514 "oldload");
3515 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3516 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3517 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3518 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3519 }
3520 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3521 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3522 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3523 LLVMContext::MD_access_group});
3524 if (AATags)
3525 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3526 V->getType(), DL));
3527
3528 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3529 Store, Store->getPointerOperand(),
3530 Store->getValueOperand(), DL);
3531
3532 Pass.DeadInsts.push_back(&SI);
3533 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3534 return true;
3535 }
3536
3537 bool visitStoreInst(StoreInst &SI) {
3538 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3539 Value *OldOp = SI.getOperand(1);
3540 assert(OldOp == OldPtr);
3541
3542 AAMDNodes AATags = SI.getAAMetadata();
3543 Value *V = SI.getValueOperand();
3544
3545 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3546 // alloca that should be re-examined after promoting this alloca.
3547 if (V->getType()->isPointerTy())
3548 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3549 Pass.PostPromotionWorklist.insert(AI);
3550
3551 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3552 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3553 assert(!SI.isVolatile());
3554 assert(V->getType()->isIntegerTy() &&
3555 "Only integer type loads and stores are split");
3556 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3557 "Non-byte-multiple bit width");
3558 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3559 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3560 "extract");
3561 }
3562
3563 if (VecTy)
3564 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3565 if (IntTy && V->getType()->isIntegerTy())
3566 return rewriteIntegerStore(V, SI, AATags);
3567
3568 StoreInst *NewSI;
3569 if (NewBeginOffset == NewAllocaBeginOffset &&
3570 NewEndOffset == NewAllocaEndOffset &&
3571 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3572 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3573 Value *NewPtr =
3574 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3575
3576 NewSI =
3577 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3578 } else {
3579 unsigned AS = SI.getPointerAddressSpace();
3580 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3581 NewSI =
3582 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3583 }
3584 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3585 LLVMContext::MD_access_group});
3586 if (AATags)
3587 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3588 V->getType(), DL));
3589 if (SI.isVolatile())
3590 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3591 if (NewSI->isAtomic())
3592 NewSI->setAlignment(SI.getAlign());
3593
3594 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3595 NewSI, NewSI->getPointerOperand(),
3596 NewSI->getValueOperand(), DL);
3597
3598 Pass.DeadInsts.push_back(&SI);
3599 deleteIfTriviallyDead(OldOp);
3600
3601 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3602 return NewSI->getPointerOperand() == &NewAI &&
3603 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3604 !SI.isVolatile();
3605 }
3606
3607 /// Compute an integer value from splatting an i8 across the given
3608 /// number of bytes.
3609 ///
3610 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3611 /// call this routine.
3612 /// FIXME: Heed the advice above.
3613 ///
3614 /// \param V The i8 value to splat.
3615 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3616 Value *getIntegerSplat(Value *V, unsigned Size) {
3617 assert(Size > 0 && "Expected a positive number of bytes.");
3618 IntegerType *VTy = cast<IntegerType>(V->getType());
3619 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3620 if (Size == 1)
3621 return V;
3622
3623 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3624 V = IRB.CreateMul(
3625 IRB.CreateZExt(V, SplatIntTy, "zext"),
3626 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3627 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3628 SplatIntTy)),
3629 "isplat");
3630 return V;
3631 }
3632
3633 /// Compute a vector splat for a given element value.
3634 Value *getVectorSplat(Value *V, unsigned NumElements) {
3635 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3636 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3637 return V;
3638 }
3639
3640 bool visitMemSetInst(MemSetInst &II) {
3641 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3642 assert(II.getRawDest() == OldPtr);
3643
3644 AAMDNodes AATags = II.getAAMetadata();
3645
3646 // If the memset has a variable size, it cannot be split, just adjust the
3647 // pointer to the new alloca.
3648 if (!isa<ConstantInt>(II.getLength())) {
3649 assert(!IsSplit);
3650 assert(NewBeginOffset == BeginOffset);
3651 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3652 II.setDestAlignment(getSliceAlign());
3653 // In theory we should call migrateDebugInfo here. However, we do not
3654 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3655 // constant geps, or storing a variable number of bytes.
3657 "AT: Unexpected link to non-const GEP");
3658 deleteIfTriviallyDead(OldPtr);
3659 return false;
3660 }
3661
3662 // Record this instruction for deletion.
3663 Pass.DeadInsts.push_back(&II);
3664
3665 Type *ScalarTy = NewAllocaTy->getScalarType();
3666
3667 const bool CanContinue = [&]() {
3668 if (VecTy || IntTy)
3669 return true;
3670 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3671 return false;
3672 // Length must be in range for FixedVectorType.
3673 auto *C = cast<ConstantInt>(II.getLength());
3674 const uint64_t Len = C->getLimitedValue();
3675 if (Len > std::numeric_limits<unsigned>::max())
3676 return false;
3677 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3678 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3679 return canConvertValue(DL, SrcTy, NewAllocaTy) &&
3680 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3681 }();
3682
3683 // If this doesn't map cleanly onto the alloca type, and that type isn't
3684 // a single value type, just emit a memset.
3685 if (!CanContinue) {
3686 Type *SizeTy = II.getLength()->getType();
3687 unsigned Sz = NewEndOffset - NewBeginOffset;
3688 Constant *Size = ConstantInt::get(SizeTy, Sz);
3689 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3690 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3691 MaybeAlign(getSliceAlign()), II.isVolatile()));
3692 if (AATags)
3693 New->setAAMetadata(
3694 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3695
3696 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3697 New, New->getRawDest(), nullptr, DL);
3698
3699 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3700 return false;
3701 }
3702
3703 // If we can represent this as a simple value, we have to build the actual
3704 // value to store, which requires expanding the byte present in memset to
3705 // a sensible representation for the alloca type. This is essentially
3706 // splatting the byte to a sufficiently wide integer, splatting it across
3707 // any desired vector width, and bitcasting to the final type.
3708 Value *V;
3709
3710 if (VecTy) {
3711 // If this is a memset of a vectorized alloca, insert it.
3712 assert(ElementTy == ScalarTy);
3713
3714 unsigned BeginIndex = getIndex(NewBeginOffset);
3715 unsigned EndIndex = getIndex(NewEndOffset);
3716 assert(EndIndex > BeginIndex && "Empty vector!");
3717 unsigned NumElements = EndIndex - BeginIndex;
3718 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3719 "Too many elements!");
3720
3721 Value *Splat = getIntegerSplat(
3722 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3723 Splat = IRB.CreateBitPreservingCastChain(DL, Splat, ElementTy);
3724 if (NumElements > 1)
3725 Splat = getVectorSplat(Splat, NumElements);
3726
3727 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3728 "oldload");
3729 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3730 } else if (IntTy) {
3731 // If this is a memset on an alloca where we can widen stores, insert the
3732 // set integer.
3733 assert(!II.isVolatile());
3734
3735 uint64_t Size = NewEndOffset - NewBeginOffset;
3736 V = getIntegerSplat(II.getValue(), Size);
3737
3738 if (IntTy && (NewBeginOffset != NewAllocaBeginOffset ||
3739 NewEndOffset != NewAllocaEndOffset)) {
3740 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI,
3741 NewAI.getAlign(), "oldload");
3742 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3743 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3744 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3745 } else {
3746 assert(V->getType() == IntTy &&
3747 "Wrong type for an alloca wide integer!");
3748 }
3749 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3750 } else {
3751 // Established these invariants above.
3752 assert(NewBeginOffset == NewAllocaBeginOffset);
3753 assert(NewEndOffset == NewAllocaEndOffset);
3754
3755 V = getIntegerSplat(II.getValue(),
3756 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3757 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(NewAllocaTy))
3758 V = getVectorSplat(
3759 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3760
3761 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3762 }
3763
3764 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3765 StoreInst *New =
3766 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3767 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3768 LLVMContext::MD_access_group});
3769 if (AATags)
3770 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3771 V->getType(), DL));
3772
3773 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3774 New, New->getPointerOperand(), V, DL);
3775
3776 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3777 return !II.isVolatile();
3778 }
3779
3780 bool visitMemTransferInst(MemTransferInst &II) {
3781 // Rewriting of memory transfer instructions can be a bit tricky. We break
3782 // them into two categories: split intrinsics and unsplit intrinsics.
3783
3784 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3785
3786 AAMDNodes AATags = II.getAAMetadata();
3787
3788 bool IsDest = &II.getRawDestUse() == OldUse;
3789 assert((IsDest && II.getRawDest() == OldPtr) ||
3790 (!IsDest && II.getRawSource() == OldPtr));
3791
3792 Align SliceAlign = getSliceAlign();
3793 // For unsplit intrinsics, we simply modify the source and destination
3794 // pointers in place. This isn't just an optimization, it is a matter of
3795 // correctness. With unsplit intrinsics we may be dealing with transfers
3796 // within a single alloca before SROA ran, or with transfers that have
3797 // a variable length. We may also be dealing with memmove instead of
3798 // memcpy, and so simply updating the pointers is the necessary for us to
3799 // update both source and dest of a single call.
3800 if (!IsSplittable) {
3801 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3802 if (IsDest) {
3803 // Update the address component of linked dbg.assigns.
3804 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3805 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3806 DbgAssign->getAddress() == II.getDest())
3807 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3808 }
3809 II.setDest(AdjustedPtr);
3810 II.setDestAlignment(SliceAlign);
3811 } else {
3812 II.setSource(AdjustedPtr);
3813 II.setSourceAlignment(SliceAlign);
3814 }
3815
3816 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3817 deleteIfTriviallyDead(OldPtr);
3818 return false;
3819 }
3820 // For split transfer intrinsics we have an incredibly useful assurance:
3821 // the source and destination do not reside within the same alloca, and at
3822 // least one of them does not escape. This means that we can replace
3823 // memmove with memcpy, and we don't need to worry about all manner of
3824 // downsides to splitting and transforming the operations.
3825
3826 // If this doesn't map cleanly onto the alloca type, and that type isn't
3827 // a single value type, just emit a memcpy.
3828 bool EmitMemCpy =
3829 !VecTy && !IntTy &&
3830 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3831 SliceSize != DL.getTypeStoreSize(NewAllocaTy).getFixedValue() ||
3832 !DL.typeSizeEqualsStoreSize(NewAllocaTy) ||
3833 !NewAllocaTy->isSingleValueType());
3834
3835 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3836 // size hasn't been shrunk based on analysis of the viable range, this is
3837 // a no-op.
3838 if (EmitMemCpy && &OldAI == &NewAI) {
3839 // Ensure the start lines up.
3840 assert(NewBeginOffset == BeginOffset);
3841
3842 // Rewrite the size as needed.
3843 if (NewEndOffset != EndOffset)
3844 II.setLength(NewEndOffset - NewBeginOffset);
3845 return false;
3846 }
3847 // Record this instruction for deletion.
3848 Pass.DeadInsts.push_back(&II);
3849
3850 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3851 // alloca that should be re-examined after rewriting this instruction.
3852 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3853 if (AllocaInst *AI =
3855 assert(AI != &OldAI && AI != &NewAI &&
3856 "Splittable transfers cannot reach the same alloca on both ends.");
3857 Pass.Worklist.insert(AI);
3858 }
3859
3860 Type *OtherPtrTy = OtherPtr->getType();
3861 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3862
3863 // Compute the relative offset for the other pointer within the transfer.
3864 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3865 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3866 Align OtherAlign =
3867 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3868 OtherAlign =
3869 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3870
3871 if (EmitMemCpy) {
3872 // Compute the other pointer, folding as much as possible to produce
3873 // a single, simple GEP in most cases.
3874 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3875 OtherPtr->getName() + ".");
3876
3877 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3878 Type *SizeTy = II.getLength()->getType();
3879 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3880
3881 Value *DestPtr, *SrcPtr;
3882 MaybeAlign DestAlign, SrcAlign;
3883 // Note: IsDest is true iff we're copying into the new alloca slice
3884 if (IsDest) {
3885 DestPtr = OurPtr;
3886 DestAlign = SliceAlign;
3887 SrcPtr = OtherPtr;
3888 SrcAlign = OtherAlign;
3889 } else {
3890 DestPtr = OtherPtr;
3891 DestAlign = OtherAlign;
3892 SrcPtr = OurPtr;
3893 SrcAlign = SliceAlign;
3894 }
3895 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3896 Size, II.isVolatile());
3897 if (AATags)
3898 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3899
3900 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3901 if (IsDest) {
3902 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3903 &II, New, DestPtr, nullptr, DL);
3904 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3906 DL, Offset, /*AllowNonInbounds*/ true))) {
3907 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3908 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3909 }
3910 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3911 return false;
3912 }
3913
3914 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3915 NewEndOffset == NewAllocaEndOffset;
3916 uint64_t Size = NewEndOffset - NewBeginOffset;
3917 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3918 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3919 unsigned NumElements = EndIndex - BeginIndex;
3920 IntegerType *SubIntTy =
3921 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3922
3923 // Reset the other pointer type to match the register type we're going to
3924 // use, but using the address space of the original other pointer.
3925 Type *OtherTy;
3926 if (VecTy && !IsWholeAlloca) {
3927 if (NumElements == 1)
3928 OtherTy = VecTy->getElementType();
3929 else
3930 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3931 } else if (IntTy && !IsWholeAlloca) {
3932 OtherTy = SubIntTy;
3933 } else {
3934 OtherTy = NewAllocaTy;
3935 }
3936
3937 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3938 OtherPtr->getName() + ".");
3939 MaybeAlign SrcAlign = OtherAlign;
3940 MaybeAlign DstAlign = SliceAlign;
3941 if (!IsDest)
3942 std::swap(SrcAlign, DstAlign);
3943
3944 Value *SrcPtr;
3945 Value *DstPtr;
3946
3947 if (IsDest) {
3948 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3949 SrcPtr = AdjPtr;
3950 } else {
3951 DstPtr = AdjPtr;
3952 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3953 }
3954
3955 Value *Src;
3956 if (VecTy && !IsWholeAlloca && !IsDest) {
3957 Src =
3958 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3959 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3960 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3961 Src =
3962 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3963 Src = IRB.CreateBitPreservingCastChain(DL, Src, IntTy);
3964 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3965 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3966 } else {
3967 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3968 II.isVolatile(), "copyload");
3969 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3970 LLVMContext::MD_access_group});
3971 if (AATags)
3972 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3973 Load->getType(), DL));
3974 Src = Load;
3975 }
3976
3977 if (VecTy && !IsWholeAlloca && IsDest) {
3978 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3979 "oldload");
3980 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3981 } else if (IntTy && !IsWholeAlloca && IsDest) {
3982 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3983 "oldload");
3984 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3985 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3986 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3987 Src = IRB.CreateBitPreservingCastChain(DL, Src, NewAllocaTy);
3988 }
3989
3990 StoreInst *Store = cast<StoreInst>(
3991 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3992 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3993 LLVMContext::MD_access_group});
3994 if (AATags)
3995 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3996 Src->getType(), DL));
3997
3998 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3999 if (IsDest) {
4000
4001 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
4002 Store, DstPtr, Src, DL);
4003 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
4005 DL, Offset, /*AllowNonInbounds*/ true))) {
4006 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
4007 &II, Store, DstPtr, Src, DL);
4008 }
4009
4010 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4011 return !II.isVolatile();
4012 }
4013
4014 bool visitIntrinsicInst(IntrinsicInst &II) {
4015 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
4016 "Unexpected intrinsic!");
4017 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
4018
4019 // Record this instruction for deletion.
4020 Pass.DeadInsts.push_back(&II);
4021
4022 if (II.isDroppable()) {
4023 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
4024 // TODO For now we forget assumed information, this can be improved.
4025 OldPtr->dropDroppableUsesIn(II);
4026 return true;
4027 }
4028
4029 assert(II.getArgOperand(0) == OldPtr);
4030 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
4031 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
4032 Value *New;
4033 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
4034 New = IRB.CreateLifetimeStart(Ptr);
4035 else
4036 New = IRB.CreateLifetimeEnd(Ptr);
4037
4038 (void)New;
4039 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
4040
4041 return true;
4042 }
4043
4044 void fixLoadStoreAlign(Instruction &Root) {
4045 // This algorithm implements the same visitor loop as
4046 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
4047 // or store found.
4048 SmallPtrSet<Instruction *, 4> Visited;
4049 SmallVector<Instruction *, 4> Uses;
4050 Visited.insert(&Root);
4051 Uses.push_back(&Root);
4052 do {
4053 Instruction *I = Uses.pop_back_val();
4054
4055 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
4056 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
4057 continue;
4058 }
4059 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
4060 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
4061 continue;
4062 }
4063
4067 for (User *U : I->users())
4068 if (Visited.insert(cast<Instruction>(U)).second)
4069 Uses.push_back(cast<Instruction>(U));
4070 } while (!Uses.empty());
4071 }
4072
4073 bool visitPHINode(PHINode &PN) {
4074 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
4075 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
4076 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
4077
4078 // We would like to compute a new pointer in only one place, but have it be
4079 // as local as possible to the PHI. To do that, we re-use the location of
4080 // the old pointer, which necessarily must be in the right position to
4081 // dominate the PHI.
4082 IRBuilderBase::InsertPointGuard Guard(IRB);
4083 if (isa<PHINode>(OldPtr))
4084 IRB.SetInsertPoint(OldPtr->getParent(),
4085 OldPtr->getParent()->getFirstInsertionPt());
4086 else
4087 IRB.SetInsertPoint(OldPtr);
4088 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
4089
4090 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
4091 // Replace the operands which were using the old pointer.
4092 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
4093
4094 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
4095 deleteIfTriviallyDead(OldPtr);
4096
4097 // Fix the alignment of any loads or stores using this PHI node.
4098 fixLoadStoreAlign(PN);
4099
4100 // PHIs can't be promoted on their own, but often can be speculated. We
4101 // check the speculation outside of the rewriter so that we see the
4102 // fully-rewritten alloca.
4103 PHIUsers.insert(&PN);
4104 return true;
4105 }
4106
4107 bool visitSelectInst(SelectInst &SI) {
4108 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4109 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
4110 "Pointer isn't an operand!");
4111 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
4112 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
4113
4114 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
4115 // Replace the operands which were using the old pointer.
4116 if (SI.getOperand(1) == OldPtr)
4117 SI.setOperand(1, NewPtr);
4118 if (SI.getOperand(2) == OldPtr)
4119 SI.setOperand(2, NewPtr);
4120
4121 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
4122 deleteIfTriviallyDead(OldPtr);
4123
4124 // Fix the alignment of any loads or stores using this select.
4125 fixLoadStoreAlign(SI);
4126
4127 // Selects can't be promoted on their own, but often can be speculated. We
4128 // check the speculation outside of the rewriter so that we see the
4129 // fully-rewritten alloca.
4130 SelectUsers.insert(&SI);
4131 return true;
4132 }
4133};
4134
4135/// Visitor to rewrite aggregate loads and stores as scalar.
4136///
4137/// This pass aggressively rewrites all aggregate loads and stores on
4138/// a particular pointer (or any pointer derived from it which we can identify)
4139/// with scalar loads and stores.
4140class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
4141 // Befriend the base class so it can delegate to private visit methods.
4142 friend class InstVisitor<AggLoadStoreRewriter, bool>;
4143
4144 /// Queue of pointer uses to analyze and potentially rewrite.
4146
4147 /// Set to prevent us from cycling with phi nodes and loops.
4148 SmallPtrSet<User *, 8> Visited;
4149
4150 /// The current pointer use being rewritten. This is used to dig up the used
4151 /// value (as opposed to the user).
4152 Use *U = nullptr;
4153
4154 /// Used to calculate offsets, and hence alignment, of subobjects.
4155 const DataLayout &DL;
4156
4157 IRBuilderTy &IRB;
4158
4159public:
4160 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
4161 : DL(DL), IRB(IRB) {}
4162
4163 /// Rewrite loads and stores through a pointer and all pointers derived from
4164 /// it.
4165 bool rewrite(Instruction &I) {
4166 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
4167 enqueueUsers(I);
4168 bool Changed = false;
4169 while (!Queue.empty()) {
4170 U = Queue.pop_back_val();
4171 Changed |= visit(cast<Instruction>(U->getUser()));
4172 }
4173 return Changed;
4174 }
4175
4176private:
4177 /// Enqueue all the users of the given instruction for further processing.
4178 /// This uses a set to de-duplicate users.
4179 void enqueueUsers(Instruction &I) {
4180 for (Use &U : I.uses())
4181 if (Visited.insert(U.getUser()).second)
4182 Queue.push_back(&U);
4183 }
4184
4185 // Conservative default is to not rewrite anything.
4186 bool visitInstruction(Instruction &I) { return false; }
4187
4188 /// Generic recursive split emission class.
4189 template <typename Derived> class OpSplitter {
4190 protected:
4191 /// The builder used to form new instructions.
4192 IRBuilderTy &IRB;
4193
4194 /// The indices which to be used with insert- or extractvalue to select the
4195 /// appropriate value within the aggregate.
4196 SmallVector<unsigned, 4> Indices;
4197
4198 /// The indices to a GEP instruction which will move Ptr to the correct slot
4199 /// within the aggregate.
4200 SmallVector<Value *, 4> GEPIndices;
4201
4202 /// The base pointer of the original op, used as a base for GEPing the
4203 /// split operations.
4204 Value *Ptr;
4205
4206 /// The base pointee type being GEPed into.
4207 Type *BaseTy;
4208
4209 /// Known alignment of the base pointer.
4210 Align BaseAlign;
4211
4212 /// To calculate offset of each component so we can correctly deduce
4213 /// alignments.
4214 const DataLayout &DL;
4215
4216 /// Initialize the splitter with an insertion point, Ptr and start with a
4217 /// single zero GEP index.
4218 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4219 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4220 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4221 BaseAlign(BaseAlign), DL(DL) {
4222 IRB.SetInsertPoint(InsertionPoint);
4223 }
4224
4225 public:
4226 /// Generic recursive split emission routine.
4227 ///
4228 /// This method recursively splits an aggregate op (load or store) into
4229 /// scalar or vector ops. It splits recursively until it hits a single value
4230 /// and emits that single value operation via the template argument.
4231 ///
4232 /// The logic of this routine relies on GEPs and insertvalue and
4233 /// extractvalue all operating with the same fundamental index list, merely
4234 /// formatted differently (GEPs need actual values).
4235 ///
4236 /// \param Ty The type being split recursively into smaller ops.
4237 /// \param Agg The aggregate value being built up or stored, depending on
4238 /// whether this is splitting a load or a store respectively.
4239 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4240 if (Ty->isSingleValueType()) {
4241 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4242 return static_cast<Derived *>(this)->emitFunc(
4243 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4244 }
4245
4246 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4247 unsigned OldSize = Indices.size();
4248 (void)OldSize;
4249 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4250 ++Idx) {
4251 assert(Indices.size() == OldSize && "Did not return to the old size");
4252 Indices.push_back(Idx);
4253 GEPIndices.push_back(IRB.getInt32(Idx));
4254 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4255 GEPIndices.pop_back();
4256 Indices.pop_back();
4257 }
4258 return;
4259 }
4260
4261 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4262 unsigned OldSize = Indices.size();
4263 (void)OldSize;
4264 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4265 ++Idx) {
4266 assert(Indices.size() == OldSize && "Did not return to the old size");
4267 Indices.push_back(Idx);
4268 GEPIndices.push_back(IRB.getInt32(Idx));
4269 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4270 GEPIndices.pop_back();
4271 Indices.pop_back();
4272 }
4273 return;
4274 }
4275
4276 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4277 }
4278 };
4279
4280 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4281 AAMDNodes AATags;
4282 // A vector to hold the split components that we want to emit
4283 // separate fake uses for.
4284 SmallVector<Value *, 4> Components;
4285 // A vector to hold all the fake uses of the struct that we are splitting.
4286 // Usually there should only be one, but we are handling the general case.
4288
4289 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4290 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4291 IRBuilderTy &IRB)
4292 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4293 IRB),
4294 AATags(AATags) {}
4295
4296 /// Emit a leaf load of a single value. This is called at the leaves of the
4297 /// recursive emission to actually load values.
4298 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4300 // Load the single value and insert it using the indices.
4301 Value *GEP =
4302 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4303 LoadInst *Load =
4304 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4305
4306 APInt Offset(
4307 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4308 if (AATags &&
4309 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4310 Load->setAAMetadata(
4311 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4312 // Record the load so we can generate a fake use for this aggregate
4313 // component.
4314 Components.push_back(Load);
4315
4316 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4317 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4318 }
4319
4320 // Stash the fake uses that use the value generated by this instruction.
4321 void recordFakeUses(LoadInst &LI) {
4322 for (Use &U : LI.uses())
4323 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4324 if (II->getIntrinsicID() == Intrinsic::fake_use)
4325 FakeUses.push_back(II);
4326 }
4327
4328 // Replace all fake uses of the aggregate with a series of fake uses, one
4329 // for each split component.
4330 void emitFakeUses() {
4331 for (Instruction *I : FakeUses) {
4332 IRB.SetInsertPoint(I);
4333 for (auto *V : Components)
4334 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4335 I->eraseFromParent();
4336 }
4337 }
4338 };
4339
4340 bool visitLoadInst(LoadInst &LI) {
4341 assert(LI.getPointerOperand() == *U);
4342 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4343 return false;
4344
4345 // We have an aggregate being loaded, split it apart.
4346 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4347 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4348 getAdjustedAlignment(&LI, 0), DL, IRB);
4349 Splitter.recordFakeUses(LI);
4351 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4352 Splitter.emitFakeUses();
4353 Visited.erase(&LI);
4354 LI.replaceAllUsesWith(V);
4355 LI.eraseFromParent();
4356 return true;
4357 }
4358
4359 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4360 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4361 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4362 const DataLayout &DL, IRBuilderTy &IRB)
4363 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4364 DL, IRB),
4365 AATags(AATags), AggStore(AggStore) {}
4366 AAMDNodes AATags;
4367 StoreInst *AggStore;
4368 /// Emit a leaf store of a single value. This is called at the leaves of the
4369 /// recursive emission to actually produce stores.
4370 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4372 // Extract the single value and store it using the indices.
4373 //
4374 // The gep and extractvalue values are factored out of the CreateStore
4375 // call to make the output independent of the argument evaluation order.
4376 Value *ExtractValue =
4377 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4378 Value *InBoundsGEP =
4379 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4380 StoreInst *Store =
4381 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4382
4383 APInt Offset(
4384 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4385 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4386 if (AATags) {
4387 Store->setAAMetadata(AATags.adjustForAccess(
4388 Offset.getZExtValue(), ExtractValue->getType(), DL));
4389 }
4390
4391 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4392 // If we cannot (because there's an intervening non-const or unbounded
4393 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4394 // this instruction.
4396 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4397 uint64_t SizeInBits =
4398 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4399 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4400 SizeInBits, AggStore, Store,
4401 Store->getPointerOperand(), Store->getValueOperand(),
4402 DL);
4403 } else {
4405 "AT: unexpected debug.assign linked to store through "
4406 "unbounded GEP");
4407 }
4408 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4409 }
4410 };
4411
4412 bool visitStoreInst(StoreInst &SI) {
4413 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4414 return false;
4415 Value *V = SI.getValueOperand();
4416 if (V->getType()->isSingleValueType())
4417 return false;
4418
4419 // We have an aggregate being stored, split it apart.
4420 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4421 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4422 getAdjustedAlignment(&SI, 0), DL, IRB);
4423 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4424 Visited.erase(&SI);
4425 // The stores replacing SI each have markers describing fragments of the
4426 // assignment so delete the assignment markers linked to SI.
4428 SI.eraseFromParent();
4429 return true;
4430 }
4431
4432 bool visitBitCastInst(BitCastInst &BC) {
4433 enqueueUsers(BC);
4434 return false;
4435 }
4436
4437 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4438 enqueueUsers(ASC);
4439 return false;
4440 }
4441
4442 // Unfold gep (select cond, ptr1, ptr2), idx
4443 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4444 // and gep ptr, (select cond, idx1, idx2)
4445 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4446 // We also allow for i1 zext indices, which are equivalent to selects.
4447 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4448 // Check whether the GEP has exactly one select operand and all indices
4449 // will become constant after the transform.
4451 for (Value *Op : GEPI.indices()) {
4452 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4453 if (Sel)
4454 return false;
4455
4456 Sel = SI;
4457 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4458 !isa<ConstantInt>(SI->getFalseValue()))
4459 return false;
4460 continue;
4461 }
4462 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4463 if (Sel)
4464 return false;
4465 Sel = ZI;
4466 if (!ZI->getSrcTy()->isIntegerTy(1))
4467 return false;
4468 continue;
4469 }
4470
4471 if (!isa<ConstantInt>(Op))
4472 return false;
4473 }
4474
4475 if (!Sel)
4476 return false;
4477
4478 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4479 dbgs() << " original: " << *Sel << "\n";
4480 dbgs() << " " << GEPI << "\n";);
4481
4482 auto GetNewOps = [&](Value *SelOp) {
4483 SmallVector<Value *> NewOps;
4484 for (Value *Op : GEPI.operands())
4485 if (Op == Sel)
4486 NewOps.push_back(SelOp);
4487 else
4488 NewOps.push_back(Op);
4489 return NewOps;
4490 };
4491
4492 Value *Cond, *True, *False;
4493 Instruction *MDFrom = nullptr;
4494 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4495 Cond = SI->getCondition();
4496 True = SI->getTrueValue();
4497 False = SI->getFalseValue();
4499 MDFrom = SI;
4500 } else {
4501 Cond = Sel->getOperand(0);
4502 True = ConstantInt::get(Sel->getType(), 1);
4503 False = ConstantInt::get(Sel->getType(), 0);
4504 }
4505 SmallVector<Value *> TrueOps = GetNewOps(True);
4506 SmallVector<Value *> FalseOps = GetNewOps(False);
4507
4508 IRB.SetInsertPoint(&GEPI);
4509 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4510
4511 Type *Ty = GEPI.getSourceElementType();
4512 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4513 True->getName() + ".sroa.gep", NW);
4514
4515 Value *NFalse =
4516 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4517 False->getName() + ".sroa.gep", NW);
4518
4519 Value *NSel = MDFrom
4520 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4521 Sel->getName() + ".sroa.sel", MDFrom)
4522 : IRB.CreateSelectWithUnknownProfile(
4523 Cond, NTrue, NFalse, DEBUG_TYPE,
4524 Sel->getName() + ".sroa.sel");
4525 Visited.erase(&GEPI);
4526 GEPI.replaceAllUsesWith(NSel);
4527 GEPI.eraseFromParent();
4528 Instruction *NSelI = cast<Instruction>(NSel);
4529 Visited.insert(NSelI);
4530 enqueueUsers(*NSelI);
4531
4532 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4533 dbgs() << " " << *NFalse << "\n";
4534 dbgs() << " " << *NSel << "\n";);
4535
4536 return true;
4537 }
4538
4539 // Unfold gep (phi ptr1, ptr2), idx
4540 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4541 // and gep ptr, (phi idx1, idx2)
4542 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4543 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4544 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4545 // operand (looking through the phi if it is the phi we want to unfold) is
4546 // an instruction besides a static alloca.
4547 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4548 auto IsInvalidPointerOperand = [](Value *V) {
4549 if (!isa<Instruction>(V))
4550 return false;
4551 if (auto *AI = dyn_cast<AllocaInst>(V))
4552 return !AI->isStaticAlloca();
4553 return true;
4554 };
4555 if (Phi) {
4556 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4557 return false;
4558 } else {
4559 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4560 return false;
4561 }
4562 // Check whether the GEP has exactly one phi operand (including the pointer
4563 // operand) and all indices will become constant after the transform.
4564 for (Value *Op : GEPI.indices()) {
4565 if (auto *SI = dyn_cast<PHINode>(Op)) {
4566 if (Phi)
4567 return false;
4568
4569 Phi = SI;
4570 if (!all_of(Phi->incoming_values(),
4571 [](Value *V) { return isa<ConstantInt>(V); }))
4572 return false;
4573 continue;
4574 }
4575
4576 if (!isa<ConstantInt>(Op))
4577 return false;
4578 }
4579
4580 if (!Phi)
4581 return false;
4582
4583 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4584 dbgs() << " original: " << *Phi << "\n";
4585 dbgs() << " " << GEPI << "\n";);
4586
4587 auto GetNewOps = [&](Value *PhiOp) {
4588 SmallVector<Value *> NewOps;
4589 for (Value *Op : GEPI.operands())
4590 if (Op == Phi)
4591 NewOps.push_back(PhiOp);
4592 else
4593 NewOps.push_back(Op);
4594 return NewOps;
4595 };
4596
4597 IRB.SetInsertPoint(Phi);
4598 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4599 Phi->getName() + ".sroa.phi");
4600
4601 Type *SourceTy = GEPI.getSourceElementType();
4602 // We only handle arguments, constants, and static allocas here, so we can
4603 // insert GEPs at the end of the entry block.
4604 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4605 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4606 Value *Op = Phi->getIncomingValue(I);
4607 BasicBlock *BB = Phi->getIncomingBlock(I);
4608 Value *NewGEP;
4609 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4610 NewGEP = NewPhi->getIncomingValue(NI);
4611 } else {
4612 SmallVector<Value *> NewOps = GetNewOps(Op);
4613 NewGEP =
4614 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4615 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4616 }
4617 NewPhi->addIncoming(NewGEP, BB);
4618 }
4619
4620 Visited.erase(&GEPI);
4621 GEPI.replaceAllUsesWith(NewPhi);
4622 GEPI.eraseFromParent();
4623 Visited.insert(NewPhi);
4624 enqueueUsers(*NewPhi);
4625
4626 LLVM_DEBUG(dbgs() << " to: ";
4627 for (Value *In
4628 : NewPhi->incoming_values()) dbgs()
4629 << "\n " << *In;
4630 dbgs() << "\n " << *NewPhi << '\n');
4631
4632 return true;
4633 }
4634
4635 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4636 if (unfoldGEPSelect(GEPI))
4637 return true;
4638
4639 if (unfoldGEPPhi(GEPI))
4640 return true;
4641
4642 enqueueUsers(GEPI);
4643 return false;
4644 }
4645
4646 bool visitPHINode(PHINode &PN) {
4647 enqueueUsers(PN);
4648 return false;
4649 }
4650
4651 bool visitSelectInst(SelectInst &SI) {
4652 enqueueUsers(SI);
4653 return false;
4654 }
4655};
4656
4657} // end anonymous namespace
4658
4659/// Strip aggregate type wrapping.
4660///
4661/// This removes no-op aggregate types wrapping an underlying type. It will
4662/// strip as many layers of types as it can without changing either the type
4663/// size or the allocated size.
4665 if (Ty->isSingleValueType())
4666 return Ty;
4667
4668 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4669 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4670
4671 Type *InnerTy;
4672 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4673 InnerTy = ArrTy->getElementType();
4674 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4675 const StructLayout *SL = DL.getStructLayout(STy);
4676 unsigned Index = SL->getElementContainingOffset(0);
4677 InnerTy = STy->getElementType(Index);
4678 } else {
4679 return Ty;
4680 }
4681
4682 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4683 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4684 return Ty;
4685
4686 return stripAggregateTypeWrapping(DL, InnerTy);
4687}
4688
4689/// Try to find a partition of the aggregate type passed in for a given
4690/// offset and size.
4691///
4692/// This recurses through the aggregate type and tries to compute a subtype
4693/// based on the offset and size. When the offset and size span a sub-section
4694/// of an array, it will even compute a new array type for that sub-section,
4695/// and the same for structs.
4696///
4697/// Note that this routine is very strict and tries to find a partition of the
4698/// type which produces the *exact* right offset and size. It is not forgiving
4699/// when the size or offset cause either end of type-based partition to be off.
4700/// Also, this is a best-effort routine. It is reasonable to give up and not
4701/// return a type if necessary.
4703 uint64_t Size) {
4704 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4705 return stripAggregateTypeWrapping(DL, Ty);
4706 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4707 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4708 return nullptr;
4709
4710 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4711 Type *ElementTy;
4712 uint64_t TyNumElements;
4713 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4714 ElementTy = AT->getElementType();
4715 TyNumElements = AT->getNumElements();
4716 } else {
4717 // FIXME: This isn't right for vectors with non-byte-sized or
4718 // non-power-of-two sized elements.
4719 auto *VT = cast<FixedVectorType>(Ty);
4720 ElementTy = VT->getElementType();
4721 TyNumElements = VT->getNumElements();
4722 }
4723 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4724 uint64_t NumSkippedElements = Offset / ElementSize;
4725 if (NumSkippedElements >= TyNumElements)
4726 return nullptr;
4727 Offset -= NumSkippedElements * ElementSize;
4728
4729 // First check if we need to recurse.
4730 if (Offset > 0 || Size < ElementSize) {
4731 // Bail if the partition ends in a different array element.
4732 if ((Offset + Size) > ElementSize)
4733 return nullptr;
4734 // Recurse through the element type trying to peel off offset bytes.
4735 return getTypePartition(DL, ElementTy, Offset, Size);
4736 }
4737 assert(Offset == 0);
4738
4739 if (Size == ElementSize)
4740 return stripAggregateTypeWrapping(DL, ElementTy);
4741 assert(Size > ElementSize);
4742 uint64_t NumElements = Size / ElementSize;
4743 if (NumElements * ElementSize != Size)
4744 return nullptr;
4745 return ArrayType::get(ElementTy, NumElements);
4746 }
4747
4749 if (!STy)
4750 return nullptr;
4751
4752 const StructLayout *SL = DL.getStructLayout(STy);
4753
4754 if (SL->getSizeInBits().isScalable())
4755 return nullptr;
4756
4757 if (Offset >= SL->getSizeInBytes())
4758 return nullptr;
4759 uint64_t EndOffset = Offset + Size;
4760 if (EndOffset > SL->getSizeInBytes())
4761 return nullptr;
4762
4763 unsigned Index = SL->getElementContainingOffset(Offset);
4764 Offset -= SL->getElementOffset(Index);
4765
4766 Type *ElementTy = STy->getElementType(Index);
4767 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4768 if (Offset >= ElementSize)
4769 return nullptr; // The offset points into alignment padding.
4770
4771 // See if any partition must be contained by the element.
4772 if (Offset > 0 || Size < ElementSize) {
4773 if ((Offset + Size) > ElementSize)
4774 return nullptr;
4775 return getTypePartition(DL, ElementTy, Offset, Size);
4776 }
4777 assert(Offset == 0);
4778
4779 if (Size == ElementSize)
4780 return stripAggregateTypeWrapping(DL, ElementTy);
4781
4782 StructType::element_iterator EI = STy->element_begin() + Index,
4783 EE = STy->element_end();
4784 if (EndOffset < SL->getSizeInBytes()) {
4785 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4786 if (Index == EndIndex)
4787 return nullptr; // Within a single element and its padding.
4788
4789 // Don't try to form "natural" types if the elements don't line up with the
4790 // expected size.
4791 // FIXME: We could potentially recurse down through the last element in the
4792 // sub-struct to find a natural end point.
4793 if (SL->getElementOffset(EndIndex) != EndOffset)
4794 return nullptr;
4795
4796 assert(Index < EndIndex);
4797 EE = STy->element_begin() + EndIndex;
4798 }
4799
4800 // Try to build up a sub-structure.
4801 StructType *SubTy =
4802 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4803 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4804 if (Size != SubSL->getSizeInBytes())
4805 return nullptr; // The sub-struct doesn't have quite the size needed.
4806
4807 return SubTy;
4808}
4809
4810/// Pre-split loads and stores to simplify rewriting.
4811///
4812/// We want to break up the splittable load+store pairs as much as
4813/// possible. This is important to do as a preprocessing step, as once we
4814/// start rewriting the accesses to partitions of the alloca we lose the
4815/// necessary information to correctly split apart paired loads and stores
4816/// which both point into this alloca. The case to consider is something like
4817/// the following:
4818///
4819/// %a = alloca [12 x i8]
4820/// %gep1 = getelementptr i8, ptr %a, i32 0
4821/// %gep2 = getelementptr i8, ptr %a, i32 4
4822/// %gep3 = getelementptr i8, ptr %a, i32 8
4823/// store float 0.0, ptr %gep1
4824/// store float 1.0, ptr %gep2
4825/// %v = load i64, ptr %gep1
4826/// store i64 %v, ptr %gep2
4827/// %f1 = load float, ptr %gep2
4828/// %f2 = load float, ptr %gep3
4829///
4830/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4831/// promote everything so we recover the 2 SSA values that should have been
4832/// there all along.
4833///
4834/// \returns true if any changes are made.
4835bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4836 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4837
4838 // Track the loads and stores which are candidates for pre-splitting here, in
4839 // the order they first appear during the partition scan. These give stable
4840 // iteration order and a basis for tracking which loads and stores we
4841 // actually split.
4844
4845 // We need to accumulate the splits required of each load or store where we
4846 // can find them via a direct lookup. This is important to cross-check loads
4847 // and stores against each other. We also track the slice so that we can kill
4848 // all the slices that end up split.
4849 struct SplitOffsets {
4850 Slice *S;
4851 std::vector<uint64_t> Splits;
4852 };
4853 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4854
4855 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4856 // This is important as we also cannot pre-split stores of those loads!
4857 // FIXME: This is all pretty gross. It means that we can be more aggressive
4858 // in pre-splitting when the load feeding the store happens to come from
4859 // a separate alloca. Put another way, the effectiveness of SROA would be
4860 // decreased by a frontend which just concatenated all of its local allocas
4861 // into one big flat alloca. But defeating such patterns is exactly the job
4862 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4863 // change store pre-splitting to actually force pre-splitting of the load
4864 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4865 // maybe it would make it more principled?
4866 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4867
4868 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4869 for (auto &P : AS.partitions()) {
4870 for (Slice &S : P) {
4871 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4872 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4873 // If this is a load we have to track that it can't participate in any
4874 // pre-splitting. If this is a store of a load we have to track that
4875 // that load also can't participate in any pre-splitting.
4876 if (auto *LI = dyn_cast<LoadInst>(I))
4877 UnsplittableLoads.insert(LI);
4878 else if (auto *SI = dyn_cast<StoreInst>(I))
4879 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4880 UnsplittableLoads.insert(LI);
4881 continue;
4882 }
4883 assert(P.endOffset() > S.beginOffset() &&
4884 "Empty or backwards partition!");
4885
4886 // Determine if this is a pre-splittable slice.
4887 if (auto *LI = dyn_cast<LoadInst>(I)) {
4888 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4889
4890 // The load must be used exclusively to store into other pointers for
4891 // us to be able to arbitrarily pre-split it. The stores must also be
4892 // simple to avoid changing semantics.
4893 auto IsLoadSimplyStored = [](LoadInst *LI) {
4894 for (User *LU : LI->users()) {
4895 auto *SI = dyn_cast<StoreInst>(LU);
4896 if (!SI || !SI->isSimple())
4897 return false;
4898 }
4899 return true;
4900 };
4901 if (!IsLoadSimplyStored(LI)) {
4902 UnsplittableLoads.insert(LI);
4903 continue;
4904 }
4905
4906 Loads.push_back(LI);
4907 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4908 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4909 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4910 continue;
4911 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4912 if (!StoredLoad || !StoredLoad->isSimple())
4913 continue;
4914 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4915
4916 Stores.push_back(SI);
4917 } else {
4918 // Other uses cannot be pre-split.
4919 continue;
4920 }
4921
4922 // Record the initial split.
4923 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4924 auto &Offsets = SplitOffsetsMap[I];
4925 assert(Offsets.Splits.empty() &&
4926 "Should not have splits the first time we see an instruction!");
4927 Offsets.S = &S;
4928 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4929 }
4930
4931 // Now scan the already split slices, and add a split for any of them which
4932 // we're going to pre-split.
4933 for (Slice *S : P.splitSliceTails()) {
4934 auto SplitOffsetsMapI =
4935 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4936 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4937 continue;
4938 auto &Offsets = SplitOffsetsMapI->second;
4939
4940 assert(Offsets.S == S && "Found a mismatched slice!");
4941 assert(!Offsets.Splits.empty() &&
4942 "Cannot have an empty set of splits on the second partition!");
4943 assert(Offsets.Splits.back() ==
4944 P.beginOffset() - Offsets.S->beginOffset() &&
4945 "Previous split does not end where this one begins!");
4946
4947 // Record each split. The last partition's end isn't needed as the size
4948 // of the slice dictates that.
4949 if (S->endOffset() > P.endOffset())
4950 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4951 }
4952 }
4953
4954 // We may have split loads where some of their stores are split stores. For
4955 // such loads and stores, we can only pre-split them if their splits exactly
4956 // match relative to their starting offset. We have to verify this prior to
4957 // any rewriting.
4958 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4959 // Lookup the load we are storing in our map of split
4960 // offsets.
4961 auto *LI = cast<LoadInst>(SI->getValueOperand());
4962 // If it was completely unsplittable, then we're done,
4963 // and this store can't be pre-split.
4964 if (UnsplittableLoads.count(LI))
4965 return true;
4966
4967 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4968 if (LoadOffsetsI == SplitOffsetsMap.end())
4969 return false; // Unrelated loads are definitely safe.
4970 auto &LoadOffsets = LoadOffsetsI->second;
4971
4972 // Now lookup the store's offsets.
4973 auto &StoreOffsets = SplitOffsetsMap[SI];
4974
4975 // If the relative offsets of each split in the load and
4976 // store match exactly, then we can split them and we
4977 // don't need to remove them here.
4978 if (LoadOffsets.Splits == StoreOffsets.Splits)
4979 return false;
4980
4981 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4982 << " " << *LI << "\n"
4983 << " " << *SI << "\n");
4984
4985 // We've found a store and load that we need to split
4986 // with mismatched relative splits. Just give up on them
4987 // and remove both instructions from our list of
4988 // candidates.
4989 UnsplittableLoads.insert(LI);
4990 return true;
4991 });
4992 // Now we have to go *back* through all the stores, because a later store may
4993 // have caused an earlier store's load to become unsplittable and if it is
4994 // unsplittable for the later store, then we can't rely on it being split in
4995 // the earlier store either.
4996 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4997 auto *LI = cast<LoadInst>(SI->getValueOperand());
4998 return UnsplittableLoads.count(LI);
4999 });
5000 // Once we've established all the loads that can't be split for some reason,
5001 // filter any that made it into our list out.
5002 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
5003 return UnsplittableLoads.count(LI);
5004 });
5005
5006 // If no loads or stores are left, there is no pre-splitting to be done for
5007 // this alloca.
5008 if (Loads.empty() && Stores.empty())
5009 return false;
5010
5011 // From here on, we can't fail and will be building new accesses, so rig up
5012 // an IR builder.
5013 IRBuilderTy IRB(&AI);
5014
5015 // Collect the new slices which we will merge into the alloca slices.
5016 SmallVector<Slice, 4> NewSlices;
5017
5018 // Track any allocas we end up splitting loads and stores for so we iterate
5019 // on them.
5020 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
5021
5022 // At this point, we have collected all of the loads and stores we can
5023 // pre-split, and the specific splits needed for them. We actually do the
5024 // splitting in a specific order in order to handle when one of the loads in
5025 // the value operand to one of the stores.
5026 //
5027 // First, we rewrite all of the split loads, and just accumulate each split
5028 // load in a parallel structure. We also build the slices for them and append
5029 // them to the alloca slices.
5030 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
5031 std::vector<LoadInst *> SplitLoads;
5032 const DataLayout &DL = AI.getDataLayout();
5033 for (LoadInst *LI : Loads) {
5034 SplitLoads.clear();
5035
5036 auto &Offsets = SplitOffsetsMap[LI];
5037 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
5038 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
5039 "Load must have type size equal to store size");
5040 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
5041 "Load must be >= slice size");
5042
5043 uint64_t BaseOffset = Offsets.S->beginOffset();
5044 assert(BaseOffset + SliceSize > BaseOffset &&
5045 "Cannot represent alloca access size using 64-bit integers!");
5046
5048 IRB.SetInsertPoint(LI);
5049
5050 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
5051
5052 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5053 int Idx = 0, Size = Offsets.Splits.size();
5054 for (;;) {
5055 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
5056 auto AS = LI->getPointerAddressSpace();
5057 auto *PartPtrTy = LI->getPointerOperandType();
5058 LoadInst *PLoad = IRB.CreateAlignedLoad(
5059 PartTy,
5060 getAdjustedPtr(IRB, DL, BasePtr,
5061 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5062 PartPtrTy, BasePtr->getName() + "."),
5063 getAdjustedAlignment(LI, PartOffset),
5064 /*IsVolatile*/ false, LI->getName());
5065 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5066 LLVMContext::MD_access_group});
5067
5068 // Append this load onto the list of split loads so we can find it later
5069 // to rewrite the stores.
5070 SplitLoads.push_back(PLoad);
5071
5072 // Now build a new slice for the alloca.
5073 NewSlices.push_back(
5074 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5075 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
5076 /*IsSplittable*/ false));
5077 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5078 << ", " << NewSlices.back().endOffset()
5079 << "): " << *PLoad << "\n");
5080
5081 // See if we've handled all the splits.
5082 if (Idx >= Size)
5083 break;
5084
5085 // Setup the next partition.
5086 PartOffset = Offsets.Splits[Idx];
5087 ++Idx;
5088 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
5089 }
5090
5091 // Now that we have the split loads, do the slow walk over all uses of the
5092 // load and rewrite them as split stores, or save the split loads to use
5093 // below if the store is going to be split there anyways.
5094 bool DeferredStores = false;
5095 for (User *LU : LI->users()) {
5096 StoreInst *SI = cast<StoreInst>(LU);
5097 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
5098 DeferredStores = true;
5099 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
5100 << "\n");
5101 continue;
5102 }
5103
5104 Value *StoreBasePtr = SI->getPointerOperand();
5105 IRB.SetInsertPoint(SI);
5106 AAMDNodes AATags = SI->getAAMetadata();
5107
5108 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
5109
5110 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
5111 LoadInst *PLoad = SplitLoads[Idx];
5112 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
5113 auto *PartPtrTy = SI->getPointerOperandType();
5114
5115 auto AS = SI->getPointerAddressSpace();
5116 StoreInst *PStore = IRB.CreateAlignedStore(
5117 PLoad,
5118 getAdjustedPtr(IRB, DL, StoreBasePtr,
5119 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5120 PartPtrTy, StoreBasePtr->getName() + "."),
5121 getAdjustedAlignment(SI, PartOffset),
5122 /*IsVolatile*/ false);
5123 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5124 LLVMContext::MD_access_group,
5125 LLVMContext::MD_DIAssignID});
5126
5127 if (AATags)
5128 PStore->setAAMetadata(
5129 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
5130 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
5131 }
5132
5133 // We want to immediately iterate on any allocas impacted by splitting
5134 // this store, and we have to track any promotable alloca (indicated by
5135 // a direct store) as needing to be resplit because it is no longer
5136 // promotable.
5137 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
5138 ResplitPromotableAllocas.insert(OtherAI);
5139 Worklist.insert(OtherAI);
5140 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5141 StoreBasePtr->stripInBoundsOffsets())) {
5142 Worklist.insert(OtherAI);
5143 }
5144
5145 // Mark the original store as dead.
5146 DeadInsts.push_back(SI);
5147 }
5148
5149 // Save the split loads if there are deferred stores among the users.
5150 if (DeferredStores)
5151 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
5152
5153 // Mark the original load as dead and kill the original slice.
5154 DeadInsts.push_back(LI);
5155 Offsets.S->kill();
5156 }
5157
5158 // Second, we rewrite all of the split stores. At this point, we know that
5159 // all loads from this alloca have been split already. For stores of such
5160 // loads, we can simply look up the pre-existing split loads. For stores of
5161 // other loads, we split those loads first and then write split stores of
5162 // them.
5163 for (StoreInst *SI : Stores) {
5164 auto *LI = cast<LoadInst>(SI->getValueOperand());
5165 IntegerType *Ty = cast<IntegerType>(LI->getType());
5166 assert(Ty->getBitWidth() % 8 == 0);
5167 uint64_t StoreSize = Ty->getBitWidth() / 8;
5168 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
5169
5170 auto &Offsets = SplitOffsetsMap[SI];
5171 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
5172 "Slice size should always match load size exactly!");
5173 uint64_t BaseOffset = Offsets.S->beginOffset();
5174 assert(BaseOffset + StoreSize > BaseOffset &&
5175 "Cannot represent alloca access size using 64-bit integers!");
5176
5177 Value *LoadBasePtr = LI->getPointerOperand();
5178 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
5179
5180 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5181
5182 // Check whether we have an already split load.
5183 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
5184 std::vector<LoadInst *> *SplitLoads = nullptr;
5185 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5186 SplitLoads = &SplitLoadsMapI->second;
5187 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5188 "Too few split loads for the number of splits in the store!");
5189 } else {
5190 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5191 }
5192
5193 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5194 int Idx = 0, Size = Offsets.Splits.size();
5195 for (;;) {
5196 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5197 auto *LoadPartPtrTy = LI->getPointerOperandType();
5198 auto *StorePartPtrTy = SI->getPointerOperandType();
5199
5200 // Either lookup a split load or create one.
5201 LoadInst *PLoad;
5202 if (SplitLoads) {
5203 PLoad = (*SplitLoads)[Idx];
5204 } else {
5205 IRB.SetInsertPoint(LI);
5206 auto AS = LI->getPointerAddressSpace();
5207 PLoad = IRB.CreateAlignedLoad(
5208 PartTy,
5209 getAdjustedPtr(IRB, DL, LoadBasePtr,
5210 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5211 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5212 getAdjustedAlignment(LI, PartOffset),
5213 /*IsVolatile*/ false, LI->getName());
5214 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5215 LLVMContext::MD_access_group});
5216 }
5217
5218 // And store this partition.
5219 IRB.SetInsertPoint(SI);
5220 auto AS = SI->getPointerAddressSpace();
5221 StoreInst *PStore = IRB.CreateAlignedStore(
5222 PLoad,
5223 getAdjustedPtr(IRB, DL, StoreBasePtr,
5224 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5225 StorePartPtrTy, StoreBasePtr->getName() + "."),
5226 getAdjustedAlignment(SI, PartOffset),
5227 /*IsVolatile*/ false);
5228 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5229 LLVMContext::MD_access_group});
5230
5231 // Now build a new slice for the alloca.
5232 NewSlices.push_back(
5233 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5234 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5235 /*IsSplittable*/ false));
5236 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5237 << ", " << NewSlices.back().endOffset()
5238 << "): " << *PStore << "\n");
5239 if (!SplitLoads) {
5240 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5241 }
5242
5243 // See if we've finished all the splits.
5244 if (Idx >= Size)
5245 break;
5246
5247 // Setup the next partition.
5248 PartOffset = Offsets.Splits[Idx];
5249 ++Idx;
5250 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5251 }
5252
5253 // We want to immediately iterate on any allocas impacted by splitting
5254 // this load, which is only relevant if it isn't a load of this alloca and
5255 // thus we didn't already split the loads above. We also have to keep track
5256 // of any promotable allocas we split loads on as they can no longer be
5257 // promoted.
5258 if (!SplitLoads) {
5259 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5260 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5261 ResplitPromotableAllocas.insert(OtherAI);
5262 Worklist.insert(OtherAI);
5263 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5264 LoadBasePtr->stripInBoundsOffsets())) {
5265 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5266 Worklist.insert(OtherAI);
5267 }
5268 }
5269
5270 // Mark the original store as dead now that we've split it up and kill its
5271 // slice. Note that we leave the original load in place unless this store
5272 // was its only use. It may in turn be split up if it is an alloca load
5273 // for some other alloca, but it may be a normal load. This may introduce
5274 // redundant loads, but where those can be merged the rest of the optimizer
5275 // should handle the merging, and this uncovers SSA splits which is more
5276 // important. In practice, the original loads will almost always be fully
5277 // split and removed eventually, and the splits will be merged by any
5278 // trivial CSE, including instcombine.
5279 if (LI->hasOneUse()) {
5280 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5281 DeadInsts.push_back(LI);
5282 }
5283 DeadInsts.push_back(SI);
5284 Offsets.S->kill();
5285 }
5286
5287 // Remove the killed slices that have ben pre-split.
5288 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5289
5290 // Insert our new slices. This will sort and merge them into the sorted
5291 // sequence.
5292 AS.insert(NewSlices);
5293
5294 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5295#ifndef NDEBUG
5296 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5297 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5298#endif
5299
5300 // Finally, don't try to promote any allocas that new require re-splitting.
5301 // They have already been added to the worklist above.
5302 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5303
5304 return true;
5305}
5306
5307/// Try to canonicalize a homogeneous struct partition to a vector type.
5308///
5309/// We can do this if all the elements of the struct are the same and tightly
5310/// packed. This can sometimes eliminate allocas because structs cannot get
5311/// promoted to LLVM values, but vectors can.
5312///
5313/// We only apply this transformation when all users of the alloca are memory
5314/// intrinsics. Otherwise, if there is a load or store of some other type to the
5315/// partition, SROA would select that type.
5316///
5317/// Applying this transformation too early may hinder memcpyopt, which may
5318/// generate better code when eliminating allocas. For example, see
5319/// `struct-to-vector-fp-store-only-tail.ll`, which demonstrates that applying
5320/// this before memcpyopt can initialize previously uninitialized memory when
5321/// the alloca gets promoted to an SSA value. For another example, see
5322/// `struct-to-vector-before-memcpyopt.ll`, which demonstrates that applying
5323/// this before memcpyopt can result in promoting an alloca so that we load a
5324/// temporary value instead of copying the temporary value into memory, whereas
5325/// memcpyopt eliminates the temporary altogether.
5326///
5327/// As such, we only apply this transformation after memcpyopt has run. We gate
5328/// this transformation by the "AggregateToVector" pass option.
5330 Partition &P,
5331 const DataLayout &DL) {
5332 unsigned NumElts = STy->getNumElements();
5333
5334 Type *EltTy = STy->getElementType(0);
5335 if (!llvm::all_equal(STy->elements()))
5336 return nullptr;
5337
5338 bool IsIntegralPointerTy =
5339 EltTy->isPointerTy() && !DL.isNonIntegralPointerType(EltTy);
5340 if (!EltTy->isIntegerTy() && !EltTy->isFloatingPointTy() &&
5341 !IsIntegralPointerTy)
5342 return nullptr;
5343
5344 auto *VTy = FixedVectorType::get(EltTy, NumElts);
5345 TypeSize StructSize = DL.getStructLayout(STy)->getSizeInBytes();
5346 TypeSize VectorSize = DL.getTypeAllocSize(VTy);
5347 if (StructSize != VectorSize)
5348 return nullptr;
5349
5350 for (const Slice &S : P) {
5351 if (S.isDead())
5352 continue;
5353 auto *U = S.getUse();
5354 if (!U)
5355 continue;
5356
5357 User *Usr = U->getUser();
5359 continue;
5360
5361 if (!isa<MemIntrinsic>(Usr))
5362 return nullptr;
5363 }
5364
5365 return VTy;
5366}
5367
5368/// Select a partition type for an alloca partition.
5369///
5370/// Try to compute a friendly type for this partition of the alloca. This
5371/// won't always succeed, in which case we fall back to a legal integer type
5372/// or an i8 array of an appropriate size.
5373///
5374/// \returns A tuple with the following elements:
5375/// - PartitionType: The computed type for this partition.
5376/// - IsIntegerWideningViable: True if integer widening promotion is used.
5377/// - VectorType: The vector type if vector promotion is used, otherwise
5378/// nullptr.
5379static std::tuple<Type *, bool, VectorType *>
5381 LLVMContext &C, bool AggregateToVector) {
5382 auto LogSelection = [&](StringRef Path, Type *SelectedTy,
5383 VectorType *SelectedVecTy, bool SelectedIntWidening) {
5384 LLVM_DEBUG({
5385 dbgs() << "selectPartitionType path=" << Path
5386 << " func=" << AI.getFunction()->getName() << " alloca=";
5387 if (AI.hasName())
5388 dbgs() << AI.getName();
5389 else
5390 dbgs() << "<unnamed>";
5391 dbgs() << " partition=[" << P.beginOffset() << "," << P.endOffset()
5392 << ") size=" << P.size();
5393 if (std::optional<TypeSize> AllocSize = AI.getAllocationSize(DL))
5394 dbgs() << " alloc-size=" << AllocSize->getKnownMinValue();
5395 if (SelectedTy)
5396 dbgs() << " chosen=" << *SelectedTy;
5397 if (SelectedVecTy)
5398 dbgs() << " vec=" << *SelectedVecTy;
5399 dbgs() << " intwiden=" << SelectedIntWidening << "\n";
5400 });
5401 };
5402 // First check if the partition is viable for vector promotion.
5403 //
5404 // We prefer vector promotion over integer widening promotion when:
5405 // - The vector element type is a floating-point type.
5406 // - All the loads/stores to the alloca are vector loads/stores to the
5407 // entire alloca or load/store a single element of the vector.
5408 //
5409 // Otherwise when there is an integer vector with mixed type loads/stores we
5410 // prefer integer widening promotion because it's more likely the user is
5411 // doing bitwise arithmetic and we generate better code.
5412 VectorType *VecTy =
5414 // If the vector element type is a floating-point type, we prefer vector
5415 // promotion. If the vector has one element, let the below code select
5416 // whether we promote with the vector or scalar.
5417 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5418 VecTy->getElementCount().getFixedValue() > 1) {
5419 LogSelection("direct-fp-vecty", VecTy, VecTy, false);
5420 return {VecTy, false, VecTy};
5421 }
5422
5423 // Check if there is a common type that all slices of the partition use that
5424 // spans the partition.
5425 auto [CommonUseTy, LargestIntTy] =
5426 findCommonType(P.begin(), P.end(), P.endOffset());
5427 if (CommonUseTy) {
5428 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
5429 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5430 // We prefer vector promotion here because if vector promotion is viable
5431 // and there is a common type used, then it implies the second listed
5432 // condition for preferring vector promotion is true.
5433 if (VecTy) {
5434 LogSelection("common-type-vecty", VecTy, VecTy, false);
5435 return {VecTy, false, VecTy};
5436 }
5437 bool IntWiden = isIntegerWideningViable(P, CommonUseTy, DL);
5438 LogSelection("common-type", CommonUseTy, nullptr, IntWiden);
5439 return {CommonUseTy, IntWiden, nullptr};
5440 }
5441 }
5442
5443 // Can we find an appropriate subtype in the original allocated
5444 // type?
5445 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5446 P.beginOffset(), P.size())) {
5447 // If the partition is an integer array that can be spanned by a legal
5448 // integer type, prefer to represent it as a legal integer type because
5449 // it's more likely to be promotable.
5450 if (TypePartitionTy->isArrayTy() &&
5451 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5452 DL.isLegalInteger(P.size() * 8))
5453 TypePartitionTy = Type::getIntNTy(C, P.size() * 8);
5454 // There was no common type used, so we prefer integer widening promotion.
5455 if (isIntegerWideningViable(P, TypePartitionTy, DL)) {
5456 LogSelection("type-partition-int-widen", TypePartitionTy, nullptr, true);
5457 return {TypePartitionTy, true, nullptr};
5458 }
5459 if (VecTy) {
5460 LogSelection("type-partition-vecty", VecTy, VecTy, false);
5461 return {VecTy, false, VecTy};
5462 }
5463 // If we couldn't promote with TypePartitionTy, try with the largest
5464 // integer type used.
5465 if (LargestIntTy &&
5466 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
5467 isIntegerWideningViable(P, LargestIntTy, DL)) {
5468 LogSelection("largest-int-int-widen", LargestIntTy, nullptr, true);
5469 return {LargestIntTy, true, nullptr};
5470 }
5471
5472 // Try homogeneous struct to vector canonicalization when requested. Running
5473 // this too early can hide memcpy chains from MemCpyOpt.
5474 if (AggregateToVector) {
5475 if (auto *STy = dyn_cast<StructType>(TypePartitionTy)) {
5476 if (auto *VTy = tryCanonicalizeStructToVector(STy, P, DL)) {
5477 LogSelection("struct-fallback-vecty", VTy, nullptr, false);
5478 return {VTy, false, nullptr};
5479 }
5480 }
5481 }
5482
5483 // Fallback to TypePartitionTy and we probably won't promote.
5484 LogSelection("type-partition-fallback", TypePartitionTy, nullptr, false);
5485 return {TypePartitionTy, false, nullptr};
5486 }
5487
5488 // Select the largest integer type used if it spans the partition.
5489 if (LargestIntTy &&
5490 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size()) {
5491 LogSelection("largest-int-fallback", LargestIntTy, nullptr, false);
5492 return {LargestIntTy, false, nullptr};
5493 }
5494
5495 // Select a legal integer type if it spans the partition.
5496 if (DL.isLegalInteger(P.size() * 8)) {
5497 Type *IntTy = Type::getIntNTy(C, P.size() * 8);
5498 LogSelection("legal-int-fallback", IntTy, nullptr, false);
5499 return {IntTy, false, nullptr};
5500 }
5501
5502 // Fallback to an i8 array.
5503 Type *ArrayTy = ArrayType::get(Type::getInt8Ty(C), P.size());
5504 LogSelection("byte-array-fallback", ArrayTy, nullptr, false);
5505 return {ArrayTy, false, nullptr};
5506}
5507
5508/// Rewrite an alloca partition's users.
5509///
5510/// This routine drives both of the rewriting goals of the SROA pass. It tries
5511/// to rewrite uses of an alloca partition to be conducive for SSA value
5512/// promotion. If the partition needs a new, more refined alloca, this will
5513/// build that new alloca, preserving as much type information as possible, and
5514/// rewrite the uses of the old alloca to point at the new one and have the
5515/// appropriate new offsets. It also evaluates how successful the rewrite was
5516/// at enabling promotion and if it was successful queues the alloca to be
5517/// promoted.
5518std::pair<AllocaInst *, uint64_t>
5519SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5520 const DataLayout &DL = AI.getDataLayout();
5521 // Select the type for the new alloca that spans the partition.
5522 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5523 selectPartitionType(P, DL, AI, *C, AggregateToVector);
5524
5525 // Check for the case where we're going to rewrite to a new alloca of the
5526 // exact same type as the original, and with the same access offsets. In that
5527 // case, re-use the existing alloca, but still run through the rewriter to
5528 // perform phi and select speculation.
5529 // P.beginOffset() can be non-zero even with the same type in a case with
5530 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5531 AllocaInst *NewAI;
5532 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5533 NewAI = &AI;
5534 // FIXME: We should be able to bail at this point with "nothing changed".
5535 // FIXME: We might want to defer PHI speculation until after here.
5536 // FIXME: return nullptr;
5537 } else {
5538 // Make sure the alignment is compatible with P.beginOffset().
5539 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5540 // If we will get at least this much alignment from the type alone, leave
5541 // the alloca's alignment unconstrained.
5542 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
5543 NewAI = new AllocaInst(
5544 PartitionTy, AI.getAddressSpace(), nullptr,
5545 IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
5546 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5547 AI.getIterator());
5548 // Copy the old AI debug location over to the new one.
5549 NewAI->setDebugLoc(AI.getDebugLoc());
5550 ++NumNewAllocas;
5551 }
5552
5553 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5554 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5555
5556 // Track the high watermark on the worklist as it is only relevant for
5557 // promoted allocas. We will reset it to this point if the alloca is not in
5558 // fact scheduled for promotion.
5559 unsigned PPWOldSize = PostPromotionWorklist.size();
5560 unsigned NumUses = 0;
5561 SmallSetVector<PHINode *, 8> PHIUsers;
5562 SmallSetVector<SelectInst *, 8> SelectUsers;
5563
5564 AllocaSliceRewriter Rewriter(
5565 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5566 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5567 bool Promotable = true;
5568 // Check whether we can have tree-structured merge.
5569 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5570 NumUses += DeletedValues->size() + 1;
5571 for (Value *V : *DeletedValues)
5572 DeadInsts.push_back(V);
5573 } else {
5574 for (Slice *S : P.splitSliceTails()) {
5575 Promotable &= Rewriter.visit(S);
5576 ++NumUses;
5577 }
5578 for (Slice &S : P) {
5579 Promotable &= Rewriter.visit(&S);
5580 ++NumUses;
5581 }
5582 }
5583
5584 NumAllocaPartitionUses += NumUses;
5585 MaxUsesPerAllocaPartition.updateMax(NumUses);
5586
5587 // Now that we've processed all the slices in the new partition, check if any
5588 // PHIs or Selects would block promotion.
5589 for (PHINode *PHI : PHIUsers)
5590 if (!isSafePHIToSpeculate(*PHI)) {
5591 Promotable = false;
5592 PHIUsers.clear();
5593 SelectUsers.clear();
5594 break;
5595 }
5596
5598 NewSelectsToRewrite;
5599 NewSelectsToRewrite.reserve(SelectUsers.size());
5600 for (SelectInst *Sel : SelectUsers) {
5601 std::optional<RewriteableMemOps> Ops =
5602 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5603 if (!Ops) {
5604 Promotable = false;
5605 PHIUsers.clear();
5606 SelectUsers.clear();
5607 NewSelectsToRewrite.clear();
5608 break;
5609 }
5610 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5611 }
5612
5613 if (Promotable) {
5614 for (Use *U : AS.getDeadUsesIfPromotable()) {
5615 auto *OldInst = dyn_cast<Instruction>(U->get());
5616 Value::dropDroppableUse(*U);
5617 if (OldInst)
5618 if (isInstructionTriviallyDead(OldInst))
5619 DeadInsts.push_back(OldInst);
5620 }
5621 if (PHIUsers.empty() && SelectUsers.empty()) {
5622 // Promote the alloca.
5623 PromotableAllocas.insert(NewAI);
5624 } else {
5625 // If we have either PHIs or Selects to speculate, add them to those
5626 // worklists and re-queue the new alloca so that we promote in on the
5627 // next iteration.
5628 SpeculatablePHIs.insert_range(PHIUsers);
5629 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5630 NewSelectsToRewrite.size());
5631 for (auto &&KV : llvm::make_range(
5632 std::make_move_iterator(NewSelectsToRewrite.begin()),
5633 std::make_move_iterator(NewSelectsToRewrite.end())))
5634 SelectsToRewrite.insert(std::move(KV));
5635 Worklist.insert(NewAI);
5636 }
5637 } else {
5638 // Drop any post-promotion work items if promotion didn't happen.
5639 while (PostPromotionWorklist.size() > PPWOldSize)
5640 PostPromotionWorklist.pop_back();
5641
5642 // We couldn't promote and we didn't create a new partition, nothing
5643 // happened.
5644 if (NewAI == &AI)
5645 return {nullptr, 0};
5646
5647 // If we can't promote the alloca, iterate on it to check for new
5648 // refinements exposed by splitting the current alloca. Don't iterate on an
5649 // alloca which didn't actually change and didn't get promoted.
5650 Worklist.insert(NewAI);
5651 }
5652
5653 return {NewAI, DL.getTypeSizeInBits(PartitionTy).getFixedValue()};
5654}
5655
5656// There isn't a shared interface to get the "address" parts out of a
5657// dbg.declare and dbg.assign, so provide some wrappers.
5660 return DVR->isKillAddress();
5661 return DVR->isKillLocation();
5662}
5663
5666 return DVR->getAddressExpression();
5667 return DVR->getExpression();
5668}
5669
5670/// Create or replace an existing fragment in a DIExpression with \p Frag.
5671/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5672/// operation, add \p BitExtractOffset to the offset part.
5673///
5674/// Returns the new expression, or nullptr if this fails (see details below).
5675///
5676/// This function is similar to DIExpression::createFragmentExpression except
5677/// for 3 important distinctions:
5678/// 1. The new fragment isn't relative to an existing fragment.
5679/// 2. It assumes the computed location is a memory location. This means we
5680/// don't need to perform checks that creating the fragment preserves the
5681/// expression semantics.
5682/// 3. Existing extract_bits are modified independently of fragment changes
5683/// using \p BitExtractOffset. A change to the fragment offset or size
5684/// may affect a bit extract. But a bit extract offset can change
5685/// independently of the fragment dimensions.
5686///
5687/// Returns the new expression, or nullptr if one couldn't be created.
5688/// Ideally this is only used to signal that a bit-extract has become
5689/// zero-sized (and thus the new debug record has no size and can be
5690/// dropped), however, it fails for other reasons too - see the FIXME below.
5691///
5692/// FIXME: To keep the change that introduces this function NFC it bails
5693/// in some situations unecessarily, e.g. when fragment and bit extract
5694/// sizes differ.
5697 int64_t BitExtractOffset) {
5699 bool HasFragment = false;
5700 bool HasBitExtract = false;
5701
5702 for (auto &Op : Expr->expr_ops()) {
5703 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5704 HasFragment = true;
5705 continue;
5706 }
5707 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5709 HasBitExtract = true;
5710 int64_t ExtractOffsetInBits = Op.getArg(0);
5711 int64_t ExtractSizeInBits = Op.getArg(1);
5712
5713 // DIExpression::createFragmentExpression doesn't know how to handle
5714 // a fragment that is smaller than the extract. Copy the behaviour
5715 // (bail) to avoid non-NFC changes.
5716 // FIXME: Don't do this.
5717 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5718 return nullptr;
5719
5720 assert(BitExtractOffset <= 0);
5721 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5722
5723 // DIExpression::createFragmentExpression doesn't know what to do
5724 // if the new extract starts "outside" the existing one. Copy the
5725 // behaviour (bail) to avoid non-NFC changes.
5726 // FIXME: Don't do this.
5727 if (AdjustedOffset < 0)
5728 return nullptr;
5729
5730 Ops.push_back(Op.getOp());
5731 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5732 Ops.push_back(ExtractSizeInBits);
5733 continue;
5734 }
5735 Op.appendToVector(Ops);
5736 }
5737
5738 // Unsupported by createFragmentExpression, so don't support it here yet to
5739 // preserve NFC-ness.
5740 if (HasFragment && HasBitExtract)
5741 return nullptr;
5742
5743 if (!HasBitExtract) {
5745 Ops.push_back(Frag.OffsetInBits);
5746 Ops.push_back(Frag.SizeInBits);
5747 }
5748 return DIExpression::get(Expr->getContext(), Ops);
5749}
5750
5751/// Insert a new DbgRecord.
5752/// \p Orig Original to copy record type, debug loc and variable from, and
5753/// additionally value and value expression for dbg_assign records.
5754/// \p NewAddr Location's new base address.
5755/// \p NewAddrExpr New expression to apply to address.
5756/// \p BeforeInst Insert position.
5757/// \p NewFragment New fragment (absolute, non-relative).
5758/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5759static void
5761 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5762 std::optional<DIExpression::FragmentInfo> NewFragment,
5763 int64_t BitExtractAdjustment) {
5764 (void)DIB;
5765
5766 // A dbg_assign puts fragment info in the value expression only. The address
5767 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5768 // new fragment info into NewAddrExpr (as it only has one expression).
5769 DIExpression *NewFragmentExpr =
5770 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5771 if (NewFragment)
5772 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5773 BitExtractAdjustment);
5774 if (!NewFragmentExpr)
5775 return;
5776
5777 if (Orig->isDbgDeclare()) {
5779 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5780 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5781 BeforeInst->getIterator());
5782 return;
5783 }
5784
5785 if (Orig->isDbgValue()) {
5787 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5788 // Drop debug information if the expression doesn't start with a
5789 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5790 // describes the address of alloca rather than the value inside the alloca.
5791 if (!NewFragmentExpr->startsWithDeref())
5792 DVR->setKillAddress();
5793 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5794 BeforeInst->getIterator());
5795 return;
5796 }
5797
5798 // Apply a DIAssignID to the store if it doesn't already have it.
5799 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5800 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5802 }
5803
5805 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5806 NewAddrExpr, Orig->getDebugLoc());
5807 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5808 (void)NewAssign;
5809}
5810
5811/// Walks the slices of an alloca and form partitions based on them,
5812/// rewriting each of their uses.
5813bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5814 if (AS.begin() == AS.end())
5815 return false;
5816
5817 unsigned NumPartitions = 0;
5818 bool Changed = false;
5819 const DataLayout &DL = AI.getModule()->getDataLayout();
5820
5821 // First try to pre-split loads and stores.
5822 Changed |= presplitLoadsAndStores(AI, AS);
5823
5824 // Now that we have identified any pre-splitting opportunities,
5825 // mark loads and stores unsplittable except for the following case.
5826 // We leave a slice splittable if all other slices are disjoint or fully
5827 // included in the slice, such as whole-alloca loads and stores.
5828 // If we fail to split these during pre-splitting, we want to force them
5829 // to be rewritten into a partition.
5830 bool IsSorted = true;
5831
5832 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5833 const uint64_t MaxBitVectorSize = 1024;
5834 if (AllocaSize <= MaxBitVectorSize) {
5835 // If a byte boundary is included in any load or store, a slice starting or
5836 // ending at the boundary is not splittable.
5837 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5838 for (Slice &S : AS)
5839 for (unsigned O = S.beginOffset() + 1;
5840 O < S.endOffset() && O < AllocaSize; O++)
5841 SplittableOffset.reset(O);
5842
5843 for (Slice &S : AS) {
5844 if (!S.isSplittable())
5845 continue;
5846
5847 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5848 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5849 continue;
5850
5851 if (isa<LoadInst>(S.getUse()->getUser()) ||
5852 isa<StoreInst>(S.getUse()->getUser())) {
5853 S.makeUnsplittable();
5854 IsSorted = false;
5855 }
5856 }
5857 } else {
5858 // We only allow whole-alloca splittable loads and stores
5859 // for a large alloca to avoid creating too large BitVector.
5860 for (Slice &S : AS) {
5861 if (!S.isSplittable())
5862 continue;
5863
5864 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5865 continue;
5866
5867 if (isa<LoadInst>(S.getUse()->getUser()) ||
5868 isa<StoreInst>(S.getUse()->getUser())) {
5869 S.makeUnsplittable();
5870 IsSorted = false;
5871 }
5872 }
5873 }
5874
5875 if (!IsSorted)
5877
5878 /// Describes the allocas introduced by rewritePartition in order to migrate
5879 /// the debug info.
5880 struct Fragment {
5881 AllocaInst *Alloca;
5882 uint64_t Offset;
5883 uint64_t Size;
5884 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5885 : Alloca(AI), Offset(O), Size(S) {}
5886 };
5887 SmallVector<Fragment, 4> Fragments;
5888
5889 // Rewrite each partition.
5890 for (auto &P : AS.partitions()) {
5891 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5892 if (NewAI) {
5893 Changed = true;
5894 if (NewAI != &AI) {
5895 uint64_t SizeOfByte = 8;
5896 // Don't include any padding.
5897 uint64_t Size = std::min(ActiveBits, P.size() * SizeOfByte);
5898 Fragments.push_back(
5899 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5900 }
5901 }
5902 ++NumPartitions;
5903 }
5904
5905 NumAllocaPartitions += NumPartitions;
5906 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5907
5908 // Migrate debug information from the old alloca to the new alloca(s)
5909 // and the individual partitions.
5910 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5911 // Can't overlap with undef memory.
5912 if (isKillAddress(DbgVariable))
5913 return;
5914
5915 const Value *DbgPtr = DbgVariable->getAddress();
5917 DbgVariable->getFragmentOrEntireVariable();
5918 // Get the address expression constant offset if one exists and the ops
5919 // that come after it.
5920 int64_t CurrentExprOffsetInBytes = 0;
5921 SmallVector<uint64_t> PostOffsetOps;
5922 if (!getAddressExpression(DbgVariable)
5923 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5924 return; // Couldn't interpret this DIExpression - drop the var.
5925
5926 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5927 int64_t ExtractOffsetInBits = 0;
5928 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5929 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5931 ExtractOffsetInBits = Op.getArg(0);
5932 break;
5933 }
5934 }
5935
5936 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5937 for (auto Fragment : Fragments) {
5938 int64_t OffsetFromLocationInBits;
5939 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5940 // Find the variable fragment that the new alloca slice covers.
5941 // Drop debug info for this variable fragment if we can't compute an
5942 // intersect between it and the alloca slice.
5944 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5945 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5946 NewDbgFragment, OffsetFromLocationInBits))
5947 continue; // Do not migrate this fragment to this slice.
5948
5949 // Zero sized fragment indicates there's no intersect between the variable
5950 // fragment and the alloca slice. Skip this slice for this variable
5951 // fragment.
5952 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5953 continue; // Do not migrate this fragment to this slice.
5954
5955 // No fragment indicates DbgVariable's variable or fragment exactly
5956 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5957 if (!NewDbgFragment)
5958 NewDbgFragment = DbgVariable->getFragment();
5959
5960 // Reduce the new expression offset by the bit-extract offset since
5961 // we'll be keeping that.
5962 int64_t OffestFromNewAllocaInBits =
5963 OffsetFromLocationInBits - ExtractOffsetInBits;
5964 // We need to adjust an existing bit extract if the offset expression
5965 // can't eat the slack (i.e., if the new offset would be negative).
5966 int64_t BitExtractOffset =
5967 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5968 // The magnitude of a negative value indicates the number of bits into
5969 // the existing variable fragment that the memory region begins. The new
5970 // variable fragment already excludes those bits - the new DbgPtr offset
5971 // only needs to be applied if it's positive.
5972 OffestFromNewAllocaInBits =
5973 std::max(int64_t(0), OffestFromNewAllocaInBits);
5974
5975 // Rebuild the expression:
5976 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5977 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5978 // address expression but the value expression instead.
5979 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5980 if (OffestFromNewAllocaInBits > 0) {
5981 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5982 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5983 }
5984
5985 // Remove any existing intrinsics on the new alloca describing
5986 // the variable fragment.
5987 auto RemoveOne = [DbgVariable](auto *OldDII) {
5988 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5989 return LHS->getVariable() == RHS->getVariable() &&
5990 LHS->getDebugLoc()->getInlinedAt() ==
5991 RHS->getDebugLoc()->getInlinedAt();
5992 };
5993 if (SameVariableFragment(OldDII, DbgVariable))
5994 OldDII->eraseFromParent();
5995 };
5996 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5997 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5998 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5999 NewDbgFragment, BitExtractOffset);
6000 }
6001 };
6002
6003 // Migrate debug information from the old alloca to the new alloca(s)
6004 // and the individual partitions.
6005 for_each(findDVRDeclares(&AI), MigrateOne);
6006 for_each(findDVRValues(&AI), MigrateOne);
6007 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
6008
6009 return Changed;
6010}
6011
6012/// Clobber a use with poison, deleting the used value if it becomes dead.
6013void SROA::clobberUse(Use &U) {
6014 Value *OldV = U;
6015 // Replace the use with an poison value.
6016 U = PoisonValue::get(OldV->getType());
6017
6018 // Check for this making an instruction dead. We have to garbage collect
6019 // all the dead instructions to ensure the uses of any alloca end up being
6020 // minimal.
6021 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
6022 if (isInstructionTriviallyDead(OldI)) {
6023 DeadInsts.push_back(OldI);
6024 }
6025}
6026
6027/// A basic LoadAndStorePromoter that does not remove store nodes.
6029public:
6031 Type *ZeroType)
6032 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
6033 bool shouldDelete(Instruction *I) const override {
6034 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
6035 }
6036
6038 return UndefValue::get(ZeroType);
6039 }
6040
6041private:
6042 Type *ZeroType;
6043};
6044
6045bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
6046 // Look through each "partition", looking for slices with the same start/end
6047 // that do not overlap with any before them. The slices are sorted by
6048 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
6049 // sophisticated algorithm that takes splittable slices into account.
6050 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
6051 bool AllSameAndValid = true;
6052 Type *PartitionType = nullptr;
6054 uint64_t BeginOffset = 0;
6055 uint64_t EndOffset = 0;
6056
6057 auto Flush = [&]() {
6058 if (AllSameAndValid && !Insts.empty()) {
6059 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
6060 << EndOffset << ")\n");
6062 SSAUpdater SSA(&NewPHIs);
6063 Insts.push_back(&AI);
6064 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
6065 Promoter.run(Insts);
6066 }
6067 AllSameAndValid = true;
6068 PartitionType = nullptr;
6069 Insts.clear();
6070 };
6071
6072 for (Slice &S : AS) {
6073 auto *User = cast<Instruction>(S.getUse()->getUser());
6074 if (isAssumeLikeIntrinsic(User)) {
6075 LLVM_DEBUG({
6076 dbgs() << "Ignoring slice: ";
6077 AS.print(dbgs(), &S);
6078 });
6079 continue;
6080 }
6081 if (S.beginOffset() >= EndOffset) {
6082 Flush();
6083 BeginOffset = S.beginOffset();
6084 EndOffset = S.endOffset();
6085 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
6086 if (AllSameAndValid) {
6087 LLVM_DEBUG({
6088 dbgs() << "Slice does not match range [" << BeginOffset << ", "
6089 << EndOffset << ")";
6090 AS.print(dbgs(), &S);
6091 });
6092 AllSameAndValid = false;
6093 }
6094 EndOffset = std::max(EndOffset, S.endOffset());
6095 continue;
6096 }
6097
6098 if (auto *LI = dyn_cast<LoadInst>(User)) {
6099 Type *UserTy = LI->getType();
6100 // LoadAndStorePromoter requires all the types to be the same.
6101 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
6102 AllSameAndValid = false;
6103 PartitionType = UserTy;
6104 Insts.push_back(User);
6105 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
6106 Type *UserTy = SI->getValueOperand()->getType();
6107 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
6108 AllSameAndValid = false;
6109 PartitionType = UserTy;
6110 Insts.push_back(User);
6111 } else {
6112 AllSameAndValid = false;
6113 }
6114 }
6115
6116 Flush();
6117 return true;
6118}
6119
6120/// Analyze an alloca for SROA.
6121///
6122/// This analyzes the alloca to ensure we can reason about it, builds
6123/// the slices of the alloca, and then hands it off to be split and
6124/// rewritten as needed.
6125std::pair<bool /*Changed*/, bool /*CFGChanged*/>
6126SROA::runOnAlloca(AllocaInst &AI) {
6127 bool Changed = false;
6128 bool CFGChanged = false;
6129
6130 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
6131 ++NumAllocasAnalyzed;
6132
6133 // Special case dead allocas, as they're trivial.
6134 if (AI.use_empty()) {
6135 AI.eraseFromParent();
6136 Changed = true;
6137 return {Changed, CFGChanged};
6138 }
6139 const DataLayout &DL = AI.getDataLayout();
6140
6141 // Skip alloca forms that this analysis can't handle.
6142 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
6143 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
6144 return {Changed, CFGChanged};
6145
6146 // First, split any FCA loads and stores touching this alloca to promote
6147 // better splitting and promotion opportunities.
6148 IRBuilderTy IRB(&AI);
6149 AggLoadStoreRewriter AggRewriter(DL, IRB);
6150 Changed |= AggRewriter.rewrite(AI);
6151
6152 // Build the slices using a recursive instruction-visiting builder.
6153 AllocaSlices AS(DL, AI);
6154 LLVM_DEBUG(AS.print(dbgs()));
6155 if (AS.isEscaped())
6156 return {Changed, CFGChanged};
6157
6158 if (AS.isEscapedReadOnly()) {
6159 Changed |= propagateStoredValuesToLoads(AI, AS);
6160 return {Changed, CFGChanged};
6161 }
6162
6163 // Delete all the dead users of this alloca before splitting and rewriting it.
6164 for (Instruction *DeadUser : AS.getDeadUsers()) {
6165 // Free up everything used by this instruction.
6166 for (Use &DeadOp : DeadUser->operands())
6167 clobberUse(DeadOp);
6168
6169 // Now replace the uses of this instruction.
6170 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
6171
6172 // And mark it for deletion.
6173 DeadInsts.push_back(DeadUser);
6174 Changed = true;
6175 }
6176 for (Use *DeadOp : AS.getDeadOperands()) {
6177 clobberUse(*DeadOp);
6178 Changed = true;
6179 }
6180
6181 // No slices to split. Leave the dead alloca for a later pass to clean up.
6182 if (AS.begin() == AS.end())
6183 return {Changed, CFGChanged};
6184
6185 Changed |= splitAlloca(AI, AS);
6186
6187 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
6188 while (!SpeculatablePHIs.empty())
6189 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
6190
6191 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
6192 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
6193 while (!RemainingSelectsToRewrite.empty()) {
6194 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
6195 CFGChanged |=
6196 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
6197 }
6198
6199 return {Changed, CFGChanged};
6200}
6201
6202/// Delete the dead instructions accumulated in this run.
6203///
6204/// Recursively deletes the dead instructions we've accumulated. This is done
6205/// at the very end to maximize locality of the recursive delete and to
6206/// minimize the problems of invalidated instruction pointers as such pointers
6207/// are used heavily in the intermediate stages of the algorithm.
6208///
6209/// We also record the alloca instructions deleted here so that they aren't
6210/// subsequently handed to mem2reg to promote.
6211bool SROA::deleteDeadInstructions(
6212 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
6213 bool Changed = false;
6214 while (!DeadInsts.empty()) {
6215 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
6216 if (!I)
6217 continue;
6218 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
6219
6220 // If the instruction is an alloca, find the possible dbg.declare connected
6221 // to it, and remove it too. We must do this before calling RAUW or we will
6222 // not be able to find it.
6223 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
6224 DeletedAllocas.insert(AI);
6225 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
6226 OldDII->eraseFromParent();
6227 }
6228
6230 I->replaceAllUsesWith(UndefValue::get(I->getType()));
6231
6232 for (Use &Operand : I->operands())
6233 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
6234 // Zero out the operand and see if it becomes trivially dead.
6235 Operand = nullptr;
6237 DeadInsts.push_back(U);
6238 }
6239
6240 ++NumDeleted;
6241 I->eraseFromParent();
6242 Changed = true;
6243 }
6244 return Changed;
6245}
6246/// Promote the allocas, using the best available technique.
6247///
6248/// This attempts to promote whatever allocas have been identified as viable in
6249/// the PromotableAllocas list. If that list is empty, there is nothing to do.
6250/// This function returns whether any promotion occurred.
6251bool SROA::promoteAllocas() {
6252 if (PromotableAllocas.empty())
6253 return false;
6254
6255 if (SROASkipMem2Reg) {
6256 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
6257 } else {
6258 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
6259 NumPromoted += PromotableAllocas.size();
6260 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
6261 }
6262
6263 PromotableAllocas.clear();
6264 return true;
6265}
6266
6267std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6268 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6269
6270 const DataLayout &DL = F.getDataLayout();
6271 BasicBlock &EntryBB = F.getEntryBlock();
6272 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
6273 I != E; ++I) {
6274 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
6275 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
6276 if (Size && Size->isScalable() && isAllocaPromotable(AI))
6277 PromotableAllocas.insert(AI);
6278 else
6279 Worklist.insert(AI);
6280 }
6281 }
6282
6283 bool Changed = false;
6284 bool CFGChanged = false;
6285 // A set of deleted alloca instruction pointers which should be removed from
6286 // the list of promotable allocas.
6287 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6288
6289 do {
6290 while (!Worklist.empty()) {
6291 auto [IterationChanged, IterationCFGChanged] =
6292 runOnAlloca(*Worklist.pop_back_val());
6293 Changed |= IterationChanged;
6294 CFGChanged |= IterationCFGChanged;
6295
6296 Changed |= deleteDeadInstructions(DeletedAllocas);
6297
6298 // Remove the deleted allocas from various lists so that we don't try to
6299 // continue processing them.
6300 if (!DeletedAllocas.empty()) {
6301 Worklist.set_subtract(DeletedAllocas);
6302 PostPromotionWorklist.set_subtract(DeletedAllocas);
6303 PromotableAllocas.set_subtract(DeletedAllocas);
6304 DeletedAllocas.clear();
6305 }
6306 }
6307
6308 Changed |= promoteAllocas();
6309
6310 Worklist = PostPromotionWorklist;
6311 PostPromotionWorklist.clear();
6312 } while (!Worklist.empty());
6313
6314 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6315 assert((!CFGChanged || !PreserveCFG) &&
6316 "Should not have modified the CFG when told to preserve it.");
6317
6318 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6319 for (auto &BB : F) {
6321 }
6322 }
6323
6324 return {Changed, CFGChanged};
6325}
6326
6330 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6331 auto [Changed, CFGChanged] =
6332 SROA(&F.getContext(), &DTU, &AC, Options).runSROA(F);
6333 if (!Changed)
6334 return PreservedAnalyses::all();
6336 if (!CFGChanged)
6339 return PA;
6340}
6341
6343 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6344 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6345 OS, MapClassName2PassName);
6346 OS << '<'
6347 << (Options.CFG == SROAOptions::PreserveCFG ? "preserve-cfg"
6348 : "modify-cfg");
6349 if (Options.AggregateToVector)
6350 OS << ";aggregate-to-vector";
6351 OS << '>';
6352}
6353
6354SROAPass::SROAPass(SROAOptions Options) : Options(Options) {}
6355
6356namespace {
6357
6358/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6359class SROALegacyPass : public FunctionPass {
6361
6362public:
6363 static char ID;
6364
6368 }
6369
6370 bool runOnFunction(Function &F) override {
6371 if (skipFunction(F))
6372 return false;
6373
6374 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6375 AssumptionCache &AC =
6376 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6377 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6378 auto [Changed, _] = SROA(&F.getContext(), &DTU, &AC, Options).runSROA(F);
6379 return Changed;
6380 }
6381
6382 void getAnalysisUsage(AnalysisUsage &AU) const override {
6383 AU.addRequired<AssumptionCacheTracker>();
6384 AU.addRequired<DominatorTreeWrapperPass>();
6385 AU.addPreserved<GlobalsAAWrapperPass>();
6386 AU.addPreserved<DominatorTreeWrapperPass>();
6387 }
6388
6389 StringRef getPassName() const override { return "SROA"; }
6390};
6391
6392} // end anonymous namespace
6393
6394char SROALegacyPass::ID = 0;
6395
6396FunctionPass *llvm::createSROAPass(bool PreserveCFG, bool AggregateToVector) {
6397 return new SROALegacyPass(SROAOptions(PreserveCFG ? SROAOptions::PreserveCFG
6399 AggregateToVector));
6400}
6401
6402INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6403 "Scalar Replacement Of Aggregates", false, false)
6406INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:663
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Forward Handle Accesses
DXIL Resource Access
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
Flatten the CFG
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:598
This file implements a map that provides insertion order iteration.
static std::optional< AllocFnsTy > getAllocationSize(const CallBase *CB, const TargetLibraryInfo *TLI)
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:344
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2241
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1919
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2092
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1485
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4664
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:279
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5695
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2484
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2017
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1908
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2323
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2517
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1007
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1874
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1807
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:994
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5658
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2539
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2418
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1625
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2197
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:325
static std::tuple< Type *, bool, VectorType * > selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, LLVMContext &C, bool AggregateToVector)
Select a partition type for an alloca partition.
Definition SROA.cpp:5380
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1551
static FixedVectorType * tryCanonicalizeStructToVector(StructType *STy, Partition &P, const DataLayout &DL)
Try to canonicalize a homogeneous struct partition to a vector type.
Definition SROA.cpp:5329
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2459
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5760
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1768
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2611
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5664
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4702
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1929
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1706
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1019
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1035
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:807
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:954
partition_iterator & operator++()
Definition SROA.cpp:974
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:6033
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:6030
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:6037
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
iterator begin() const
Definition ArrayRef.h:129
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
DebugLoc getDebugLoc() const
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI void setKillAddress()
Kill the address component.
LLVM_ABI bool isKillLocation() const
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
LLVM_ABI void setAssignId(DIAssignID *New)
DIExpression * getExpression() const
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DILocalVariable * getVariable() const
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:55
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:221
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:809
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:126
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2900
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
LLVMContext & getContext() const
Definition Metadata.h:1233
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
LLVM_ABI SROAPass(SROAOptions Options)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6354
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6327
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6342
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static constexpr size_t npos
Definition StringRef.h:58
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:591
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:365
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:290
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:743
TypeSize getSizeInBytes() const
Definition DataLayout.h:752
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:774
TypeSize getSizeInBits() const
Definition DataLayout.h:754
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:479
element_iterator element_end() const
ArrayRef< Type * > elements() const
element_iterator element_begin() const
bool isPacked() const
unsigned getNumElements() const
Random access to the elements.
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:311
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:205
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:285
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:828
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:215
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:204
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:558
@ Length
Definition DWP.cpp:558
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:360
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2115
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
LLVM_ABI cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1460
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2133
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2199
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:396
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:449
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:82
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true, bool AggregateToVector=false)
Definition SROA.cpp:6396
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:822
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:89