LLVM 23.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
76#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <queue>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <variant>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "sroa"
104
105STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
106STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
107STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
108STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
109STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
110STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
111STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
112STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
113STATISTIC(NumLoadsPredicated,
114 "Number of loads rewritten into predicated loads to allow promotion");
116 NumStoresPredicated,
117 "Number of stores rewritten into predicated loads to allow promotion");
118STATISTIC(NumDeleted, "Number of instructions deleted");
119STATISTIC(NumVectorized, "Number of vectorized aggregates");
120
121namespace llvm {
122/// Disable running mem2reg during SROA in order to test or debug SROA.
123static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
124 cl::Hidden);
126} // namespace llvm
127
128namespace {
129
130class AllocaSliceRewriter;
131class AllocaSlices;
132class Partition;
133
134class SelectHandSpeculativity {
135 unsigned char Storage = 0; // None are speculatable by default.
136 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
137 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
138public:
139 SelectHandSpeculativity() = default;
140 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
141 bool isSpeculatable(bool isTrueVal) const;
142 bool areAllSpeculatable() const;
143 bool areAnySpeculatable() const;
144 bool areNoneSpeculatable() const;
145 // For interop as int half of PointerIntPair.
146 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
147 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
148};
149static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
150
151using PossiblySpeculatableLoad =
153using UnspeculatableStore = StoreInst *;
154using RewriteableMemOp =
155 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
156using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
157
158/// An optimization pass providing Scalar Replacement of Aggregates.
159///
160/// This pass takes allocations which can be completely analyzed (that is, they
161/// don't escape) and tries to turn them into scalar SSA values. There are
162/// a few steps to this process.
163///
164/// 1) It takes allocations of aggregates and analyzes the ways in which they
165/// are used to try to split them into smaller allocations, ideally of
166/// a single scalar data type. It will split up memcpy and memset accesses
167/// as necessary and try to isolate individual scalar accesses.
168/// 2) It will transform accesses into forms which are suitable for SSA value
169/// promotion. This can be replacing a memset with a scalar store of an
170/// integer value, or it can involve speculating operations on a PHI or
171/// select to be a PHI or select of the results.
172/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
173/// onto insert and extract operations on a vector value, and convert them to
174/// this form. By doing so, it will enable promotion of vector aggregates to
175/// SSA vector values.
176class SROA {
177 LLVMContext *const C;
178 DomTreeUpdater *const DTU;
179 AssumptionCache *const AC;
180 const bool PreserveCFG;
181
182 /// Worklist of alloca instructions to simplify.
183 ///
184 /// Each alloca in the function is added to this. Each new alloca formed gets
185 /// added to it as well to recursively simplify unless that alloca can be
186 /// directly promoted. Finally, each time we rewrite a use of an alloca other
187 /// the one being actively rewritten, we add it back onto the list if not
188 /// already present to ensure it is re-visited.
189 SmallSetVector<AllocaInst *, 16> Worklist;
190
191 /// A collection of instructions to delete.
192 /// We try to batch deletions to simplify code and make things a bit more
193 /// efficient. We also make sure there is no dangling pointers.
194 SmallVector<WeakVH, 8> DeadInsts;
195
196 /// Post-promotion worklist.
197 ///
198 /// Sometimes we discover an alloca which has a high probability of becoming
199 /// viable for SROA after a round of promotion takes place. In those cases,
200 /// the alloca is enqueued here for re-processing.
201 ///
202 /// Note that we have to be very careful to clear allocas out of this list in
203 /// the event they are deleted.
204 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
205
206 /// A collection of alloca instructions we can directly promote.
207 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
208 SmallPtrSet<AllocaInst *, 16>, 16>
209 PromotableAllocas;
210
211 /// A worklist of PHIs to speculate prior to promoting allocas.
212 ///
213 /// All of these PHIs have been checked for the safety of speculation and by
214 /// being speculated will allow promoting allocas currently in the promotable
215 /// queue.
216 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
217
218 /// A worklist of select instructions to rewrite prior to promoting
219 /// allocas.
220 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
221
222 /// Select instructions that use an alloca and are subsequently loaded can be
223 /// rewritten to load both input pointers and then select between the result,
224 /// allowing the load of the alloca to be promoted.
225 /// From this:
226 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
227 /// %V = load <type>, ptr %P2
228 /// to:
229 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
230 /// %V2 = load <type>, ptr %Other
231 /// %V = select i1 %cond, <type> %V1, <type> %V2
232 ///
233 /// We can do this to a select if its only uses are loads
234 /// and if either the operand to the select can be loaded unconditionally,
235 /// or if we are allowed to perform CFG modifications.
236 /// If found an intervening bitcast with a single use of the load,
237 /// allow the promotion.
238 static std::optional<RewriteableMemOps>
239 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
240
241public:
242 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
243 SROAOptions PreserveCFG_)
244 : C(C), DTU(DTU), AC(AC),
245 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
246
247 /// Main run method used by both the SROAPass and by the legacy pass.
248 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
249
250private:
251 friend class AllocaSliceRewriter;
252
253 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
254 std::pair<AllocaInst *, uint64_t>
255 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
256 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
257 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
258 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
259 void clobberUse(Use &U);
260 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
261 bool promoteAllocas();
262};
263
264} // end anonymous namespace
265
266/// Calculate the fragment of a variable to use when slicing a store
267/// based on the slice dimensions, existing fragment, and base storage
268/// fragment.
269/// Results:
270/// UseFrag - Use Target as the new fragment.
271/// UseNoFrag - The new slice already covers the whole variable.
272/// Skip - The new alloca slice doesn't include this variable.
273/// FIXME: Can we use calculateFragmentIntersect instead?
274namespace {
275enum FragCalcResult { UseFrag, UseNoFrag, Skip };
276}
277static FragCalcResult
279 uint64_t NewStorageSliceOffsetInBits,
280 uint64_t NewStorageSliceSizeInBits,
281 std::optional<DIExpression::FragmentInfo> StorageFragment,
282 std::optional<DIExpression::FragmentInfo> CurrentFragment,
284 // If the base storage describes part of the variable apply the offset and
285 // the size constraint.
286 if (StorageFragment) {
287 Target.SizeInBits =
288 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
289 Target.OffsetInBits =
290 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
291 } else {
292 Target.SizeInBits = NewStorageSliceSizeInBits;
293 Target.OffsetInBits = NewStorageSliceOffsetInBits;
294 }
295
296 // If this slice extracts the entirety of an independent variable from a
297 // larger alloca, do not produce a fragment expression, as the variable is
298 // not fragmented.
299 if (!CurrentFragment) {
300 if (auto Size = Variable->getSizeInBits()) {
301 // Treat the current fragment as covering the whole variable.
302 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
303 if (Target == CurrentFragment)
304 return UseNoFrag;
305 }
306 }
307
308 // No additional work to do if there isn't a fragment already, or there is
309 // but it already exactly describes the new assignment.
310 if (!CurrentFragment || *CurrentFragment == Target)
311 return UseFrag;
312
313 // Reject the target fragment if it doesn't fit wholly within the current
314 // fragment. TODO: We could instead chop up the target to fit in the case of
315 // a partial overlap.
316 if (Target.startInBits() < CurrentFragment->startInBits() ||
317 Target.endInBits() > CurrentFragment->endInBits())
318 return Skip;
319
320 // Target fits within the current fragment, return it.
321 return UseFrag;
322}
323
325 return DebugVariable(DVR->getVariable(), std::nullopt,
326 DVR->getDebugLoc().getInlinedAt());
327}
328
329/// Find linked dbg.assign and generate a new one with the correct
330/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
331/// value component is copied from the old dbg.assign to the new.
332/// \param OldAlloca Alloca for the variable before splitting.
333/// \param IsSplit True if the store (not necessarily alloca)
334/// is being split.
335/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
336/// \param SliceSizeInBits New number of bits being written to.
337/// \param OldInst Instruction that is being split.
338/// \param Inst New instruction performing this part of the
339/// split store.
340/// \param Dest Store destination.
341/// \param Value Stored value.
342/// \param DL Datalayout.
343static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
344 uint64_t OldAllocaOffsetInBits,
345 uint64_t SliceSizeInBits, Instruction *OldInst,
346 Instruction *Inst, Value *Dest, Value *Value,
347 const DataLayout &DL) {
348 // If we want allocas to be migrated using this helper then we need to ensure
349 // that the BaseFragments map code still works. A simple solution would be
350 // to choose to always clone alloca dbg_assigns (rather than sometimes
351 // "stealing" them).
352 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
353
354 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
355 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
356 if (DVRAssignMarkerRange.empty())
357 return;
358
359 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
360 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
361 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
362 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
363 << "\n");
364 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
365 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
366 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
367 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
368 if (Value)
369 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
370
371 /// Map of aggregate variables to their fragment associated with OldAlloca.
373 BaseFragments;
374 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
375 BaseFragments[getAggregateVariable(DVR)] =
376 DVR->getExpression()->getFragmentInfo();
377
378 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
379 // one). It shouldn't already have one: assert this assumption.
380 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
381 DIAssignID *NewID = nullptr;
382 auto &Ctx = Inst->getContext();
383 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
384 assert(OldAlloca->isStaticAlloca());
385
386 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
387 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
388 << "\n");
389 auto *Expr = DbgAssign->getExpression();
390 bool SetKillLocation = false;
391
392 if (IsSplit) {
393 std::optional<DIExpression::FragmentInfo> BaseFragment;
394 {
395 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
396 if (R == BaseFragments.end())
397 return;
398 BaseFragment = R->second;
399 }
400 std::optional<DIExpression::FragmentInfo> CurrentFragment =
401 Expr->getFragmentInfo();
402 DIExpression::FragmentInfo NewFragment;
403 FragCalcResult Result = calculateFragment(
404 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
405 BaseFragment, CurrentFragment, NewFragment);
406
407 if (Result == Skip)
408 return;
409 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
410 if (CurrentFragment) {
411 // Rewrite NewFragment to be relative to the existing one (this is
412 // what createFragmentExpression wants). CalculateFragment has
413 // already resolved the size for us. FIXME: Should it return the
414 // relative fragment too?
415 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
416 }
417 // Add the new fragment info to the existing expression if possible.
419 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
420 Expr = *E;
421 } else {
422 // Otherwise, add the new fragment info to an empty expression and
423 // discard the value component of this dbg.assign as the value cannot
424 // be computed with the new fragment.
426 DIExpression::get(Expr->getContext(), {}),
427 NewFragment.OffsetInBits, NewFragment.SizeInBits);
428 SetKillLocation = true;
429 }
430 }
431 }
432
433 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
434 if (!NewID) {
435 NewID = DIAssignID::getDistinct(Ctx);
436 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
437 }
438
439 DbgVariableRecord *NewAssign;
440 if (IsSplit) {
441 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
443 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
444 Dest, DIExpression::get(Expr->getContext(), {}),
445 DbgAssign->getDebugLoc())));
446 } else {
447 // The store is not split, simply steal the existing dbg_assign.
448 NewAssign = DbgAssign;
449 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
450 NewAssign->setAddress(Dest);
451 if (Value)
452 NewAssign->replaceVariableLocationOp(0u, Value);
453 assert(Expr == NewAssign->getExpression());
454 }
455
456 // If we've updated the value but the original dbg.assign has an arglist
457 // then kill it now - we can't use the requested new value.
458 // We can't replace the DIArgList with the new value as it'd leave
459 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
460 // an arglist). And we can't keep the DIArgList in case the linked store
461 // is being split - in which case the DIArgList + expression may no longer
462 // be computing the correct value.
463 // This should be a very rare situation as it requires the value being
464 // stored to differ from the dbg.assign (i.e., the value has been
465 // represented differently in the debug intrinsic for some reason).
466 SetKillLocation |=
467 Value && (DbgAssign->hasArgList() ||
468 !DbgAssign->getExpression()->isSingleLocationExpression());
469 if (SetKillLocation)
470 NewAssign->setKillLocation();
471
472 // We could use more precision here at the cost of some additional (code)
473 // complexity - if the original dbg.assign was adjacent to its store, we
474 // could position this new dbg.assign adjacent to its store rather than the
475 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
476 // what we get now:
477 // split store !1
478 // split store !2
479 // dbg.assign !1
480 // dbg.assign !2
481 // This (current behaviour) results results in debug assignments being
482 // noted as slightly offset (in code) from the store. In practice this
483 // should have little effect on the debugging experience due to the fact
484 // that all the split stores should get the same line number.
485 if (NewAssign != DbgAssign) {
486 NewAssign->moveBefore(DbgAssign->getIterator());
487 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
488 }
489 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
490 };
491
492 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
493}
494
495namespace {
496
497/// A custom IRBuilder inserter which prefixes all names, but only in
498/// Assert builds.
499class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
500 std::string Prefix;
501
502 Twine getNameWithPrefix(const Twine &Name) const {
503 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
504 }
505
506public:
507 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
508
509 void InsertHelper(Instruction *I, const Twine &Name,
510 BasicBlock::iterator InsertPt) const override {
511 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
512 InsertPt);
513 }
514};
515
516/// Provide a type for IRBuilder that drops names in release builds.
518
519/// A used slice of an alloca.
520///
521/// This structure represents a slice of an alloca used by some instruction. It
522/// stores both the begin and end offsets of this use, a pointer to the use
523/// itself, and a flag indicating whether we can classify the use as splittable
524/// or not when forming partitions of the alloca.
525class Slice {
526 /// The beginning offset of the range.
527 uint64_t BeginOffset = 0;
528
529 /// The ending offset, not included in the range.
530 uint64_t EndOffset = 0;
531
532 /// Storage for both the use of this slice and whether it can be
533 /// split.
534 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
535
536public:
537 Slice() = default;
538
539 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
540 : BeginOffset(BeginOffset), EndOffset(EndOffset),
541 UseAndIsSplittable(U, IsSplittable) {}
542
543 uint64_t beginOffset() const { return BeginOffset; }
544 uint64_t endOffset() const { return EndOffset; }
545
546 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
547 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
548
549 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
550
551 bool isDead() const { return getUse() == nullptr; }
552 void kill() { UseAndIsSplittable.setPointer(nullptr); }
553
554 /// Support for ordering ranges.
555 ///
556 /// This provides an ordering over ranges such that start offsets are
557 /// always increasing, and within equal start offsets, the end offsets are
558 /// decreasing. Thus the spanning range comes first in a cluster with the
559 /// same start position.
560 bool operator<(const Slice &RHS) const {
561 if (beginOffset() < RHS.beginOffset())
562 return true;
563 if (beginOffset() > RHS.beginOffset())
564 return false;
565 if (isSplittable() != RHS.isSplittable())
566 return !isSplittable();
567 if (endOffset() > RHS.endOffset())
568 return true;
569 return false;
570 }
571
572 /// Support comparison with a single offset to allow binary searches.
573 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
574 return LHS.beginOffset() < RHSOffset;
575 }
576 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
577 return LHSOffset < RHS.beginOffset();
578 }
579
580 bool operator==(const Slice &RHS) const {
581 return isSplittable() == RHS.isSplittable() &&
582 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
583 }
584 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
585};
586
587/// Representation of the alloca slices.
588///
589/// This class represents the slices of an alloca which are formed by its
590/// various uses. If a pointer escapes, we can't fully build a representation
591/// for the slices used and we reflect that in this structure. The uses are
592/// stored, sorted by increasing beginning offset and with unsplittable slices
593/// starting at a particular offset before splittable slices.
594class AllocaSlices {
595public:
596 /// Construct the slices of a particular alloca.
597 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
598
599 /// Test whether a pointer to the allocation escapes our analysis.
600 ///
601 /// If this is true, the slices are never fully built and should be
602 /// ignored.
603 bool isEscaped() const { return PointerEscapingInstr; }
604 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
605
606 /// Support for iterating over the slices.
607 /// @{
608 using iterator = SmallVectorImpl<Slice>::iterator;
609 using range = iterator_range<iterator>;
610
611 iterator begin() { return Slices.begin(); }
612 iterator end() { return Slices.end(); }
613
614 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
615 using const_range = iterator_range<const_iterator>;
616
617 const_iterator begin() const { return Slices.begin(); }
618 const_iterator end() const { return Slices.end(); }
619 /// @}
620
621 /// Erase a range of slices.
622 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
623
624 /// Insert new slices for this alloca.
625 ///
626 /// This moves the slices into the alloca's slices collection, and re-sorts
627 /// everything so that the usual ordering properties of the alloca's slices
628 /// hold.
629 void insert(ArrayRef<Slice> NewSlices) {
630 int OldSize = Slices.size();
631 Slices.append(NewSlices.begin(), NewSlices.end());
632 auto SliceI = Slices.begin() + OldSize;
633 std::stable_sort(SliceI, Slices.end());
634 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
635 }
636
637 // Forward declare the iterator and range accessor for walking the
638 // partitions.
639 class partition_iterator;
641
642 /// Access the dead users for this alloca.
643 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
644
645 /// Access Uses that should be dropped if the alloca is promotable.
646 ArrayRef<Use *> getDeadUsesIfPromotable() const {
647 return DeadUseIfPromotable;
648 }
649
650 /// Access the dead operands referring to this alloca.
651 ///
652 /// These are operands which have cannot actually be used to refer to the
653 /// alloca as they are outside its range and the user doesn't correct for
654 /// that. These mostly consist of PHI node inputs and the like which we just
655 /// need to replace with undef.
656 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
657
658#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
659 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
660 void printSlice(raw_ostream &OS, const_iterator I,
661 StringRef Indent = " ") const;
662 void printUse(raw_ostream &OS, const_iterator I,
663 StringRef Indent = " ") const;
664 void print(raw_ostream &OS) const;
665 void dump(const_iterator I) const;
666 void dump() const;
667#endif
668
669private:
670 template <typename DerivedT, typename RetT = void> class BuilderBase;
671 class SliceBuilder;
672
673 friend class AllocaSlices::SliceBuilder;
674
675#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
676 /// Handle to alloca instruction to simplify method interfaces.
677 AllocaInst &AI;
678#endif
679
680 /// The instruction responsible for this alloca not having a known set
681 /// of slices.
682 ///
683 /// When an instruction (potentially) escapes the pointer to the alloca, we
684 /// store a pointer to that here and abort trying to form slices of the
685 /// alloca. This will be null if the alloca slices are analyzed successfully.
686 Instruction *PointerEscapingInstr;
687 Instruction *PointerEscapingInstrReadOnly;
688
689 /// The slices of the alloca.
690 ///
691 /// We store a vector of the slices formed by uses of the alloca here. This
692 /// vector is sorted by increasing begin offset, and then the unsplittable
693 /// slices before the splittable ones. See the Slice inner class for more
694 /// details.
696
697 /// Instructions which will become dead if we rewrite the alloca.
698 ///
699 /// Note that these are not separated by slice. This is because we expect an
700 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
701 /// all these instructions can simply be removed and replaced with poison as
702 /// they come from outside of the allocated space.
703 SmallVector<Instruction *, 8> DeadUsers;
704
705 /// Uses which will become dead if can promote the alloca.
706 SmallVector<Use *, 8> DeadUseIfPromotable;
707
708 /// Operands which will become dead if we rewrite the alloca.
709 ///
710 /// These are operands that in their particular use can be replaced with
711 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
712 /// to PHI nodes and the like. They aren't entirely dead (there might be
713 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
714 /// want to swap this particular input for poison to simplify the use lists of
715 /// the alloca.
716 SmallVector<Use *, 8> DeadOperands;
717};
718
719/// A partition of the slices.
720///
721/// An ephemeral representation for a range of slices which can be viewed as
722/// a partition of the alloca. This range represents a span of the alloca's
723/// memory which cannot be split, and provides access to all of the slices
724/// overlapping some part of the partition.
725///
726/// Objects of this type are produced by traversing the alloca's slices, but
727/// are only ephemeral and not persistent.
728class Partition {
729private:
730 friend class AllocaSlices;
731 friend class AllocaSlices::partition_iterator;
732
733 using iterator = AllocaSlices::iterator;
734
735 /// The beginning and ending offsets of the alloca for this
736 /// partition.
737 uint64_t BeginOffset = 0, EndOffset = 0;
738
739 /// The start and end iterators of this partition.
740 iterator SI, SJ;
741
742 /// A collection of split slice tails overlapping the partition.
743 SmallVector<Slice *, 4> SplitTails;
744
745 /// Raw constructor builds an empty partition starting and ending at
746 /// the given iterator.
747 Partition(iterator SI) : SI(SI), SJ(SI) {}
748
749public:
750 /// The start offset of this partition.
751 ///
752 /// All of the contained slices start at or after this offset.
753 uint64_t beginOffset() const { return BeginOffset; }
754
755 /// The end offset of this partition.
756 ///
757 /// All of the contained slices end at or before this offset.
758 uint64_t endOffset() const { return EndOffset; }
759
760 /// The size of the partition.
761 ///
762 /// Note that this can never be zero.
763 uint64_t size() const {
764 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
765 return EndOffset - BeginOffset;
766 }
767
768 /// Test whether this partition contains no slices, and merely spans
769 /// a region occupied by split slices.
770 bool empty() const { return SI == SJ; }
771
772 /// \name Iterate slices that start within the partition.
773 /// These may be splittable or unsplittable. They have a begin offset >= the
774 /// partition begin offset.
775 /// @{
776 // FIXME: We should probably define a "concat_iterator" helper and use that
777 // to stitch together pointee_iterators over the split tails and the
778 // contiguous iterators of the partition. That would give a much nicer
779 // interface here. We could then additionally expose filtered iterators for
780 // split, unsplit, and unsplittable splices based on the usage patterns.
781 iterator begin() const { return SI; }
782 iterator end() const { return SJ; }
783 /// @}
784
785 /// Get the sequence of split slice tails.
786 ///
787 /// These tails are of slices which start before this partition but are
788 /// split and overlap into the partition. We accumulate these while forming
789 /// partitions.
790 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
791};
792
793} // end anonymous namespace
794
795/// An iterator over partitions of the alloca's slices.
796///
797/// This iterator implements the core algorithm for partitioning the alloca's
798/// slices. It is a forward iterator as we don't support backtracking for
799/// efficiency reasons, and re-use a single storage area to maintain the
800/// current set of split slices.
801///
802/// It is templated on the slice iterator type to use so that it can operate
803/// with either const or non-const slice iterators.
805 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
806 Partition> {
807 friend class AllocaSlices;
808
809 /// Most of the state for walking the partitions is held in a class
810 /// with a nice interface for examining them.
811 Partition P;
812
813 /// We need to keep the end of the slices to know when to stop.
814 AllocaSlices::iterator SE;
815
816 /// We also need to keep track of the maximum split end offset seen.
817 /// FIXME: Do we really?
818 uint64_t MaxSplitSliceEndOffset = 0;
819
820 /// Sets the partition to be empty at given iterator, and sets the
821 /// end iterator.
822 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
823 : P(SI), SE(SE) {
824 // If not already at the end, advance our state to form the initial
825 // partition.
826 if (SI != SE)
827 advance();
828 }
829
830 /// Advance the iterator to the next partition.
831 ///
832 /// Requires that the iterator not be at the end of the slices.
833 void advance() {
834 assert((P.SI != SE || !P.SplitTails.empty()) &&
835 "Cannot advance past the end of the slices!");
836
837 // Clear out any split uses which have ended.
838 if (!P.SplitTails.empty()) {
839 if (P.EndOffset >= MaxSplitSliceEndOffset) {
840 // If we've finished all splits, this is easy.
841 P.SplitTails.clear();
842 MaxSplitSliceEndOffset = 0;
843 } else {
844 // Remove the uses which have ended in the prior partition. This
845 // cannot change the max split slice end because we just checked that
846 // the prior partition ended prior to that max.
847 llvm::erase_if(P.SplitTails,
848 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
849 assert(llvm::any_of(P.SplitTails,
850 [&](Slice *S) {
851 return S->endOffset() == MaxSplitSliceEndOffset;
852 }) &&
853 "Could not find the current max split slice offset!");
854 assert(llvm::all_of(P.SplitTails,
855 [&](Slice *S) {
856 return S->endOffset() <= MaxSplitSliceEndOffset;
857 }) &&
858 "Max split slice end offset is not actually the max!");
859 }
860 }
861
862 // If P.SI is already at the end, then we've cleared the split tail and
863 // now have an end iterator.
864 if (P.SI == SE) {
865 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
866 return;
867 }
868
869 // If we had a non-empty partition previously, set up the state for
870 // subsequent partitions.
871 if (P.SI != P.SJ) {
872 // Accumulate all the splittable slices which started in the old
873 // partition into the split list.
874 for (Slice &S : P)
875 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
876 P.SplitTails.push_back(&S);
877 MaxSplitSliceEndOffset =
878 std::max(S.endOffset(), MaxSplitSliceEndOffset);
879 }
880
881 // Start from the end of the previous partition.
882 P.SI = P.SJ;
883
884 // If P.SI is now at the end, we at most have a tail of split slices.
885 if (P.SI == SE) {
886 P.BeginOffset = P.EndOffset;
887 P.EndOffset = MaxSplitSliceEndOffset;
888 return;
889 }
890
891 // If the we have split slices and the next slice is after a gap and is
892 // not splittable immediately form an empty partition for the split
893 // slices up until the next slice begins.
894 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
895 !P.SI->isSplittable()) {
896 P.BeginOffset = P.EndOffset;
897 P.EndOffset = P.SI->beginOffset();
898 return;
899 }
900 }
901
902 // OK, we need to consume new slices. Set the end offset based on the
903 // current slice, and step SJ past it. The beginning offset of the
904 // partition is the beginning offset of the next slice unless we have
905 // pre-existing split slices that are continuing, in which case we begin
906 // at the prior end offset.
907 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
908 P.EndOffset = P.SI->endOffset();
909 ++P.SJ;
910
911 // There are two strategies to form a partition based on whether the
912 // partition starts with an unsplittable slice or a splittable slice.
913 if (!P.SI->isSplittable()) {
914 // When we're forming an unsplittable region, it must always start at
915 // the first slice and will extend through its end.
916 assert(P.BeginOffset == P.SI->beginOffset());
917
918 // Form a partition including all of the overlapping slices with this
919 // unsplittable slice.
920 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
921 if (!P.SJ->isSplittable())
922 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
923 ++P.SJ;
924 }
925
926 // We have a partition across a set of overlapping unsplittable
927 // partitions.
928 return;
929 }
930
931 // If we're starting with a splittable slice, then we need to form
932 // a synthetic partition spanning it and any other overlapping splittable
933 // splices.
934 assert(P.SI->isSplittable() && "Forming a splittable partition!");
935
936 // Collect all of the overlapping splittable slices.
937 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
938 P.SJ->isSplittable()) {
939 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
940 ++P.SJ;
941 }
942
943 // Back upiP.EndOffset if we ended the span early when encountering an
944 // unsplittable slice. This synthesizes the early end offset of
945 // a partition spanning only splittable slices.
946 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
947 assert(!P.SJ->isSplittable());
948 P.EndOffset = P.SJ->beginOffset();
949 }
950 }
951
952public:
953 bool operator==(const partition_iterator &RHS) const {
954 assert(SE == RHS.SE &&
955 "End iterators don't match between compared partition iterators!");
956
957 // The observed positions of partitions is marked by the P.SI iterator and
958 // the emptiness of the split slices. The latter is only relevant when
959 // P.SI == SE, as the end iterator will additionally have an empty split
960 // slices list, but the prior may have the same P.SI and a tail of split
961 // slices.
962 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
963 assert(P.SJ == RHS.P.SJ &&
964 "Same set of slices formed two different sized partitions!");
965 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
966 "Same slice position with differently sized non-empty split "
967 "slice tails!");
968 return true;
969 }
970 return false;
971 }
972
973 partition_iterator &operator++() {
974 advance();
975 return *this;
976 }
977
978 Partition &operator*() { return P; }
979};
980
981/// A forward range over the partitions of the alloca's slices.
982///
983/// This accesses an iterator range over the partitions of the alloca's
984/// slices. It computes these partitions on the fly based on the overlapping
985/// offsets of the slices and the ability to split them. It will visit "empty"
986/// partitions to cover regions of the alloca only accessed via split
987/// slices.
988iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
989 return make_range(partition_iterator(begin(), end()),
990 partition_iterator(end(), end()));
991}
992
994 // If the condition being selected on is a constant or the same value is
995 // being selected between, fold the select. Yes this does (rarely) happen
996 // early on.
997 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
998 return SI.getOperand(1 + CI->isZero());
999 if (SI.getOperand(1) == SI.getOperand(2))
1000 return SI.getOperand(1);
1001
1002 return nullptr;
1003}
1004
1005/// A helper that folds a PHI node or a select.
1007 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1008 // If PN merges together the same value, return that value.
1009 return PN->hasConstantValue();
1010 }
1012}
1013
1014/// Builder for the alloca slices.
1015///
1016/// This class builds a set of alloca slices by recursively visiting the uses
1017/// of an alloca and making a slice for each load and store at each offset.
1018class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1019 friend class PtrUseVisitor<SliceBuilder>;
1020 friend class InstVisitor<SliceBuilder>;
1021
1022 using Base = PtrUseVisitor<SliceBuilder>;
1023
1024 const uint64_t AllocSize;
1025 AllocaSlices &AS;
1026
1027 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1029
1030 /// Set to de-duplicate dead instructions found in the use walk.
1031 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1032
1033public:
1034 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1036 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1037
1038private:
1039 void markAsDead(Instruction &I) {
1040 if (VisitedDeadInsts.insert(&I).second)
1041 AS.DeadUsers.push_back(&I);
1042 }
1043
1044 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1045 bool IsSplittable = false) {
1046 // Completely skip uses which have a zero size or start either before or
1047 // past the end of the allocation.
1048 if (Size == 0 || Offset.uge(AllocSize)) {
1049 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1050 << Offset
1051 << " which has zero size or starts outside of the "
1052 << AllocSize << " byte alloca:\n"
1053 << " alloca: " << AS.AI << "\n"
1054 << " use: " << I << "\n");
1055 return markAsDead(I);
1056 }
1057
1058 uint64_t BeginOffset = Offset.getZExtValue();
1059 uint64_t EndOffset = BeginOffset + Size;
1060
1061 // Clamp the end offset to the end of the allocation. Note that this is
1062 // formulated to handle even the case where "BeginOffset + Size" overflows.
1063 // This may appear superficially to be something we could ignore entirely,
1064 // but that is not so! There may be widened loads or PHI-node uses where
1065 // some instructions are dead but not others. We can't completely ignore
1066 // them, and so have to record at least the information here.
1067 assert(AllocSize >= BeginOffset); // Established above.
1068 if (Size > AllocSize - BeginOffset) {
1069 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1070 << Offset << " to remain within the " << AllocSize
1071 << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 EndOffset = AllocSize;
1075 }
1076
1077 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1078 }
1079
1080 void visitBitCastInst(BitCastInst &BC) {
1081 if (BC.use_empty())
1082 return markAsDead(BC);
1083
1084 return Base::visitBitCastInst(BC);
1085 }
1086
1087 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1088 if (ASC.use_empty())
1089 return markAsDead(ASC);
1090
1091 return Base::visitAddrSpaceCastInst(ASC);
1092 }
1093
1094 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1095 if (GEPI.use_empty())
1096 return markAsDead(GEPI);
1097
1098 return Base::visitGetElementPtrInst(GEPI);
1099 }
1100
1101 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1102 uint64_t Size, bool IsVolatile) {
1103 // We allow splitting of non-volatile loads and stores where the type is an
1104 // integer type. These may be used to implement 'memcpy' or other "transfer
1105 // of bits" patterns.
1106 bool IsSplittable =
1107 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1108
1109 insertUse(I, Offset, Size, IsSplittable);
1110 }
1111
1112 void visitLoadInst(LoadInst &LI) {
1113 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1114 "All simple FCA loads should have been pre-split");
1115
1116 // If there is a load with an unknown offset, we can still perform store
1117 // to load forwarding for other known-offset loads.
1118 if (!IsOffsetKnown)
1119 return PI.setEscapedReadOnly(&LI);
1120
1121 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1122 if (Size.isScalable()) {
1123 unsigned VScale = LI.getFunction()->getVScaleValue();
1124 if (!VScale)
1125 return PI.setAborted(&LI);
1126
1127 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1128 }
1129
1130 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1131 LI.isVolatile());
1132 }
1133
1134 void visitStoreInst(StoreInst &SI) {
1135 Value *ValOp = SI.getValueOperand();
1136 if (ValOp == *U)
1137 return PI.setEscapedAndAborted(&SI);
1138 if (!IsOffsetKnown)
1139 return PI.setAborted(&SI);
1140
1141 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1142 if (StoreSize.isScalable()) {
1143 unsigned VScale = SI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&SI);
1146
1147 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1148 }
1149
1150 uint64_t Size = StoreSize.getFixedValue();
1151
1152 // If this memory access can be shown to *statically* extend outside the
1153 // bounds of the allocation, it's behavior is undefined, so simply
1154 // ignore it. Note that this is more strict than the generic clamping
1155 // behavior of insertUse. We also try to handle cases which might run the
1156 // risk of overflow.
1157 // FIXME: We should instead consider the pointer to have escaped if this
1158 // function is being instrumented for addressing bugs or race conditions.
1159 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1160 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1161 << Offset << " which extends past the end of the "
1162 << AllocSize << " byte alloca:\n"
1163 << " alloca: " << AS.AI << "\n"
1164 << " use: " << SI << "\n");
1165 return markAsDead(SI);
1166 }
1167
1168 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1169 "All simple FCA stores should have been pre-split");
1170 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1171 }
1172
1173 void visitMemSetInst(MemSetInst &II) {
1174 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1175 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1176 if ((Length && Length->getValue() == 0) ||
1177 (IsOffsetKnown && Offset.uge(AllocSize)))
1178 // Zero-length mem transfer intrinsics can be ignored entirely.
1179 return markAsDead(II);
1180
1181 if (!IsOffsetKnown)
1182 return PI.setAborted(&II);
1183
1184 insertUse(II, Offset,
1185 Length ? Length->getLimitedValue()
1186 : AllocSize - Offset.getLimitedValue(),
1187 (bool)Length);
1188 }
1189
1190 void visitMemTransferInst(MemTransferInst &II) {
1191 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1192 if (Length && Length->getValue() == 0)
1193 // Zero-length mem transfer intrinsics can be ignored entirely.
1194 return markAsDead(II);
1195
1196 // Because we can visit these intrinsics twice, also check to see if the
1197 // first time marked this instruction as dead. If so, skip it.
1198 if (VisitedDeadInsts.count(&II))
1199 return;
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 // This side of the transfer is completely out-of-bounds, and so we can
1205 // nuke the entire transfer. However, we also need to nuke the other side
1206 // if already added to our partitions.
1207 // FIXME: Yet another place we really should bypass this when
1208 // instrumenting for ASan.
1209 if (Offset.uge(AllocSize)) {
1210 auto MTPI = MemTransferSliceMap.find(&II);
1211 if (MTPI != MemTransferSliceMap.end())
1212 AS.Slices[MTPI->second].kill();
1213 return markAsDead(II);
1214 }
1215
1216 uint64_t RawOffset = Offset.getLimitedValue();
1217 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1218
1219 // Check for the special case where the same exact value is used for both
1220 // source and dest.
1221 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1222 // For non-volatile transfers this is a no-op.
1223 if (!II.isVolatile())
1224 return markAsDead(II);
1225
1226 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1227 }
1228
1229 // If we have seen both source and destination for a mem transfer, then
1230 // they both point to the same alloca.
1231 bool Inserted;
1232 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1233 std::tie(MTPI, Inserted) =
1234 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1235 unsigned PrevIdx = MTPI->second;
1236 if (!Inserted) {
1237 Slice &PrevP = AS.Slices[PrevIdx];
1238
1239 // Check if the begin offsets match and this is a non-volatile transfer.
1240 // In that case, we can completely elide the transfer.
1241 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1242 PrevP.kill();
1243 return markAsDead(II);
1244 }
1245
1246 // Otherwise we have an offset transfer within the same alloca. We can't
1247 // split those.
1248 PrevP.makeUnsplittable();
1249 }
1250
1251 // Insert the use now that we've fixed up the splittable nature.
1252 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1253
1254 // Check that we ended up with a valid index in the map.
1255 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1256 "Map index doesn't point back to a slice with this user.");
1257 }
1258
1259 // Disable SRoA for any intrinsics except for lifetime invariants.
1260 // FIXME: What about debug intrinsics? This matches old behavior, but
1261 // doesn't make sense.
1262 void visitIntrinsicInst(IntrinsicInst &II) {
1263 if (II.isDroppable()) {
1264 AS.DeadUseIfPromotable.push_back(U);
1265 return;
1266 }
1267
1268 if (!IsOffsetKnown)
1269 return PI.setAborted(&II);
1270
1271 if (II.isLifetimeStartOrEnd()) {
1272 insertUse(II, Offset, AllocSize, true);
1273 return;
1274 }
1275
1276 Base::visitIntrinsicInst(II);
1277 }
1278
1279 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1280 // We consider any PHI or select that results in a direct load or store of
1281 // the same offset to be a viable use for slicing purposes. These uses
1282 // are considered unsplittable and the size is the maximum loaded or stored
1283 // size.
1284 SmallPtrSet<Instruction *, 4> Visited;
1286 Visited.insert(Root);
1287 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1288 const DataLayout &DL = Root->getDataLayout();
1289 // If there are no loads or stores, the access is dead. We mark that as
1290 // a size zero access.
1291 Size = 0;
1292 do {
1293 Instruction *I, *UsedI;
1294 std::tie(UsedI, I) = Uses.pop_back_val();
1295
1296 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1297 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1298 if (LoadSize.isScalable()) {
1299 PI.setAborted(LI);
1300 return nullptr;
1301 }
1302 Size = std::max(Size, LoadSize.getFixedValue());
1303 continue;
1304 }
1305 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1306 Value *Op = SI->getOperand(0);
1307 if (Op == UsedI)
1308 return SI;
1309 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1310 if (StoreSize.isScalable()) {
1311 PI.setAborted(SI);
1312 return nullptr;
1313 }
1314 Size = std::max(Size, StoreSize.getFixedValue());
1315 continue;
1316 }
1317
1318 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1319 if (!GEP->hasAllZeroIndices())
1320 return GEP;
1321 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1323 return I;
1324 }
1325
1326 for (User *U : I->users())
1327 if (Visited.insert(cast<Instruction>(U)).second)
1328 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1329 } while (!Uses.empty());
1330
1331 return nullptr;
1332 }
1333
1334 void visitPHINodeOrSelectInst(Instruction &I) {
1336 if (I.use_empty())
1337 return markAsDead(I);
1338
1339 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1340 // instructions in this BB, which may be required during rewriting. Bail out
1341 // on these cases.
1342 if (isa<PHINode>(I) && !I.getParent()->hasInsertionPt())
1343 return PI.setAborted(&I);
1344
1345 // TODO: We could use simplifyInstruction here to fold PHINodes and
1346 // SelectInsts. However, doing so requires to change the current
1347 // dead-operand-tracking mechanism. For instance, suppose neither loading
1348 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1349 // trap either. However, if we simply replace %U with undef using the
1350 // current dead-operand-tracking mechanism, "load (select undef, undef,
1351 // %other)" may trap because the select may return the first operand
1352 // "undef".
1353 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1354 if (Result == *U)
1355 // If the result of the constant fold will be the pointer, recurse
1356 // through the PHI/select as if we had RAUW'ed it.
1357 enqueueUsers(I);
1358 else
1359 // Otherwise the operand to the PHI/select is dead, and we can replace
1360 // it with poison.
1361 AS.DeadOperands.push_back(U);
1362
1363 return;
1364 }
1365
1366 if (!IsOffsetKnown)
1367 return PI.setAborted(&I);
1368
1369 // See if we already have computed info on this node.
1370 uint64_t &Size = PHIOrSelectSizes[&I];
1371 if (!Size) {
1372 // This is a new PHI/Select, check for an unsafe use of it.
1373 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1374 return PI.setAborted(UnsafeI);
1375 }
1376
1377 // For PHI and select operands outside the alloca, we can't nuke the entire
1378 // phi or select -- the other side might still be relevant, so we special
1379 // case them here and use a separate structure to track the operands
1380 // themselves which should be replaced with poison.
1381 // FIXME: This should instead be escaped in the event we're instrumenting
1382 // for address sanitization.
1383 if (Offset.uge(AllocSize)) {
1384 AS.DeadOperands.push_back(U);
1385 return;
1386 }
1387
1388 insertUse(I, Offset, Size);
1389 }
1390
1391 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1392
1393 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1394
1395 /// Disable SROA entirely if there are unhandled users of the alloca.
1396 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1397
1398 void visitCallBase(CallBase &CB) {
1399 // If the call operand is read-only and only does a read-only or address
1400 // capture, then we mark it as EscapedReadOnly.
1401 if (CB.isDataOperand(U) &&
1402 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1403 CB.onlyReadsMemory(U->getOperandNo())) {
1404 PI.setEscapedReadOnly(&CB);
1405 return;
1406 }
1407
1408 Base::visitCallBase(CB);
1409 }
1410};
1411
1412AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1413 :
1414#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1415 AI(AI),
1416#endif
1417 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1418 SliceBuilder PB(DL, AI, *this);
1419 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1420 if (PtrI.isEscaped() || PtrI.isAborted()) {
1421 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1422 // possibly by just storing the PtrInfo in the AllocaSlices.
1423 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1424 : PtrI.getAbortingInst();
1425 assert(PointerEscapingInstr && "Did not track a bad instruction");
1426 return;
1427 }
1428 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1429
1430 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1431
1432 // Sort the uses. This arranges for the offsets to be in ascending order,
1433 // and the sizes to be in descending order.
1434 llvm::stable_sort(Slices);
1435}
1436
1437#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1438
1439void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1440 StringRef Indent) const {
1441 printSlice(OS, I, Indent);
1442 OS << "\n";
1443 printUse(OS, I, Indent);
1444}
1445
1446void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1447 StringRef Indent) const {
1448 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1449 << " slice #" << (I - begin())
1450 << (I->isSplittable() ? " (splittable)" : "");
1451}
1452
1453void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1454 StringRef Indent) const {
1455 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1456}
1457
1458void AllocaSlices::print(raw_ostream &OS) const {
1459 if (PointerEscapingInstr) {
1460 OS << "Can't analyze slices for alloca: " << AI << "\n"
1461 << " A pointer to this alloca escaped by:\n"
1462 << " " << *PointerEscapingInstr << "\n";
1463 return;
1464 }
1465
1466 if (PointerEscapingInstrReadOnly)
1467 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1468
1469 OS << "Slices of alloca: " << AI << "\n";
1470 for (const_iterator I = begin(), E = end(); I != E; ++I)
1471 print(OS, I);
1472}
1473
1474LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1475 print(dbgs(), I);
1476}
1477LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1478
1479#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1480
1481/// Walk the range of a partitioning looking for a common type to cover this
1482/// sequence of slices.
1483static std::pair<Type *, IntegerType *>
1484findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1485 uint64_t EndOffset) {
1486 Type *Ty = nullptr;
1487 bool TyIsCommon = true;
1488 IntegerType *ITy = nullptr;
1489
1490 // Note that we need to look at *every* alloca slice's Use to ensure we
1491 // always get consistent results regardless of the order of slices.
1492 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1493 Use *U = I->getUse();
1494 if (isa<IntrinsicInst>(*U->getUser()))
1495 continue;
1496 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1497 continue;
1498
1499 Type *UserTy = nullptr;
1500 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1501 UserTy = LI->getType();
1502 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1503 UserTy = SI->getValueOperand()->getType();
1504 }
1505
1506 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1507 // If the type is larger than the partition, skip it. We only encounter
1508 // this for split integer operations where we want to use the type of the
1509 // entity causing the split. Also skip if the type is not a byte width
1510 // multiple.
1511 if (UserITy->getBitWidth() % 8 != 0 ||
1512 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1513 continue;
1514
1515 // Track the largest bitwidth integer type used in this way in case there
1516 // is no common type.
1517 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1518 ITy = UserITy;
1519 }
1520
1521 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1522 // depend on types skipped above.
1523 if (!UserTy || (Ty && Ty != UserTy))
1524 TyIsCommon = false; // Give up on anything but an iN type.
1525 else
1526 Ty = UserTy;
1527 }
1528
1529 return {TyIsCommon ? Ty : nullptr, ITy};
1530}
1531
1532/// PHI instructions that use an alloca and are subsequently loaded can be
1533/// rewritten to load both input pointers in the pred blocks and then PHI the
1534/// results, allowing the load of the alloca to be promoted.
1535/// From this:
1536/// %P2 = phi [i32* %Alloca, i32* %Other]
1537/// %V = load i32* %P2
1538/// to:
1539/// %V1 = load i32* %Alloca -> will be mem2reg'd
1540/// ...
1541/// %V2 = load i32* %Other
1542/// ...
1543/// %V = phi [i32 %V1, i32 %V2]
1544///
1545/// We can do this to a select if its only uses are loads and if the operands
1546/// to the select can be loaded unconditionally.
1547///
1548/// FIXME: This should be hoisted into a generic utility, likely in
1549/// Transforms/Util/Local.h
1551 const DataLayout &DL = PN.getDataLayout();
1552
1553 // For now, we can only do this promotion if the load is in the same block
1554 // as the PHI, and if there are no stores between the phi and load.
1555 // TODO: Allow recursive phi users.
1556 // TODO: Allow stores.
1557 BasicBlock *BB = PN.getParent();
1558 Align MaxAlign;
1559 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1560 Type *LoadType = nullptr;
1561 for (User *U : PN.users()) {
1563 if (!LI || !LI->isSimple())
1564 return false;
1565
1566 // For now we only allow loads in the same block as the PHI. This is
1567 // a common case that happens when instcombine merges two loads through
1568 // a PHI.
1569 if (LI->getParent() != BB)
1570 return false;
1571
1572 if (LoadType) {
1573 if (LoadType != LI->getType())
1574 return false;
1575 } else {
1576 LoadType = LI->getType();
1577 }
1578
1579 // Ensure that there are no instructions between the PHI and the load that
1580 // could store.
1581 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1582 if (BBI->mayWriteToMemory())
1583 return false;
1584
1585 MaxAlign = std::max(MaxAlign, LI->getAlign());
1586 }
1587
1588 if (!LoadType)
1589 return false;
1590
1591 APInt LoadSize =
1592 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1593
1594 // We can only transform this if it is safe to push the loads into the
1595 // predecessor blocks. The only thing to watch out for is that we can't put
1596 // a possibly trapping load in the predecessor if it is a critical edge.
1597 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1599 Value *InVal = PN.getIncomingValue(Idx);
1600
1601 // If the value is produced by the terminator of the predecessor (an
1602 // invoke) or it has side-effects, there is no valid place to put a load
1603 // in the predecessor.
1604 if (TI == InVal || TI->mayHaveSideEffects())
1605 return false;
1606
1607 // If the predecessor has a single successor, then the edge isn't
1608 // critical.
1609 if (TI->getNumSuccessors() == 1)
1610 continue;
1611
1612 // If this pointer is always safe to load, or if we can prove that there
1613 // is already a load in the block, then we can move the load to the pred
1614 // block.
1615 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1616 continue;
1617
1618 return false;
1619 }
1620
1621 return true;
1622}
1623
1624static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1625 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1626
1627 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1628 Type *LoadTy = SomeLoad->getType();
1629 IRB.SetInsertPoint(&PN);
1630 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1631 PN.getName() + ".sroa.speculated");
1632
1633 // Get the AA tags and alignment to use from one of the loads. It does not
1634 // matter which one we get and if any differ.
1635 AAMDNodes AATags = SomeLoad->getAAMetadata();
1636 Align Alignment = SomeLoad->getAlign();
1637
1638 // Rewrite all loads of the PN to use the new PHI.
1639 while (!PN.use_empty()) {
1640 LoadInst *LI = cast<LoadInst>(PN.user_back());
1641 LI->replaceAllUsesWith(NewPN);
1642 LI->eraseFromParent();
1643 }
1644
1645 // Inject loads into all of the pred blocks.
1646 DenseMap<BasicBlock *, Value *> InjectedLoads;
1647 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1648 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1649 Value *InVal = PN.getIncomingValue(Idx);
1650
1651 // A PHI node is allowed to have multiple (duplicated) entries for the same
1652 // basic block, as long as the value is the same. So if we already injected
1653 // a load in the predecessor, then we should reuse the same load for all
1654 // duplicated entries.
1655 if (Value *V = InjectedLoads.lookup(Pred)) {
1656 NewPN->addIncoming(V, Pred);
1657 continue;
1658 }
1659
1660 Instruction *TI = Pred->getTerminator();
1661 IRB.SetInsertPoint(TI);
1662
1663 LoadInst *Load = IRB.CreateAlignedLoad(
1664 LoadTy, InVal, Alignment,
1665 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1666 ++NumLoadsSpeculated;
1667 if (AATags)
1668 Load->setAAMetadata(AATags);
1669 NewPN->addIncoming(Load, Pred);
1670 InjectedLoads[Pred] = Load;
1671 }
1672
1673 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1674 PN.eraseFromParent();
1675}
1676
1677SelectHandSpeculativity &
1678SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1679 if (isTrueVal)
1681 else
1683 return *this;
1684}
1685
1686bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1687 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1688 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1689}
1690
1691bool SelectHandSpeculativity::areAllSpeculatable() const {
1692 return isSpeculatable(/*isTrueVal=*/true) &&
1693 isSpeculatable(/*isTrueVal=*/false);
1694}
1695
1696bool SelectHandSpeculativity::areAnySpeculatable() const {
1697 return isSpeculatable(/*isTrueVal=*/true) ||
1698 isSpeculatable(/*isTrueVal=*/false);
1699}
1700bool SelectHandSpeculativity::areNoneSpeculatable() const {
1701 return !areAnySpeculatable();
1702}
1703
1704static SelectHandSpeculativity
1706 assert(LI.isSimple() && "Only for simple loads");
1707 SelectHandSpeculativity Spec;
1708
1709 const DataLayout &DL = SI.getDataLayout();
1710 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1712 &LI))
1713 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1714 else if (PreserveCFG)
1715 return Spec;
1716
1717 return Spec;
1718}
1719
1720std::optional<RewriteableMemOps>
1721SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1722 RewriteableMemOps Ops;
1723
1724 for (User *U : SI.users()) {
1725 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1726 U = *BC->user_begin();
1727
1728 if (auto *Store = dyn_cast<StoreInst>(U)) {
1729 // Note that atomic stores can be transformed; atomic semantics do not
1730 // have any meaning for a local alloca. Stores are not speculatable,
1731 // however, so if we can't turn it into a predicated store, we are done.
1732 if (Store->isVolatile() || PreserveCFG)
1733 return {}; // Give up on this `select`.
1734 Ops.emplace_back(Store);
1735 continue;
1736 }
1737
1738 auto *LI = dyn_cast<LoadInst>(U);
1739
1740 // Note that atomic loads can be transformed;
1741 // atomic semantics do not have any meaning for a local alloca.
1742 if (!LI || LI->isVolatile())
1743 return {}; // Give up on this `select`.
1744
1745 PossiblySpeculatableLoad Load(LI);
1746 if (!LI->isSimple()) {
1747 // If the `load` is not simple, we can't speculatively execute it,
1748 // but we could handle this via a CFG modification. But can we?
1749 if (PreserveCFG)
1750 return {}; // Give up on this `select`.
1751 Ops.emplace_back(Load);
1752 continue;
1753 }
1754
1755 SelectHandSpeculativity Spec =
1757 if (PreserveCFG && !Spec.areAllSpeculatable())
1758 return {}; // Give up on this `select`.
1759
1760 Load.setInt(Spec);
1761 Ops.emplace_back(Load);
1762 }
1763
1764 return Ops;
1765}
1766
1768 IRBuilderTy &IRB) {
1769 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1770
1771 Value *TV = SI.getTrueValue();
1772 Value *FV = SI.getFalseValue();
1773 // Replace the given load of the select with a select of two loads.
1774
1775 assert(LI.isSimple() && "We only speculate simple loads");
1776
1777 IRB.SetInsertPoint(&LI);
1778
1779 LoadInst *TL =
1780 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1781 LI.getName() + ".sroa.speculate.load.true");
1782 LoadInst *FL =
1783 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1784 LI.getName() + ".sroa.speculate.load.false");
1785 NumLoadsSpeculated += 2;
1786
1787 // Transfer alignment and AA info if present.
1788 TL->setAlignment(LI.getAlign());
1789 FL->setAlignment(LI.getAlign());
1790
1791 AAMDNodes Tags = LI.getAAMetadata();
1792 if (Tags) {
1793 TL->setAAMetadata(Tags);
1794 FL->setAAMetadata(Tags);
1795 }
1796
1797 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1798 LI.getName() + ".sroa.speculated",
1799 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1800
1801 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1802 LI.replaceAllUsesWith(V);
1803}
1804
1805template <typename T>
1807 SelectHandSpeculativity Spec,
1808 DomTreeUpdater &DTU) {
1809 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1810 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1811 BasicBlock *Head = I.getParent();
1812 Instruction *ThenTerm = nullptr;
1813 Instruction *ElseTerm = nullptr;
1814 if (Spec.areNoneSpeculatable())
1815 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1816 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1817 else {
1818 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1819 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1820 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1821 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1822 cast<CondBrInst>(Head->getTerminator())->swapSuccessors();
1823 }
1824 auto *HeadBI = cast<CondBrInst>(Head->getTerminator());
1825 Spec = {}; // Do not use `Spec` beyond this point.
1826 BasicBlock *Tail = I.getParent();
1827 Tail->setName(Head->getName() + ".cont");
1828 PHINode *PN;
1829 if (isa<LoadInst>(I))
1830 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1831 for (BasicBlock *SuccBB : successors(Head)) {
1832 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1833 int SuccIdx = IsThen ? 0 : 1;
1834 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1835 auto &CondMemOp = cast<T>(*I.clone());
1836 if (NewMemOpBB != Head) {
1837 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1838 if (isa<LoadInst>(I))
1839 ++NumLoadsPredicated;
1840 else
1841 ++NumStoresPredicated;
1842 } else {
1843 CondMemOp.dropUBImplyingAttrsAndMetadata();
1844 ++NumLoadsSpeculated;
1845 }
1846 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1847 Value *Ptr = SI.getOperand(1 + SuccIdx);
1848 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1849 if (isa<LoadInst>(I)) {
1850 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1851 PN->addIncoming(&CondMemOp, NewMemOpBB);
1852 } else
1853 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1854 }
1855 if (isa<LoadInst>(I)) {
1856 PN->takeName(&I);
1857 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1858 I.replaceAllUsesWith(PN);
1859 }
1860}
1861
1863 SelectHandSpeculativity Spec,
1864 DomTreeUpdater &DTU) {
1865 if (auto *LI = dyn_cast<LoadInst>(&I))
1866 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1867 else if (auto *SI = dyn_cast<StoreInst>(&I))
1868 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1869 else
1870 llvm_unreachable_internal("Only for load and store.");
1871}
1872
1874 const RewriteableMemOps &Ops,
1875 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1876 bool CFGChanged = false;
1877 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1878
1879 for (const RewriteableMemOp &Op : Ops) {
1880 SelectHandSpeculativity Spec;
1881 Instruction *I;
1882 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1883 I = *US;
1884 } else {
1885 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1886 I = PSL.getPointer();
1887 Spec = PSL.getInt();
1888 }
1889 if (Spec.areAllSpeculatable()) {
1891 } else {
1892 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1893 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1894 CFGChanged = true;
1895 }
1896 I->eraseFromParent();
1897 }
1898
1899 for (User *U : make_early_inc_range(SI.users()))
1900 cast<BitCastInst>(U)->eraseFromParent();
1901 SI.eraseFromParent();
1902 return CFGChanged;
1903}
1904
1905/// Compute an adjusted pointer from Ptr by Offset bytes where the
1906/// resulting pointer has PointerTy.
1907static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1909 const Twine &NamePrefix) {
1910 if (Offset != 0)
1911 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1912 NamePrefix + "sroa_idx");
1913 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1914 NamePrefix + "sroa_cast");
1915}
1916
1917/// Compute the adjusted alignment for a load or store from an offset.
1921
1922/// Test whether we can convert a value from the old to the new type.
1923///
1924/// This predicate should be used to guard calls to convertValue in order to
1925/// ensure that we only try to convert viable values. The strategy is that we
1926/// will peel off single element struct and array wrappings to get to an
1927/// underlying value, and convert that value.
1928static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1929 unsigned VScale = 0) {
1930 if (OldTy == NewTy)
1931 return true;
1932
1933 // For integer types, we can't handle any bit-width differences. This would
1934 // break both vector conversions with extension and introduce endianness
1935 // issues when in conjunction with loads and stores.
1936 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1938 cast<IntegerType>(NewTy)->getBitWidth() &&
1939 "We can't have the same bitwidth for different int types");
1940 return false;
1941 }
1942
1943 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1944 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1945
1946 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1947 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1948 // Conversion is only possible when the size of scalable vectors is known.
1949 if (!VScale)
1950 return false;
1951
1952 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1953 // a single domain (either fixed or scalable). Any additional conversion
1954 // between fixed and scalable types is handled through integer types.
1955 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1956 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1957
1958 if (isa<ScalableVectorType>(NewTy)) {
1960 return false;
1961
1962 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1963 } else {
1965 return false;
1966
1967 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1968 }
1969 }
1970
1971 if (NewSize != OldSize)
1972 return false;
1973 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1974 return false;
1975
1976 // We can convert pointers to integers and vice-versa. Same for vectors
1977 // of pointers and integers.
1978 OldTy = OldTy->getScalarType();
1979 NewTy = NewTy->getScalarType();
1980 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1981 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1982 unsigned OldAS = OldTy->getPointerAddressSpace();
1983 unsigned NewAS = NewTy->getPointerAddressSpace();
1984 // Convert pointers if they are pointers from the same address space or
1985 // different integral (not non-integral) address spaces with the same
1986 // pointer size.
1987 return OldAS == NewAS ||
1988 (!DL.isNonIntegralAddressSpace(OldAS) &&
1989 !DL.isNonIntegralAddressSpace(NewAS) &&
1990 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1991 }
1992
1993 // We can convert integers to integral pointers, but not to non-integral
1994 // pointers.
1995 if (OldTy->isIntegerTy())
1996 return !DL.isNonIntegralPointerType(NewTy);
1997
1998 // We can convert integral pointers to integers, but non-integral pointers
1999 // need to remain pointers.
2000 if (!DL.isNonIntegralPointerType(OldTy))
2001 return NewTy->isIntegerTy();
2002
2003 return false;
2004 }
2005
2006 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2007 return false;
2008
2009 return true;
2010}
2011
2012/// Test whether the given slice use can be promoted to a vector.
2013///
2014/// This function is called to test each entry in a partition which is slated
2015/// for a single slice.
2016static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2017 VectorType *Ty,
2018 uint64_t ElementSize,
2019 const DataLayout &DL,
2020 unsigned VScale) {
2021 // First validate the slice offsets.
2022 uint64_t BeginOffset =
2023 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2024 uint64_t BeginIndex = BeginOffset / ElementSize;
2025 if (BeginIndex * ElementSize != BeginOffset ||
2026 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2027 return false;
2028 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2029 uint64_t EndIndex = EndOffset / ElementSize;
2030 if (EndIndex * ElementSize != EndOffset ||
2031 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2032 return false;
2033
2034 assert(EndIndex > BeginIndex && "Empty vector!");
2035 uint64_t NumElements = EndIndex - BeginIndex;
2036 Type *SliceTy = (NumElements == 1)
2037 ? Ty->getElementType()
2038 : FixedVectorType::get(Ty->getElementType(), NumElements);
2039
2040 Type *SplitIntTy =
2041 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2042
2043 Use *U = S.getUse();
2044
2045 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2046 if (MI->isVolatile())
2047 return false;
2048 if (!S.isSplittable())
2049 return false; // Skip any unsplittable intrinsics.
2050 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2051 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2052 return false;
2053 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2054 if (LI->isVolatile())
2055 return false;
2056 Type *LTy = LI->getType();
2057 // Disable vector promotion when there are loads or stores of an FCA.
2058 if (LTy->isStructTy())
2059 return false;
2060 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2061 assert(LTy->isIntegerTy());
2062 LTy = SplitIntTy;
2063 }
2064 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2065 return false;
2066 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2067 if (SI->isVolatile())
2068 return false;
2069 Type *STy = SI->getValueOperand()->getType();
2070 // Disable vector promotion when there are loads or stores of an FCA.
2071 if (STy->isStructTy())
2072 return false;
2073 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2074 assert(STy->isIntegerTy());
2075 STy = SplitIntTy;
2076 }
2077 if (!canConvertValue(DL, STy, SliceTy, VScale))
2078 return false;
2079 } else {
2080 return false;
2081 }
2082
2083 return true;
2084}
2085
2086/// Test whether any vector type in \p CandidateTys is viable for promotion.
2087///
2088/// This implements the necessary checking for \c isVectorPromotionViable over
2089/// all slices of the alloca for the given VectorType.
2090static VectorType *
2092 SmallVectorImpl<VectorType *> &CandidateTys,
2093 bool HaveCommonEltTy, Type *CommonEltTy,
2094 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2095 VectorType *CommonVecPtrTy, unsigned VScale) {
2096 // If we didn't find a vector type, nothing to do here.
2097 if (CandidateTys.empty())
2098 return nullptr;
2099
2100 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2101 // then we should choose it, not some other alternative.
2102 // But, we can't perform a no-op pointer address space change via bitcast,
2103 // so if we didn't have a common pointer element type, bail.
2104 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2105 return nullptr;
2106
2107 // Try to pick the "best" element type out of the choices.
2108 if (!HaveCommonEltTy && HaveVecPtrTy) {
2109 // If there was a pointer element type, there's really only one choice.
2110 CandidateTys.clear();
2111 CandidateTys.push_back(CommonVecPtrTy);
2112 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2113 // Integer-ify vector types.
2114 for (VectorType *&VTy : CandidateTys) {
2115 if (!VTy->getElementType()->isIntegerTy())
2116 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2117 VTy->getContext(), VTy->getScalarSizeInBits())));
2118 }
2119
2120 // Rank the remaining candidate vector types. This is easy because we know
2121 // they're all integer vectors. We sort by ascending number of elements.
2122 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2123 (void)DL;
2124 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2125 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2126 "Cannot have vector types of different sizes!");
2127 assert(RHSTy->getElementType()->isIntegerTy() &&
2128 "All non-integer types eliminated!");
2129 assert(LHSTy->getElementType()->isIntegerTy() &&
2130 "All non-integer types eliminated!");
2131 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2132 cast<FixedVectorType>(LHSTy)->getNumElements();
2133 };
2134 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2135 (void)DL;
2136 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2137 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2138 "Cannot have vector types of different sizes!");
2139 assert(RHSTy->getElementType()->isIntegerTy() &&
2140 "All non-integer types eliminated!");
2141 assert(LHSTy->getElementType()->isIntegerTy() &&
2142 "All non-integer types eliminated!");
2143 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2144 cast<FixedVectorType>(LHSTy)->getNumElements();
2145 };
2146 llvm::sort(CandidateTys, RankVectorTypesComp);
2147 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2148 CandidateTys.end());
2149 } else {
2150// The only way to have the same element type in every vector type is to
2151// have the same vector type. Check that and remove all but one.
2152#ifndef NDEBUG
2153 for (VectorType *VTy : CandidateTys) {
2154 assert(VTy->getElementType() == CommonEltTy &&
2155 "Unaccounted for element type!");
2156 assert(VTy == CandidateTys[0] &&
2157 "Different vector types with the same element type!");
2158 }
2159#endif
2160 CandidateTys.resize(1);
2161 }
2162
2163 // FIXME: hack. Do we have a named constant for this?
2164 // SDAG SDNode can't have more than 65535 operands.
2165 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2166 return cast<FixedVectorType>(VTy)->getNumElements() >
2167 std::numeric_limits<unsigned short>::max();
2168 });
2169
2170 // Find a vector type viable for promotion by iterating over all slices.
2171 auto *VTy = llvm::find_if(CandidateTys, [&](VectorType *VTy) -> bool {
2172 uint64_t ElementSize =
2173 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2174
2175 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2176 // that aren't byte sized.
2177 if (ElementSize % 8)
2178 return false;
2179 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2180 "vector size not a multiple of element size?");
2181 ElementSize /= 8;
2182
2183 for (const Slice &S : P)
2184 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2185 return false;
2186
2187 for (const Slice *S : P.splitSliceTails())
2188 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2189 return false;
2190
2191 return true;
2192 });
2193 return VTy != CandidateTys.end() ? *VTy : nullptr;
2194}
2195
2197 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2198 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2199 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2200 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2201 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2202 [[maybe_unused]] VectorType *OriginalElt =
2203 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2204 // Consider additional vector types where the element type size is a
2205 // multiple of load/store element size.
2206 for (Type *Ty : OtherTys) {
2208 continue;
2209 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2210 // Make a copy of CandidateTys and iterate through it, because we
2211 // might append to CandidateTys in the loop.
2212 for (VectorType *const VTy : CandidateTysCopy) {
2213 // The elements in the copy should remain invariant throughout the loop
2214 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2215 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2216 unsigned ElementSize =
2217 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2218 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2219 VectorSize % TypeSize == 0) {
2220 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2221 CheckCandidateType(NewVTy);
2222 }
2223 }
2224 }
2225
2227 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2228 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2229}
2230
2231/// Test whether the given alloca partitioning and range of slices can be
2232/// promoted to a vector.
2233///
2234/// This is a quick test to check whether we can rewrite a particular alloca
2235/// partition (and its newly formed alloca) into a vector alloca with only
2236/// whole-vector loads and stores such that it could be promoted to a vector
2237/// SSA value. We only can ensure this for a limited set of operations, and we
2238/// don't want to do the rewrites unless we are confident that the result will
2239/// be promotable, so we have an early test here.
2241 unsigned VScale) {
2242 // Collect the candidate types for vector-based promotion. Also track whether
2243 // we have different element types.
2244 SmallVector<VectorType *, 4> CandidateTys;
2245 SetVector<Type *> LoadStoreTys;
2246 SetVector<Type *> DeferredTys;
2247 Type *CommonEltTy = nullptr;
2248 VectorType *CommonVecPtrTy = nullptr;
2249 bool HaveVecPtrTy = false;
2250 bool HaveCommonEltTy = true;
2251 bool HaveCommonVecPtrTy = true;
2252 auto CheckCandidateType = [&](Type *Ty) {
2253 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2254 // Return if bitcast to vectors is different for total size in bits.
2255 if (!CandidateTys.empty()) {
2256 VectorType *V = CandidateTys[0];
2257 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2258 DL.getTypeSizeInBits(V).getFixedValue()) {
2259 CandidateTys.clear();
2260 return;
2261 }
2262 }
2263 CandidateTys.push_back(VTy);
2264 Type *EltTy = VTy->getElementType();
2265
2266 if (!CommonEltTy)
2267 CommonEltTy = EltTy;
2268 else if (CommonEltTy != EltTy)
2269 HaveCommonEltTy = false;
2270
2271 if (EltTy->isPointerTy()) {
2272 HaveVecPtrTy = true;
2273 if (!CommonVecPtrTy)
2274 CommonVecPtrTy = VTy;
2275 else if (CommonVecPtrTy != VTy)
2276 HaveCommonVecPtrTy = false;
2277 }
2278 }
2279 };
2280
2281 // Put load and store types into a set for de-duplication.
2282 for (const Slice &S : P) {
2283 Type *Ty;
2284 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2285 Ty = LI->getType();
2286 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2287 Ty = SI->getValueOperand()->getType();
2288 else
2289 continue;
2290
2291 auto CandTy = Ty->getScalarType();
2292 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2293 S.endOffset() != P.endOffset())) {
2294 DeferredTys.insert(Ty);
2295 continue;
2296 }
2297
2298 LoadStoreTys.insert(Ty);
2299 // Consider any loads or stores that are the exact size of the slice.
2300 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2301 CheckCandidateType(Ty);
2302 }
2303
2304 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2306 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2307 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2308 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2309 return VTy;
2310
2311 CandidateTys.clear();
2313 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2314 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2315 CommonVecPtrTy, VScale);
2316}
2317
2318/// Test whether a slice of an alloca is valid for integer widening.
2319///
2320/// This implements the necessary checking for the \c isIntegerWideningViable
2321/// test below on a single slice of the alloca.
2322static bool isIntegerWideningViableForSlice(const Slice &S,
2323 uint64_t AllocBeginOffset,
2324 Type *AllocaTy,
2325 const DataLayout &DL,
2326 bool &WholeAllocaOp) {
2327 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2328
2329 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2330 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2331
2332 Use *U = S.getUse();
2333
2334 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2335 // larger than other load/store slices (RelEnd > Size). But lifetime are
2336 // always promotable and should not impact other slices' promotability of the
2337 // partition.
2338 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2339 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2340 return true;
2341 }
2342
2343 // We can't reasonably handle cases where the load or store extends past
2344 // the end of the alloca's type and into its padding.
2345 if (RelEnd > Size)
2346 return false;
2347
2348 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2349 if (LI->isVolatile())
2350 return false;
2351 // We can't handle loads that extend past the allocated memory.
2352 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2353 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2354 return false;
2355 // So far, AllocaSliceRewriter does not support widening split slice tails
2356 // in rewriteIntegerLoad.
2357 if (S.beginOffset() < AllocBeginOffset)
2358 return false;
2359 // Note that we don't count vector loads or stores as whole-alloca
2360 // operations which enable integer widening because we would prefer to use
2361 // vector widening instead.
2362 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2363 WholeAllocaOp = true;
2364 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2365 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2366 return false;
2367 } else if (RelBegin != 0 || RelEnd != Size ||
2368 !canConvertValue(DL, AllocaTy, LI->getType())) {
2369 // Non-integer loads need to be convertible from the alloca type so that
2370 // they are promotable.
2371 return false;
2372 }
2373 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2374 Type *ValueTy = SI->getValueOperand()->getType();
2375 if (SI->isVolatile())
2376 return false;
2377 // We can't handle stores that extend past the allocated memory.
2378 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2379 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2380 return false;
2381 // So far, AllocaSliceRewriter does not support widening split slice tails
2382 // in rewriteIntegerStore.
2383 if (S.beginOffset() < AllocBeginOffset)
2384 return false;
2385 // Note that we don't count vector loads or stores as whole-alloca
2386 // operations which enable integer widening because we would prefer to use
2387 // vector widening instead.
2388 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2389 WholeAllocaOp = true;
2390 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2391 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2392 return false;
2393 } else if (RelBegin != 0 || RelEnd != Size ||
2394 !canConvertValue(DL, ValueTy, AllocaTy)) {
2395 // Non-integer stores need to be convertible to the alloca type so that
2396 // they are promotable.
2397 return false;
2398 }
2399 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2400 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2401 return false;
2402 if (!S.isSplittable())
2403 return false; // Skip any unsplittable intrinsics.
2404 } else {
2405 return false;
2406 }
2407
2408 return true;
2409}
2410
2411/// Test whether the given alloca partition's integer operations can be
2412/// widened to promotable ones.
2413///
2414/// This is a quick test to check whether we can rewrite the integer loads and
2415/// stores to a particular alloca into wider loads and stores and be able to
2416/// promote the resulting alloca.
2417static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2418 const DataLayout &DL) {
2419 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2420 // Don't create integer types larger than the maximum bitwidth.
2421 if (SizeInBits > IntegerType::MAX_INT_BITS)
2422 return false;
2423
2424 // Don't try to handle allocas with bit-padding.
2425 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2426 return false;
2427
2428 // We need to ensure that an integer type with the appropriate bitwidth can
2429 // be converted to the alloca type, whatever that is. We don't want to force
2430 // the alloca itself to have an integer type if there is a more suitable one.
2431 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2432 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2433 !canConvertValue(DL, IntTy, AllocaTy))
2434 return false;
2435
2436 // While examining uses, we ensure that the alloca has a covering load or
2437 // store. We don't want to widen the integer operations only to fail to
2438 // promote due to some other unsplittable entry (which we may make splittable
2439 // later). However, if there are only splittable uses, go ahead and assume
2440 // that we cover the alloca.
2441 // FIXME: We shouldn't consider split slices that happen to start in the
2442 // partition here...
2443 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2444
2445 for (const Slice &S : P)
2446 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2447 WholeAllocaOp))
2448 return false;
2449
2450 for (const Slice *S : P.splitSliceTails())
2451 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2452 WholeAllocaOp))
2453 return false;
2454
2455 return WholeAllocaOp;
2456}
2457
2458static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2460 const Twine &Name) {
2461 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2462 IntegerType *IntTy = cast<IntegerType>(V->getType());
2463 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2464 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2465 "Element extends past full value");
2466 uint64_t ShAmt = 8 * Offset;
2467 if (DL.isBigEndian())
2468 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2469 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2470 if (ShAmt) {
2471 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2472 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2473 }
2474 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2475 "Cannot extract to a larger integer!");
2476 if (Ty != IntTy) {
2477 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2478 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2479 }
2480 return V;
2481}
2482
2483static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2484 Value *V, uint64_t Offset, const Twine &Name) {
2485 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2486 IntegerType *Ty = cast<IntegerType>(V->getType());
2487 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2488 "Cannot insert a larger integer!");
2489 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2490 if (Ty != IntTy) {
2491 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2492 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2493 }
2494 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2495 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2496 "Element store outside of alloca store");
2497 uint64_t ShAmt = 8 * Offset;
2498 if (DL.isBigEndian())
2499 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2500 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2501 if (ShAmt) {
2502 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2503 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2504 }
2505
2506 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2507 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2508 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2509 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2510 V = IRB.CreateOr(Old, V, Name + ".insert");
2511 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2512 }
2513 return V;
2514}
2515
2516static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2517 unsigned EndIndex, const Twine &Name) {
2518 auto *VecTy = cast<FixedVectorType>(V->getType());
2519 unsigned NumElements = EndIndex - BeginIndex;
2520 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2521
2522 if (NumElements == VecTy->getNumElements())
2523 return V;
2524
2525 if (NumElements == 1) {
2526 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2527 Name + ".extract");
2528 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2529 return V;
2530 }
2531
2532 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2533 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2534 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2535 return V;
2536}
2537
2538static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2539 unsigned BeginIndex, const Twine &Name) {
2540 VectorType *VecTy = cast<VectorType>(Old->getType());
2541 assert(VecTy && "Can only insert a vector into a vector");
2542
2543 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2544 if (!Ty) {
2545 // Single element to insert.
2546 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2547 Name + ".insert");
2548 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2549 return V;
2550 }
2551
2552 unsigned NumSubElements = cast<FixedVectorType>(Ty)->getNumElements();
2553 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
2554
2555 assert(NumSubElements <= NumElements && "Too many elements!");
2556 if (NumSubElements == NumElements) {
2557 assert(V->getType() == VecTy && "Vector type mismatch");
2558 return V;
2559 }
2560 unsigned EndIndex = BeginIndex + NumSubElements;
2561
2562 // When inserting a smaller vector into the larger to store, we first
2563 // use a shuffle vector to widen it with undef elements, and then
2564 // a second shuffle vector to select between the loaded vector and the
2565 // incoming vector.
2567 Mask.reserve(NumElements);
2568 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2569 if (Idx >= BeginIndex && Idx < EndIndex)
2570 Mask.push_back(Idx - BeginIndex);
2571 else
2572 Mask.push_back(-1);
2573 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2574 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2575
2576 Mask.clear();
2577 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2578 if (Idx >= BeginIndex && Idx < EndIndex)
2579 Mask.push_back(Idx);
2580 else
2581 Mask.push_back(Idx + NumElements);
2582 V = IRB.CreateShuffleVector(V, Old, Mask, Name + "blend");
2583 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2584 return V;
2585}
2586
2587/// This function takes two vector values and combines them into a single vector
2588/// by concatenating their elements. The function handles:
2589///
2590/// 1. Element type mismatch: If either vector's element type differs from
2591/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2592/// preserving the total bit width (adjusting the number of elements
2593/// accordingly).
2594///
2595/// 2. Size mismatch: After transforming the vectors to have the desired element
2596/// type, if the two vectors have different numbers of elements, the smaller
2597/// vector is extended with poison values to match the size of the larger
2598/// vector before concatenation.
2599///
2600/// 3. Concatenation: The vectors are merged using a shuffle operation that
2601/// places all elements of V0 first, followed by all elements of V1.
2602///
2603/// \param V0 The first vector to merge (must be a vector type)
2604/// \param V1 The second vector to merge (must be a vector type)
2605/// \param DL The data layout for size calculations
2606/// \param NewAIEltTy The desired element type for the result vector
2607/// \param Builder IRBuilder for creating new instructions
2608/// \return A new vector containing all elements from V0 followed by all
2609/// elements from V1
2611 Type *NewAIEltTy, IRBuilder<> &Builder) {
2612 // V0 and V1 are vectors
2613 // Create a new vector type with combined elements
2614 // Use ShuffleVector to concatenate the vectors
2615 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2616 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2617
2618 // If V0/V1 element types are different from NewAllocaElementType,
2619 // we need to introduce bitcasts before merging them
2620 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2621 const char *DebugName) {
2622 Type *EltType = VecType->getElementType();
2623 if (EltType != NewAIEltTy) {
2624 // Calculate new number of elements to maintain same bit width
2625 unsigned TotalBits =
2626 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2627 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2628
2629 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2630 V = Builder.CreateBitCast(V, NewVecType);
2631 VecType = NewVecType;
2632 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2633 }
2634 };
2635
2636 BitcastIfNeeded(V0, VecType0, "V0");
2637 BitcastIfNeeded(V1, VecType1, "V1");
2638
2639 unsigned NumElts0 = VecType0->getNumElements();
2640 unsigned NumElts1 = VecType1->getNumElements();
2641
2642 SmallVector<int, 16> ShuffleMask;
2643
2644 if (NumElts0 == NumElts1) {
2645 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2646 ShuffleMask.push_back(i);
2647 } else {
2648 // If two vectors have different sizes, we need to extend
2649 // the smaller vector to the size of the larger vector.
2650 unsigned SmallSize = std::min(NumElts0, NumElts1);
2651 unsigned LargeSize = std::max(NumElts0, NumElts1);
2652 bool IsV0Smaller = NumElts0 < NumElts1;
2653 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2654 SmallVector<int, 16> ExtendMask;
2655 for (unsigned i = 0; i < SmallSize; ++i)
2656 ExtendMask.push_back(i);
2657 for (unsigned i = SmallSize; i < LargeSize; ++i)
2658 ExtendMask.push_back(PoisonMaskElem);
2659 ExtendedVec = Builder.CreateShuffleVector(
2660 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2661 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2662 for (unsigned i = 0; i < NumElts0; ++i)
2663 ShuffleMask.push_back(i);
2664 for (unsigned i = 0; i < NumElts1; ++i)
2665 ShuffleMask.push_back(LargeSize + i);
2666 }
2667
2668 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2669}
2670
2671namespace {
2672
2673/// Visitor to rewrite instructions using p particular slice of an alloca
2674/// to use a new alloca.
2675///
2676/// Also implements the rewriting to vector-based accesses when the partition
2677/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2678/// lives here.
2679class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2680 // Befriend the base class so it can delegate to private visit methods.
2681 friend class InstVisitor<AllocaSliceRewriter, bool>;
2682
2683 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2684
2685 const DataLayout &DL;
2686 AllocaSlices &AS;
2687 SROA &Pass;
2688 AllocaInst &OldAI, &NewAI;
2689 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2690 Type *NewAllocaTy;
2691
2692 // This is a convenience and flag variable that will be null unless the new
2693 // alloca's integer operations should be widened to this integer type due to
2694 // passing isIntegerWideningViable above. If it is non-null, the desired
2695 // integer type will be stored here for easy access during rewriting.
2696 IntegerType *IntTy;
2697
2698 // If we are rewriting an alloca partition which can be written as pure
2699 // vector operations, we stash extra information here. When VecTy is
2700 // non-null, we have some strict guarantees about the rewritten alloca:
2701 // - The new alloca is exactly the size of the vector type here.
2702 // - The accesses all either map to the entire vector or to a single
2703 // element.
2704 // - The set of accessing instructions is only one of those handled above
2705 // in isVectorPromotionViable. Generally these are the same access kinds
2706 // which are promotable via mem2reg.
2707 VectorType *VecTy;
2708 Type *ElementTy;
2709 uint64_t ElementSize;
2710
2711 // The original offset of the slice currently being rewritten relative to
2712 // the original alloca.
2713 uint64_t BeginOffset = 0;
2714 uint64_t EndOffset = 0;
2715
2716 // The new offsets of the slice currently being rewritten relative to the
2717 // original alloca.
2718 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2719
2720 uint64_t SliceSize = 0;
2721 bool IsSplittable = false;
2722 bool IsSplit = false;
2723 Use *OldUse = nullptr;
2724 Instruction *OldPtr = nullptr;
2725
2726 // Track post-rewrite users which are PHI nodes and Selects.
2727 SmallSetVector<PHINode *, 8> &PHIUsers;
2728 SmallSetVector<SelectInst *, 8> &SelectUsers;
2729
2730 // Utility IR builder, whose name prefix is setup for each visited use, and
2731 // the insertion point is set to point to the user.
2732 IRBuilderTy IRB;
2733
2734 // Return the new alloca, addrspacecasted if required to avoid changing the
2735 // addrspace of a volatile access.
2736 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2737 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2738 return &NewAI;
2739
2740 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2741 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2742 }
2743
2744public:
2745 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2746 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2747 uint64_t NewAllocaBeginOffset,
2748 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2749 VectorType *PromotableVecTy,
2750 SmallSetVector<PHINode *, 8> &PHIUsers,
2751 SmallSetVector<SelectInst *, 8> &SelectUsers)
2752 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2753 NewAllocaBeginOffset(NewAllocaBeginOffset),
2754 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2755 IntTy(IsIntegerPromotable
2756 ? Type::getIntNTy(
2757 NewAI.getContext(),
2758 DL.getTypeSizeInBits(NewAllocaTy).getFixedValue())
2759 : nullptr),
2760 VecTy(PromotableVecTy),
2761 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2762 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2763 : 0),
2764 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2765 IRB(NewAI.getContext(), ConstantFolder()) {
2766 if (VecTy) {
2767 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2768 "Only multiple-of-8 sized vector elements are viable");
2769 ++NumVectorized;
2770 }
2771 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2772 }
2773
2774 bool visit(AllocaSlices::const_iterator I) {
2775 bool CanSROA = true;
2776 BeginOffset = I->beginOffset();
2777 EndOffset = I->endOffset();
2778 IsSplittable = I->isSplittable();
2779 IsSplit =
2780 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2781 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2782 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2783 LLVM_DEBUG(dbgs() << "\n");
2784
2785 // Compute the intersecting offset range.
2786 assert(BeginOffset < NewAllocaEndOffset);
2787 assert(EndOffset > NewAllocaBeginOffset);
2788 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2789 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2790
2791 SliceSize = NewEndOffset - NewBeginOffset;
2792 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2793 << ") NewBegin:(" << NewBeginOffset << ", "
2794 << NewEndOffset << ") NewAllocaBegin:("
2795 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2796 << ")\n");
2797 assert(IsSplit || NewBeginOffset == BeginOffset);
2798 OldUse = I->getUse();
2799 OldPtr = cast<Instruction>(OldUse->get());
2800
2801 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2802 IRB.SetInsertPoint(OldUserI);
2803 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2804 // Avoid materializing the name prefix when it is discarded anyway.
2805 if (!IRB.getContext().shouldDiscardValueNames())
2806 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2807 Twine(BeginOffset) + ".");
2808
2809 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2810 if (VecTy || IntTy)
2811 assert(CanSROA);
2812 return CanSROA;
2813 }
2814
2815 /// Attempts to rewrite a partition using tree-structured merge optimization.
2816 ///
2817 /// This function analyzes a partition to determine if it can be optimized
2818 /// using a tree-structured merge pattern, where multiple non-overlapping
2819 /// stores completely fill an alloca. And there is no load from the alloca in
2820 /// the middle of the stores. Such patterns can be optimized by eliminating
2821 /// the intermediate stores and directly constructing the final vector by
2822 /// using shufflevectors.
2823 ///
2824 /// Example transformation:
2825 /// Before: (stores do not have to be in order)
2826 /// %alloca = alloca <8 x float>
2827 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2828 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2829 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2830 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2831 ///
2832 /// After:
2833 /// %alloca = alloca <8 x float>
2834 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2835 /// i32 3>
2836 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2837 /// i32 3>
2838 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2839 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2840 /// store %shuffle2, ptr %alloca
2841 ///
2842 /// The optimization looks for partitions that:
2843 /// 1. Have no overlapping split slice tails
2844 /// 2. Contain non-overlapping stores that cover the entire alloca
2845 /// 3. Have exactly one load that reads the complete alloca structure and not
2846 /// in the middle of the stores (TODO: maybe we can relax the constraint
2847 /// about reading the entire alloca structure)
2848 ///
2849 /// \param P The partition to analyze and potentially rewrite
2850 /// \return An optional vector of values that were deleted during the rewrite
2851 /// process, or std::nullopt if the partition cannot be optimized
2852 /// using tree-structured merge
2853 std::optional<SmallVector<Value *, 4>>
2854 rewriteTreeStructuredMerge(Partition &P) {
2855 // No tail slices that overlap with the partition
2856 if (P.splitSliceTails().size() > 0)
2857 return std::nullopt;
2858
2859 SmallVector<Value *, 4> DeletedValues;
2860 LoadInst *TheLoad = nullptr;
2861
2862 // Structure to hold store information
2863 struct StoreInfo {
2864 StoreInst *Store;
2865 uint64_t BeginOffset;
2866 uint64_t EndOffset;
2867 Value *StoredValue;
2868 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2869 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2870 };
2871
2872 SmallVector<StoreInfo, 4> StoreInfos;
2873
2874 // If the new alloca is a fixed vector type, we use its element type as the
2875 // allocated element type, otherwise we use i8 as the allocated element
2876 Type *AllocatedEltTy =
2877 isa<FixedVectorType>(NewAllocaTy)
2878 ? cast<FixedVectorType>(NewAllocaTy)->getElementType()
2879 : Type::getInt8Ty(NewAI.getContext());
2880 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2881
2882 // Helper to check if a type is
2883 // 1. A fixed vector type
2884 // 2. The element type is not a pointer
2885 // 3. The element type size is byte-aligned
2886 // We only handle the cases that the ld/st meet these conditions
2887 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2888 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2889 return FixedVecTy &&
2890 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2891 !FixedVecTy->getElementType()->isPointerTy();
2892 };
2893
2894 for (Slice &S : P) {
2895 auto *User = cast<Instruction>(S.getUse()->getUser());
2896 if (auto *LI = dyn_cast<LoadInst>(User)) {
2897 // Do not handle the case if
2898 // 1. There is more than one load
2899 // 2. The load is volatile
2900 // 3. The load does not read the entire alloca structure
2901 // 4. The load does not meet the conditions in the helper function
2902 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2903 S.beginOffset() != NewAllocaBeginOffset ||
2904 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2905 return std::nullopt;
2906 TheLoad = LI;
2907 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
2908 // Do not handle the case if
2909 // 1. The store does not meet the conditions in the helper function
2910 // 2. The store is volatile
2911 // 3. The total store size is not a multiple of the allocated element
2912 // type size
2913 if (!IsTypeValidForTreeStructuredMerge(
2914 SI->getValueOperand()->getType()) ||
2915 SI->isVolatile())
2916 return std::nullopt;
2917 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
2918 unsigned NumElts = VecTy->getNumElements();
2919 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
2920 if (NumElts * EltSize % AllocatedEltTySize != 0)
2921 return std::nullopt;
2922 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
2923 SI->getValueOperand());
2924 } else {
2925 // If we have instructions other than load and store, we cannot do the
2926 // tree structured merge
2927 return std::nullopt;
2928 }
2929 }
2930 // If we do not have any load, we cannot do the tree structured merge
2931 if (!TheLoad)
2932 return std::nullopt;
2933
2934 // If we do not have multiple stores, we cannot do the tree structured merge
2935 if (StoreInfos.size() < 2)
2936 return std::nullopt;
2937
2938 // Stores should not overlap and should cover the whole alloca
2939 // Sort by begin offset
2940 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
2941 return A.BeginOffset < B.BeginOffset;
2942 });
2943
2944 // Check for overlaps and coverage
2945 uint64_t ExpectedStart = NewAllocaBeginOffset;
2946 for (auto &StoreInfo : StoreInfos) {
2947 uint64_t BeginOff = StoreInfo.BeginOffset;
2948 uint64_t EndOff = StoreInfo.EndOffset;
2949
2950 // Check for gap or overlap
2951 if (BeginOff != ExpectedStart)
2952 return std::nullopt;
2953
2954 ExpectedStart = EndOff;
2955 }
2956 // Check that stores cover the entire alloca
2957 if (ExpectedStart != NewAllocaEndOffset)
2958 return std::nullopt;
2959
2960 // Stores should be in the same basic block
2961 // The load should not be in the middle of the stores
2962 // Note:
2963 // If the load is in a different basic block with the stores, we can still
2964 // do the tree structured merge. This is because we do not have the
2965 // store->load forwarding here. The merged vector will be stored back to
2966 // NewAI and the new load will load from NewAI. The forwarding will be
2967 // handled later when we try to promote NewAI.
2968 BasicBlock *LoadBB = TheLoad->getParent();
2969 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
2970
2971 for (auto &StoreInfo : StoreInfos) {
2972 if (StoreInfo.Store->getParent() != StoreBB)
2973 return std::nullopt;
2974 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
2975 return std::nullopt;
2976 }
2977
2978 // If we reach here, the partition can be merged with a tree structured
2979 // merge
2980 LLVM_DEBUG({
2981 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
2982 << "\n Ordered stores:\n";
2983 for (auto [i, Info] : enumerate(StoreInfos))
2984 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
2985 << Info.EndOffset << ") \tStore: " << *Info.Store
2986 << "\tValue: " << *Info.StoredValue << "\n";
2987 });
2988
2989 // Instead of having these stores, we merge all the stored values into a
2990 // vector and store the merged value into the alloca
2991 std::queue<Value *> VecElements;
2992 // StoreInfos is sorted by offset, not by block order. Anchoring to
2993 // StoreInfos.back().Store (last by offset) can place shuffles before
2994 // operands that appear later in the block (invalid SSA). Insert before
2995 // TheLoad when it shares the store block (after all stores, before any
2996 // later IR in that block). Otherwise insert before the store block's
2997 // terminator so the merge runs after every store and any trailing
2998 // instructions in that block.
2999 IRBuilder<> Builder(LoadBB == StoreBB ? TheLoad : StoreBB->getTerminator());
3000 for (const auto &Info : StoreInfos) {
3001 DeletedValues.push_back(Info.Store);
3002 VecElements.push(Info.StoredValue);
3003 }
3004
3005 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3006 while (VecElements.size() > 1) {
3007 const auto NumElts = VecElements.size();
3008 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3009 Value *V0 = VecElements.front();
3010 VecElements.pop();
3011 Value *V1 = VecElements.front();
3012 VecElements.pop();
3013 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3014 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3015 VecElements.push(Merged);
3016 }
3017 if (NumElts % 2 == 1) {
3018 Value *V = VecElements.front();
3019 VecElements.pop();
3020 VecElements.push(V);
3021 }
3022 }
3023
3024 // Store the merged value into the alloca
3025 Value *MergedValue = VecElements.front();
3026 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3027
3028 IRBuilder<> LoadBuilder(TheLoad);
3029 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3030 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3031 TheLoad->getName() + ".sroa.new.load"));
3032 DeletedValues.push_back(TheLoad);
3033
3034 return DeletedValues;
3035 }
3036
3037private:
3038 // Make sure the other visit overloads are visible.
3039 using Base::visit;
3040
3041 // Every instruction which can end up as a user must have a rewrite rule.
3042 bool visitInstruction(Instruction &I) {
3043 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3044 llvm_unreachable("No rewrite rule for this instruction!");
3045 }
3046
3047 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3048 // Note that the offset computation can use BeginOffset or NewBeginOffset
3049 // interchangeably for unsplit slices.
3050 assert(IsSplit || BeginOffset == NewBeginOffset);
3051 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3052
3053 StringRef OldName = OldPtr->getName();
3054 // Skip through the last '.sroa.' component of the name.
3055 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3056 if (LastSROAPrefix != StringRef::npos) {
3057 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3058 // Look for an SROA slice index.
3059 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3060 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3061 // Strip the index and look for the offset.
3062 OldName = OldName.substr(IndexEnd + 1);
3063 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3064 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3065 // Strip the offset.
3066 OldName = OldName.substr(OffsetEnd + 1);
3067 }
3068 }
3069 // Strip any SROA suffixes as well.
3070 OldName = OldName.substr(0, OldName.find(".sroa_"));
3071
3072 return getAdjustedPtr(IRB, DL, &NewAI,
3073 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3074 PointerTy, Twine(OldName) + ".");
3075 }
3076
3077 /// Compute suitable alignment to access this slice of the *new*
3078 /// alloca.
3079 ///
3080 /// You can optionally pass a type to this routine and if that type's ABI
3081 /// alignment is itself suitable, this will return zero.
3082 Align getSliceAlign() {
3083 return commonAlignment(NewAI.getAlign(),
3084 NewBeginOffset - NewAllocaBeginOffset);
3085 }
3086
3087 unsigned getIndex(uint64_t Offset) {
3088 assert(VecTy && "Can only call getIndex when rewriting a vector");
3089 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3090 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3091 uint32_t Index = RelOffset / ElementSize;
3092 assert(Index * ElementSize == RelOffset);
3093 return Index;
3094 }
3095
3096 void deleteIfTriviallyDead(Value *V) {
3099 Pass.DeadInsts.push_back(I);
3100 }
3101
3102 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3103 unsigned BeginIndex = getIndex(NewBeginOffset);
3104 unsigned EndIndex = getIndex(NewEndOffset);
3105 assert(EndIndex > BeginIndex && "Empty vector!");
3106
3107 LoadInst *Load =
3108 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3109
3110 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3111 LLVMContext::MD_access_group});
3112 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3113 }
3114
3115 Value *rewriteIntegerLoad(LoadInst &LI) {
3116 assert(IntTy && "We cannot insert an integer to the alloca");
3117 assert(!LI.isVolatile());
3118 Value *V =
3119 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3120 V = IRB.CreateBitPreservingCastChain(DL, V, IntTy);
3121 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3122 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3123 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3124 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3125 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3126 }
3127 // It is possible that the extracted type is not the load type. This
3128 // happens if there is a load past the end of the alloca, and as
3129 // a consequence the slice is narrower but still a candidate for integer
3130 // lowering. To handle this case, we just zero extend the extracted
3131 // integer.
3132 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3133 "Can only handle an extract for an overly wide load");
3134 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3135 V = IRB.CreateZExt(V, LI.getType());
3136 return V;
3137 }
3138
3139 bool visitLoadInst(LoadInst &LI) {
3140 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3141 Value *OldOp = LI.getOperand(0);
3142 assert(OldOp == OldPtr);
3143
3144 AAMDNodes AATags = LI.getAAMetadata();
3145
3146 unsigned AS = LI.getPointerAddressSpace();
3147
3148 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3149 : LI.getType();
3150 bool IsPtrAdjusted = false;
3151 Value *V;
3152 if (VecTy) {
3153 V = rewriteVectorizedLoadInst(LI);
3154 } else if (IntTy && LI.getType()->isIntegerTy()) {
3155 V = rewriteIntegerLoad(LI);
3156 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3157 NewEndOffset == NewAllocaEndOffset &&
3158 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3159 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3160 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3161 !LI.isVolatile()))) {
3162 Value *NewPtr =
3163 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3164 LoadInst *NewLI = IRB.CreateAlignedLoad(
3165 NewAllocaTy, NewPtr, NewAI.getAlign(), LI.isVolatile(), LI.getName());
3166 if (LI.isVolatile())
3167 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3168 if (NewLI->isAtomic())
3169 NewLI->setAlignment(LI.getAlign());
3170
3171 // Copy any metadata that is valid for the new load. This may require
3172 // conversion to a different kind of metadata, e.g. !nonnull might change
3173 // to !range or vice versa.
3174 copyMetadataForLoad(*NewLI, LI);
3175
3176 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3177 if (AATags)
3178 NewLI->setAAMetadata(AATags.adjustForAccess(
3179 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3180
3181 // Try to preserve nonnull metadata
3182 V = NewLI;
3183
3184 // If this is an integer load past the end of the slice (which means the
3185 // bytes outside the slice are undef or this load is dead) just forcibly
3186 // fix the integer size with correct handling of endianness.
3187 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3188 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3189 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3190 V = IRB.CreateZExt(V, TITy, "load.ext");
3191 if (DL.isBigEndian())
3192 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3193 "endian_shift");
3194 }
3195 } else {
3196 Type *LTy = IRB.getPtrTy(AS);
3197 LoadInst *NewLI =
3198 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3199 getSliceAlign(), LI.isVolatile(), LI.getName());
3200
3201 if (AATags)
3202 NewLI->setAAMetadata(AATags.adjustForAccess(
3203 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3204
3205 if (LI.isVolatile())
3206 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3207 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3208 LLVMContext::MD_access_group});
3209
3210 V = NewLI;
3211 IsPtrAdjusted = true;
3212 }
3213 V = IRB.CreateBitPreservingCastChain(DL, V, TargetTy);
3214
3215 if (IsSplit) {
3216 assert(!LI.isVolatile());
3217 assert(LI.getType()->isIntegerTy() &&
3218 "Only integer type loads and stores are split");
3219 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3220 "Split load isn't smaller than original load");
3221 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3222 "Non-byte-multiple bit width");
3223 // Move the insertion point just past the load so that we can refer to it.
3224 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3225 // Ensure the insertion point comes before any debug-info immediately
3226 // after the load, so that variable values referring to the load are
3227 // dominated by it.
3228 LIIt.setHeadBit(true);
3229 IRB.SetInsertPoint(LI.getParent(), LIIt);
3230 // Create a placeholder value with the same type as LI to use as the
3231 // basis for the new value. This allows us to replace the uses of LI with
3232 // the computed value, and then replace the placeholder with LI, leaving
3233 // LI only used for this computation.
3234 Value *Placeholder =
3235 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3236 false, Align(1));
3237 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3238 "insert");
3239 LI.replaceAllUsesWith(V);
3240 Placeholder->replaceAllUsesWith(&LI);
3241 Placeholder->deleteValue();
3242 } else {
3243 LI.replaceAllUsesWith(V);
3244 }
3245
3246 Pass.DeadInsts.push_back(&LI);
3247 deleteIfTriviallyDead(OldOp);
3248 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3249 return !LI.isVolatile() && !IsPtrAdjusted;
3250 }
3251
3252 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3253 AAMDNodes AATags) {
3254 // Capture V for the purpose of debug-info accounting once it's converted
3255 // to a vector store.
3256 Value *OrigV = V;
3257 if (V->getType() != VecTy) {
3258 unsigned BeginIndex = getIndex(NewBeginOffset);
3259 unsigned EndIndex = getIndex(NewEndOffset);
3260 assert(EndIndex > BeginIndex && "Empty vector!");
3261 unsigned NumElements = EndIndex - BeginIndex;
3262 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3263 "Too many elements!");
3264 Type *SliceTy = (NumElements == 1)
3265 ? ElementTy
3266 : FixedVectorType::get(ElementTy, NumElements);
3267 if (V->getType() != SliceTy)
3268 V = IRB.CreateBitPreservingCastChain(DL, V, SliceTy);
3269
3270 // Mix in the existing elements.
3271 Value *Old =
3272 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3273 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3274 }
3275 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3276 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3277 LLVMContext::MD_access_group});
3278 if (AATags)
3279 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3280 V->getType(), DL));
3281 Pass.DeadInsts.push_back(&SI);
3282
3283 // NOTE: Careful to use OrigV rather than V.
3284 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3285 Store, Store->getPointerOperand(), OrigV, DL);
3286 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3287 return true;
3288 }
3289
3290 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3291 assert(IntTy && "We cannot extract an integer from the alloca");
3292 assert(!SI.isVolatile());
3293 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3294 IntTy->getBitWidth()) {
3295 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3296 "oldload");
3297 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3298 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3299 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3300 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3301 }
3302 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3303 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3304 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3305 LLVMContext::MD_access_group});
3306 if (AATags)
3307 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3308 V->getType(), DL));
3309
3310 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3311 Store, Store->getPointerOperand(),
3312 Store->getValueOperand(), DL);
3313
3314 Pass.DeadInsts.push_back(&SI);
3315 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3316 return true;
3317 }
3318
3319 bool visitStoreInst(StoreInst &SI) {
3320 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3321 Value *OldOp = SI.getOperand(1);
3322 assert(OldOp == OldPtr);
3323
3324 AAMDNodes AATags = SI.getAAMetadata();
3325 Value *V = SI.getValueOperand();
3326
3327 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3328 // alloca that should be re-examined after promoting this alloca.
3329 if (V->getType()->isPointerTy())
3330 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3331 Pass.PostPromotionWorklist.insert(AI);
3332
3333 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3334 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3335 assert(!SI.isVolatile());
3336 assert(V->getType()->isIntegerTy() &&
3337 "Only integer type loads and stores are split");
3338 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3339 "Non-byte-multiple bit width");
3340 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3341 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3342 "extract");
3343 }
3344
3345 if (VecTy)
3346 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3347 if (IntTy && V->getType()->isIntegerTy())
3348 return rewriteIntegerStore(V, SI, AATags);
3349
3350 StoreInst *NewSI;
3351 if (NewBeginOffset == NewAllocaBeginOffset &&
3352 NewEndOffset == NewAllocaEndOffset &&
3353 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3354 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3355 Value *NewPtr =
3356 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3357
3358 NewSI =
3359 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3360 } else {
3361 unsigned AS = SI.getPointerAddressSpace();
3362 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3363 NewSI =
3364 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3365 }
3366 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3367 LLVMContext::MD_access_group});
3368 if (AATags)
3369 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3370 V->getType(), DL));
3371 if (SI.isVolatile())
3372 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3373 if (NewSI->isAtomic())
3374 NewSI->setAlignment(SI.getAlign());
3375
3376 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3377 NewSI, NewSI->getPointerOperand(),
3378 NewSI->getValueOperand(), DL);
3379
3380 Pass.DeadInsts.push_back(&SI);
3381 deleteIfTriviallyDead(OldOp);
3382
3383 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3384 return NewSI->getPointerOperand() == &NewAI &&
3385 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3386 !SI.isVolatile();
3387 }
3388
3389 /// Compute an integer value from splatting an i8 across the given
3390 /// number of bytes.
3391 ///
3392 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3393 /// call this routine.
3394 /// FIXME: Heed the advice above.
3395 ///
3396 /// \param V The i8 value to splat.
3397 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3398 Value *getIntegerSplat(Value *V, unsigned Size) {
3399 assert(Size > 0 && "Expected a positive number of bytes.");
3400 IntegerType *VTy = cast<IntegerType>(V->getType());
3401 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3402 if (Size == 1)
3403 return V;
3404
3405 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3406 V = IRB.CreateMul(
3407 IRB.CreateZExt(V, SplatIntTy, "zext"),
3408 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3409 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3410 SplatIntTy)),
3411 "isplat");
3412 return V;
3413 }
3414
3415 /// Compute a vector splat for a given element value.
3416 Value *getVectorSplat(Value *V, unsigned NumElements) {
3417 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3418 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3419 return V;
3420 }
3421
3422 bool visitMemSetInst(MemSetInst &II) {
3423 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3424 assert(II.getRawDest() == OldPtr);
3425
3426 AAMDNodes AATags = II.getAAMetadata();
3427
3428 // If the memset has a variable size, it cannot be split, just adjust the
3429 // pointer to the new alloca.
3430 if (!isa<ConstantInt>(II.getLength())) {
3431 assert(!IsSplit);
3432 assert(NewBeginOffset == BeginOffset);
3433 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3434 II.setDestAlignment(getSliceAlign());
3435 // In theory we should call migrateDebugInfo here. However, we do not
3436 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3437 // constant geps, or storing a variable number of bytes.
3439 "AT: Unexpected link to non-const GEP");
3440 deleteIfTriviallyDead(OldPtr);
3441 return false;
3442 }
3443
3444 // Record this instruction for deletion.
3445 Pass.DeadInsts.push_back(&II);
3446
3447 Type *ScalarTy = NewAllocaTy->getScalarType();
3448
3449 const bool CanContinue = [&]() {
3450 if (VecTy || IntTy)
3451 return true;
3452 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3453 return false;
3454 // Length must be in range for FixedVectorType.
3455 auto *C = cast<ConstantInt>(II.getLength());
3456 const uint64_t Len = C->getLimitedValue();
3457 if (Len > std::numeric_limits<unsigned>::max())
3458 return false;
3459 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3460 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3461 return canConvertValue(DL, SrcTy, NewAllocaTy) &&
3462 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3463 }();
3464
3465 // If this doesn't map cleanly onto the alloca type, and that type isn't
3466 // a single value type, just emit a memset.
3467 if (!CanContinue) {
3468 Type *SizeTy = II.getLength()->getType();
3469 unsigned Sz = NewEndOffset - NewBeginOffset;
3470 Constant *Size = ConstantInt::get(SizeTy, Sz);
3471 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3472 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3473 MaybeAlign(getSliceAlign()), II.isVolatile()));
3474 if (AATags)
3475 New->setAAMetadata(
3476 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3477
3478 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3479 New, New->getRawDest(), nullptr, DL);
3480
3481 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3482 return false;
3483 }
3484
3485 // If we can represent this as a simple value, we have to build the actual
3486 // value to store, which requires expanding the byte present in memset to
3487 // a sensible representation for the alloca type. This is essentially
3488 // splatting the byte to a sufficiently wide integer, splatting it across
3489 // any desired vector width, and bitcasting to the final type.
3490 Value *V;
3491
3492 if (VecTy) {
3493 // If this is a memset of a vectorized alloca, insert it.
3494 assert(ElementTy == ScalarTy);
3495
3496 unsigned BeginIndex = getIndex(NewBeginOffset);
3497 unsigned EndIndex = getIndex(NewEndOffset);
3498 assert(EndIndex > BeginIndex && "Empty vector!");
3499 unsigned NumElements = EndIndex - BeginIndex;
3500 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3501 "Too many elements!");
3502
3503 Value *Splat = getIntegerSplat(
3504 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3505 Splat = IRB.CreateBitPreservingCastChain(DL, Splat, ElementTy);
3506 if (NumElements > 1)
3507 Splat = getVectorSplat(Splat, NumElements);
3508
3509 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3510 "oldload");
3511 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3512 } else if (IntTy) {
3513 // If this is a memset on an alloca where we can widen stores, insert the
3514 // set integer.
3515 assert(!II.isVolatile());
3516
3517 uint64_t Size = NewEndOffset - NewBeginOffset;
3518 V = getIntegerSplat(II.getValue(), Size);
3519
3520 if (IntTy && (NewBeginOffset != NewAllocaBeginOffset ||
3521 NewEndOffset != NewAllocaEndOffset)) {
3522 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI,
3523 NewAI.getAlign(), "oldload");
3524 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3525 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3526 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3527 } else {
3528 assert(V->getType() == IntTy &&
3529 "Wrong type for an alloca wide integer!");
3530 }
3531 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3532 } else {
3533 // Established these invariants above.
3534 assert(NewBeginOffset == NewAllocaBeginOffset);
3535 assert(NewEndOffset == NewAllocaEndOffset);
3536
3537 V = getIntegerSplat(II.getValue(),
3538 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3539 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(NewAllocaTy))
3540 V = getVectorSplat(
3541 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3542
3543 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3544 }
3545
3546 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3547 StoreInst *New =
3548 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3549 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3550 LLVMContext::MD_access_group});
3551 if (AATags)
3552 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3553 V->getType(), DL));
3554
3555 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3556 New, New->getPointerOperand(), V, DL);
3557
3558 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3559 return !II.isVolatile();
3560 }
3561
3562 bool visitMemTransferInst(MemTransferInst &II) {
3563 // Rewriting of memory transfer instructions can be a bit tricky. We break
3564 // them into two categories: split intrinsics and unsplit intrinsics.
3565
3566 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3567
3568 AAMDNodes AATags = II.getAAMetadata();
3569
3570 bool IsDest = &II.getRawDestUse() == OldUse;
3571 assert((IsDest && II.getRawDest() == OldPtr) ||
3572 (!IsDest && II.getRawSource() == OldPtr));
3573
3574 Align SliceAlign = getSliceAlign();
3575 // For unsplit intrinsics, we simply modify the source and destination
3576 // pointers in place. This isn't just an optimization, it is a matter of
3577 // correctness. With unsplit intrinsics we may be dealing with transfers
3578 // within a single alloca before SROA ran, or with transfers that have
3579 // a variable length. We may also be dealing with memmove instead of
3580 // memcpy, and so simply updating the pointers is the necessary for us to
3581 // update both source and dest of a single call.
3582 if (!IsSplittable) {
3583 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3584 if (IsDest) {
3585 // Update the address component of linked dbg.assigns.
3586 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3587 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3588 DbgAssign->getAddress() == II.getDest())
3589 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3590 }
3591 II.setDest(AdjustedPtr);
3592 II.setDestAlignment(SliceAlign);
3593 } else {
3594 II.setSource(AdjustedPtr);
3595 II.setSourceAlignment(SliceAlign);
3596 }
3597
3598 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3599 deleteIfTriviallyDead(OldPtr);
3600 return false;
3601 }
3602 // For split transfer intrinsics we have an incredibly useful assurance:
3603 // the source and destination do not reside within the same alloca, and at
3604 // least one of them does not escape. This means that we can replace
3605 // memmove with memcpy, and we don't need to worry about all manner of
3606 // downsides to splitting and transforming the operations.
3607
3608 // If this doesn't map cleanly onto the alloca type, and that type isn't
3609 // a single value type, just emit a memcpy.
3610 bool EmitMemCpy =
3611 !VecTy && !IntTy &&
3612 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3613 SliceSize != DL.getTypeStoreSize(NewAllocaTy).getFixedValue() ||
3614 !DL.typeSizeEqualsStoreSize(NewAllocaTy) ||
3615 !NewAllocaTy->isSingleValueType());
3616
3617 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3618 // size hasn't been shrunk based on analysis of the viable range, this is
3619 // a no-op.
3620 if (EmitMemCpy && &OldAI == &NewAI) {
3621 // Ensure the start lines up.
3622 assert(NewBeginOffset == BeginOffset);
3623
3624 // Rewrite the size as needed.
3625 if (NewEndOffset != EndOffset)
3626 II.setLength(NewEndOffset - NewBeginOffset);
3627 return false;
3628 }
3629 // Record this instruction for deletion.
3630 Pass.DeadInsts.push_back(&II);
3631
3632 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3633 // alloca that should be re-examined after rewriting this instruction.
3634 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3635 if (AllocaInst *AI =
3637 assert(AI != &OldAI && AI != &NewAI &&
3638 "Splittable transfers cannot reach the same alloca on both ends.");
3639 Pass.Worklist.insert(AI);
3640 }
3641
3642 Type *OtherPtrTy = OtherPtr->getType();
3643 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3644
3645 // Compute the relative offset for the other pointer within the transfer.
3646 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3647 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3648 Align OtherAlign =
3649 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3650 OtherAlign =
3651 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3652
3653 if (EmitMemCpy) {
3654 // Compute the other pointer, folding as much as possible to produce
3655 // a single, simple GEP in most cases.
3656 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3657 OtherPtr->getName() + ".");
3658
3659 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3660 Type *SizeTy = II.getLength()->getType();
3661 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3662
3663 Value *DestPtr, *SrcPtr;
3664 MaybeAlign DestAlign, SrcAlign;
3665 // Note: IsDest is true iff we're copying into the new alloca slice
3666 if (IsDest) {
3667 DestPtr = OurPtr;
3668 DestAlign = SliceAlign;
3669 SrcPtr = OtherPtr;
3670 SrcAlign = OtherAlign;
3671 } else {
3672 DestPtr = OtherPtr;
3673 DestAlign = OtherAlign;
3674 SrcPtr = OurPtr;
3675 SrcAlign = SliceAlign;
3676 }
3677 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3678 Size, II.isVolatile());
3679 if (AATags)
3680 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3681
3682 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3683 if (IsDest) {
3684 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3685 &II, New, DestPtr, nullptr, DL);
3686 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3688 DL, Offset, /*AllowNonInbounds*/ true))) {
3689 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3690 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3691 }
3692 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3693 return false;
3694 }
3695
3696 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3697 NewEndOffset == NewAllocaEndOffset;
3698 uint64_t Size = NewEndOffset - NewBeginOffset;
3699 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3700 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3701 unsigned NumElements = EndIndex - BeginIndex;
3702 IntegerType *SubIntTy =
3703 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3704
3705 // Reset the other pointer type to match the register type we're going to
3706 // use, but using the address space of the original other pointer.
3707 Type *OtherTy;
3708 if (VecTy && !IsWholeAlloca) {
3709 if (NumElements == 1)
3710 OtherTy = VecTy->getElementType();
3711 else
3712 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3713 } else if (IntTy && !IsWholeAlloca) {
3714 OtherTy = SubIntTy;
3715 } else {
3716 OtherTy = NewAllocaTy;
3717 }
3718
3719 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3720 OtherPtr->getName() + ".");
3721 MaybeAlign SrcAlign = OtherAlign;
3722 MaybeAlign DstAlign = SliceAlign;
3723 if (!IsDest)
3724 std::swap(SrcAlign, DstAlign);
3725
3726 Value *SrcPtr;
3727 Value *DstPtr;
3728
3729 if (IsDest) {
3730 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3731 SrcPtr = AdjPtr;
3732 } else {
3733 DstPtr = AdjPtr;
3734 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3735 }
3736
3737 Value *Src;
3738 if (VecTy && !IsWholeAlloca && !IsDest) {
3739 Src =
3740 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3741 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3742 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3743 Src =
3744 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3745 Src = IRB.CreateBitPreservingCastChain(DL, Src, IntTy);
3746 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3747 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3748 } else {
3749 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3750 II.isVolatile(), "copyload");
3751 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3752 LLVMContext::MD_access_group});
3753 if (AATags)
3754 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3755 Load->getType(), DL));
3756 Src = Load;
3757 }
3758
3759 if (VecTy && !IsWholeAlloca && IsDest) {
3760 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3761 "oldload");
3762 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3763 } else if (IntTy && !IsWholeAlloca && IsDest) {
3764 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3765 "oldload");
3766 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3767 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3768 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3769 Src = IRB.CreateBitPreservingCastChain(DL, Src, NewAllocaTy);
3770 }
3771
3772 StoreInst *Store = cast<StoreInst>(
3773 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3774 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3775 LLVMContext::MD_access_group});
3776 if (AATags)
3777 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3778 Src->getType(), DL));
3779
3780 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3781 if (IsDest) {
3782
3783 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3784 Store, DstPtr, Src, DL);
3785 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3787 DL, Offset, /*AllowNonInbounds*/ true))) {
3788 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3789 &II, Store, DstPtr, Src, DL);
3790 }
3791
3792 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3793 return !II.isVolatile();
3794 }
3795
3796 bool visitIntrinsicInst(IntrinsicInst &II) {
3797 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3798 "Unexpected intrinsic!");
3799 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3800
3801 // Record this instruction for deletion.
3802 Pass.DeadInsts.push_back(&II);
3803
3804 if (II.isDroppable()) {
3805 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3806 // TODO For now we forget assumed information, this can be improved.
3807 OldPtr->dropDroppableUsesIn(II);
3808 return true;
3809 }
3810
3811 assert(II.getArgOperand(0) == OldPtr);
3812 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3813 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3814 Value *New;
3815 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3816 New = IRB.CreateLifetimeStart(Ptr);
3817 else
3818 New = IRB.CreateLifetimeEnd(Ptr);
3819
3820 (void)New;
3821 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3822
3823 return true;
3824 }
3825
3826 void fixLoadStoreAlign(Instruction &Root) {
3827 // This algorithm implements the same visitor loop as
3828 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3829 // or store found.
3830 SmallPtrSet<Instruction *, 4> Visited;
3831 SmallVector<Instruction *, 4> Uses;
3832 Visited.insert(&Root);
3833 Uses.push_back(&Root);
3834 do {
3835 Instruction *I = Uses.pop_back_val();
3836
3837 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3838 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3839 continue;
3840 }
3841 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3842 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3843 continue;
3844 }
3845
3849 for (User *U : I->users())
3850 if (Visited.insert(cast<Instruction>(U)).second)
3851 Uses.push_back(cast<Instruction>(U));
3852 } while (!Uses.empty());
3853 }
3854
3855 bool visitPHINode(PHINode &PN) {
3856 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3857 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3858 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3859
3860 // We would like to compute a new pointer in only one place, but have it be
3861 // as local as possible to the PHI. To do that, we re-use the location of
3862 // the old pointer, which necessarily must be in the right position to
3863 // dominate the PHI.
3864 IRBuilderBase::InsertPointGuard Guard(IRB);
3865 if (isa<PHINode>(OldPtr))
3866 IRB.SetInsertPoint(OldPtr->getParent(),
3867 OldPtr->getParent()->getFirstInsertionPt());
3868 else
3869 IRB.SetInsertPoint(OldPtr);
3870 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3871
3872 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3873 // Replace the operands which were using the old pointer.
3874 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3875
3876 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3877 deleteIfTriviallyDead(OldPtr);
3878
3879 // Fix the alignment of any loads or stores using this PHI node.
3880 fixLoadStoreAlign(PN);
3881
3882 // PHIs can't be promoted on their own, but often can be speculated. We
3883 // check the speculation outside of the rewriter so that we see the
3884 // fully-rewritten alloca.
3885 PHIUsers.insert(&PN);
3886 return true;
3887 }
3888
3889 bool visitSelectInst(SelectInst &SI) {
3890 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3891 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3892 "Pointer isn't an operand!");
3893 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3894 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3895
3896 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3897 // Replace the operands which were using the old pointer.
3898 if (SI.getOperand(1) == OldPtr)
3899 SI.setOperand(1, NewPtr);
3900 if (SI.getOperand(2) == OldPtr)
3901 SI.setOperand(2, NewPtr);
3902
3903 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3904 deleteIfTriviallyDead(OldPtr);
3905
3906 // Fix the alignment of any loads or stores using this select.
3907 fixLoadStoreAlign(SI);
3908
3909 // Selects can't be promoted on their own, but often can be speculated. We
3910 // check the speculation outside of the rewriter so that we see the
3911 // fully-rewritten alloca.
3912 SelectUsers.insert(&SI);
3913 return true;
3914 }
3915};
3916
3917/// Visitor to rewrite aggregate loads and stores as scalar.
3918///
3919/// This pass aggressively rewrites all aggregate loads and stores on
3920/// a particular pointer (or any pointer derived from it which we can identify)
3921/// with scalar loads and stores.
3922class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3923 // Befriend the base class so it can delegate to private visit methods.
3924 friend class InstVisitor<AggLoadStoreRewriter, bool>;
3925
3926 /// Queue of pointer uses to analyze and potentially rewrite.
3928
3929 /// Set to prevent us from cycling with phi nodes and loops.
3930 SmallPtrSet<User *, 8> Visited;
3931
3932 /// The current pointer use being rewritten. This is used to dig up the used
3933 /// value (as opposed to the user).
3934 Use *U = nullptr;
3935
3936 /// Used to calculate offsets, and hence alignment, of subobjects.
3937 const DataLayout &DL;
3938
3939 IRBuilderTy &IRB;
3940
3941public:
3942 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3943 : DL(DL), IRB(IRB) {}
3944
3945 /// Rewrite loads and stores through a pointer and all pointers derived from
3946 /// it.
3947 bool rewrite(Instruction &I) {
3948 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3949 enqueueUsers(I);
3950 bool Changed = false;
3951 while (!Queue.empty()) {
3952 U = Queue.pop_back_val();
3953 Changed |= visit(cast<Instruction>(U->getUser()));
3954 }
3955 return Changed;
3956 }
3957
3958private:
3959 /// Enqueue all the users of the given instruction for further processing.
3960 /// This uses a set to de-duplicate users.
3961 void enqueueUsers(Instruction &I) {
3962 for (Use &U : I.uses())
3963 if (Visited.insert(U.getUser()).second)
3964 Queue.push_back(&U);
3965 }
3966
3967 // Conservative default is to not rewrite anything.
3968 bool visitInstruction(Instruction &I) { return false; }
3969
3970 /// Generic recursive split emission class.
3971 template <typename Derived> class OpSplitter {
3972 protected:
3973 /// The builder used to form new instructions.
3974 IRBuilderTy &IRB;
3975
3976 /// The indices which to be used with insert- or extractvalue to select the
3977 /// appropriate value within the aggregate.
3978 SmallVector<unsigned, 4> Indices;
3979
3980 /// The indices to a GEP instruction which will move Ptr to the correct slot
3981 /// within the aggregate.
3982 SmallVector<Value *, 4> GEPIndices;
3983
3984 /// The base pointer of the original op, used as a base for GEPing the
3985 /// split operations.
3986 Value *Ptr;
3987
3988 /// The base pointee type being GEPed into.
3989 Type *BaseTy;
3990
3991 /// Known alignment of the base pointer.
3992 Align BaseAlign;
3993
3994 /// To calculate offset of each component so we can correctly deduce
3995 /// alignments.
3996 const DataLayout &DL;
3997
3998 /// Initialize the splitter with an insertion point, Ptr and start with a
3999 /// single zero GEP index.
4000 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4001 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4002 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4003 BaseAlign(BaseAlign), DL(DL) {
4004 IRB.SetInsertPoint(InsertionPoint);
4005 }
4006
4007 public:
4008 /// Generic recursive split emission routine.
4009 ///
4010 /// This method recursively splits an aggregate op (load or store) into
4011 /// scalar or vector ops. It splits recursively until it hits a single value
4012 /// and emits that single value operation via the template argument.
4013 ///
4014 /// The logic of this routine relies on GEPs and insertvalue and
4015 /// extractvalue all operating with the same fundamental index list, merely
4016 /// formatted differently (GEPs need actual values).
4017 ///
4018 /// \param Ty The type being split recursively into smaller ops.
4019 /// \param Agg The aggregate value being built up or stored, depending on
4020 /// whether this is splitting a load or a store respectively.
4021 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4022 if (Ty->isSingleValueType()) {
4023 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4024 return static_cast<Derived *>(this)->emitFunc(
4025 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4026 }
4027
4028 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4029 unsigned OldSize = Indices.size();
4030 (void)OldSize;
4031 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4032 ++Idx) {
4033 assert(Indices.size() == OldSize && "Did not return to the old size");
4034 Indices.push_back(Idx);
4035 GEPIndices.push_back(IRB.getInt32(Idx));
4036 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4037 GEPIndices.pop_back();
4038 Indices.pop_back();
4039 }
4040 return;
4041 }
4042
4043 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4044 unsigned OldSize = Indices.size();
4045 (void)OldSize;
4046 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4047 ++Idx) {
4048 assert(Indices.size() == OldSize && "Did not return to the old size");
4049 Indices.push_back(Idx);
4050 GEPIndices.push_back(IRB.getInt32(Idx));
4051 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4052 GEPIndices.pop_back();
4053 Indices.pop_back();
4054 }
4055 return;
4056 }
4057
4058 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4059 }
4060 };
4061
4062 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4063 AAMDNodes AATags;
4064 // A vector to hold the split components that we want to emit
4065 // separate fake uses for.
4066 SmallVector<Value *, 4> Components;
4067 // A vector to hold all the fake uses of the struct that we are splitting.
4068 // Usually there should only be one, but we are handling the general case.
4070
4071 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4072 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4073 IRBuilderTy &IRB)
4074 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4075 IRB),
4076 AATags(AATags) {}
4077
4078 /// Emit a leaf load of a single value. This is called at the leaves of the
4079 /// recursive emission to actually load values.
4080 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4082 // Load the single value and insert it using the indices.
4083 Value *GEP =
4084 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4085 LoadInst *Load =
4086 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4087
4088 APInt Offset(
4089 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4090 if (AATags &&
4091 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4092 Load->setAAMetadata(
4093 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4094 // Record the load so we can generate a fake use for this aggregate
4095 // component.
4096 Components.push_back(Load);
4097
4098 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4099 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4100 }
4101
4102 // Stash the fake uses that use the value generated by this instruction.
4103 void recordFakeUses(LoadInst &LI) {
4104 for (Use &U : LI.uses())
4105 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4106 if (II->getIntrinsicID() == Intrinsic::fake_use)
4107 FakeUses.push_back(II);
4108 }
4109
4110 // Replace all fake uses of the aggregate with a series of fake uses, one
4111 // for each split component.
4112 void emitFakeUses() {
4113 for (Instruction *I : FakeUses) {
4114 IRB.SetInsertPoint(I);
4115 for (auto *V : Components)
4116 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4117 I->eraseFromParent();
4118 }
4119 }
4120 };
4121
4122 bool visitLoadInst(LoadInst &LI) {
4123 assert(LI.getPointerOperand() == *U);
4124 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4125 return false;
4126
4127 // We have an aggregate being loaded, split it apart.
4128 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4129 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4130 getAdjustedAlignment(&LI, 0), DL, IRB);
4131 Splitter.recordFakeUses(LI);
4133 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4134 Splitter.emitFakeUses();
4135 Visited.erase(&LI);
4136 LI.replaceAllUsesWith(V);
4137 LI.eraseFromParent();
4138 return true;
4139 }
4140
4141 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4142 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4143 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4144 const DataLayout &DL, IRBuilderTy &IRB)
4145 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4146 DL, IRB),
4147 AATags(AATags), AggStore(AggStore) {}
4148 AAMDNodes AATags;
4149 StoreInst *AggStore;
4150 /// Emit a leaf store of a single value. This is called at the leaves of the
4151 /// recursive emission to actually produce stores.
4152 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4154 // Extract the single value and store it using the indices.
4155 //
4156 // The gep and extractvalue values are factored out of the CreateStore
4157 // call to make the output independent of the argument evaluation order.
4158 Value *ExtractValue =
4159 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4160 Value *InBoundsGEP =
4161 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4162 StoreInst *Store =
4163 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4164
4165 APInt Offset(
4166 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4167 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4168 if (AATags) {
4169 Store->setAAMetadata(AATags.adjustForAccess(
4170 Offset.getZExtValue(), ExtractValue->getType(), DL));
4171 }
4172
4173 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4174 // If we cannot (because there's an intervening non-const or unbounded
4175 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4176 // this instruction.
4178 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4179 uint64_t SizeInBits =
4180 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4181 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4182 SizeInBits, AggStore, Store,
4183 Store->getPointerOperand(), Store->getValueOperand(),
4184 DL);
4185 } else {
4187 "AT: unexpected debug.assign linked to store through "
4188 "unbounded GEP");
4189 }
4190 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4191 }
4192 };
4193
4194 bool visitStoreInst(StoreInst &SI) {
4195 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4196 return false;
4197 Value *V = SI.getValueOperand();
4198 if (V->getType()->isSingleValueType())
4199 return false;
4200
4201 // We have an aggregate being stored, split it apart.
4202 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4203 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4204 getAdjustedAlignment(&SI, 0), DL, IRB);
4205 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4206 Visited.erase(&SI);
4207 // The stores replacing SI each have markers describing fragments of the
4208 // assignment so delete the assignment markers linked to SI.
4210 SI.eraseFromParent();
4211 return true;
4212 }
4213
4214 bool visitBitCastInst(BitCastInst &BC) {
4215 enqueueUsers(BC);
4216 return false;
4217 }
4218
4219 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4220 enqueueUsers(ASC);
4221 return false;
4222 }
4223
4224 // Unfold gep (select cond, ptr1, ptr2), idx
4225 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4226 // and gep ptr, (select cond, idx1, idx2)
4227 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4228 // We also allow for i1 zext indices, which are equivalent to selects.
4229 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4230 // Check whether the GEP has exactly one select operand and all indices
4231 // will become constant after the transform.
4233 for (Value *Op : GEPI.indices()) {
4234 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4235 if (Sel)
4236 return false;
4237
4238 Sel = SI;
4239 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4240 !isa<ConstantInt>(SI->getFalseValue()))
4241 return false;
4242 continue;
4243 }
4244 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4245 if (Sel)
4246 return false;
4247 Sel = ZI;
4248 if (!ZI->getSrcTy()->isIntegerTy(1))
4249 return false;
4250 continue;
4251 }
4252
4253 if (!isa<ConstantInt>(Op))
4254 return false;
4255 }
4256
4257 if (!Sel)
4258 return false;
4259
4260 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4261 dbgs() << " original: " << *Sel << "\n";
4262 dbgs() << " " << GEPI << "\n";);
4263
4264 auto GetNewOps = [&](Value *SelOp) {
4265 SmallVector<Value *> NewOps;
4266 for (Value *Op : GEPI.operands())
4267 if (Op == Sel)
4268 NewOps.push_back(SelOp);
4269 else
4270 NewOps.push_back(Op);
4271 return NewOps;
4272 };
4273
4274 Value *Cond, *True, *False;
4275 Instruction *MDFrom = nullptr;
4276 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4277 Cond = SI->getCondition();
4278 True = SI->getTrueValue();
4279 False = SI->getFalseValue();
4281 MDFrom = SI;
4282 } else {
4283 Cond = Sel->getOperand(0);
4284 True = ConstantInt::get(Sel->getType(), 1);
4285 False = ConstantInt::get(Sel->getType(), 0);
4286 }
4287 SmallVector<Value *> TrueOps = GetNewOps(True);
4288 SmallVector<Value *> FalseOps = GetNewOps(False);
4289
4290 IRB.SetInsertPoint(&GEPI);
4291 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4292
4293 Type *Ty = GEPI.getSourceElementType();
4294 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4295 True->getName() + ".sroa.gep", NW);
4296
4297 Value *NFalse =
4298 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4299 False->getName() + ".sroa.gep", NW);
4300
4301 Value *NSel = MDFrom
4302 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4303 Sel->getName() + ".sroa.sel", MDFrom)
4304 : IRB.CreateSelectWithUnknownProfile(
4305 Cond, NTrue, NFalse, DEBUG_TYPE,
4306 Sel->getName() + ".sroa.sel");
4307 Visited.erase(&GEPI);
4308 GEPI.replaceAllUsesWith(NSel);
4309 GEPI.eraseFromParent();
4310 Instruction *NSelI = cast<Instruction>(NSel);
4311 Visited.insert(NSelI);
4312 enqueueUsers(*NSelI);
4313
4314 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4315 dbgs() << " " << *NFalse << "\n";
4316 dbgs() << " " << *NSel << "\n";);
4317
4318 return true;
4319 }
4320
4321 // Unfold gep (phi ptr1, ptr2), idx
4322 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4323 // and gep ptr, (phi idx1, idx2)
4324 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4325 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4326 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4327 // operand (looking through the phi if it is the phi we want to unfold) is
4328 // an instruction besides a static alloca.
4329 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4330 auto IsInvalidPointerOperand = [](Value *V) {
4331 if (!isa<Instruction>(V))
4332 return false;
4333 if (auto *AI = dyn_cast<AllocaInst>(V))
4334 return !AI->isStaticAlloca();
4335 return true;
4336 };
4337 if (Phi) {
4338 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4339 return false;
4340 } else {
4341 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4342 return false;
4343 }
4344 // Check whether the GEP has exactly one phi operand (including the pointer
4345 // operand) and all indices will become constant after the transform.
4346 for (Value *Op : GEPI.indices()) {
4347 if (auto *SI = dyn_cast<PHINode>(Op)) {
4348 if (Phi)
4349 return false;
4350
4351 Phi = SI;
4352 if (!all_of(Phi->incoming_values(),
4353 [](Value *V) { return isa<ConstantInt>(V); }))
4354 return false;
4355 continue;
4356 }
4357
4358 if (!isa<ConstantInt>(Op))
4359 return false;
4360 }
4361
4362 if (!Phi)
4363 return false;
4364
4365 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4366 dbgs() << " original: " << *Phi << "\n";
4367 dbgs() << " " << GEPI << "\n";);
4368
4369 auto GetNewOps = [&](Value *PhiOp) {
4370 SmallVector<Value *> NewOps;
4371 for (Value *Op : GEPI.operands())
4372 if (Op == Phi)
4373 NewOps.push_back(PhiOp);
4374 else
4375 NewOps.push_back(Op);
4376 return NewOps;
4377 };
4378
4379 IRB.SetInsertPoint(Phi);
4380 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4381 Phi->getName() + ".sroa.phi");
4382
4383 Type *SourceTy = GEPI.getSourceElementType();
4384 // We only handle arguments, constants, and static allocas here, so we can
4385 // insert GEPs at the end of the entry block.
4386 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4387 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4388 Value *Op = Phi->getIncomingValue(I);
4389 BasicBlock *BB = Phi->getIncomingBlock(I);
4390 Value *NewGEP;
4391 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4392 NewGEP = NewPhi->getIncomingValue(NI);
4393 } else {
4394 SmallVector<Value *> NewOps = GetNewOps(Op);
4395 NewGEP =
4396 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4397 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4398 }
4399 NewPhi->addIncoming(NewGEP, BB);
4400 }
4401
4402 Visited.erase(&GEPI);
4403 GEPI.replaceAllUsesWith(NewPhi);
4404 GEPI.eraseFromParent();
4405 Visited.insert(NewPhi);
4406 enqueueUsers(*NewPhi);
4407
4408 LLVM_DEBUG(dbgs() << " to: ";
4409 for (Value *In
4410 : NewPhi->incoming_values()) dbgs()
4411 << "\n " << *In;
4412 dbgs() << "\n " << *NewPhi << '\n');
4413
4414 return true;
4415 }
4416
4417 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4418 if (unfoldGEPSelect(GEPI))
4419 return true;
4420
4421 if (unfoldGEPPhi(GEPI))
4422 return true;
4423
4424 enqueueUsers(GEPI);
4425 return false;
4426 }
4427
4428 bool visitPHINode(PHINode &PN) {
4429 enqueueUsers(PN);
4430 return false;
4431 }
4432
4433 bool visitSelectInst(SelectInst &SI) {
4434 enqueueUsers(SI);
4435 return false;
4436 }
4437};
4438
4439} // end anonymous namespace
4440
4441/// Strip aggregate type wrapping.
4442///
4443/// This removes no-op aggregate types wrapping an underlying type. It will
4444/// strip as many layers of types as it can without changing either the type
4445/// size or the allocated size.
4447 if (Ty->isSingleValueType())
4448 return Ty;
4449
4450 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4451 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4452
4453 Type *InnerTy;
4454 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4455 InnerTy = ArrTy->getElementType();
4456 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4457 const StructLayout *SL = DL.getStructLayout(STy);
4458 unsigned Index = SL->getElementContainingOffset(0);
4459 InnerTy = STy->getElementType(Index);
4460 } else {
4461 return Ty;
4462 }
4463
4464 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4465 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4466 return Ty;
4467
4468 return stripAggregateTypeWrapping(DL, InnerTy);
4469}
4470
4471/// Try to find a partition of the aggregate type passed in for a given
4472/// offset and size.
4473///
4474/// This recurses through the aggregate type and tries to compute a subtype
4475/// based on the offset and size. When the offset and size span a sub-section
4476/// of an array, it will even compute a new array type for that sub-section,
4477/// and the same for structs.
4478///
4479/// Note that this routine is very strict and tries to find a partition of the
4480/// type which produces the *exact* right offset and size. It is not forgiving
4481/// when the size or offset cause either end of type-based partition to be off.
4482/// Also, this is a best-effort routine. It is reasonable to give up and not
4483/// return a type if necessary.
4485 uint64_t Size) {
4486 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4487 return stripAggregateTypeWrapping(DL, Ty);
4488 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4489 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4490 return nullptr;
4491
4492 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4493 Type *ElementTy;
4494 uint64_t TyNumElements;
4495 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4496 ElementTy = AT->getElementType();
4497 TyNumElements = AT->getNumElements();
4498 } else {
4499 // FIXME: This isn't right for vectors with non-byte-sized or
4500 // non-power-of-two sized elements.
4501 auto *VT = cast<FixedVectorType>(Ty);
4502 ElementTy = VT->getElementType();
4503 TyNumElements = VT->getNumElements();
4504 }
4505 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4506 uint64_t NumSkippedElements = Offset / ElementSize;
4507 if (NumSkippedElements >= TyNumElements)
4508 return nullptr;
4509 Offset -= NumSkippedElements * ElementSize;
4510
4511 // First check if we need to recurse.
4512 if (Offset > 0 || Size < ElementSize) {
4513 // Bail if the partition ends in a different array element.
4514 if ((Offset + Size) > ElementSize)
4515 return nullptr;
4516 // Recurse through the element type trying to peel off offset bytes.
4517 return getTypePartition(DL, ElementTy, Offset, Size);
4518 }
4519 assert(Offset == 0);
4520
4521 if (Size == ElementSize)
4522 return stripAggregateTypeWrapping(DL, ElementTy);
4523 assert(Size > ElementSize);
4524 uint64_t NumElements = Size / ElementSize;
4525 if (NumElements * ElementSize != Size)
4526 return nullptr;
4527 return ArrayType::get(ElementTy, NumElements);
4528 }
4529
4531 if (!STy)
4532 return nullptr;
4533
4534 const StructLayout *SL = DL.getStructLayout(STy);
4535
4536 if (SL->getSizeInBits().isScalable())
4537 return nullptr;
4538
4539 if (Offset >= SL->getSizeInBytes())
4540 return nullptr;
4541 uint64_t EndOffset = Offset + Size;
4542 if (EndOffset > SL->getSizeInBytes())
4543 return nullptr;
4544
4545 unsigned Index = SL->getElementContainingOffset(Offset);
4546 Offset -= SL->getElementOffset(Index);
4547
4548 Type *ElementTy = STy->getElementType(Index);
4549 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4550 if (Offset >= ElementSize)
4551 return nullptr; // The offset points into alignment padding.
4552
4553 // See if any partition must be contained by the element.
4554 if (Offset > 0 || Size < ElementSize) {
4555 if ((Offset + Size) > ElementSize)
4556 return nullptr;
4557 return getTypePartition(DL, ElementTy, Offset, Size);
4558 }
4559 assert(Offset == 0);
4560
4561 if (Size == ElementSize)
4562 return stripAggregateTypeWrapping(DL, ElementTy);
4563
4564 StructType::element_iterator EI = STy->element_begin() + Index,
4565 EE = STy->element_end();
4566 if (EndOffset < SL->getSizeInBytes()) {
4567 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4568 if (Index == EndIndex)
4569 return nullptr; // Within a single element and its padding.
4570
4571 // Don't try to form "natural" types if the elements don't line up with the
4572 // expected size.
4573 // FIXME: We could potentially recurse down through the last element in the
4574 // sub-struct to find a natural end point.
4575 if (SL->getElementOffset(EndIndex) != EndOffset)
4576 return nullptr;
4577
4578 assert(Index < EndIndex);
4579 EE = STy->element_begin() + EndIndex;
4580 }
4581
4582 // Try to build up a sub-structure.
4583 StructType *SubTy =
4584 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4585 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4586 if (Size != SubSL->getSizeInBytes())
4587 return nullptr; // The sub-struct doesn't have quite the size needed.
4588
4589 return SubTy;
4590}
4591
4592/// Pre-split loads and stores to simplify rewriting.
4593///
4594/// We want to break up the splittable load+store pairs as much as
4595/// possible. This is important to do as a preprocessing step, as once we
4596/// start rewriting the accesses to partitions of the alloca we lose the
4597/// necessary information to correctly split apart paired loads and stores
4598/// which both point into this alloca. The case to consider is something like
4599/// the following:
4600///
4601/// %a = alloca [12 x i8]
4602/// %gep1 = getelementptr i8, ptr %a, i32 0
4603/// %gep2 = getelementptr i8, ptr %a, i32 4
4604/// %gep3 = getelementptr i8, ptr %a, i32 8
4605/// store float 0.0, ptr %gep1
4606/// store float 1.0, ptr %gep2
4607/// %v = load i64, ptr %gep1
4608/// store i64 %v, ptr %gep2
4609/// %f1 = load float, ptr %gep2
4610/// %f2 = load float, ptr %gep3
4611///
4612/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4613/// promote everything so we recover the 2 SSA values that should have been
4614/// there all along.
4615///
4616/// \returns true if any changes are made.
4617bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4618 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4619
4620 // Track the loads and stores which are candidates for pre-splitting here, in
4621 // the order they first appear during the partition scan. These give stable
4622 // iteration order and a basis for tracking which loads and stores we
4623 // actually split.
4626
4627 // We need to accumulate the splits required of each load or store where we
4628 // can find them via a direct lookup. This is important to cross-check loads
4629 // and stores against each other. We also track the slice so that we can kill
4630 // all the slices that end up split.
4631 struct SplitOffsets {
4632 Slice *S;
4633 std::vector<uint64_t> Splits;
4634 };
4635 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4636
4637 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4638 // This is important as we also cannot pre-split stores of those loads!
4639 // FIXME: This is all pretty gross. It means that we can be more aggressive
4640 // in pre-splitting when the load feeding the store happens to come from
4641 // a separate alloca. Put another way, the effectiveness of SROA would be
4642 // decreased by a frontend which just concatenated all of its local allocas
4643 // into one big flat alloca. But defeating such patterns is exactly the job
4644 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4645 // change store pre-splitting to actually force pre-splitting of the load
4646 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4647 // maybe it would make it more principled?
4648 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4649
4650 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4651 for (auto &P : AS.partitions()) {
4652 for (Slice &S : P) {
4653 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4654 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4655 // If this is a load we have to track that it can't participate in any
4656 // pre-splitting. If this is a store of a load we have to track that
4657 // that load also can't participate in any pre-splitting.
4658 if (auto *LI = dyn_cast<LoadInst>(I))
4659 UnsplittableLoads.insert(LI);
4660 else if (auto *SI = dyn_cast<StoreInst>(I))
4661 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4662 UnsplittableLoads.insert(LI);
4663 continue;
4664 }
4665 assert(P.endOffset() > S.beginOffset() &&
4666 "Empty or backwards partition!");
4667
4668 // Determine if this is a pre-splittable slice.
4669 if (auto *LI = dyn_cast<LoadInst>(I)) {
4670 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4671
4672 // The load must be used exclusively to store into other pointers for
4673 // us to be able to arbitrarily pre-split it. The stores must also be
4674 // simple to avoid changing semantics.
4675 auto IsLoadSimplyStored = [](LoadInst *LI) {
4676 for (User *LU : LI->users()) {
4677 auto *SI = dyn_cast<StoreInst>(LU);
4678 if (!SI || !SI->isSimple())
4679 return false;
4680 }
4681 return true;
4682 };
4683 if (!IsLoadSimplyStored(LI)) {
4684 UnsplittableLoads.insert(LI);
4685 continue;
4686 }
4687
4688 Loads.push_back(LI);
4689 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4690 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4691 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4692 continue;
4693 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4694 if (!StoredLoad || !StoredLoad->isSimple())
4695 continue;
4696 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4697
4698 Stores.push_back(SI);
4699 } else {
4700 // Other uses cannot be pre-split.
4701 continue;
4702 }
4703
4704 // Record the initial split.
4705 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4706 auto &Offsets = SplitOffsetsMap[I];
4707 assert(Offsets.Splits.empty() &&
4708 "Should not have splits the first time we see an instruction!");
4709 Offsets.S = &S;
4710 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4711 }
4712
4713 // Now scan the already split slices, and add a split for any of them which
4714 // we're going to pre-split.
4715 for (Slice *S : P.splitSliceTails()) {
4716 auto SplitOffsetsMapI =
4717 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4718 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4719 continue;
4720 auto &Offsets = SplitOffsetsMapI->second;
4721
4722 assert(Offsets.S == S && "Found a mismatched slice!");
4723 assert(!Offsets.Splits.empty() &&
4724 "Cannot have an empty set of splits on the second partition!");
4725 assert(Offsets.Splits.back() ==
4726 P.beginOffset() - Offsets.S->beginOffset() &&
4727 "Previous split does not end where this one begins!");
4728
4729 // Record each split. The last partition's end isn't needed as the size
4730 // of the slice dictates that.
4731 if (S->endOffset() > P.endOffset())
4732 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4733 }
4734 }
4735
4736 // We may have split loads where some of their stores are split stores. For
4737 // such loads and stores, we can only pre-split them if their splits exactly
4738 // match relative to their starting offset. We have to verify this prior to
4739 // any rewriting.
4740 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4741 // Lookup the load we are storing in our map of split
4742 // offsets.
4743 auto *LI = cast<LoadInst>(SI->getValueOperand());
4744 // If it was completely unsplittable, then we're done,
4745 // and this store can't be pre-split.
4746 if (UnsplittableLoads.count(LI))
4747 return true;
4748
4749 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4750 if (LoadOffsetsI == SplitOffsetsMap.end())
4751 return false; // Unrelated loads are definitely safe.
4752 auto &LoadOffsets = LoadOffsetsI->second;
4753
4754 // Now lookup the store's offsets.
4755 auto &StoreOffsets = SplitOffsetsMap[SI];
4756
4757 // If the relative offsets of each split in the load and
4758 // store match exactly, then we can split them and we
4759 // don't need to remove them here.
4760 if (LoadOffsets.Splits == StoreOffsets.Splits)
4761 return false;
4762
4763 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4764 << " " << *LI << "\n"
4765 << " " << *SI << "\n");
4766
4767 // We've found a store and load that we need to split
4768 // with mismatched relative splits. Just give up on them
4769 // and remove both instructions from our list of
4770 // candidates.
4771 UnsplittableLoads.insert(LI);
4772 return true;
4773 });
4774 // Now we have to go *back* through all the stores, because a later store may
4775 // have caused an earlier store's load to become unsplittable and if it is
4776 // unsplittable for the later store, then we can't rely on it being split in
4777 // the earlier store either.
4778 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4779 auto *LI = cast<LoadInst>(SI->getValueOperand());
4780 return UnsplittableLoads.count(LI);
4781 });
4782 // Once we've established all the loads that can't be split for some reason,
4783 // filter any that made it into our list out.
4784 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4785 return UnsplittableLoads.count(LI);
4786 });
4787
4788 // If no loads or stores are left, there is no pre-splitting to be done for
4789 // this alloca.
4790 if (Loads.empty() && Stores.empty())
4791 return false;
4792
4793 // From here on, we can't fail and will be building new accesses, so rig up
4794 // an IR builder.
4795 IRBuilderTy IRB(&AI);
4796
4797 // Collect the new slices which we will merge into the alloca slices.
4798 SmallVector<Slice, 4> NewSlices;
4799
4800 // Track any allocas we end up splitting loads and stores for so we iterate
4801 // on them.
4802 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4803
4804 // At this point, we have collected all of the loads and stores we can
4805 // pre-split, and the specific splits needed for them. We actually do the
4806 // splitting in a specific order in order to handle when one of the loads in
4807 // the value operand to one of the stores.
4808 //
4809 // First, we rewrite all of the split loads, and just accumulate each split
4810 // load in a parallel structure. We also build the slices for them and append
4811 // them to the alloca slices.
4812 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4813 std::vector<LoadInst *> SplitLoads;
4814 const DataLayout &DL = AI.getDataLayout();
4815 for (LoadInst *LI : Loads) {
4816 SplitLoads.clear();
4817
4818 auto &Offsets = SplitOffsetsMap[LI];
4819 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4820 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4821 "Load must have type size equal to store size");
4822 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4823 "Load must be >= slice size");
4824
4825 uint64_t BaseOffset = Offsets.S->beginOffset();
4826 assert(BaseOffset + SliceSize > BaseOffset &&
4827 "Cannot represent alloca access size using 64-bit integers!");
4828
4830 IRB.SetInsertPoint(LI);
4831
4832 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4833
4834 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4835 int Idx = 0, Size = Offsets.Splits.size();
4836 for (;;) {
4837 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4838 auto AS = LI->getPointerAddressSpace();
4839 auto *PartPtrTy = LI->getPointerOperandType();
4840 LoadInst *PLoad = IRB.CreateAlignedLoad(
4841 PartTy,
4842 getAdjustedPtr(IRB, DL, BasePtr,
4843 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4844 PartPtrTy, BasePtr->getName() + "."),
4845 getAdjustedAlignment(LI, PartOffset),
4846 /*IsVolatile*/ false, LI->getName());
4847 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4848 LLVMContext::MD_access_group});
4849
4850 // Append this load onto the list of split loads so we can find it later
4851 // to rewrite the stores.
4852 SplitLoads.push_back(PLoad);
4853
4854 // Now build a new slice for the alloca.
4855 NewSlices.push_back(
4856 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4857 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4858 /*IsSplittable*/ false));
4859 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4860 << ", " << NewSlices.back().endOffset()
4861 << "): " << *PLoad << "\n");
4862
4863 // See if we've handled all the splits.
4864 if (Idx >= Size)
4865 break;
4866
4867 // Setup the next partition.
4868 PartOffset = Offsets.Splits[Idx];
4869 ++Idx;
4870 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4871 }
4872
4873 // Now that we have the split loads, do the slow walk over all uses of the
4874 // load and rewrite them as split stores, or save the split loads to use
4875 // below if the store is going to be split there anyways.
4876 bool DeferredStores = false;
4877 for (User *LU : LI->users()) {
4878 StoreInst *SI = cast<StoreInst>(LU);
4879 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4880 DeferredStores = true;
4881 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4882 << "\n");
4883 continue;
4884 }
4885
4886 Value *StoreBasePtr = SI->getPointerOperand();
4887 IRB.SetInsertPoint(SI);
4888 AAMDNodes AATags = SI->getAAMetadata();
4889
4890 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4891
4892 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4893 LoadInst *PLoad = SplitLoads[Idx];
4894 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4895 auto *PartPtrTy = SI->getPointerOperandType();
4896
4897 auto AS = SI->getPointerAddressSpace();
4898 StoreInst *PStore = IRB.CreateAlignedStore(
4899 PLoad,
4900 getAdjustedPtr(IRB, DL, StoreBasePtr,
4901 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4902 PartPtrTy, StoreBasePtr->getName() + "."),
4903 getAdjustedAlignment(SI, PartOffset),
4904 /*IsVolatile*/ false);
4905 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4906 LLVMContext::MD_access_group,
4907 LLVMContext::MD_DIAssignID});
4908
4909 if (AATags)
4910 PStore->setAAMetadata(
4911 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4912 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4913 }
4914
4915 // We want to immediately iterate on any allocas impacted by splitting
4916 // this store, and we have to track any promotable alloca (indicated by
4917 // a direct store) as needing to be resplit because it is no longer
4918 // promotable.
4919 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4920 ResplitPromotableAllocas.insert(OtherAI);
4921 Worklist.insert(OtherAI);
4922 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4923 StoreBasePtr->stripInBoundsOffsets())) {
4924 Worklist.insert(OtherAI);
4925 }
4926
4927 // Mark the original store as dead.
4928 DeadInsts.push_back(SI);
4929 }
4930
4931 // Save the split loads if there are deferred stores among the users.
4932 if (DeferredStores)
4933 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4934
4935 // Mark the original load as dead and kill the original slice.
4936 DeadInsts.push_back(LI);
4937 Offsets.S->kill();
4938 }
4939
4940 // Second, we rewrite all of the split stores. At this point, we know that
4941 // all loads from this alloca have been split already. For stores of such
4942 // loads, we can simply look up the pre-existing split loads. For stores of
4943 // other loads, we split those loads first and then write split stores of
4944 // them.
4945 for (StoreInst *SI : Stores) {
4946 auto *LI = cast<LoadInst>(SI->getValueOperand());
4947 IntegerType *Ty = cast<IntegerType>(LI->getType());
4948 assert(Ty->getBitWidth() % 8 == 0);
4949 uint64_t StoreSize = Ty->getBitWidth() / 8;
4950 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4951
4952 auto &Offsets = SplitOffsetsMap[SI];
4953 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4954 "Slice size should always match load size exactly!");
4955 uint64_t BaseOffset = Offsets.S->beginOffset();
4956 assert(BaseOffset + StoreSize > BaseOffset &&
4957 "Cannot represent alloca access size using 64-bit integers!");
4958
4959 Value *LoadBasePtr = LI->getPointerOperand();
4960 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4961
4962 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4963
4964 // Check whether we have an already split load.
4965 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4966 std::vector<LoadInst *> *SplitLoads = nullptr;
4967 if (SplitLoadsMapI != SplitLoadsMap.end()) {
4968 SplitLoads = &SplitLoadsMapI->second;
4969 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4970 "Too few split loads for the number of splits in the store!");
4971 } else {
4972 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4973 }
4974
4975 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4976 int Idx = 0, Size = Offsets.Splits.size();
4977 for (;;) {
4978 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4979 auto *LoadPartPtrTy = LI->getPointerOperandType();
4980 auto *StorePartPtrTy = SI->getPointerOperandType();
4981
4982 // Either lookup a split load or create one.
4983 LoadInst *PLoad;
4984 if (SplitLoads) {
4985 PLoad = (*SplitLoads)[Idx];
4986 } else {
4987 IRB.SetInsertPoint(LI);
4988 auto AS = LI->getPointerAddressSpace();
4989 PLoad = IRB.CreateAlignedLoad(
4990 PartTy,
4991 getAdjustedPtr(IRB, DL, LoadBasePtr,
4992 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4993 LoadPartPtrTy, LoadBasePtr->getName() + "."),
4994 getAdjustedAlignment(LI, PartOffset),
4995 /*IsVolatile*/ false, LI->getName());
4996 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4997 LLVMContext::MD_access_group});
4998 }
4999
5000 // And store this partition.
5001 IRB.SetInsertPoint(SI);
5002 auto AS = SI->getPointerAddressSpace();
5003 StoreInst *PStore = IRB.CreateAlignedStore(
5004 PLoad,
5005 getAdjustedPtr(IRB, DL, StoreBasePtr,
5006 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5007 StorePartPtrTy, StoreBasePtr->getName() + "."),
5008 getAdjustedAlignment(SI, PartOffset),
5009 /*IsVolatile*/ false);
5010 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5011 LLVMContext::MD_access_group});
5012
5013 // Now build a new slice for the alloca.
5014 NewSlices.push_back(
5015 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5016 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5017 /*IsSplittable*/ false));
5018 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5019 << ", " << NewSlices.back().endOffset()
5020 << "): " << *PStore << "\n");
5021 if (!SplitLoads) {
5022 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5023 }
5024
5025 // See if we've finished all the splits.
5026 if (Idx >= Size)
5027 break;
5028
5029 // Setup the next partition.
5030 PartOffset = Offsets.Splits[Idx];
5031 ++Idx;
5032 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5033 }
5034
5035 // We want to immediately iterate on any allocas impacted by splitting
5036 // this load, which is only relevant if it isn't a load of this alloca and
5037 // thus we didn't already split the loads above. We also have to keep track
5038 // of any promotable allocas we split loads on as they can no longer be
5039 // promoted.
5040 if (!SplitLoads) {
5041 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5042 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5043 ResplitPromotableAllocas.insert(OtherAI);
5044 Worklist.insert(OtherAI);
5045 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5046 LoadBasePtr->stripInBoundsOffsets())) {
5047 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5048 Worklist.insert(OtherAI);
5049 }
5050 }
5051
5052 // Mark the original store as dead now that we've split it up and kill its
5053 // slice. Note that we leave the original load in place unless this store
5054 // was its only use. It may in turn be split up if it is an alloca load
5055 // for some other alloca, but it may be a normal load. This may introduce
5056 // redundant loads, but where those can be merged the rest of the optimizer
5057 // should handle the merging, and this uncovers SSA splits which is more
5058 // important. In practice, the original loads will almost always be fully
5059 // split and removed eventually, and the splits will be merged by any
5060 // trivial CSE, including instcombine.
5061 if (LI->hasOneUse()) {
5062 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5063 DeadInsts.push_back(LI);
5064 }
5065 DeadInsts.push_back(SI);
5066 Offsets.S->kill();
5067 }
5068
5069 // Remove the killed slices that have ben pre-split.
5070 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5071
5072 // Insert our new slices. This will sort and merge them into the sorted
5073 // sequence.
5074 AS.insert(NewSlices);
5075
5076 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5077#ifndef NDEBUG
5078 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5079 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5080#endif
5081
5082 // Finally, don't try to promote any allocas that new require re-splitting.
5083 // They have already been added to the worklist above.
5084 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5085
5086 return true;
5087}
5088
5089/// Select a partition type for an alloca partition.
5090///
5091/// Try to compute a friendly type for this partition of the alloca. This
5092/// won't always succeed, in which case we fall back to a legal integer type
5093/// or an i8 array of an appropriate size.
5094///
5095/// \returns A tuple with the following elements:
5096/// - PartitionType: The computed type for this partition.
5097/// - IsIntegerWideningViable: True if integer widening promotion is used.
5098/// - VectorType: The vector type if vector promotion is used, otherwise
5099/// nullptr.
5100static std::tuple<Type *, bool, VectorType *>
5102 LLVMContext &C) {
5103 // First check if the partition is viable for vector promotion.
5104 //
5105 // We prefer vector promotion over integer widening promotion when:
5106 // - The vector element type is a floating-point type.
5107 // - All the loads/stores to the alloca are vector loads/stores to the
5108 // entire alloca or load/store a single element of the vector.
5109 //
5110 // Otherwise when there is an integer vector with mixed type loads/stores we
5111 // prefer integer widening promotion because it's more likely the user is
5112 // doing bitwise arithmetic and we generate better code.
5113 VectorType *VecTy =
5115 // If the vector element type is a floating-point type, we prefer vector
5116 // promotion. If the vector has one element, let the below code select
5117 // whether we promote with the vector or scalar.
5118 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5119 VecTy->getElementCount().getFixedValue() > 1)
5120 return {VecTy, false, VecTy};
5121
5122 // Check if there is a common type that all slices of the partition use that
5123 // spans the partition.
5124 auto [CommonUseTy, LargestIntTy] =
5125 findCommonType(P.begin(), P.end(), P.endOffset());
5126 if (CommonUseTy) {
5127 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
5128 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5129 // We prefer vector promotion here because if vector promotion is viable
5130 // and there is a common type used, then it implies the second listed
5131 // condition for preferring vector promotion is true.
5132 if (VecTy)
5133 return {VecTy, false, VecTy};
5134 return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
5135 nullptr};
5136 }
5137 }
5138
5139 // Can we find an appropriate subtype in the original allocated
5140 // type?
5141 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5142 P.beginOffset(), P.size())) {
5143 // If the partition is an integer array that can be spanned by a legal
5144 // integer type, prefer to represent it as a legal integer type because
5145 // it's more likely to be promotable.
5146 if (TypePartitionTy->isArrayTy() &&
5147 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5148 DL.isLegalInteger(P.size() * 8))
5149 TypePartitionTy = Type::getIntNTy(C, P.size() * 8);
5150 // There was no common type used, so we prefer integer widening promotion.
5151 if (isIntegerWideningViable(P, TypePartitionTy, DL))
5152 return {TypePartitionTy, true, nullptr};
5153 if (VecTy)
5154 return {VecTy, false, VecTy};
5155 // If we couldn't promote with TypePartitionTy, try with the largest
5156 // integer type used.
5157 if (LargestIntTy &&
5158 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
5159 isIntegerWideningViable(P, LargestIntTy, DL))
5160 return {LargestIntTy, true, nullptr};
5161
5162 // Fallback to TypePartitionTy and we probably won't promote.
5163 return {TypePartitionTy, false, nullptr};
5164 }
5165
5166 // Select the largest integer type used if it spans the partition.
5167 if (LargestIntTy &&
5168 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size())
5169 return {LargestIntTy, false, nullptr};
5170
5171 // Select a legal integer type if it spans the partition.
5172 if (DL.isLegalInteger(P.size() * 8))
5173 return {Type::getIntNTy(C, P.size() * 8), false, nullptr};
5174
5175 // Fallback to an i8 array.
5176 return {ArrayType::get(Type::getInt8Ty(C), P.size()), false, nullptr};
5177}
5178
5179/// Rewrite an alloca partition's users.
5180///
5181/// This routine drives both of the rewriting goals of the SROA pass. It tries
5182/// to rewrite uses of an alloca partition to be conducive for SSA value
5183/// promotion. If the partition needs a new, more refined alloca, this will
5184/// build that new alloca, preserving as much type information as possible, and
5185/// rewrite the uses of the old alloca to point at the new one and have the
5186/// appropriate new offsets. It also evaluates how successful the rewrite was
5187/// at enabling promotion and if it was successful queues the alloca to be
5188/// promoted.
5189std::pair<AllocaInst *, uint64_t>
5190SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5191 const DataLayout &DL = AI.getDataLayout();
5192 // Select the type for the new alloca that spans the partition.
5193 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5194 selectPartitionType(P, DL, AI, *C);
5195
5196 // Check for the case where we're going to rewrite to a new alloca of the
5197 // exact same type as the original, and with the same access offsets. In that
5198 // case, re-use the existing alloca, but still run through the rewriter to
5199 // perform phi and select speculation.
5200 // P.beginOffset() can be non-zero even with the same type in a case with
5201 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5202 AllocaInst *NewAI;
5203 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5204 NewAI = &AI;
5205 // FIXME: We should be able to bail at this point with "nothing changed".
5206 // FIXME: We might want to defer PHI speculation until after here.
5207 // FIXME: return nullptr;
5208 } else {
5209 // Make sure the alignment is compatible with P.beginOffset().
5210 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5211 // If we will get at least this much alignment from the type alone, leave
5212 // the alloca's alignment unconstrained.
5213 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
5214 NewAI = new AllocaInst(
5215 PartitionTy, AI.getAddressSpace(), nullptr,
5216 IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
5217 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5218 AI.getIterator());
5219 // Copy the old AI debug location over to the new one.
5220 NewAI->setDebugLoc(AI.getDebugLoc());
5221 ++NumNewAllocas;
5222 }
5223
5224 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5225 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5226
5227 // Track the high watermark on the worklist as it is only relevant for
5228 // promoted allocas. We will reset it to this point if the alloca is not in
5229 // fact scheduled for promotion.
5230 unsigned PPWOldSize = PostPromotionWorklist.size();
5231 unsigned NumUses = 0;
5232 SmallSetVector<PHINode *, 8> PHIUsers;
5233 SmallSetVector<SelectInst *, 8> SelectUsers;
5234
5235 AllocaSliceRewriter Rewriter(
5236 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5237 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5238 bool Promotable = true;
5239 // Check whether we can have tree-structured merge.
5240 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5241 NumUses += DeletedValues->size() + 1;
5242 for (Value *V : *DeletedValues)
5243 DeadInsts.push_back(V);
5244 } else {
5245 for (Slice *S : P.splitSliceTails()) {
5246 Promotable &= Rewriter.visit(S);
5247 ++NumUses;
5248 }
5249 for (Slice &S : P) {
5250 Promotable &= Rewriter.visit(&S);
5251 ++NumUses;
5252 }
5253 }
5254
5255 NumAllocaPartitionUses += NumUses;
5256 MaxUsesPerAllocaPartition.updateMax(NumUses);
5257
5258 // Now that we've processed all the slices in the new partition, check if any
5259 // PHIs or Selects would block promotion.
5260 for (PHINode *PHI : PHIUsers)
5261 if (!isSafePHIToSpeculate(*PHI)) {
5262 Promotable = false;
5263 PHIUsers.clear();
5264 SelectUsers.clear();
5265 break;
5266 }
5267
5269 NewSelectsToRewrite;
5270 NewSelectsToRewrite.reserve(SelectUsers.size());
5271 for (SelectInst *Sel : SelectUsers) {
5272 std::optional<RewriteableMemOps> Ops =
5273 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5274 if (!Ops) {
5275 Promotable = false;
5276 PHIUsers.clear();
5277 SelectUsers.clear();
5278 NewSelectsToRewrite.clear();
5279 break;
5280 }
5281 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5282 }
5283
5284 if (Promotable) {
5285 for (Use *U : AS.getDeadUsesIfPromotable()) {
5286 auto *OldInst = dyn_cast<Instruction>(U->get());
5287 Value::dropDroppableUse(*U);
5288 if (OldInst)
5289 if (isInstructionTriviallyDead(OldInst))
5290 DeadInsts.push_back(OldInst);
5291 }
5292 if (PHIUsers.empty() && SelectUsers.empty()) {
5293 // Promote the alloca.
5294 PromotableAllocas.insert(NewAI);
5295 } else {
5296 // If we have either PHIs or Selects to speculate, add them to those
5297 // worklists and re-queue the new alloca so that we promote in on the
5298 // next iteration.
5299 SpeculatablePHIs.insert_range(PHIUsers);
5300 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5301 NewSelectsToRewrite.size());
5302 for (auto &&KV : llvm::make_range(
5303 std::make_move_iterator(NewSelectsToRewrite.begin()),
5304 std::make_move_iterator(NewSelectsToRewrite.end())))
5305 SelectsToRewrite.insert(std::move(KV));
5306 Worklist.insert(NewAI);
5307 }
5308 } else {
5309 // Drop any post-promotion work items if promotion didn't happen.
5310 while (PostPromotionWorklist.size() > PPWOldSize)
5311 PostPromotionWorklist.pop_back();
5312
5313 // We couldn't promote and we didn't create a new partition, nothing
5314 // happened.
5315 if (NewAI == &AI)
5316 return {nullptr, 0};
5317
5318 // If we can't promote the alloca, iterate on it to check for new
5319 // refinements exposed by splitting the current alloca. Don't iterate on an
5320 // alloca which didn't actually change and didn't get promoted.
5321 Worklist.insert(NewAI);
5322 }
5323
5324 return {NewAI, DL.getTypeSizeInBits(PartitionTy).getFixedValue()};
5325}
5326
5327// There isn't a shared interface to get the "address" parts out of a
5328// dbg.declare and dbg.assign, so provide some wrappers.
5331 return DVR->isKillAddress();
5332 return DVR->isKillLocation();
5333}
5334
5337 return DVR->getAddressExpression();
5338 return DVR->getExpression();
5339}
5340
5341/// Create or replace an existing fragment in a DIExpression with \p Frag.
5342/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5343/// operation, add \p BitExtractOffset to the offset part.
5344///
5345/// Returns the new expression, or nullptr if this fails (see details below).
5346///
5347/// This function is similar to DIExpression::createFragmentExpression except
5348/// for 3 important distinctions:
5349/// 1. The new fragment isn't relative to an existing fragment.
5350/// 2. It assumes the computed location is a memory location. This means we
5351/// don't need to perform checks that creating the fragment preserves the
5352/// expression semantics.
5353/// 3. Existing extract_bits are modified independently of fragment changes
5354/// using \p BitExtractOffset. A change to the fragment offset or size
5355/// may affect a bit extract. But a bit extract offset can change
5356/// independently of the fragment dimensions.
5357///
5358/// Returns the new expression, or nullptr if one couldn't be created.
5359/// Ideally this is only used to signal that a bit-extract has become
5360/// zero-sized (and thus the new debug record has no size and can be
5361/// dropped), however, it fails for other reasons too - see the FIXME below.
5362///
5363/// FIXME: To keep the change that introduces this function NFC it bails
5364/// in some situations unecessarily, e.g. when fragment and bit extract
5365/// sizes differ.
5368 int64_t BitExtractOffset) {
5370 bool HasFragment = false;
5371 bool HasBitExtract = false;
5372
5373 for (auto &Op : Expr->expr_ops()) {
5374 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5375 HasFragment = true;
5376 continue;
5377 }
5378 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5380 HasBitExtract = true;
5381 int64_t ExtractOffsetInBits = Op.getArg(0);
5382 int64_t ExtractSizeInBits = Op.getArg(1);
5383
5384 // DIExpression::createFragmentExpression doesn't know how to handle
5385 // a fragment that is smaller than the extract. Copy the behaviour
5386 // (bail) to avoid non-NFC changes.
5387 // FIXME: Don't do this.
5388 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5389 return nullptr;
5390
5391 assert(BitExtractOffset <= 0);
5392 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5393
5394 // DIExpression::createFragmentExpression doesn't know what to do
5395 // if the new extract starts "outside" the existing one. Copy the
5396 // behaviour (bail) to avoid non-NFC changes.
5397 // FIXME: Don't do this.
5398 if (AdjustedOffset < 0)
5399 return nullptr;
5400
5401 Ops.push_back(Op.getOp());
5402 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5403 Ops.push_back(ExtractSizeInBits);
5404 continue;
5405 }
5406 Op.appendToVector(Ops);
5407 }
5408
5409 // Unsupported by createFragmentExpression, so don't support it here yet to
5410 // preserve NFC-ness.
5411 if (HasFragment && HasBitExtract)
5412 return nullptr;
5413
5414 if (!HasBitExtract) {
5416 Ops.push_back(Frag.OffsetInBits);
5417 Ops.push_back(Frag.SizeInBits);
5418 }
5419 return DIExpression::get(Expr->getContext(), Ops);
5420}
5421
5422/// Insert a new DbgRecord.
5423/// \p Orig Original to copy record type, debug loc and variable from, and
5424/// additionally value and value expression for dbg_assign records.
5425/// \p NewAddr Location's new base address.
5426/// \p NewAddrExpr New expression to apply to address.
5427/// \p BeforeInst Insert position.
5428/// \p NewFragment New fragment (absolute, non-relative).
5429/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5430static void
5432 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5433 std::optional<DIExpression::FragmentInfo> NewFragment,
5434 int64_t BitExtractAdjustment) {
5435 (void)DIB;
5436
5437 // A dbg_assign puts fragment info in the value expression only. The address
5438 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5439 // new fragment info into NewAddrExpr (as it only has one expression).
5440 DIExpression *NewFragmentExpr =
5441 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5442 if (NewFragment)
5443 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5444 BitExtractAdjustment);
5445 if (!NewFragmentExpr)
5446 return;
5447
5448 if (Orig->isDbgDeclare()) {
5450 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5451 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5452 BeforeInst->getIterator());
5453 return;
5454 }
5455
5456 if (Orig->isDbgValue()) {
5458 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5459 // Drop debug information if the expression doesn't start with a
5460 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5461 // describes the address of alloca rather than the value inside the alloca.
5462 if (!NewFragmentExpr->startsWithDeref())
5463 DVR->setKillAddress();
5464 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5465 BeforeInst->getIterator());
5466 return;
5467 }
5468
5469 // Apply a DIAssignID to the store if it doesn't already have it.
5470 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5471 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5473 }
5474
5476 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5477 NewAddrExpr, Orig->getDebugLoc());
5478 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5479 (void)NewAssign;
5480}
5481
5482/// Walks the slices of an alloca and form partitions based on them,
5483/// rewriting each of their uses.
5484bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5485 if (AS.begin() == AS.end())
5486 return false;
5487
5488 unsigned NumPartitions = 0;
5489 bool Changed = false;
5490 const DataLayout &DL = AI.getModule()->getDataLayout();
5491
5492 // First try to pre-split loads and stores.
5493 Changed |= presplitLoadsAndStores(AI, AS);
5494
5495 // Now that we have identified any pre-splitting opportunities,
5496 // mark loads and stores unsplittable except for the following case.
5497 // We leave a slice splittable if all other slices are disjoint or fully
5498 // included in the slice, such as whole-alloca loads and stores.
5499 // If we fail to split these during pre-splitting, we want to force them
5500 // to be rewritten into a partition.
5501 bool IsSorted = true;
5502
5503 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5504 const uint64_t MaxBitVectorSize = 1024;
5505 if (AllocaSize <= MaxBitVectorSize) {
5506 // If a byte boundary is included in any load or store, a slice starting or
5507 // ending at the boundary is not splittable.
5508 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5509 for (Slice &S : AS)
5510 for (unsigned O = S.beginOffset() + 1;
5511 O < S.endOffset() && O < AllocaSize; O++)
5512 SplittableOffset.reset(O);
5513
5514 for (Slice &S : AS) {
5515 if (!S.isSplittable())
5516 continue;
5517
5518 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5519 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5520 continue;
5521
5522 if (isa<LoadInst>(S.getUse()->getUser()) ||
5523 isa<StoreInst>(S.getUse()->getUser())) {
5524 S.makeUnsplittable();
5525 IsSorted = false;
5526 }
5527 }
5528 } else {
5529 // We only allow whole-alloca splittable loads and stores
5530 // for a large alloca to avoid creating too large BitVector.
5531 for (Slice &S : AS) {
5532 if (!S.isSplittable())
5533 continue;
5534
5535 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5536 continue;
5537
5538 if (isa<LoadInst>(S.getUse()->getUser()) ||
5539 isa<StoreInst>(S.getUse()->getUser())) {
5540 S.makeUnsplittable();
5541 IsSorted = false;
5542 }
5543 }
5544 }
5545
5546 if (!IsSorted)
5548
5549 /// Describes the allocas introduced by rewritePartition in order to migrate
5550 /// the debug info.
5551 struct Fragment {
5552 AllocaInst *Alloca;
5553 uint64_t Offset;
5554 uint64_t Size;
5555 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5556 : Alloca(AI), Offset(O), Size(S) {}
5557 };
5558 SmallVector<Fragment, 4> Fragments;
5559
5560 // Rewrite each partition.
5561 for (auto &P : AS.partitions()) {
5562 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5563 if (NewAI) {
5564 Changed = true;
5565 if (NewAI != &AI) {
5566 uint64_t SizeOfByte = 8;
5567 // Don't include any padding.
5568 uint64_t Size = std::min(ActiveBits, P.size() * SizeOfByte);
5569 Fragments.push_back(
5570 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5571 }
5572 }
5573 ++NumPartitions;
5574 }
5575
5576 NumAllocaPartitions += NumPartitions;
5577 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5578
5579 // Migrate debug information from the old alloca to the new alloca(s)
5580 // and the individual partitions.
5581 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5582 // Can't overlap with undef memory.
5583 if (isKillAddress(DbgVariable))
5584 return;
5585
5586 const Value *DbgPtr = DbgVariable->getAddress();
5588 DbgVariable->getFragmentOrEntireVariable();
5589 // Get the address expression constant offset if one exists and the ops
5590 // that come after it.
5591 int64_t CurrentExprOffsetInBytes = 0;
5592 SmallVector<uint64_t> PostOffsetOps;
5593 if (!getAddressExpression(DbgVariable)
5594 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5595 return; // Couldn't interpret this DIExpression - drop the var.
5596
5597 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5598 int64_t ExtractOffsetInBits = 0;
5599 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5600 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5602 ExtractOffsetInBits = Op.getArg(0);
5603 break;
5604 }
5605 }
5606
5607 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5608 for (auto Fragment : Fragments) {
5609 int64_t OffsetFromLocationInBits;
5610 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5611 // Find the variable fragment that the new alloca slice covers.
5612 // Drop debug info for this variable fragment if we can't compute an
5613 // intersect between it and the alloca slice.
5615 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5616 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5617 NewDbgFragment, OffsetFromLocationInBits))
5618 continue; // Do not migrate this fragment to this slice.
5619
5620 // Zero sized fragment indicates there's no intersect between the variable
5621 // fragment and the alloca slice. Skip this slice for this variable
5622 // fragment.
5623 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5624 continue; // Do not migrate this fragment to this slice.
5625
5626 // No fragment indicates DbgVariable's variable or fragment exactly
5627 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5628 if (!NewDbgFragment)
5629 NewDbgFragment = DbgVariable->getFragment();
5630
5631 // Reduce the new expression offset by the bit-extract offset since
5632 // we'll be keeping that.
5633 int64_t OffestFromNewAllocaInBits =
5634 OffsetFromLocationInBits - ExtractOffsetInBits;
5635 // We need to adjust an existing bit extract if the offset expression
5636 // can't eat the slack (i.e., if the new offset would be negative).
5637 int64_t BitExtractOffset =
5638 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5639 // The magnitude of a negative value indicates the number of bits into
5640 // the existing variable fragment that the memory region begins. The new
5641 // variable fragment already excludes those bits - the new DbgPtr offset
5642 // only needs to be applied if it's positive.
5643 OffestFromNewAllocaInBits =
5644 std::max(int64_t(0), OffestFromNewAllocaInBits);
5645
5646 // Rebuild the expression:
5647 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5648 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5649 // address expression but the value expression instead.
5650 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5651 if (OffestFromNewAllocaInBits > 0) {
5652 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5653 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5654 }
5655
5656 // Remove any existing intrinsics on the new alloca describing
5657 // the variable fragment.
5658 auto RemoveOne = [DbgVariable](auto *OldDII) {
5659 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5660 return LHS->getVariable() == RHS->getVariable() &&
5661 LHS->getDebugLoc()->getInlinedAt() ==
5662 RHS->getDebugLoc()->getInlinedAt();
5663 };
5664 if (SameVariableFragment(OldDII, DbgVariable))
5665 OldDII->eraseFromParent();
5666 };
5667 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5668 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5669 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5670 NewDbgFragment, BitExtractOffset);
5671 }
5672 };
5673
5674 // Migrate debug information from the old alloca to the new alloca(s)
5675 // and the individual partitions.
5676 for_each(findDVRDeclares(&AI), MigrateOne);
5677 for_each(findDVRValues(&AI), MigrateOne);
5678 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5679
5680 return Changed;
5681}
5682
5683/// Clobber a use with poison, deleting the used value if it becomes dead.
5684void SROA::clobberUse(Use &U) {
5685 Value *OldV = U;
5686 // Replace the use with an poison value.
5687 U = PoisonValue::get(OldV->getType());
5688
5689 // Check for this making an instruction dead. We have to garbage collect
5690 // all the dead instructions to ensure the uses of any alloca end up being
5691 // minimal.
5692 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5693 if (isInstructionTriviallyDead(OldI)) {
5694 DeadInsts.push_back(OldI);
5695 }
5696}
5697
5698/// A basic LoadAndStorePromoter that does not remove store nodes.
5700public:
5702 Type *ZeroType)
5703 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5704 bool shouldDelete(Instruction *I) const override {
5705 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5706 }
5707
5709 return UndefValue::get(ZeroType);
5710 }
5711
5712private:
5713 Type *ZeroType;
5714};
5715
5716bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5717 // Look through each "partition", looking for slices with the same start/end
5718 // that do not overlap with any before them. The slices are sorted by
5719 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5720 // sophisticated algorithm that takes splittable slices into account.
5721 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5722 bool AllSameAndValid = true;
5723 Type *PartitionType = nullptr;
5725 uint64_t BeginOffset = 0;
5726 uint64_t EndOffset = 0;
5727
5728 auto Flush = [&]() {
5729 if (AllSameAndValid && !Insts.empty()) {
5730 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5731 << EndOffset << ")\n");
5733 SSAUpdater SSA(&NewPHIs);
5734 Insts.push_back(&AI);
5735 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5736 Promoter.run(Insts);
5737 }
5738 AllSameAndValid = true;
5739 PartitionType = nullptr;
5740 Insts.clear();
5741 };
5742
5743 for (Slice &S : AS) {
5744 auto *User = cast<Instruction>(S.getUse()->getUser());
5745 if (isAssumeLikeIntrinsic(User)) {
5746 LLVM_DEBUG({
5747 dbgs() << "Ignoring slice: ";
5748 AS.print(dbgs(), &S);
5749 });
5750 continue;
5751 }
5752 if (S.beginOffset() >= EndOffset) {
5753 Flush();
5754 BeginOffset = S.beginOffset();
5755 EndOffset = S.endOffset();
5756 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5757 if (AllSameAndValid) {
5758 LLVM_DEBUG({
5759 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5760 << EndOffset << ")";
5761 AS.print(dbgs(), &S);
5762 });
5763 AllSameAndValid = false;
5764 }
5765 EndOffset = std::max(EndOffset, S.endOffset());
5766 continue;
5767 }
5768
5769 if (auto *LI = dyn_cast<LoadInst>(User)) {
5770 Type *UserTy = LI->getType();
5771 // LoadAndStorePromoter requires all the types to be the same.
5772 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5773 AllSameAndValid = false;
5774 PartitionType = UserTy;
5775 Insts.push_back(User);
5776 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5777 Type *UserTy = SI->getValueOperand()->getType();
5778 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5779 AllSameAndValid = false;
5780 PartitionType = UserTy;
5781 Insts.push_back(User);
5782 } else {
5783 AllSameAndValid = false;
5784 }
5785 }
5786
5787 Flush();
5788 return true;
5789}
5790
5791/// Analyze an alloca for SROA.
5792///
5793/// This analyzes the alloca to ensure we can reason about it, builds
5794/// the slices of the alloca, and then hands it off to be split and
5795/// rewritten as needed.
5796std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5797SROA::runOnAlloca(AllocaInst &AI) {
5798 bool Changed = false;
5799 bool CFGChanged = false;
5800
5801 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5802 ++NumAllocasAnalyzed;
5803
5804 // Special case dead allocas, as they're trivial.
5805 if (AI.use_empty()) {
5806 AI.eraseFromParent();
5807 Changed = true;
5808 return {Changed, CFGChanged};
5809 }
5810 const DataLayout &DL = AI.getDataLayout();
5811
5812 // Skip alloca forms that this analysis can't handle.
5813 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
5814 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
5815 return {Changed, CFGChanged};
5816
5817 // First, split any FCA loads and stores touching this alloca to promote
5818 // better splitting and promotion opportunities.
5819 IRBuilderTy IRB(&AI);
5820 AggLoadStoreRewriter AggRewriter(DL, IRB);
5821 Changed |= AggRewriter.rewrite(AI);
5822
5823 // Build the slices using a recursive instruction-visiting builder.
5824 AllocaSlices AS(DL, AI);
5825 LLVM_DEBUG(AS.print(dbgs()));
5826 if (AS.isEscaped())
5827 return {Changed, CFGChanged};
5828
5829 if (AS.isEscapedReadOnly()) {
5830 Changed |= propagateStoredValuesToLoads(AI, AS);
5831 return {Changed, CFGChanged};
5832 }
5833
5834 // Delete all the dead users of this alloca before splitting and rewriting it.
5835 for (Instruction *DeadUser : AS.getDeadUsers()) {
5836 // Free up everything used by this instruction.
5837 for (Use &DeadOp : DeadUser->operands())
5838 clobberUse(DeadOp);
5839
5840 // Now replace the uses of this instruction.
5841 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5842
5843 // And mark it for deletion.
5844 DeadInsts.push_back(DeadUser);
5845 Changed = true;
5846 }
5847 for (Use *DeadOp : AS.getDeadOperands()) {
5848 clobberUse(*DeadOp);
5849 Changed = true;
5850 }
5851
5852 // No slices to split. Leave the dead alloca for a later pass to clean up.
5853 if (AS.begin() == AS.end())
5854 return {Changed, CFGChanged};
5855
5856 Changed |= splitAlloca(AI, AS);
5857
5858 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5859 while (!SpeculatablePHIs.empty())
5860 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5861
5862 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5863 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5864 while (!RemainingSelectsToRewrite.empty()) {
5865 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5866 CFGChanged |=
5867 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5868 }
5869
5870 return {Changed, CFGChanged};
5871}
5872
5873/// Delete the dead instructions accumulated in this run.
5874///
5875/// Recursively deletes the dead instructions we've accumulated. This is done
5876/// at the very end to maximize locality of the recursive delete and to
5877/// minimize the problems of invalidated instruction pointers as such pointers
5878/// are used heavily in the intermediate stages of the algorithm.
5879///
5880/// We also record the alloca instructions deleted here so that they aren't
5881/// subsequently handed to mem2reg to promote.
5882bool SROA::deleteDeadInstructions(
5883 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5884 bool Changed = false;
5885 while (!DeadInsts.empty()) {
5886 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5887 if (!I)
5888 continue;
5889 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5890
5891 // If the instruction is an alloca, find the possible dbg.declare connected
5892 // to it, and remove it too. We must do this before calling RAUW or we will
5893 // not be able to find it.
5894 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5895 DeletedAllocas.insert(AI);
5896 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5897 OldDII->eraseFromParent();
5898 }
5899
5901 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5902
5903 for (Use &Operand : I->operands())
5904 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5905 // Zero out the operand and see if it becomes trivially dead.
5906 Operand = nullptr;
5908 DeadInsts.push_back(U);
5909 }
5910
5911 ++NumDeleted;
5912 I->eraseFromParent();
5913 Changed = true;
5914 }
5915 return Changed;
5916}
5917/// Promote the allocas, using the best available technique.
5918///
5919/// This attempts to promote whatever allocas have been identified as viable in
5920/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5921/// This function returns whether any promotion occurred.
5922bool SROA::promoteAllocas() {
5923 if (PromotableAllocas.empty())
5924 return false;
5925
5926 if (SROASkipMem2Reg) {
5927 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5928 } else {
5929 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5930 NumPromoted += PromotableAllocas.size();
5931 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5932 }
5933
5934 PromotableAllocas.clear();
5935 return true;
5936}
5937
5938std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5939 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5940
5941 const DataLayout &DL = F.getDataLayout();
5942 BasicBlock &EntryBB = F.getEntryBlock();
5943 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
5944 I != E; ++I) {
5945 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5946 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
5947 if (Size && Size->isScalable() && isAllocaPromotable(AI))
5948 PromotableAllocas.insert(AI);
5949 else
5950 Worklist.insert(AI);
5951 }
5952 }
5953
5954 bool Changed = false;
5955 bool CFGChanged = false;
5956 // A set of deleted alloca instruction pointers which should be removed from
5957 // the list of promotable allocas.
5958 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
5959
5960 do {
5961 while (!Worklist.empty()) {
5962 auto [IterationChanged, IterationCFGChanged] =
5963 runOnAlloca(*Worklist.pop_back_val());
5964 Changed |= IterationChanged;
5965 CFGChanged |= IterationCFGChanged;
5966
5967 Changed |= deleteDeadInstructions(DeletedAllocas);
5968
5969 // Remove the deleted allocas from various lists so that we don't try to
5970 // continue processing them.
5971 if (!DeletedAllocas.empty()) {
5972 Worklist.set_subtract(DeletedAllocas);
5973 PostPromotionWorklist.set_subtract(DeletedAllocas);
5974 PromotableAllocas.set_subtract(DeletedAllocas);
5975 DeletedAllocas.clear();
5976 }
5977 }
5978
5979 Changed |= promoteAllocas();
5980
5981 Worklist = PostPromotionWorklist;
5982 PostPromotionWorklist.clear();
5983 } while (!Worklist.empty());
5984
5985 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
5986 assert((!CFGChanged || !PreserveCFG) &&
5987 "Should not have modified the CFG when told to preserve it.");
5988
5989 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
5990 for (auto &BB : F) {
5992 }
5993 }
5994
5995 return {Changed, CFGChanged};
5996}
5997
6001 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6002 auto [Changed, CFGChanged] =
6003 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6004 if (!Changed)
6005 return PreservedAnalyses::all();
6007 if (!CFGChanged)
6010 return PA;
6011}
6012
6014 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6015 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6016 OS, MapClassName2PassName);
6017 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6018 : "<modify-cfg>");
6019}
6020
6021SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6022
6023namespace {
6024
6025/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6026class SROALegacyPass : public FunctionPass {
6028
6029public:
6030 static char ID;
6031
6035 }
6036
6037 bool runOnFunction(Function &F) override {
6038 if (skipFunction(F))
6039 return false;
6040
6041 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6042 AssumptionCache &AC =
6043 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6044 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6045 auto [Changed, _] =
6046 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6047 return Changed;
6048 }
6049
6050 void getAnalysisUsage(AnalysisUsage &AU) const override {
6051 AU.addRequired<AssumptionCacheTracker>();
6052 AU.addRequired<DominatorTreeWrapperPass>();
6053 AU.addPreserved<GlobalsAAWrapperPass>();
6054 AU.addPreserved<DominatorTreeWrapperPass>();
6055 }
6056
6057 StringRef getPassName() const override { return "SROA"; }
6058};
6059
6060} // end anonymous namespace
6061
6062char SROALegacyPass::ID = 0;
6063
6068
6069INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6070 "Scalar Replacement Of Aggregates", false, false)
6073INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:598
This file implements a map that provides insertion order iteration.
static std::optional< AllocFnsTy > getAllocationSize(const CallBase *CB, const TargetLibraryInfo *TLI)
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:343
static std::tuple< Type *, bool, VectorType * > selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, LLVMContext &C)
Select a partition type for an alloca partition.
Definition SROA.cpp:5101
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2240
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1918
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2091
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1484
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4446
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:278
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5366
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2483
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2016
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1907
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2322
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2516
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1006
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1873
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1806
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:993
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5329
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2538
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2417
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1624
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2196
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:324
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1550
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2458
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5431
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1767
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2610
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5335
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4484
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1928
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1705
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1018
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1034
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:806
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:953
partition_iterator & operator++()
Definition SROA.cpp:973
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5704
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5701
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5708
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
iterator begin() const
Definition ArrayRef.h:129
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
DebugLoc getDebugLoc() const
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI void setKillAddress()
Kill the address component.
LLVM_ABI bool isKillLocation() const
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
LLVM_ABI void setAssignId(DIAssignID *New)
DIExpression * getExpression() const
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DILocalVariable * getVariable() const
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:67
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:809
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:125
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
LLVMContext & getContext() const
Definition Metadata.h:1244
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:5998
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6013
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6021
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static constexpr size_t npos
Definition StringRef.h:58
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:591
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:365
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:290
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:743
TypeSize getSizeInBytes() const
Definition DataLayout.h:752
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:774
TypeSize getSizeInBits() const
Definition DataLayout.h:754
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:313
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:205
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:287
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:820
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:214
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:203
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:557
@ Length
Definition DWP.cpp:557
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:360
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2115
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1453
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2133
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2199
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:396
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:82
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6064
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:822
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:89