LLVM 23.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/LLVMContext.h"
67#include "llvm/IR/Metadata.h"
68#include "llvm/IR/Module.h"
69#include "llvm/IR/Operator.h"
70#include "llvm/IR/PassManager.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/User.h"
74#include "llvm/IR/Value.h"
75#include "llvm/IR/ValueHandle.h"
77#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
89#include <algorithm>
90#include <cassert>
91#include <cstddef>
92#include <cstdint>
93#include <cstring>
94#include <iterator>
95#include <queue>
96#include <string>
97#include <tuple>
98#include <utility>
99#include <variant>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "sroa"
105
106STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
107STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
108STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
109STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
110STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
111STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
112STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
113STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
114STATISTIC(NumLoadsPredicated,
115 "Number of loads rewritten into predicated loads to allow promotion");
117 NumStoresPredicated,
118 "Number of stores rewritten into predicated loads to allow promotion");
119STATISTIC(NumDeleted, "Number of instructions deleted");
120STATISTIC(NumVectorized, "Number of vectorized aggregates");
121
122namespace llvm {
123/// Disable running mem2reg during SROA in order to test or debug SROA.
124static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
125 cl::Hidden);
127} // namespace llvm
128
129namespace {
130
131class AllocaSliceRewriter;
132class AllocaSlices;
133class Partition;
134
135class SelectHandSpeculativity {
136 unsigned char Storage = 0; // None are speculatable by default.
137 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
138 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
139public:
140 SelectHandSpeculativity() = default;
141 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
142 bool isSpeculatable(bool isTrueVal) const;
143 bool areAllSpeculatable() const;
144 bool areAnySpeculatable() const;
145 bool areNoneSpeculatable() const;
146 // For interop as int half of PointerIntPair.
147 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
148 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
149};
150static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
151
152using PossiblySpeculatableLoad =
154using UnspeculatableStore = StoreInst *;
155using RewriteableMemOp =
156 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
157using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
158
159/// An optimization pass providing Scalar Replacement of Aggregates.
160///
161/// This pass takes allocations which can be completely analyzed (that is, they
162/// don't escape) and tries to turn them into scalar SSA values. There are
163/// a few steps to this process.
164///
165/// 1) It takes allocations of aggregates and analyzes the ways in which they
166/// are used to try to split them into smaller allocations, ideally of
167/// a single scalar data type. It will split up memcpy and memset accesses
168/// as necessary and try to isolate individual scalar accesses.
169/// 2) It will transform accesses into forms which are suitable for SSA value
170/// promotion. This can be replacing a memset with a scalar store of an
171/// integer value, or it can involve speculating operations on a PHI or
172/// select to be a PHI or select of the results.
173/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
174/// onto insert and extract operations on a vector value, and convert them to
175/// this form. By doing so, it will enable promotion of vector aggregates to
176/// SSA vector values.
177class SROA {
178 LLVMContext *const C;
179 DomTreeUpdater *const DTU;
180 AssumptionCache *const AC;
181 const bool PreserveCFG;
182
183 /// Worklist of alloca instructions to simplify.
184 ///
185 /// Each alloca in the function is added to this. Each new alloca formed gets
186 /// added to it as well to recursively simplify unless that alloca can be
187 /// directly promoted. Finally, each time we rewrite a use of an alloca other
188 /// the one being actively rewritten, we add it back onto the list if not
189 /// already present to ensure it is re-visited.
190 SmallSetVector<AllocaInst *, 16> Worklist;
191
192 /// A collection of instructions to delete.
193 /// We try to batch deletions to simplify code and make things a bit more
194 /// efficient. We also make sure there is no dangling pointers.
195 SmallVector<WeakVH, 8> DeadInsts;
196
197 /// Post-promotion worklist.
198 ///
199 /// Sometimes we discover an alloca which has a high probability of becoming
200 /// viable for SROA after a round of promotion takes place. In those cases,
201 /// the alloca is enqueued here for re-processing.
202 ///
203 /// Note that we have to be very careful to clear allocas out of this list in
204 /// the event they are deleted.
205 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
206
207 /// A collection of alloca instructions we can directly promote.
208 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
209 SmallPtrSet<AllocaInst *, 16>, 16>
210 PromotableAllocas;
211
212 /// A worklist of PHIs to speculate prior to promoting allocas.
213 ///
214 /// All of these PHIs have been checked for the safety of speculation and by
215 /// being speculated will allow promoting allocas currently in the promotable
216 /// queue.
217 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
218
219 /// A worklist of select instructions to rewrite prior to promoting
220 /// allocas.
221 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
222
223 /// Select instructions that use an alloca and are subsequently loaded can be
224 /// rewritten to load both input pointers and then select between the result,
225 /// allowing the load of the alloca to be promoted.
226 /// From this:
227 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
228 /// %V = load <type>, ptr %P2
229 /// to:
230 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
231 /// %V2 = load <type>, ptr %Other
232 /// %V = select i1 %cond, <type> %V1, <type> %V2
233 ///
234 /// We can do this to a select if its only uses are loads
235 /// and if either the operand to the select can be loaded unconditionally,
236 /// or if we are allowed to perform CFG modifications.
237 /// If found an intervening bitcast with a single use of the load,
238 /// allow the promotion.
239 static std::optional<RewriteableMemOps>
240 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
241
242public:
243 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
244 SROAOptions PreserveCFG_)
245 : C(C), DTU(DTU), AC(AC),
246 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
247
248 /// Main run method used by both the SROAPass and by the legacy pass.
249 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
250
251private:
252 friend class AllocaSliceRewriter;
253
254 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255 std::pair<AllocaInst *, uint64_t>
256 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
257 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
258 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
259 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
260 void clobberUse(Use &U);
261 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
262 bool promoteAllocas();
263};
264
265} // end anonymous namespace
266
267/// Calculate the fragment of a variable to use when slicing a store
268/// based on the slice dimensions, existing fragment, and base storage
269/// fragment.
270/// Results:
271/// UseFrag - Use Target as the new fragment.
272/// UseNoFrag - The new slice already covers the whole variable.
273/// Skip - The new alloca slice doesn't include this variable.
274/// FIXME: Can we use calculateFragmentIntersect instead?
275namespace {
276enum FragCalcResult { UseFrag, UseNoFrag, Skip };
277}
278static FragCalcResult
280 uint64_t NewStorageSliceOffsetInBits,
281 uint64_t NewStorageSliceSizeInBits,
282 std::optional<DIExpression::FragmentInfo> StorageFragment,
283 std::optional<DIExpression::FragmentInfo> CurrentFragment,
285 // If the base storage describes part of the variable apply the offset and
286 // the size constraint.
287 if (StorageFragment) {
288 Target.SizeInBits =
289 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
290 Target.OffsetInBits =
291 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
292 } else {
293 Target.SizeInBits = NewStorageSliceSizeInBits;
294 Target.OffsetInBits = NewStorageSliceOffsetInBits;
295 }
296
297 // If this slice extracts the entirety of an independent variable from a
298 // larger alloca, do not produce a fragment expression, as the variable is
299 // not fragmented.
300 if (!CurrentFragment) {
301 if (auto Size = Variable->getSizeInBits()) {
302 // Treat the current fragment as covering the whole variable.
303 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
304 if (Target == CurrentFragment)
305 return UseNoFrag;
306 }
307 }
308
309 // No additional work to do if there isn't a fragment already, or there is
310 // but it already exactly describes the new assignment.
311 if (!CurrentFragment || *CurrentFragment == Target)
312 return UseFrag;
313
314 // Reject the target fragment if it doesn't fit wholly within the current
315 // fragment. TODO: We could instead chop up the target to fit in the case of
316 // a partial overlap.
317 if (Target.startInBits() < CurrentFragment->startInBits() ||
318 Target.endInBits() > CurrentFragment->endInBits())
319 return Skip;
320
321 // Target fits within the current fragment, return it.
322 return UseFrag;
323}
324
326 return DebugVariable(DVR->getVariable(), std::nullopt,
327 DVR->getDebugLoc().getInlinedAt());
328}
329
330/// Find linked dbg.assign and generate a new one with the correct
331/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
332/// value component is copied from the old dbg.assign to the new.
333/// \param OldAlloca Alloca for the variable before splitting.
334/// \param IsSplit True if the store (not necessarily alloca)
335/// is being split.
336/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
337/// \param SliceSizeInBits New number of bits being written to.
338/// \param OldInst Instruction that is being split.
339/// \param Inst New instruction performing this part of the
340/// split store.
341/// \param Dest Store destination.
342/// \param Value Stored value.
343/// \param DL Datalayout.
344static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
345 uint64_t OldAllocaOffsetInBits,
346 uint64_t SliceSizeInBits, Instruction *OldInst,
347 Instruction *Inst, Value *Dest, Value *Value,
348 const DataLayout &DL) {
349 // If we want allocas to be migrated using this helper then we need to ensure
350 // that the BaseFragments map code still works. A simple solution would be
351 // to choose to always clone alloca dbg_assigns (rather than sometimes
352 // "stealing" them).
353 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
354
355 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
356 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
357 if (DVRAssignMarkerRange.empty())
358 return;
359
360 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
361 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
362 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
363 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
364 << "\n");
365 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
366 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
367 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
368 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
369 if (Value)
370 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
371
372 /// Map of aggregate variables to their fragment associated with OldAlloca.
374 BaseFragments;
375 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
376 BaseFragments[getAggregateVariable(DVR)] =
377 DVR->getExpression()->getFragmentInfo();
378
379 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
380 // one). It shouldn't already have one: assert this assumption.
381 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
382 DIAssignID *NewID = nullptr;
383 auto &Ctx = Inst->getContext();
384 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
385 assert(OldAlloca->isStaticAlloca());
386
387 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
388 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
389 << "\n");
390 auto *Expr = DbgAssign->getExpression();
391 bool SetKillLocation = false;
392
393 if (IsSplit) {
394 std::optional<DIExpression::FragmentInfo> BaseFragment;
395 {
396 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
397 if (R == BaseFragments.end())
398 return;
399 BaseFragment = R->second;
400 }
401 std::optional<DIExpression::FragmentInfo> CurrentFragment =
402 Expr->getFragmentInfo();
403 DIExpression::FragmentInfo NewFragment;
404 FragCalcResult Result = calculateFragment(
405 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
406 BaseFragment, CurrentFragment, NewFragment);
407
408 if (Result == Skip)
409 return;
410 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
411 if (CurrentFragment) {
412 // Rewrite NewFragment to be relative to the existing one (this is
413 // what createFragmentExpression wants). CalculateFragment has
414 // already resolved the size for us. FIXME: Should it return the
415 // relative fragment too?
416 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
417 }
418 // Add the new fragment info to the existing expression if possible.
420 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
421 Expr = *E;
422 } else {
423 // Otherwise, add the new fragment info to an empty expression and
424 // discard the value component of this dbg.assign as the value cannot
425 // be computed with the new fragment.
427 DIExpression::get(Expr->getContext(), {}),
428 NewFragment.OffsetInBits, NewFragment.SizeInBits);
429 SetKillLocation = true;
430 }
431 }
432 }
433
434 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
435 if (!NewID) {
436 NewID = DIAssignID::getDistinct(Ctx);
437 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
438 }
439
440 DbgVariableRecord *NewAssign;
441 if (IsSplit) {
442 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
444 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
445 Dest, DIExpression::get(Expr->getContext(), {}),
446 DbgAssign->getDebugLoc())));
447 } else {
448 // The store is not split, simply steal the existing dbg_assign.
449 NewAssign = DbgAssign;
450 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
451 NewAssign->setAddress(Dest);
452 if (Value)
453 NewAssign->replaceVariableLocationOp(0u, Value);
454 assert(Expr == NewAssign->getExpression());
455 }
456
457 // If we've updated the value but the original dbg.assign has an arglist
458 // then kill it now - we can't use the requested new value.
459 // We can't replace the DIArgList with the new value as it'd leave
460 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
461 // an arglist). And we can't keep the DIArgList in case the linked store
462 // is being split - in which case the DIArgList + expression may no longer
463 // be computing the correct value.
464 // This should be a very rare situation as it requires the value being
465 // stored to differ from the dbg.assign (i.e., the value has been
466 // represented differently in the debug intrinsic for some reason).
467 SetKillLocation |=
468 Value && (DbgAssign->hasArgList() ||
469 !DbgAssign->getExpression()->isSingleLocationExpression());
470 if (SetKillLocation)
471 NewAssign->setKillLocation();
472
473 // We could use more precision here at the cost of some additional (code)
474 // complexity - if the original dbg.assign was adjacent to its store, we
475 // could position this new dbg.assign adjacent to its store rather than the
476 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
477 // what we get now:
478 // split store !1
479 // split store !2
480 // dbg.assign !1
481 // dbg.assign !2
482 // This (current behaviour) results results in debug assignments being
483 // noted as slightly offset (in code) from the store. In practice this
484 // should have little effect on the debugging experience due to the fact
485 // that all the split stores should get the same line number.
486 if (NewAssign != DbgAssign) {
487 NewAssign->moveBefore(DbgAssign->getIterator());
488 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
489 }
490 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
491 };
492
493 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
494}
495
496namespace {
497
498/// A custom IRBuilder inserter which prefixes all names, but only in
499/// Assert builds.
500class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
501 std::string Prefix;
502
503 Twine getNameWithPrefix(const Twine &Name) const {
504 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
505 }
506
507public:
508 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
509
510 void InsertHelper(Instruction *I, const Twine &Name,
511 BasicBlock::iterator InsertPt) const override {
512 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
513 InsertPt);
514 }
515};
516
517/// Provide a type for IRBuilder that drops names in release builds.
519
520/// A used slice of an alloca.
521///
522/// This structure represents a slice of an alloca used by some instruction. It
523/// stores both the begin and end offsets of this use, a pointer to the use
524/// itself, and a flag indicating whether we can classify the use as splittable
525/// or not when forming partitions of the alloca.
526class Slice {
527 /// The beginning offset of the range.
528 uint64_t BeginOffset = 0;
529
530 /// The ending offset, not included in the range.
531 uint64_t EndOffset = 0;
532
533 /// Storage for both the use of this slice and whether it can be
534 /// split.
535 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
536
537public:
538 Slice() = default;
539
540 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable,
541 Value *ProtectedFieldDisc)
542 : BeginOffset(BeginOffset), EndOffset(EndOffset),
543 UseAndIsSplittable(U, IsSplittable),
544 ProtectedFieldDisc(ProtectedFieldDisc) {}
545
546 uint64_t beginOffset() const { return BeginOffset; }
547 uint64_t endOffset() const { return EndOffset; }
548
549 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
550 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
551
552 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
553
554 bool isDead() const { return getUse() == nullptr; }
555 void kill() { UseAndIsSplittable.setPointer(nullptr); }
556
557 // When this access is via an llvm.protected.field.ptr intrinsic, contains
558 // the second argument to the intrinsic, the discriminator.
559 Value *ProtectedFieldDisc;
560
561 /// Support for ordering ranges.
562 ///
563 /// This provides an ordering over ranges such that start offsets are
564 /// always increasing, and within equal start offsets, the end offsets are
565 /// decreasing. Thus the spanning range comes first in a cluster with the
566 /// same start position.
567 bool operator<(const Slice &RHS) const {
568 if (beginOffset() < RHS.beginOffset())
569 return true;
570 if (beginOffset() > RHS.beginOffset())
571 return false;
572 if (isSplittable() != RHS.isSplittable())
573 return !isSplittable();
574 if (endOffset() > RHS.endOffset())
575 return true;
576 return false;
577 }
578
579 /// Support comparison with a single offset to allow binary searches.
580 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
581 return LHS.beginOffset() < RHSOffset;
582 }
583 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
584 return LHSOffset < RHS.beginOffset();
585 }
586
587 bool operator==(const Slice &RHS) const {
588 return isSplittable() == RHS.isSplittable() &&
589 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
590 }
591 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
592};
593
594/// Representation of the alloca slices.
595///
596/// This class represents the slices of an alloca which are formed by its
597/// various uses. If a pointer escapes, we can't fully build a representation
598/// for the slices used and we reflect that in this structure. The uses are
599/// stored, sorted by increasing beginning offset and with unsplittable slices
600/// starting at a particular offset before splittable slices.
601class AllocaSlices {
602public:
603 /// Construct the slices of a particular alloca.
604 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
605
606 /// Test whether a pointer to the allocation escapes our analysis.
607 ///
608 /// If this is true, the slices are never fully built and should be
609 /// ignored.
610 bool isEscaped() const { return PointerEscapingInstr; }
611 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
612
613 /// Support for iterating over the slices.
614 /// @{
615 using iterator = SmallVectorImpl<Slice>::iterator;
616 using range = iterator_range<iterator>;
617
618 iterator begin() { return Slices.begin(); }
619 iterator end() { return Slices.end(); }
620
621 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
622 using const_range = iterator_range<const_iterator>;
623
624 const_iterator begin() const { return Slices.begin(); }
625 const_iterator end() const { return Slices.end(); }
626 /// @}
627
628 /// Erase a range of slices.
629 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
630
631 /// Insert new slices for this alloca.
632 ///
633 /// This moves the slices into the alloca's slices collection, and re-sorts
634 /// everything so that the usual ordering properties of the alloca's slices
635 /// hold.
636 void insert(ArrayRef<Slice> NewSlices) {
637 int OldSize = Slices.size();
638 Slices.append(NewSlices.begin(), NewSlices.end());
639 auto SliceI = Slices.begin() + OldSize;
640 std::stable_sort(SliceI, Slices.end());
641 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
642 }
643
644 // Forward declare the iterator and range accessor for walking the
645 // partitions.
646 class partition_iterator;
648
649 /// Access the dead users for this alloca.
650 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
651
652 /// Access the users for this alloca that are llvm.protected.field.ptr
653 /// intrinsics.
654 ArrayRef<IntrinsicInst *> getPFPUsers() const { return PFPUsers; }
655
656 /// Access Uses that should be dropped if the alloca is promotable.
657 ArrayRef<Use *> getDeadUsesIfPromotable() const {
658 return DeadUseIfPromotable;
659 }
660
661 /// Access the dead operands referring to this alloca.
662 ///
663 /// These are operands which have cannot actually be used to refer to the
664 /// alloca as they are outside its range and the user doesn't correct for
665 /// that. These mostly consist of PHI node inputs and the like which we just
666 /// need to replace with undef.
667 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
668
669#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
670 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
671 void printSlice(raw_ostream &OS, const_iterator I,
672 StringRef Indent = " ") const;
673 void printUse(raw_ostream &OS, const_iterator I,
674 StringRef Indent = " ") const;
675 void print(raw_ostream &OS) const;
676 void dump(const_iterator I) const;
677 void dump() const;
678#endif
679
680private:
681 template <typename DerivedT, typename RetT = void> class BuilderBase;
682 class SliceBuilder;
683
684 friend class AllocaSlices::SliceBuilder;
685
686#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
687 /// Handle to alloca instruction to simplify method interfaces.
688 AllocaInst &AI;
689#endif
690
691 /// The instruction responsible for this alloca not having a known set
692 /// of slices.
693 ///
694 /// When an instruction (potentially) escapes the pointer to the alloca, we
695 /// store a pointer to that here and abort trying to form slices of the
696 /// alloca. This will be null if the alloca slices are analyzed successfully.
697 Instruction *PointerEscapingInstr;
698 Instruction *PointerEscapingInstrReadOnly;
699
700 /// The slices of the alloca.
701 ///
702 /// We store a vector of the slices formed by uses of the alloca here. This
703 /// vector is sorted by increasing begin offset, and then the unsplittable
704 /// slices before the splittable ones. See the Slice inner class for more
705 /// details.
707
708 /// Instructions which will become dead if we rewrite the alloca.
709 ///
710 /// Note that these are not separated by slice. This is because we expect an
711 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
712 /// all these instructions can simply be removed and replaced with poison as
713 /// they come from outside of the allocated space.
714 SmallVector<Instruction *, 8> DeadUsers;
715
716 /// Users that are llvm.protected.field.ptr intrinsics. These will be RAUW'd
717 /// to their first argument if we rewrite the alloca.
719
720 /// Uses which will become dead if can promote the alloca.
721 SmallVector<Use *, 8> DeadUseIfPromotable;
722
723 /// Operands which will become dead if we rewrite the alloca.
724 ///
725 /// These are operands that in their particular use can be replaced with
726 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
727 /// to PHI nodes and the like. They aren't entirely dead (there might be
728 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
729 /// want to swap this particular input for poison to simplify the use lists of
730 /// the alloca.
731 SmallVector<Use *, 8> DeadOperands;
732};
733
734/// A partition of the slices.
735///
736/// An ephemeral representation for a range of slices which can be viewed as
737/// a partition of the alloca. This range represents a span of the alloca's
738/// memory which cannot be split, and provides access to all of the slices
739/// overlapping some part of the partition.
740///
741/// Objects of this type are produced by traversing the alloca's slices, but
742/// are only ephemeral and not persistent.
743class Partition {
744private:
745 friend class AllocaSlices;
746 friend class AllocaSlices::partition_iterator;
747
748 using iterator = AllocaSlices::iterator;
749
750 /// The beginning and ending offsets of the alloca for this
751 /// partition.
752 uint64_t BeginOffset = 0, EndOffset = 0;
753
754 /// The start and end iterators of this partition.
755 iterator SI, SJ;
756
757 /// A collection of split slice tails overlapping the partition.
758 SmallVector<Slice *, 4> SplitTails;
759
760 /// Raw constructor builds an empty partition starting and ending at
761 /// the given iterator.
762 Partition(iterator SI) : SI(SI), SJ(SI) {}
763
764public:
765 /// The start offset of this partition.
766 ///
767 /// All of the contained slices start at or after this offset.
768 uint64_t beginOffset() const { return BeginOffset; }
769
770 /// The end offset of this partition.
771 ///
772 /// All of the contained slices end at or before this offset.
773 uint64_t endOffset() const { return EndOffset; }
774
775 /// The size of the partition.
776 ///
777 /// Note that this can never be zero.
778 uint64_t size() const {
779 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
780 return EndOffset - BeginOffset;
781 }
782
783 /// Test whether this partition contains no slices, and merely spans
784 /// a region occupied by split slices.
785 bool empty() const { return SI == SJ; }
786
787 /// \name Iterate slices that start within the partition.
788 /// These may be splittable or unsplittable. They have a begin offset >= the
789 /// partition begin offset.
790 /// @{
791 // FIXME: We should probably define a "concat_iterator" helper and use that
792 // to stitch together pointee_iterators over the split tails and the
793 // contiguous iterators of the partition. That would give a much nicer
794 // interface here. We could then additionally expose filtered iterators for
795 // split, unsplit, and unsplittable splices based on the usage patterns.
796 iterator begin() const { return SI; }
797 iterator end() const { return SJ; }
798 /// @}
799
800 /// Get the sequence of split slice tails.
801 ///
802 /// These tails are of slices which start before this partition but are
803 /// split and overlap into the partition. We accumulate these while forming
804 /// partitions.
805 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
806};
807
808} // end anonymous namespace
809
810/// An iterator over partitions of the alloca's slices.
811///
812/// This iterator implements the core algorithm for partitioning the alloca's
813/// slices. It is a forward iterator as we don't support backtracking for
814/// efficiency reasons, and re-use a single storage area to maintain the
815/// current set of split slices.
816///
817/// It is templated on the slice iterator type to use so that it can operate
818/// with either const or non-const slice iterators.
820 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
821 Partition> {
822 friend class AllocaSlices;
823
824 /// Most of the state for walking the partitions is held in a class
825 /// with a nice interface for examining them.
826 Partition P;
827
828 /// We need to keep the end of the slices to know when to stop.
829 AllocaSlices::iterator SE;
830
831 /// We also need to keep track of the maximum split end offset seen.
832 /// FIXME: Do we really?
833 uint64_t MaxSplitSliceEndOffset = 0;
834
835 /// Sets the partition to be empty at given iterator, and sets the
836 /// end iterator.
837 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
838 : P(SI), SE(SE) {
839 // If not already at the end, advance our state to form the initial
840 // partition.
841 if (SI != SE)
842 advance();
843 }
844
845 /// Advance the iterator to the next partition.
846 ///
847 /// Requires that the iterator not be at the end of the slices.
848 void advance() {
849 assert((P.SI != SE || !P.SplitTails.empty()) &&
850 "Cannot advance past the end of the slices!");
851
852 // Clear out any split uses which have ended.
853 if (!P.SplitTails.empty()) {
854 if (P.EndOffset >= MaxSplitSliceEndOffset) {
855 // If we've finished all splits, this is easy.
856 P.SplitTails.clear();
857 MaxSplitSliceEndOffset = 0;
858 } else {
859 // Remove the uses which have ended in the prior partition. This
860 // cannot change the max split slice end because we just checked that
861 // the prior partition ended prior to that max.
862 llvm::erase_if(P.SplitTails,
863 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
864 assert(llvm::any_of(P.SplitTails,
865 [&](Slice *S) {
866 return S->endOffset() == MaxSplitSliceEndOffset;
867 }) &&
868 "Could not find the current max split slice offset!");
869 assert(llvm::all_of(P.SplitTails,
870 [&](Slice *S) {
871 return S->endOffset() <= MaxSplitSliceEndOffset;
872 }) &&
873 "Max split slice end offset is not actually the max!");
874 }
875 }
876
877 // If P.SI is already at the end, then we've cleared the split tail and
878 // now have an end iterator.
879 if (P.SI == SE) {
880 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
881 return;
882 }
883
884 // If we had a non-empty partition previously, set up the state for
885 // subsequent partitions.
886 if (P.SI != P.SJ) {
887 // Accumulate all the splittable slices which started in the old
888 // partition into the split list.
889 for (Slice &S : P)
890 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
891 P.SplitTails.push_back(&S);
892 MaxSplitSliceEndOffset =
893 std::max(S.endOffset(), MaxSplitSliceEndOffset);
894 }
895
896 // Start from the end of the previous partition.
897 P.SI = P.SJ;
898
899 // If P.SI is now at the end, we at most have a tail of split slices.
900 if (P.SI == SE) {
901 P.BeginOffset = P.EndOffset;
902 P.EndOffset = MaxSplitSliceEndOffset;
903 return;
904 }
905
906 // If the we have split slices and the next slice is after a gap and is
907 // not splittable immediately form an empty partition for the split
908 // slices up until the next slice begins.
909 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
910 !P.SI->isSplittable()) {
911 P.BeginOffset = P.EndOffset;
912 P.EndOffset = P.SI->beginOffset();
913 return;
914 }
915 }
916
917 // OK, we need to consume new slices. Set the end offset based on the
918 // current slice, and step SJ past it. The beginning offset of the
919 // partition is the beginning offset of the next slice unless we have
920 // pre-existing split slices that are continuing, in which case we begin
921 // at the prior end offset.
922 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
923 P.EndOffset = P.SI->endOffset();
924 ++P.SJ;
925
926 // There are two strategies to form a partition based on whether the
927 // partition starts with an unsplittable slice or a splittable slice.
928 if (!P.SI->isSplittable()) {
929 // When we're forming an unsplittable region, it must always start at
930 // the first slice and will extend through its end.
931 assert(P.BeginOffset == P.SI->beginOffset());
932
933 // Form a partition including all of the overlapping slices with this
934 // unsplittable slice.
935 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
936 if (!P.SJ->isSplittable())
937 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
938 ++P.SJ;
939 }
940
941 // We have a partition across a set of overlapping unsplittable
942 // partitions.
943 return;
944 }
945
946 // If we're starting with a splittable slice, then we need to form
947 // a synthetic partition spanning it and any other overlapping splittable
948 // splices.
949 assert(P.SI->isSplittable() && "Forming a splittable partition!");
950
951 // Collect all of the overlapping splittable slices.
952 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
953 P.SJ->isSplittable()) {
954 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
955 ++P.SJ;
956 }
957
958 // Back upiP.EndOffset if we ended the span early when encountering an
959 // unsplittable slice. This synthesizes the early end offset of
960 // a partition spanning only splittable slices.
961 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
962 assert(!P.SJ->isSplittable());
963 P.EndOffset = P.SJ->beginOffset();
964 }
965 }
966
967public:
968 bool operator==(const partition_iterator &RHS) const {
969 assert(SE == RHS.SE &&
970 "End iterators don't match between compared partition iterators!");
971
972 // The observed positions of partitions is marked by the P.SI iterator and
973 // the emptiness of the split slices. The latter is only relevant when
974 // P.SI == SE, as the end iterator will additionally have an empty split
975 // slices list, but the prior may have the same P.SI and a tail of split
976 // slices.
977 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
978 assert(P.SJ == RHS.P.SJ &&
979 "Same set of slices formed two different sized partitions!");
980 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
981 "Same slice position with differently sized non-empty split "
982 "slice tails!");
983 return true;
984 }
985 return false;
986 }
987
988 partition_iterator &operator++() {
989 advance();
990 return *this;
991 }
992
993 Partition &operator*() { return P; }
994};
995
996/// A forward range over the partitions of the alloca's slices.
997///
998/// This accesses an iterator range over the partitions of the alloca's
999/// slices. It computes these partitions on the fly based on the overlapping
1000/// offsets of the slices and the ability to split them. It will visit "empty"
1001/// partitions to cover regions of the alloca only accessed via split
1002/// slices.
1003iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
1004 return make_range(partition_iterator(begin(), end()),
1005 partition_iterator(end(), end()));
1006}
1007
1009 // If the condition being selected on is a constant or the same value is
1010 // being selected between, fold the select. Yes this does (rarely) happen
1011 // early on.
1012 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
1013 return SI.getOperand(1 + CI->isZero());
1014 if (SI.getOperand(1) == SI.getOperand(2))
1015 return SI.getOperand(1);
1016
1017 return nullptr;
1018}
1019
1020/// A helper that folds a PHI node or a select.
1022 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1023 // If PN merges together the same value, return that value.
1024 return PN->hasConstantValue();
1025 }
1027}
1028
1029/// Builder for the alloca slices.
1030///
1031/// This class builds a set of alloca slices by recursively visiting the uses
1032/// of an alloca and making a slice for each load and store at each offset.
1033class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1034 friend class PtrUseVisitor<SliceBuilder>;
1035 friend class InstVisitor<SliceBuilder>;
1036
1037 using Base = PtrUseVisitor<SliceBuilder>;
1038
1039 const uint64_t AllocSize;
1040 AllocaSlices &AS;
1041
1042 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1044
1045 /// Set to de-duplicate dead instructions found in the use walk.
1046 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1047
1048 // When this access is via an llvm.protected.field.ptr intrinsic, contains
1049 // the second argument to the intrinsic, the discriminator.
1050 Value *ProtectedFieldDisc = nullptr;
1051
1052public:
1053 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1055 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1056
1057private:
1058 void markAsDead(Instruction &I) {
1059 if (VisitedDeadInsts.insert(&I).second)
1060 AS.DeadUsers.push_back(&I);
1061 }
1062
1063 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1064 bool IsSplittable = false) {
1065 // Completely skip uses which have a zero size or start either before or
1066 // past the end of the allocation.
1067 if (Size == 0 || Offset.uge(AllocSize)) {
1068 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1069 << Offset
1070 << " which has zero size or starts outside of the "
1071 << AllocSize << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 return markAsDead(I);
1075 }
1076
1077 uint64_t BeginOffset = Offset.getZExtValue();
1078 uint64_t EndOffset = BeginOffset + Size;
1079
1080 // Clamp the end offset to the end of the allocation. Note that this is
1081 // formulated to handle even the case where "BeginOffset + Size" overflows.
1082 // This may appear superficially to be something we could ignore entirely,
1083 // but that is not so! There may be widened loads or PHI-node uses where
1084 // some instructions are dead but not others. We can't completely ignore
1085 // them, and so have to record at least the information here.
1086 assert(AllocSize >= BeginOffset); // Established above.
1087 if (Size > AllocSize - BeginOffset) {
1088 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1089 << Offset << " to remain within the " << AllocSize
1090 << " byte alloca:\n"
1091 << " alloca: " << AS.AI << "\n"
1092 << " use: " << I << "\n");
1093 EndOffset = AllocSize;
1094 }
1095
1096 AS.Slices.push_back(
1097 Slice(BeginOffset, EndOffset, U, IsSplittable, ProtectedFieldDisc));
1098 }
1099
1100 void visitBitCastInst(BitCastInst &BC) {
1101 if (BC.use_empty())
1102 return markAsDead(BC);
1103
1104 return Base::visitBitCastInst(BC);
1105 }
1106
1107 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1108 if (ASC.use_empty())
1109 return markAsDead(ASC);
1110
1111 return Base::visitAddrSpaceCastInst(ASC);
1112 }
1113
1114 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1115 if (GEPI.use_empty())
1116 return markAsDead(GEPI);
1117
1118 return Base::visitGetElementPtrInst(GEPI);
1119 }
1120
1121 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1122 uint64_t Size, bool IsVolatile) {
1123 // We allow splitting of non-volatile loads and stores where the type is an
1124 // integer type. These may be used to implement 'memcpy' or other "transfer
1125 // of bits" patterns.
1126 bool IsSplittable =
1127 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1128
1129 insertUse(I, Offset, Size, IsSplittable);
1130 }
1131
1132 void visitLoadInst(LoadInst &LI) {
1133 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1134 "All simple FCA loads should have been pre-split");
1135
1136 // If there is a load with an unknown offset, we can still perform store
1137 // to load forwarding for other known-offset loads.
1138 if (!IsOffsetKnown)
1139 return PI.setEscapedReadOnly(&LI);
1140
1141 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1142 if (Size.isScalable()) {
1143 unsigned VScale = LI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&LI);
1146
1147 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1148 }
1149
1150 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1151 LI.isVolatile());
1152 }
1153
1154 void visitStoreInst(StoreInst &SI) {
1155 Value *ValOp = SI.getValueOperand();
1156 if (ValOp == *U)
1157 return PI.setEscapedAndAborted(&SI);
1158 if (!IsOffsetKnown)
1159 return PI.setAborted(&SI);
1160
1161 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1162 if (StoreSize.isScalable()) {
1163 unsigned VScale = SI.getFunction()->getVScaleValue();
1164 if (!VScale)
1165 return PI.setAborted(&SI);
1166
1167 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1168 }
1169
1170 uint64_t Size = StoreSize.getFixedValue();
1171
1172 // If this memory access can be shown to *statically* extend outside the
1173 // bounds of the allocation, it's behavior is undefined, so simply
1174 // ignore it. Note that this is more strict than the generic clamping
1175 // behavior of insertUse. We also try to handle cases which might run the
1176 // risk of overflow.
1177 // FIXME: We should instead consider the pointer to have escaped if this
1178 // function is being instrumented for addressing bugs or race conditions.
1179 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1180 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1181 << Offset << " which extends past the end of the "
1182 << AllocSize << " byte alloca:\n"
1183 << " alloca: " << AS.AI << "\n"
1184 << " use: " << SI << "\n");
1185 return markAsDead(SI);
1186 }
1187
1188 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1189 "All simple FCA stores should have been pre-split");
1190 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1191 }
1192
1193 void visitMemSetInst(MemSetInst &II) {
1194 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1195 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1196 if ((Length && Length->getValue() == 0) ||
1197 (IsOffsetKnown && Offset.uge(AllocSize)))
1198 // Zero-length mem transfer intrinsics can be ignored entirely.
1199 return markAsDead(II);
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 insertUse(II, Offset,
1205 Length ? Length->getLimitedValue()
1206 : AllocSize - Offset.getLimitedValue(),
1207 (bool)Length);
1208 }
1209
1210 void visitMemTransferInst(MemTransferInst &II) {
1211 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1212 if (Length && Length->getValue() == 0)
1213 // Zero-length mem transfer intrinsics can be ignored entirely.
1214 return markAsDead(II);
1215
1216 // Because we can visit these intrinsics twice, also check to see if the
1217 // first time marked this instruction as dead. If so, skip it.
1218 if (VisitedDeadInsts.count(&II))
1219 return;
1220
1221 if (!IsOffsetKnown)
1222 return PI.setAborted(&II);
1223
1224 // This side of the transfer is completely out-of-bounds, and so we can
1225 // nuke the entire transfer. However, we also need to nuke the other side
1226 // if already added to our partitions.
1227 // FIXME: Yet another place we really should bypass this when
1228 // instrumenting for ASan.
1229 if (Offset.uge(AllocSize)) {
1230 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1231 MemTransferSliceMap.find(&II);
1232 if (MTPI != MemTransferSliceMap.end())
1233 AS.Slices[MTPI->second].kill();
1234 return markAsDead(II);
1235 }
1236
1237 uint64_t RawOffset = Offset.getLimitedValue();
1238 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1239
1240 // Check for the special case where the same exact value is used for both
1241 // source and dest.
1242 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1243 // For non-volatile transfers this is a no-op.
1244 if (!II.isVolatile())
1245 return markAsDead(II);
1246
1247 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1248 }
1249
1250 // If we have seen both source and destination for a mem transfer, then
1251 // they both point to the same alloca.
1252 bool Inserted;
1253 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1254 std::tie(MTPI, Inserted) =
1255 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1256 unsigned PrevIdx = MTPI->second;
1257 if (!Inserted) {
1258 Slice &PrevP = AS.Slices[PrevIdx];
1259
1260 // Check if the begin offsets match and this is a non-volatile transfer.
1261 // In that case, we can completely elide the transfer.
1262 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1263 PrevP.kill();
1264 return markAsDead(II);
1265 }
1266
1267 // Otherwise we have an offset transfer within the same alloca. We can't
1268 // split those.
1269 PrevP.makeUnsplittable();
1270 }
1271
1272 // Insert the use now that we've fixed up the splittable nature.
1273 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1274
1275 // Check that we ended up with a valid index in the map.
1276 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1277 "Map index doesn't point back to a slice with this user.");
1278 }
1279
1280 // Disable SRoA for any intrinsics except for lifetime invariants.
1281 // FIXME: What about debug intrinsics? This matches old behavior, but
1282 // doesn't make sense.
1283 void visitIntrinsicInst(IntrinsicInst &II) {
1284 if (II.isDroppable()) {
1285 AS.DeadUseIfPromotable.push_back(U);
1286 return;
1287 }
1288
1289 if (!IsOffsetKnown)
1290 return PI.setAborted(&II);
1291
1292 if (II.isLifetimeStartOrEnd()) {
1293 insertUse(II, Offset, AllocSize, true);
1294 return;
1295 }
1296
1297 if (II.getIntrinsicID() == Intrinsic::protected_field_ptr) {
1298 // We only handle loads and stores as users of llvm.protected.field.ptr.
1299 // Other uses may add items to the worklist, which will cause
1300 // ProtectedFieldDisc to be tracked incorrectly.
1301 AS.PFPUsers.push_back(&II);
1302 ProtectedFieldDisc = II.getArgOperand(1);
1303 for (Use &U : II.uses()) {
1304 this->U = &U;
1305 if (auto *LI = dyn_cast<LoadInst>(U.getUser()))
1306 visitLoadInst(*LI);
1307 else if (auto *SI = dyn_cast<StoreInst>(U.getUser()))
1308 visitStoreInst(*SI);
1309 else
1310 PI.setAborted(&II);
1311 if (PI.isAborted())
1312 break;
1313 }
1314 ProtectedFieldDisc = nullptr;
1315 return;
1316 }
1317
1318 Base::visitIntrinsicInst(II);
1319 }
1320
1321 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1322 // We consider any PHI or select that results in a direct load or store of
1323 // the same offset to be a viable use for slicing purposes. These uses
1324 // are considered unsplittable and the size is the maximum loaded or stored
1325 // size.
1326 SmallPtrSet<Instruction *, 4> Visited;
1328 Visited.insert(Root);
1329 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1330 const DataLayout &DL = Root->getDataLayout();
1331 // If there are no loads or stores, the access is dead. We mark that as
1332 // a size zero access.
1333 Size = 0;
1334 do {
1335 Instruction *I, *UsedI;
1336 std::tie(UsedI, I) = Uses.pop_back_val();
1337
1338 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1339 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1340 if (LoadSize.isScalable()) {
1341 PI.setAborted(LI);
1342 return nullptr;
1343 }
1344 Size = std::max(Size, LoadSize.getFixedValue());
1345 continue;
1346 }
1347 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1348 Value *Op = SI->getOperand(0);
1349 if (Op == UsedI)
1350 return SI;
1351 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1352 if (StoreSize.isScalable()) {
1353 PI.setAborted(SI);
1354 return nullptr;
1355 }
1356 Size = std::max(Size, StoreSize.getFixedValue());
1357 continue;
1358 }
1359
1360 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1361 if (!GEP->hasAllZeroIndices())
1362 return GEP;
1363 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1365 return I;
1366 }
1367
1368 for (User *U : I->users())
1369 if (Visited.insert(cast<Instruction>(U)).second)
1370 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1371 } while (!Uses.empty());
1372
1373 return nullptr;
1374 }
1375
1376 void visitPHINodeOrSelectInst(Instruction &I) {
1378 if (I.use_empty())
1379 return markAsDead(I);
1380
1381 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1382 // instructions in this BB, which may be required during rewriting. Bail out
1383 // on these cases.
1384 if (isa<PHINode>(I) && !I.getParent()->hasInsertionPt())
1385 return PI.setAborted(&I);
1386
1387 // TODO: We could use simplifyInstruction here to fold PHINodes and
1388 // SelectInsts. However, doing so requires to change the current
1389 // dead-operand-tracking mechanism. For instance, suppose neither loading
1390 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1391 // trap either. However, if we simply replace %U with undef using the
1392 // current dead-operand-tracking mechanism, "load (select undef, undef,
1393 // %other)" may trap because the select may return the first operand
1394 // "undef".
1395 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1396 if (Result == *U)
1397 // If the result of the constant fold will be the pointer, recurse
1398 // through the PHI/select as if we had RAUW'ed it.
1399 enqueueUsers(I);
1400 else
1401 // Otherwise the operand to the PHI/select is dead, and we can replace
1402 // it with poison.
1403 AS.DeadOperands.push_back(U);
1404
1405 return;
1406 }
1407
1408 if (!IsOffsetKnown)
1409 return PI.setAborted(&I);
1410
1411 // See if we already have computed info on this node.
1412 uint64_t &Size = PHIOrSelectSizes[&I];
1413 if (!Size) {
1414 // This is a new PHI/Select, check for an unsafe use of it.
1415 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1416 return PI.setAborted(UnsafeI);
1417 }
1418
1419 // For PHI and select operands outside the alloca, we can't nuke the entire
1420 // phi or select -- the other side might still be relevant, so we special
1421 // case them here and use a separate structure to track the operands
1422 // themselves which should be replaced with poison.
1423 // FIXME: This should instead be escaped in the event we're instrumenting
1424 // for address sanitization.
1425 if (Offset.uge(AllocSize)) {
1426 AS.DeadOperands.push_back(U);
1427 return;
1428 }
1429
1430 insertUse(I, Offset, Size);
1431 }
1432
1433 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1434
1435 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1436
1437 /// Disable SROA entirely if there are unhandled users of the alloca.
1438 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1439
1440 void visitCallBase(CallBase &CB) {
1441 // If the call operand is read-only and only does a read-only or address
1442 // capture, then we mark it as EscapedReadOnly.
1443 if (CB.isDataOperand(U) &&
1444 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1445 CB.onlyReadsMemory(U->getOperandNo())) {
1446 PI.setEscapedReadOnly(&CB);
1447 return;
1448 }
1449
1450 Base::visitCallBase(CB);
1451 }
1452};
1453
1454AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1455 :
1456#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1457 AI(AI),
1458#endif
1459 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1460 SliceBuilder PB(DL, AI, *this);
1461 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1462 if (PtrI.isEscaped() || PtrI.isAborted()) {
1463 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1464 // possibly by just storing the PtrInfo in the AllocaSlices.
1465 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1466 : PtrI.getAbortingInst();
1467 assert(PointerEscapingInstr && "Did not track a bad instruction");
1468 return;
1469 }
1470 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1471
1472 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1473
1474 // Sort the uses. This arranges for the offsets to be in ascending order,
1475 // and the sizes to be in descending order.
1476 llvm::stable_sort(Slices);
1477}
1478
1479#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1480
1481void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1482 StringRef Indent) const {
1483 printSlice(OS, I, Indent);
1484 OS << "\n";
1485 printUse(OS, I, Indent);
1486}
1487
1488void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1489 StringRef Indent) const {
1490 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1491 << " slice #" << (I - begin())
1492 << (I->isSplittable() ? " (splittable)" : "");
1493}
1494
1495void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1496 StringRef Indent) const {
1497 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1498}
1499
1500void AllocaSlices::print(raw_ostream &OS) const {
1501 if (PointerEscapingInstr) {
1502 OS << "Can't analyze slices for alloca: " << AI << "\n"
1503 << " A pointer to this alloca escaped by:\n"
1504 << " " << *PointerEscapingInstr << "\n";
1505 return;
1506 }
1507
1508 if (PointerEscapingInstrReadOnly)
1509 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1510
1511 OS << "Slices of alloca: " << AI << "\n";
1512 for (const_iterator I = begin(), E = end(); I != E; ++I)
1513 print(OS, I);
1514}
1515
1516LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1517 print(dbgs(), I);
1518}
1519LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1520
1521#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1522
1523/// Walk the range of a partitioning looking for a common type to cover this
1524/// sequence of slices.
1525static std::pair<Type *, IntegerType *>
1526findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1527 uint64_t EndOffset) {
1528 Type *Ty = nullptr;
1529 bool TyIsCommon = true;
1530 IntegerType *ITy = nullptr;
1531
1532 // Note that we need to look at *every* alloca slice's Use to ensure we
1533 // always get consistent results regardless of the order of slices.
1534 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1535 Use *U = I->getUse();
1536 if (isa<IntrinsicInst>(*U->getUser()))
1537 continue;
1538 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1539 continue;
1540
1541 Type *UserTy = nullptr;
1542 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1543 UserTy = LI->getType();
1544 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1545 UserTy = SI->getValueOperand()->getType();
1546 }
1547
1548 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1549 // If the type is larger than the partition, skip it. We only encounter
1550 // this for split integer operations where we want to use the type of the
1551 // entity causing the split. Also skip if the type is not a byte width
1552 // multiple.
1553 if (UserITy->getBitWidth() % 8 != 0 ||
1554 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1555 continue;
1556
1557 // Track the largest bitwidth integer type used in this way in case there
1558 // is no common type.
1559 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1560 ITy = UserITy;
1561 }
1562
1563 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1564 // depend on types skipped above.
1565 if (!UserTy || (Ty && Ty != UserTy))
1566 TyIsCommon = false; // Give up on anything but an iN type.
1567 else
1568 Ty = UserTy;
1569 }
1570
1571 return {TyIsCommon ? Ty : nullptr, ITy};
1572}
1573
1574/// PHI instructions that use an alloca and are subsequently loaded can be
1575/// rewritten to load both input pointers in the pred blocks and then PHI the
1576/// results, allowing the load of the alloca to be promoted.
1577/// From this:
1578/// %P2 = phi [i32* %Alloca, i32* %Other]
1579/// %V = load i32* %P2
1580/// to:
1581/// %V1 = load i32* %Alloca -> will be mem2reg'd
1582/// ...
1583/// %V2 = load i32* %Other
1584/// ...
1585/// %V = phi [i32 %V1, i32 %V2]
1586///
1587/// We can do this to a select if its only uses are loads and if the operands
1588/// to the select can be loaded unconditionally.
1589///
1590/// FIXME: This should be hoisted into a generic utility, likely in
1591/// Transforms/Util/Local.h
1593 const DataLayout &DL = PN.getDataLayout();
1594
1595 // For now, we can only do this promotion if the load is in the same block
1596 // as the PHI, and if there are no stores between the phi and load.
1597 // TODO: Allow recursive phi users.
1598 // TODO: Allow stores.
1599 BasicBlock *BB = PN.getParent();
1600 Align MaxAlign;
1601 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1602 Type *LoadType = nullptr;
1603 for (User *U : PN.users()) {
1605 if (!LI || !LI->isSimple())
1606 return false;
1607
1608 // For now we only allow loads in the same block as the PHI. This is
1609 // a common case that happens when instcombine merges two loads through
1610 // a PHI.
1611 if (LI->getParent() != BB)
1612 return false;
1613
1614 if (LoadType) {
1615 if (LoadType != LI->getType())
1616 return false;
1617 } else {
1618 LoadType = LI->getType();
1619 }
1620
1621 // Ensure that there are no instructions between the PHI and the load that
1622 // could store.
1623 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1624 if (BBI->mayWriteToMemory())
1625 return false;
1626
1627 MaxAlign = std::max(MaxAlign, LI->getAlign());
1628 }
1629
1630 if (!LoadType)
1631 return false;
1632
1633 APInt LoadSize =
1634 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1635
1636 // We can only transform this if it is safe to push the loads into the
1637 // predecessor blocks. The only thing to watch out for is that we can't put
1638 // a possibly trapping load in the predecessor if it is a critical edge.
1639 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1641 Value *InVal = PN.getIncomingValue(Idx);
1642
1643 // If the value is produced by the terminator of the predecessor (an
1644 // invoke) or it has side-effects, there is no valid place to put a load
1645 // in the predecessor.
1646 if (TI == InVal || TI->mayHaveSideEffects())
1647 return false;
1648
1649 // If the predecessor has a single successor, then the edge isn't
1650 // critical.
1651 if (TI->getNumSuccessors() == 1)
1652 continue;
1653
1654 // If this pointer is always safe to load, or if we can prove that there
1655 // is already a load in the block, then we can move the load to the pred
1656 // block.
1657 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1658 continue;
1659
1660 return false;
1661 }
1662
1663 return true;
1664}
1665
1666static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1667 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1668
1669 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1670 Type *LoadTy = SomeLoad->getType();
1671 IRB.SetInsertPoint(&PN);
1672 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1673 PN.getName() + ".sroa.speculated");
1674
1675 // Get the AA tags and alignment to use from one of the loads. It does not
1676 // matter which one we get and if any differ.
1677 AAMDNodes AATags = SomeLoad->getAAMetadata();
1678 Align Alignment = SomeLoad->getAlign();
1679
1680 // Rewrite all loads of the PN to use the new PHI.
1681 while (!PN.use_empty()) {
1682 LoadInst *LI = cast<LoadInst>(PN.user_back());
1683 LI->replaceAllUsesWith(NewPN);
1684 LI->eraseFromParent();
1685 }
1686
1687 // Inject loads into all of the pred blocks.
1688 DenseMap<BasicBlock *, Value *> InjectedLoads;
1689 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1690 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1691 Value *InVal = PN.getIncomingValue(Idx);
1692
1693 // A PHI node is allowed to have multiple (duplicated) entries for the same
1694 // basic block, as long as the value is the same. So if we already injected
1695 // a load in the predecessor, then we should reuse the same load for all
1696 // duplicated entries.
1697 if (Value *V = InjectedLoads.lookup(Pred)) {
1698 NewPN->addIncoming(V, Pred);
1699 continue;
1700 }
1701
1702 Instruction *TI = Pred->getTerminator();
1703 IRB.SetInsertPoint(TI);
1704
1705 LoadInst *Load = IRB.CreateAlignedLoad(
1706 LoadTy, InVal, Alignment,
1707 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1708 ++NumLoadsSpeculated;
1709 if (AATags)
1710 Load->setAAMetadata(AATags);
1711 NewPN->addIncoming(Load, Pred);
1712 InjectedLoads[Pred] = Load;
1713 }
1714
1715 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1716 PN.eraseFromParent();
1717}
1718
1719SelectHandSpeculativity &
1720SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1721 if (isTrueVal)
1723 else
1725 return *this;
1726}
1727
1728bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1729 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1730 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1731}
1732
1733bool SelectHandSpeculativity::areAllSpeculatable() const {
1734 return isSpeculatable(/*isTrueVal=*/true) &&
1735 isSpeculatable(/*isTrueVal=*/false);
1736}
1737
1738bool SelectHandSpeculativity::areAnySpeculatable() const {
1739 return isSpeculatable(/*isTrueVal=*/true) ||
1740 isSpeculatable(/*isTrueVal=*/false);
1741}
1742bool SelectHandSpeculativity::areNoneSpeculatable() const {
1743 return !areAnySpeculatable();
1744}
1745
1746static SelectHandSpeculativity
1748 assert(LI.isSimple() && "Only for simple loads");
1749 SelectHandSpeculativity Spec;
1750
1751 const DataLayout &DL = SI.getDataLayout();
1752 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1754 &LI))
1755 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1756 else if (PreserveCFG)
1757 return Spec;
1758
1759 return Spec;
1760}
1761
1762std::optional<RewriteableMemOps>
1763SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1764 RewriteableMemOps Ops;
1765
1766 for (User *U : SI.users()) {
1767 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1768 U = *BC->user_begin();
1769
1770 if (auto *Store = dyn_cast<StoreInst>(U)) {
1771 // Note that atomic stores can be transformed; atomic semantics do not
1772 // have any meaning for a local alloca. Stores are not speculatable,
1773 // however, so if we can't turn it into a predicated store, we are done.
1774 if (Store->isVolatile() || PreserveCFG)
1775 return {}; // Give up on this `select`.
1776 Ops.emplace_back(Store);
1777 continue;
1778 }
1779
1780 auto *LI = dyn_cast<LoadInst>(U);
1781
1782 // Note that atomic loads can be transformed;
1783 // atomic semantics do not have any meaning for a local alloca.
1784 if (!LI || LI->isVolatile())
1785 return {}; // Give up on this `select`.
1786
1787 PossiblySpeculatableLoad Load(LI);
1788 if (!LI->isSimple()) {
1789 // If the `load` is not simple, we can't speculatively execute it,
1790 // but we could handle this via a CFG modification. But can we?
1791 if (PreserveCFG)
1792 return {}; // Give up on this `select`.
1793 Ops.emplace_back(Load);
1794 continue;
1795 }
1796
1797 SelectHandSpeculativity Spec =
1799 if (PreserveCFG && !Spec.areAllSpeculatable())
1800 return {}; // Give up on this `select`.
1801
1802 Load.setInt(Spec);
1803 Ops.emplace_back(Load);
1804 }
1805
1806 return Ops;
1807}
1808
1810 IRBuilderTy &IRB) {
1811 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1812
1813 Value *TV = SI.getTrueValue();
1814 Value *FV = SI.getFalseValue();
1815 // Replace the given load of the select with a select of two loads.
1816
1817 assert(LI.isSimple() && "We only speculate simple loads");
1818
1819 IRB.SetInsertPoint(&LI);
1820
1821 LoadInst *TL =
1822 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1823 LI.getName() + ".sroa.speculate.load.true");
1824 LoadInst *FL =
1825 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1826 LI.getName() + ".sroa.speculate.load.false");
1827 NumLoadsSpeculated += 2;
1828
1829 // Transfer alignment and AA info if present.
1830 TL->setAlignment(LI.getAlign());
1831 FL->setAlignment(LI.getAlign());
1832
1833 AAMDNodes Tags = LI.getAAMetadata();
1834 if (Tags) {
1835 TL->setAAMetadata(Tags);
1836 FL->setAAMetadata(Tags);
1837 }
1838
1839 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1840 LI.getName() + ".sroa.speculated",
1841 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1842
1843 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1844 LI.replaceAllUsesWith(V);
1845}
1846
1847template <typename T>
1849 SelectHandSpeculativity Spec,
1850 DomTreeUpdater &DTU) {
1851 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1852 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1853 BasicBlock *Head = I.getParent();
1854 Instruction *ThenTerm = nullptr;
1855 Instruction *ElseTerm = nullptr;
1856 if (Spec.areNoneSpeculatable())
1857 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1858 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1859 else {
1860 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1861 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1862 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1863 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1864 cast<CondBrInst>(Head->getTerminator())->swapSuccessors();
1865 }
1866 auto *HeadBI = cast<CondBrInst>(Head->getTerminator());
1867 Spec = {}; // Do not use `Spec` beyond this point.
1868 BasicBlock *Tail = I.getParent();
1869 Tail->setName(Head->getName() + ".cont");
1870 PHINode *PN;
1871 if (isa<LoadInst>(I))
1872 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1873 for (BasicBlock *SuccBB : successors(Head)) {
1874 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1875 int SuccIdx = IsThen ? 0 : 1;
1876 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1877 auto &CondMemOp = cast<T>(*I.clone());
1878 if (NewMemOpBB != Head) {
1879 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1880 if (isa<LoadInst>(I))
1881 ++NumLoadsPredicated;
1882 else
1883 ++NumStoresPredicated;
1884 } else {
1885 CondMemOp.dropUBImplyingAttrsAndMetadata();
1886 ++NumLoadsSpeculated;
1887 }
1888 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1889 Value *Ptr = SI.getOperand(1 + SuccIdx);
1890 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1891 if (isa<LoadInst>(I)) {
1892 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1893 PN->addIncoming(&CondMemOp, NewMemOpBB);
1894 } else
1895 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1896 }
1897 if (isa<LoadInst>(I)) {
1898 PN->takeName(&I);
1899 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1900 I.replaceAllUsesWith(PN);
1901 }
1902}
1903
1905 SelectHandSpeculativity Spec,
1906 DomTreeUpdater &DTU) {
1907 if (auto *LI = dyn_cast<LoadInst>(&I))
1908 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1909 else if (auto *SI = dyn_cast<StoreInst>(&I))
1910 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1911 else
1912 llvm_unreachable_internal("Only for load and store.");
1913}
1914
1916 const RewriteableMemOps &Ops,
1917 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1918 bool CFGChanged = false;
1919 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1920
1921 for (const RewriteableMemOp &Op : Ops) {
1922 SelectHandSpeculativity Spec;
1923 Instruction *I;
1924 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1925 I = *US;
1926 } else {
1927 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1928 I = PSL.getPointer();
1929 Spec = PSL.getInt();
1930 }
1931 if (Spec.areAllSpeculatable()) {
1933 } else {
1934 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1935 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1936 CFGChanged = true;
1937 }
1938 I->eraseFromParent();
1939 }
1940
1941 for (User *U : make_early_inc_range(SI.users()))
1942 cast<BitCastInst>(U)->eraseFromParent();
1943 SI.eraseFromParent();
1944 return CFGChanged;
1945}
1946
1947/// Compute an adjusted pointer from Ptr by Offset bytes where the
1948/// resulting pointer has PointerTy.
1949static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1951 const Twine &NamePrefix) {
1952 if (Offset != 0)
1953 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1954 NamePrefix + "sroa_idx");
1955 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1956 NamePrefix + "sroa_cast");
1957}
1958
1959/// Compute the adjusted alignment for a load or store from an offset.
1963
1964/// Test whether we can convert a value from the old to the new type.
1965///
1966/// This predicate should be used to guard calls to convertValue in order to
1967/// ensure that we only try to convert viable values. The strategy is that we
1968/// will peel off single element struct and array wrappings to get to an
1969/// underlying value, and convert that value.
1970static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1971 unsigned VScale = 0) {
1972 if (OldTy == NewTy)
1973 return true;
1974
1975 // For integer types, we can't handle any bit-width differences. This would
1976 // break both vector conversions with extension and introduce endianness
1977 // issues when in conjunction with loads and stores.
1978 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1980 cast<IntegerType>(NewTy)->getBitWidth() &&
1981 "We can't have the same bitwidth for different int types");
1982 return false;
1983 }
1984
1985 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1986 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1987
1988 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1989 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1990 // Conversion is only possible when the size of scalable vectors is known.
1991 if (!VScale)
1992 return false;
1993
1994 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1995 // a single domain (either fixed or scalable). Any additional conversion
1996 // between fixed and scalable types is handled through integer types.
1997 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1998 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1999
2000 if (isa<ScalableVectorType>(NewTy)) {
2002 return false;
2003
2004 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
2005 } else {
2007 return false;
2008
2009 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
2010 }
2011 }
2012
2013 if (NewSize != OldSize)
2014 return false;
2015 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
2016 return false;
2017
2018 // We can convert pointers to integers and vice-versa. Same for vectors
2019 // of pointers and integers.
2020 OldTy = OldTy->getScalarType();
2021 NewTy = NewTy->getScalarType();
2022 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
2023 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
2024 unsigned OldAS = OldTy->getPointerAddressSpace();
2025 unsigned NewAS = NewTy->getPointerAddressSpace();
2026 // Convert pointers if they are pointers from the same address space or
2027 // different integral (not non-integral) address spaces with the same
2028 // pointer size.
2029 return OldAS == NewAS ||
2030 (!DL.isNonIntegralAddressSpace(OldAS) &&
2031 !DL.isNonIntegralAddressSpace(NewAS) &&
2032 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2033 }
2034
2035 // We can convert integers to integral pointers, but not to non-integral
2036 // pointers.
2037 if (OldTy->isIntegerTy())
2038 return !DL.isNonIntegralPointerType(NewTy);
2039
2040 // We can convert integral pointers to integers, but non-integral pointers
2041 // need to remain pointers.
2042 if (!DL.isNonIntegralPointerType(OldTy))
2043 return NewTy->isIntegerTy();
2044
2045 return false;
2046 }
2047
2048 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2049 return false;
2050
2051 return true;
2052}
2053
2054/// Test whether the given slice use can be promoted to a vector.
2055///
2056/// This function is called to test each entry in a partition which is slated
2057/// for a single slice.
2058static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2059 VectorType *Ty,
2060 uint64_t ElementSize,
2061 const DataLayout &DL,
2062 unsigned VScale) {
2063 // First validate the slice offsets.
2064 uint64_t BeginOffset =
2065 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2066 uint64_t BeginIndex = BeginOffset / ElementSize;
2067 if (BeginIndex * ElementSize != BeginOffset ||
2068 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2069 return false;
2070 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2071 uint64_t EndIndex = EndOffset / ElementSize;
2072 if (EndIndex * ElementSize != EndOffset ||
2073 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2074 return false;
2075
2076 assert(EndIndex > BeginIndex && "Empty vector!");
2077 uint64_t NumElements = EndIndex - BeginIndex;
2078 Type *SliceTy = (NumElements == 1)
2079 ? Ty->getElementType()
2080 : FixedVectorType::get(Ty->getElementType(), NumElements);
2081
2082 Type *SplitIntTy =
2083 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2084
2085 Use *U = S.getUse();
2086
2087 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2088 if (MI->isVolatile())
2089 return false;
2090 if (!S.isSplittable())
2091 return false; // Skip any unsplittable intrinsics.
2092 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2093 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2094 return false;
2095 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2096 if (LI->isVolatile())
2097 return false;
2098 Type *LTy = LI->getType();
2099 // Disable vector promotion when there are loads or stores of an FCA.
2100 if (LTy->isStructTy())
2101 return false;
2102 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2103 assert(LTy->isIntegerTy());
2104 LTy = SplitIntTy;
2105 }
2106 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2107 return false;
2108 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2109 if (SI->isVolatile())
2110 return false;
2111 Type *STy = SI->getValueOperand()->getType();
2112 // Disable vector promotion when there are loads or stores of an FCA.
2113 if (STy->isStructTy())
2114 return false;
2115 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2116 assert(STy->isIntegerTy());
2117 STy = SplitIntTy;
2118 }
2119 if (!canConvertValue(DL, STy, SliceTy, VScale))
2120 return false;
2121 } else {
2122 return false;
2123 }
2124
2125 return true;
2126}
2127
2128/// Test whether any vector type in \p CandidateTys is viable for promotion.
2129///
2130/// This implements the necessary checking for \c isVectorPromotionViable over
2131/// all slices of the alloca for the given VectorType.
2132static VectorType *
2134 SmallVectorImpl<VectorType *> &CandidateTys,
2135 bool HaveCommonEltTy, Type *CommonEltTy,
2136 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2137 VectorType *CommonVecPtrTy, unsigned VScale) {
2138 // If we didn't find a vector type, nothing to do here.
2139 if (CandidateTys.empty())
2140 return nullptr;
2141
2142 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2143 // then we should choose it, not some other alternative.
2144 // But, we can't perform a no-op pointer address space change via bitcast,
2145 // so if we didn't have a common pointer element type, bail.
2146 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2147 return nullptr;
2148
2149 // Try to pick the "best" element type out of the choices.
2150 if (!HaveCommonEltTy && HaveVecPtrTy) {
2151 // If there was a pointer element type, there's really only one choice.
2152 CandidateTys.clear();
2153 CandidateTys.push_back(CommonVecPtrTy);
2154 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2155 // Integer-ify vector types.
2156 for (VectorType *&VTy : CandidateTys) {
2157 if (!VTy->getElementType()->isIntegerTy())
2158 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2159 VTy->getContext(), VTy->getScalarSizeInBits())));
2160 }
2161
2162 // Rank the remaining candidate vector types. This is easy because we know
2163 // they're all integer vectors. We sort by ascending number of elements.
2164 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2165 (void)DL;
2166 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2167 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2168 "Cannot have vector types of different sizes!");
2169 assert(RHSTy->getElementType()->isIntegerTy() &&
2170 "All non-integer types eliminated!");
2171 assert(LHSTy->getElementType()->isIntegerTy() &&
2172 "All non-integer types eliminated!");
2173 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2174 cast<FixedVectorType>(LHSTy)->getNumElements();
2175 };
2176 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2177 (void)DL;
2178 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2179 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2180 "Cannot have vector types of different sizes!");
2181 assert(RHSTy->getElementType()->isIntegerTy() &&
2182 "All non-integer types eliminated!");
2183 assert(LHSTy->getElementType()->isIntegerTy() &&
2184 "All non-integer types eliminated!");
2185 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2186 cast<FixedVectorType>(LHSTy)->getNumElements();
2187 };
2188 llvm::sort(CandidateTys, RankVectorTypesComp);
2189 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2190 CandidateTys.end());
2191 } else {
2192// The only way to have the same element type in every vector type is to
2193// have the same vector type. Check that and remove all but one.
2194#ifndef NDEBUG
2195 for (VectorType *VTy : CandidateTys) {
2196 assert(VTy->getElementType() == CommonEltTy &&
2197 "Unaccounted for element type!");
2198 assert(VTy == CandidateTys[0] &&
2199 "Different vector types with the same element type!");
2200 }
2201#endif
2202 CandidateTys.resize(1);
2203 }
2204
2205 // FIXME: hack. Do we have a named constant for this?
2206 // SDAG SDNode can't have more than 65535 operands.
2207 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2208 return cast<FixedVectorType>(VTy)->getNumElements() >
2209 std::numeric_limits<unsigned short>::max();
2210 });
2211
2212 // Find a vector type viable for promotion by iterating over all slices.
2213 auto *VTy = llvm::find_if(CandidateTys, [&](VectorType *VTy) -> bool {
2214 uint64_t ElementSize =
2215 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2216
2217 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2218 // that aren't byte sized.
2219 if (ElementSize % 8)
2220 return false;
2221 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2222 "vector size not a multiple of element size?");
2223 ElementSize /= 8;
2224
2225 for (const Slice &S : P)
2226 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2227 return false;
2228
2229 for (const Slice *S : P.splitSliceTails())
2230 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2231 return false;
2232
2233 return true;
2234 });
2235 return VTy != CandidateTys.end() ? *VTy : nullptr;
2236}
2237
2239 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2240 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2241 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2242 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2243 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2244 [[maybe_unused]] VectorType *OriginalElt =
2245 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2246 // Consider additional vector types where the element type size is a
2247 // multiple of load/store element size.
2248 for (Type *Ty : OtherTys) {
2250 continue;
2251 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2252 // Make a copy of CandidateTys and iterate through it, because we
2253 // might append to CandidateTys in the loop.
2254 for (VectorType *const VTy : CandidateTysCopy) {
2255 // The elements in the copy should remain invariant throughout the loop
2256 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2257 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2258 unsigned ElementSize =
2259 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2260 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2261 VectorSize % TypeSize == 0) {
2262 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2263 CheckCandidateType(NewVTy);
2264 }
2265 }
2266 }
2267
2269 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2270 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2271}
2272
2273/// Test whether the given alloca partitioning and range of slices can be
2274/// promoted to a vector.
2275///
2276/// This is a quick test to check whether we can rewrite a particular alloca
2277/// partition (and its newly formed alloca) into a vector alloca with only
2278/// whole-vector loads and stores such that it could be promoted to a vector
2279/// SSA value. We only can ensure this for a limited set of operations, and we
2280/// don't want to do the rewrites unless we are confident that the result will
2281/// be promotable, so we have an early test here.
2283 unsigned VScale) {
2284 // Collect the candidate types for vector-based promotion. Also track whether
2285 // we have different element types.
2286 SmallVector<VectorType *, 4> CandidateTys;
2287 SetVector<Type *> LoadStoreTys;
2288 SetVector<Type *> DeferredTys;
2289 Type *CommonEltTy = nullptr;
2290 VectorType *CommonVecPtrTy = nullptr;
2291 bool HaveVecPtrTy = false;
2292 bool HaveCommonEltTy = true;
2293 bool HaveCommonVecPtrTy = true;
2294 auto CheckCandidateType = [&](Type *Ty) {
2295 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2296 // Return if bitcast to vectors is different for total size in bits.
2297 if (!CandidateTys.empty()) {
2298 VectorType *V = CandidateTys[0];
2299 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2300 DL.getTypeSizeInBits(V).getFixedValue()) {
2301 CandidateTys.clear();
2302 return;
2303 }
2304 }
2305 CandidateTys.push_back(VTy);
2306 Type *EltTy = VTy->getElementType();
2307
2308 if (!CommonEltTy)
2309 CommonEltTy = EltTy;
2310 else if (CommonEltTy != EltTy)
2311 HaveCommonEltTy = false;
2312
2313 if (EltTy->isPointerTy()) {
2314 HaveVecPtrTy = true;
2315 if (!CommonVecPtrTy)
2316 CommonVecPtrTy = VTy;
2317 else if (CommonVecPtrTy != VTy)
2318 HaveCommonVecPtrTy = false;
2319 }
2320 }
2321 };
2322
2323 // Put load and store types into a set for de-duplication.
2324 for (const Slice &S : P) {
2325 Type *Ty;
2326 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2327 Ty = LI->getType();
2328 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2329 Ty = SI->getValueOperand()->getType();
2330 else
2331 continue;
2332
2333 auto CandTy = Ty->getScalarType();
2334 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2335 S.endOffset() != P.endOffset())) {
2336 DeferredTys.insert(Ty);
2337 continue;
2338 }
2339
2340 LoadStoreTys.insert(Ty);
2341 // Consider any loads or stores that are the exact size of the slice.
2342 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2343 CheckCandidateType(Ty);
2344 }
2345
2346 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2348 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2349 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2350 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2351 return VTy;
2352
2353 CandidateTys.clear();
2355 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2356 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2357 CommonVecPtrTy, VScale);
2358}
2359
2360/// Test whether a slice of an alloca is valid for integer widening.
2361///
2362/// This implements the necessary checking for the \c isIntegerWideningViable
2363/// test below on a single slice of the alloca.
2364static bool isIntegerWideningViableForSlice(const Slice &S,
2365 uint64_t AllocBeginOffset,
2366 Type *AllocaTy,
2367 const DataLayout &DL,
2368 bool &WholeAllocaOp) {
2369 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2370
2371 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2372 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2373
2374 Use *U = S.getUse();
2375
2376 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2377 // larger than other load/store slices (RelEnd > Size). But lifetime are
2378 // always promotable and should not impact other slices' promotability of the
2379 // partition.
2380 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2381 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2382 return true;
2383 }
2384
2385 // We can't reasonably handle cases where the load or store extends past
2386 // the end of the alloca's type and into its padding.
2387 if (RelEnd > Size)
2388 return false;
2389
2390 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2391 if (LI->isVolatile())
2392 return false;
2393 // We can't handle loads that extend past the allocated memory.
2394 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2395 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2396 return false;
2397 // So far, AllocaSliceRewriter does not support widening split slice tails
2398 // in rewriteIntegerLoad.
2399 if (S.beginOffset() < AllocBeginOffset)
2400 return false;
2401 // Note that we don't count vector loads or stores as whole-alloca
2402 // operations which enable integer widening because we would prefer to use
2403 // vector widening instead.
2404 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2405 WholeAllocaOp = true;
2406 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2407 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2408 return false;
2409 } else if (RelBegin != 0 || RelEnd != Size ||
2410 !canConvertValue(DL, AllocaTy, LI->getType())) {
2411 // Non-integer loads need to be convertible from the alloca type so that
2412 // they are promotable.
2413 return false;
2414 }
2415 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2416 Type *ValueTy = SI->getValueOperand()->getType();
2417 if (SI->isVolatile())
2418 return false;
2419 // We can't handle stores that extend past the allocated memory.
2420 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2421 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2422 return false;
2423 // So far, AllocaSliceRewriter does not support widening split slice tails
2424 // in rewriteIntegerStore.
2425 if (S.beginOffset() < AllocBeginOffset)
2426 return false;
2427 // Note that we don't count vector loads or stores as whole-alloca
2428 // operations which enable integer widening because we would prefer to use
2429 // vector widening instead.
2430 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2431 WholeAllocaOp = true;
2432 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2433 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2434 return false;
2435 } else if (RelBegin != 0 || RelEnd != Size ||
2436 !canConvertValue(DL, ValueTy, AllocaTy)) {
2437 // Non-integer stores need to be convertible to the alloca type so that
2438 // they are promotable.
2439 return false;
2440 }
2441 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2442 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2443 return false;
2444 if (!S.isSplittable())
2445 return false; // Skip any unsplittable intrinsics.
2446 } else {
2447 return false;
2448 }
2449
2450 return true;
2451}
2452
2453/// Test whether the given alloca partition's integer operations can be
2454/// widened to promotable ones.
2455///
2456/// This is a quick test to check whether we can rewrite the integer loads and
2457/// stores to a particular alloca into wider loads and stores and be able to
2458/// promote the resulting alloca.
2459static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2460 const DataLayout &DL) {
2461 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2462 // Don't create integer types larger than the maximum bitwidth.
2463 if (SizeInBits > IntegerType::MAX_INT_BITS)
2464 return false;
2465
2466 // Don't try to handle allocas with bit-padding.
2467 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2468 return false;
2469
2470 // We need to ensure that an integer type with the appropriate bitwidth can
2471 // be converted to the alloca type, whatever that is. We don't want to force
2472 // the alloca itself to have an integer type if there is a more suitable one.
2473 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2474 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2475 !canConvertValue(DL, IntTy, AllocaTy))
2476 return false;
2477
2478 // While examining uses, we ensure that the alloca has a covering load or
2479 // store. We don't want to widen the integer operations only to fail to
2480 // promote due to some other unsplittable entry (which we may make splittable
2481 // later). However, if there are only splittable uses, go ahead and assume
2482 // that we cover the alloca.
2483 // FIXME: We shouldn't consider split slices that happen to start in the
2484 // partition here...
2485 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2486
2487 for (const Slice &S : P)
2488 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2489 WholeAllocaOp))
2490 return false;
2491
2492 for (const Slice *S : P.splitSliceTails())
2493 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2494 WholeAllocaOp))
2495 return false;
2496
2497 return WholeAllocaOp;
2498}
2499
2500static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2502 const Twine &Name) {
2503 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2504 IntegerType *IntTy = cast<IntegerType>(V->getType());
2505 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2506 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2507 "Element extends past full value");
2508 uint64_t ShAmt = 8 * Offset;
2509 if (DL.isBigEndian())
2510 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2511 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2512 if (ShAmt) {
2513 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2514 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2515 }
2516 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2517 "Cannot extract to a larger integer!");
2518 if (Ty != IntTy) {
2519 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2520 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2521 }
2522 return V;
2523}
2524
2525static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2526 Value *V, uint64_t Offset, const Twine &Name) {
2527 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2528 IntegerType *Ty = cast<IntegerType>(V->getType());
2529 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2530 "Cannot insert a larger integer!");
2531 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2532 if (Ty != IntTy) {
2533 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2534 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2535 }
2536 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2537 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2538 "Element store outside of alloca store");
2539 uint64_t ShAmt = 8 * Offset;
2540 if (DL.isBigEndian())
2541 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2542 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2543 if (ShAmt) {
2544 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2545 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2546 }
2547
2548 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2549 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2550 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2551 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2552 V = IRB.CreateOr(Old, V, Name + ".insert");
2553 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2554 }
2555 return V;
2556}
2557
2558static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2559 unsigned EndIndex, const Twine &Name) {
2560 auto *VecTy = cast<FixedVectorType>(V->getType());
2561 unsigned NumElements = EndIndex - BeginIndex;
2562 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2563
2564 if (NumElements == VecTy->getNumElements())
2565 return V;
2566
2567 if (NumElements == 1) {
2568 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2569 Name + ".extract");
2570 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2571 return V;
2572 }
2573
2574 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2575 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2576 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2577 return V;
2578}
2579
2580static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2581 unsigned BeginIndex, const Twine &Name) {
2582 VectorType *VecTy = cast<VectorType>(Old->getType());
2583 assert(VecTy && "Can only insert a vector into a vector");
2584
2585 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2586 if (!Ty) {
2587 // Single element to insert.
2588 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2589 Name + ".insert");
2590 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2591 return V;
2592 }
2593
2594 unsigned NumSubElements = cast<FixedVectorType>(Ty)->getNumElements();
2595 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
2596
2597 assert(NumSubElements <= NumElements && "Too many elements!");
2598 if (NumSubElements == NumElements) {
2599 assert(V->getType() == VecTy && "Vector type mismatch");
2600 return V;
2601 }
2602 unsigned EndIndex = BeginIndex + NumSubElements;
2603
2604 // When inserting a smaller vector into the larger to store, we first
2605 // use a shuffle vector to widen it with undef elements, and then
2606 // a second shuffle vector to select between the loaded vector and the
2607 // incoming vector.
2609 Mask.reserve(NumElements);
2610 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2611 if (Idx >= BeginIndex && Idx < EndIndex)
2612 Mask.push_back(Idx - BeginIndex);
2613 else
2614 Mask.push_back(-1);
2615 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2616 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2617
2618 Mask.clear();
2619 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2620 if (Idx >= BeginIndex && Idx < EndIndex)
2621 Mask.push_back(Idx);
2622 else
2623 Mask.push_back(Idx + NumElements);
2624 V = IRB.CreateShuffleVector(V, Old, Mask, Name + "blend");
2625 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2626 return V;
2627}
2628
2629/// This function takes two vector values and combines them into a single vector
2630/// by concatenating their elements. The function handles:
2631///
2632/// 1. Element type mismatch: If either vector's element type differs from
2633/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2634/// preserving the total bit width (adjusting the number of elements
2635/// accordingly).
2636///
2637/// 2. Size mismatch: After transforming the vectors to have the desired element
2638/// type, if the two vectors have different numbers of elements, the smaller
2639/// vector is extended with poison values to match the size of the larger
2640/// vector before concatenation.
2641///
2642/// 3. Concatenation: The vectors are merged using a shuffle operation that
2643/// places all elements of V0 first, followed by all elements of V1.
2644///
2645/// \param V0 The first vector to merge (must be a vector type)
2646/// \param V1 The second vector to merge (must be a vector type)
2647/// \param DL The data layout for size calculations
2648/// \param NewAIEltTy The desired element type for the result vector
2649/// \param Builder IRBuilder for creating new instructions
2650/// \return A new vector containing all elements from V0 followed by all
2651/// elements from V1
2653 Type *NewAIEltTy, IRBuilder<> &Builder) {
2654 // V0 and V1 are vectors
2655 // Create a new vector type with combined elements
2656 // Use ShuffleVector to concatenate the vectors
2657 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2658 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2659
2660 // If V0/V1 element types are different from NewAllocaElementType,
2661 // we need to introduce bitcasts before merging them
2662 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2663 const char *DebugName) {
2664 Type *EltType = VecType->getElementType();
2665 if (EltType != NewAIEltTy) {
2666 // Calculate new number of elements to maintain same bit width
2667 unsigned TotalBits =
2668 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2669 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2670
2671 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2672 V = Builder.CreateBitCast(V, NewVecType);
2673 VecType = NewVecType;
2674 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2675 }
2676 };
2677
2678 BitcastIfNeeded(V0, VecType0, "V0");
2679 BitcastIfNeeded(V1, VecType1, "V1");
2680
2681 unsigned NumElts0 = VecType0->getNumElements();
2682 unsigned NumElts1 = VecType1->getNumElements();
2683
2684 SmallVector<int, 16> ShuffleMask;
2685
2686 if (NumElts0 == NumElts1) {
2687 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2688 ShuffleMask.push_back(i);
2689 } else {
2690 // If two vectors have different sizes, we need to extend
2691 // the smaller vector to the size of the larger vector.
2692 unsigned SmallSize = std::min(NumElts0, NumElts1);
2693 unsigned LargeSize = std::max(NumElts0, NumElts1);
2694 bool IsV0Smaller = NumElts0 < NumElts1;
2695 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2696 SmallVector<int, 16> ExtendMask;
2697 for (unsigned i = 0; i < SmallSize; ++i)
2698 ExtendMask.push_back(i);
2699 for (unsigned i = SmallSize; i < LargeSize; ++i)
2700 ExtendMask.push_back(PoisonMaskElem);
2701 ExtendedVec = Builder.CreateShuffleVector(
2702 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2703 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2704 for (unsigned i = 0; i < NumElts0; ++i)
2705 ShuffleMask.push_back(i);
2706 for (unsigned i = 0; i < NumElts1; ++i)
2707 ShuffleMask.push_back(LargeSize + i);
2708 }
2709
2710 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2711}
2712
2713namespace {
2714
2715/// Visitor to rewrite instructions using p particular slice of an alloca
2716/// to use a new alloca.
2717///
2718/// Also implements the rewriting to vector-based accesses when the partition
2719/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2720/// lives here.
2721class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2722 // Befriend the base class so it can delegate to private visit methods.
2723 friend class InstVisitor<AllocaSliceRewriter, bool>;
2724
2725 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2726
2727 const DataLayout &DL;
2728 AllocaSlices &AS;
2729 SROA &Pass;
2730 AllocaInst &OldAI, &NewAI;
2731 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2732 Type *NewAllocaTy;
2733
2734 // This is a convenience and flag variable that will be null unless the new
2735 // alloca's integer operations should be widened to this integer type due to
2736 // passing isIntegerWideningViable above. If it is non-null, the desired
2737 // integer type will be stored here for easy access during rewriting.
2738 IntegerType *IntTy;
2739
2740 // If we are rewriting an alloca partition which can be written as pure
2741 // vector operations, we stash extra information here. When VecTy is
2742 // non-null, we have some strict guarantees about the rewritten alloca:
2743 // - The new alloca is exactly the size of the vector type here.
2744 // - The accesses all either map to the entire vector or to a single
2745 // element.
2746 // - The set of accessing instructions is only one of those handled above
2747 // in isVectorPromotionViable. Generally these are the same access kinds
2748 // which are promotable via mem2reg.
2749 VectorType *VecTy;
2750 Type *ElementTy;
2751 uint64_t ElementSize;
2752
2753 // The original offset of the slice currently being rewritten relative to
2754 // the original alloca.
2755 uint64_t BeginOffset = 0;
2756 uint64_t EndOffset = 0;
2757
2758 // The new offsets of the slice currently being rewritten relative to the
2759 // original alloca.
2760 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2761
2762 uint64_t SliceSize = 0;
2763 bool IsSplittable = false;
2764 bool IsSplit = false;
2765 Use *OldUse = nullptr;
2766 Instruction *OldPtr = nullptr;
2767
2768 // Track post-rewrite users which are PHI nodes and Selects.
2769 SmallSetVector<PHINode *, 8> &PHIUsers;
2770 SmallSetVector<SelectInst *, 8> &SelectUsers;
2771
2772 // Utility IR builder, whose name prefix is setup for each visited use, and
2773 // the insertion point is set to point to the user.
2774 IRBuilderTy IRB;
2775
2776 // Return the new alloca, addrspacecasted if required to avoid changing the
2777 // addrspace of a volatile access.
2778 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2779 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2780 return &NewAI;
2781
2782 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2783 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2784 }
2785
2786public:
2787 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2788 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2789 uint64_t NewAllocaBeginOffset,
2790 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2791 VectorType *PromotableVecTy,
2792 SmallSetVector<PHINode *, 8> &PHIUsers,
2793 SmallSetVector<SelectInst *, 8> &SelectUsers)
2794 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2795 NewAllocaBeginOffset(NewAllocaBeginOffset),
2796 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2797 IntTy(IsIntegerPromotable
2798 ? Type::getIntNTy(
2799 NewAI.getContext(),
2800 DL.getTypeSizeInBits(NewAllocaTy).getFixedValue())
2801 : nullptr),
2802 VecTy(PromotableVecTy),
2803 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2804 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2805 : 0),
2806 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2807 IRB(NewAI.getContext(), ConstantFolder()) {
2808 if (VecTy) {
2809 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2810 "Only multiple-of-8 sized vector elements are viable");
2811 ++NumVectorized;
2812 }
2813 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2814 }
2815
2816 bool visit(AllocaSlices::const_iterator I) {
2817 bool CanSROA = true;
2818 BeginOffset = I->beginOffset();
2819 EndOffset = I->endOffset();
2820 IsSplittable = I->isSplittable();
2821 IsSplit =
2822 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2823 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2824 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2825 LLVM_DEBUG(dbgs() << "\n");
2826
2827 // Compute the intersecting offset range.
2828 assert(BeginOffset < NewAllocaEndOffset);
2829 assert(EndOffset > NewAllocaBeginOffset);
2830 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2831 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2832
2833 SliceSize = NewEndOffset - NewBeginOffset;
2834 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2835 << ") NewBegin:(" << NewBeginOffset << ", "
2836 << NewEndOffset << ") NewAllocaBegin:("
2837 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2838 << ")\n");
2839 assert(IsSplit || NewBeginOffset == BeginOffset);
2840 OldUse = I->getUse();
2841 OldPtr = cast<Instruction>(OldUse->get());
2842
2843 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2844 IRB.SetInsertPoint(OldUserI);
2845 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2846 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2847 Twine(BeginOffset) + ".");
2848
2849 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2850 if (VecTy || IntTy)
2851 assert(CanSROA);
2852 return CanSROA;
2853 }
2854
2855 /// Attempts to rewrite a partition using tree-structured merge optimization.
2856 ///
2857 /// This function analyzes a partition to determine if it can be optimized
2858 /// using a tree-structured merge pattern, where multiple non-overlapping
2859 /// stores completely fill an alloca. And there is no load from the alloca in
2860 /// the middle of the stores. Such patterns can be optimized by eliminating
2861 /// the intermediate stores and directly constructing the final vector by
2862 /// using shufflevectors.
2863 ///
2864 /// Example transformation:
2865 /// Before: (stores do not have to be in order)
2866 /// %alloca = alloca <8 x float>
2867 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2868 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2869 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2870 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2871 ///
2872 /// After:
2873 /// %alloca = alloca <8 x float>
2874 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2875 /// i32 3>
2876 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2877 /// i32 3>
2878 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2879 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2880 /// store %shuffle2, ptr %alloca
2881 ///
2882 /// The optimization looks for partitions that:
2883 /// 1. Have no overlapping split slice tails
2884 /// 2. Contain non-overlapping stores that cover the entire alloca
2885 /// 3. Have exactly one load that reads the complete alloca structure and not
2886 /// in the middle of the stores (TODO: maybe we can relax the constraint
2887 /// about reading the entire alloca structure)
2888 ///
2889 /// \param P The partition to analyze and potentially rewrite
2890 /// \return An optional vector of values that were deleted during the rewrite
2891 /// process, or std::nullopt if the partition cannot be optimized
2892 /// using tree-structured merge
2893 std::optional<SmallVector<Value *, 4>>
2894 rewriteTreeStructuredMerge(Partition &P) {
2895 // No tail slices that overlap with the partition
2896 if (P.splitSliceTails().size() > 0)
2897 return std::nullopt;
2898
2899 SmallVector<Value *, 4> DeletedValues;
2900 LoadInst *TheLoad = nullptr;
2901
2902 // Structure to hold store information
2903 struct StoreInfo {
2904 StoreInst *Store;
2905 uint64_t BeginOffset;
2906 uint64_t EndOffset;
2907 Value *StoredValue;
2908 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2909 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2910 };
2911
2912 SmallVector<StoreInfo, 4> StoreInfos;
2913
2914 // If the new alloca is a fixed vector type, we use its element type as the
2915 // allocated element type, otherwise we use i8 as the allocated element
2916 Type *AllocatedEltTy =
2917 isa<FixedVectorType>(NewAllocaTy)
2918 ? cast<FixedVectorType>(NewAllocaTy)->getElementType()
2919 : Type::getInt8Ty(NewAI.getContext());
2920 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2921
2922 // Helper to check if a type is
2923 // 1. A fixed vector type
2924 // 2. The element type is not a pointer
2925 // 3. The element type size is byte-aligned
2926 // We only handle the cases that the ld/st meet these conditions
2927 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2928 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2929 return FixedVecTy &&
2930 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2931 !FixedVecTy->getElementType()->isPointerTy();
2932 };
2933
2934 for (Slice &S : P) {
2935 auto *User = cast<Instruction>(S.getUse()->getUser());
2936 if (auto *LI = dyn_cast<LoadInst>(User)) {
2937 // Do not handle the case if
2938 // 1. There is more than one load
2939 // 2. The load is volatile
2940 // 3. The load does not read the entire alloca structure
2941 // 4. The load does not meet the conditions in the helper function
2942 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2943 S.beginOffset() != NewAllocaBeginOffset ||
2944 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2945 return std::nullopt;
2946 TheLoad = LI;
2947 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
2948 // Do not handle the case if
2949 // 1. The store does not meet the conditions in the helper function
2950 // 2. The store is volatile
2951 // 3. The total store size is not a multiple of the allocated element
2952 // type size
2953 if (!IsTypeValidForTreeStructuredMerge(
2954 SI->getValueOperand()->getType()) ||
2955 SI->isVolatile())
2956 return std::nullopt;
2957 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
2958 unsigned NumElts = VecTy->getNumElements();
2959 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
2960 if (NumElts * EltSize % AllocatedEltTySize != 0)
2961 return std::nullopt;
2962 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
2963 SI->getValueOperand());
2964 } else {
2965 // If we have instructions other than load and store, we cannot do the
2966 // tree structured merge
2967 return std::nullopt;
2968 }
2969 }
2970 // If we do not have any load, we cannot do the tree structured merge
2971 if (!TheLoad)
2972 return std::nullopt;
2973
2974 // If we do not have multiple stores, we cannot do the tree structured merge
2975 if (StoreInfos.size() < 2)
2976 return std::nullopt;
2977
2978 // Stores should not overlap and should cover the whole alloca
2979 // Sort by begin offset
2980 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
2981 return A.BeginOffset < B.BeginOffset;
2982 });
2983
2984 // Check for overlaps and coverage
2985 uint64_t ExpectedStart = NewAllocaBeginOffset;
2986 for (auto &StoreInfo : StoreInfos) {
2987 uint64_t BeginOff = StoreInfo.BeginOffset;
2988 uint64_t EndOff = StoreInfo.EndOffset;
2989
2990 // Check for gap or overlap
2991 if (BeginOff != ExpectedStart)
2992 return std::nullopt;
2993
2994 ExpectedStart = EndOff;
2995 }
2996 // Check that stores cover the entire alloca
2997 if (ExpectedStart != NewAllocaEndOffset)
2998 return std::nullopt;
2999
3000 // Stores should be in the same basic block
3001 // The load should not be in the middle of the stores
3002 // Note:
3003 // If the load is in a different basic block with the stores, we can still
3004 // do the tree structured merge. This is because we do not have the
3005 // store->load forwarding here. The merged vector will be stored back to
3006 // NewAI and the new load will load from NewAI. The forwarding will be
3007 // handled later when we try to promote NewAI.
3008 BasicBlock *LoadBB = TheLoad->getParent();
3009 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3010
3011 for (auto &StoreInfo : StoreInfos) {
3012 if (StoreInfo.Store->getParent() != StoreBB)
3013 return std::nullopt;
3014 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
3015 return std::nullopt;
3016 }
3017
3018 // If we reach here, the partition can be merged with a tree structured
3019 // merge
3020 LLVM_DEBUG({
3021 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3022 << "\n Ordered stores:\n";
3023 for (auto [i, Info] : enumerate(StoreInfos))
3024 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3025 << Info.EndOffset << ") \tStore: " << *Info.Store
3026 << "\tValue: " << *Info.StoredValue << "\n";
3027 });
3028
3029 // Instead of having these stores, we merge all the stored values into a
3030 // vector and store the merged value into the alloca
3031 std::queue<Value *> VecElements;
3032 // StoreInfos is sorted by offset, not by block order. Anchoring to
3033 // StoreInfos.back().Store (last by offset) can place shuffles before
3034 // operands that appear later in the block (invalid SSA). Insert before
3035 // TheLoad when it shares the store block (after all stores, before any
3036 // later IR in that block). Otherwise insert before the store block's
3037 // terminator so the merge runs after every store and any trailing
3038 // instructions in that block.
3039 IRBuilder<> Builder(LoadBB == StoreBB ? TheLoad : StoreBB->getTerminator());
3040 for (const auto &Info : StoreInfos) {
3041 DeletedValues.push_back(Info.Store);
3042 VecElements.push(Info.StoredValue);
3043 }
3044
3045 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3046 while (VecElements.size() > 1) {
3047 const auto NumElts = VecElements.size();
3048 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3049 Value *V0 = VecElements.front();
3050 VecElements.pop();
3051 Value *V1 = VecElements.front();
3052 VecElements.pop();
3053 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3054 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3055 VecElements.push(Merged);
3056 }
3057 if (NumElts % 2 == 1) {
3058 Value *V = VecElements.front();
3059 VecElements.pop();
3060 VecElements.push(V);
3061 }
3062 }
3063
3064 // Store the merged value into the alloca
3065 Value *MergedValue = VecElements.front();
3066 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3067
3068 IRBuilder<> LoadBuilder(TheLoad);
3069 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3070 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3071 TheLoad->getName() + ".sroa.new.load"));
3072 DeletedValues.push_back(TheLoad);
3073
3074 return DeletedValues;
3075 }
3076
3077private:
3078 // Make sure the other visit overloads are visible.
3079 using Base::visit;
3080
3081 // Every instruction which can end up as a user must have a rewrite rule.
3082 bool visitInstruction(Instruction &I) {
3083 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3084 llvm_unreachable("No rewrite rule for this instruction!");
3085 }
3086
3087 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3088 // Note that the offset computation can use BeginOffset or NewBeginOffset
3089 // interchangeably for unsplit slices.
3090 assert(IsSplit || BeginOffset == NewBeginOffset);
3091 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3092
3093 StringRef OldName = OldPtr->getName();
3094 // Skip through the last '.sroa.' component of the name.
3095 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3096 if (LastSROAPrefix != StringRef::npos) {
3097 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3098 // Look for an SROA slice index.
3099 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3100 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3101 // Strip the index and look for the offset.
3102 OldName = OldName.substr(IndexEnd + 1);
3103 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3104 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3105 // Strip the offset.
3106 OldName = OldName.substr(OffsetEnd + 1);
3107 }
3108 }
3109 // Strip any SROA suffixes as well.
3110 OldName = OldName.substr(0, OldName.find(".sroa_"));
3111
3112 return getAdjustedPtr(IRB, DL, &NewAI,
3113 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3114 PointerTy, Twine(OldName) + ".");
3115 }
3116
3117 /// Compute suitable alignment to access this slice of the *new*
3118 /// alloca.
3119 ///
3120 /// You can optionally pass a type to this routine and if that type's ABI
3121 /// alignment is itself suitable, this will return zero.
3122 Align getSliceAlign() {
3123 return commonAlignment(NewAI.getAlign(),
3124 NewBeginOffset - NewAllocaBeginOffset);
3125 }
3126
3127 unsigned getIndex(uint64_t Offset) {
3128 assert(VecTy && "Can only call getIndex when rewriting a vector");
3129 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3130 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3131 uint32_t Index = RelOffset / ElementSize;
3132 assert(Index * ElementSize == RelOffset);
3133 return Index;
3134 }
3135
3136 void deleteIfTriviallyDead(Value *V) {
3139 Pass.DeadInsts.push_back(I);
3140 }
3141
3142 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3143 unsigned BeginIndex = getIndex(NewBeginOffset);
3144 unsigned EndIndex = getIndex(NewEndOffset);
3145 assert(EndIndex > BeginIndex && "Empty vector!");
3146
3147 LoadInst *Load =
3148 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3149
3150 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3151 LLVMContext::MD_access_group});
3152 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3153 }
3154
3155 Value *rewriteIntegerLoad(LoadInst &LI) {
3156 assert(IntTy && "We cannot insert an integer to the alloca");
3157 assert(!LI.isVolatile());
3158 Value *V =
3159 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3160 V = IRB.CreateBitPreservingCastChain(DL, V, IntTy);
3161 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3162 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3163 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3164 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3165 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3166 }
3167 // It is possible that the extracted type is not the load type. This
3168 // happens if there is a load past the end of the alloca, and as
3169 // a consequence the slice is narrower but still a candidate for integer
3170 // lowering. To handle this case, we just zero extend the extracted
3171 // integer.
3172 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3173 "Can only handle an extract for an overly wide load");
3174 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3175 V = IRB.CreateZExt(V, LI.getType());
3176 return V;
3177 }
3178
3179 bool visitLoadInst(LoadInst &LI) {
3180 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3181 Value *OldOp = LI.getOperand(0);
3182 assert(OldOp == OldPtr);
3183
3184 AAMDNodes AATags = LI.getAAMetadata();
3185
3186 unsigned AS = LI.getPointerAddressSpace();
3187
3188 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3189 : LI.getType();
3190 bool IsPtrAdjusted = false;
3191 Value *V;
3192 if (VecTy) {
3193 V = rewriteVectorizedLoadInst(LI);
3194 } else if (IntTy && LI.getType()->isIntegerTy()) {
3195 V = rewriteIntegerLoad(LI);
3196 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3197 NewEndOffset == NewAllocaEndOffset &&
3198 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3199 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3200 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3201 !LI.isVolatile()))) {
3202 Value *NewPtr =
3203 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3204 LoadInst *NewLI = IRB.CreateAlignedLoad(
3205 NewAllocaTy, NewPtr, NewAI.getAlign(), LI.isVolatile(), LI.getName());
3206 if (LI.isVolatile())
3207 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3208 if (NewLI->isAtomic())
3209 NewLI->setAlignment(LI.getAlign());
3210
3211 // Copy any metadata that is valid for the new load. This may require
3212 // conversion to a different kind of metadata, e.g. !nonnull might change
3213 // to !range or vice versa.
3214 copyMetadataForLoad(*NewLI, LI);
3215
3216 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3217 if (AATags)
3218 NewLI->setAAMetadata(AATags.adjustForAccess(
3219 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3220
3221 // Try to preserve nonnull metadata
3222 V = NewLI;
3223
3224 // If this is an integer load past the end of the slice (which means the
3225 // bytes outside the slice are undef or this load is dead) just forcibly
3226 // fix the integer size with correct handling of endianness.
3227 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3228 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3229 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3230 V = IRB.CreateZExt(V, TITy, "load.ext");
3231 if (DL.isBigEndian())
3232 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3233 "endian_shift");
3234 }
3235 } else {
3236 Type *LTy = IRB.getPtrTy(AS);
3237 LoadInst *NewLI =
3238 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3239 getSliceAlign(), LI.isVolatile(), LI.getName());
3240
3241 if (AATags)
3242 NewLI->setAAMetadata(AATags.adjustForAccess(
3243 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3244
3245 if (LI.isVolatile())
3246 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3247 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3248 LLVMContext::MD_access_group});
3249
3250 V = NewLI;
3251 IsPtrAdjusted = true;
3252 }
3253 V = IRB.CreateBitPreservingCastChain(DL, V, TargetTy);
3254
3255 if (IsSplit) {
3256 assert(!LI.isVolatile());
3257 assert(LI.getType()->isIntegerTy() &&
3258 "Only integer type loads and stores are split");
3259 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3260 "Split load isn't smaller than original load");
3261 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3262 "Non-byte-multiple bit width");
3263 // Move the insertion point just past the load so that we can refer to it.
3264 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3265 // Ensure the insertion point comes before any debug-info immediately
3266 // after the load, so that variable values referring to the load are
3267 // dominated by it.
3268 LIIt.setHeadBit(true);
3269 IRB.SetInsertPoint(LI.getParent(), LIIt);
3270 // Create a placeholder value with the same type as LI to use as the
3271 // basis for the new value. This allows us to replace the uses of LI with
3272 // the computed value, and then replace the placeholder with LI, leaving
3273 // LI only used for this computation.
3274 Value *Placeholder =
3275 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3276 false, Align(1));
3277 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3278 "insert");
3279 LI.replaceAllUsesWith(V);
3280 Placeholder->replaceAllUsesWith(&LI);
3281 Placeholder->deleteValue();
3282 } else {
3283 LI.replaceAllUsesWith(V);
3284 }
3285
3286 Pass.DeadInsts.push_back(&LI);
3287 deleteIfTriviallyDead(OldOp);
3288 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3289 return !LI.isVolatile() && !IsPtrAdjusted;
3290 }
3291
3292 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3293 AAMDNodes AATags) {
3294 // Capture V for the purpose of debug-info accounting once it's converted
3295 // to a vector store.
3296 Value *OrigV = V;
3297 if (V->getType() != VecTy) {
3298 unsigned BeginIndex = getIndex(NewBeginOffset);
3299 unsigned EndIndex = getIndex(NewEndOffset);
3300 assert(EndIndex > BeginIndex && "Empty vector!");
3301 unsigned NumElements = EndIndex - BeginIndex;
3302 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3303 "Too many elements!");
3304 Type *SliceTy = (NumElements == 1)
3305 ? ElementTy
3306 : FixedVectorType::get(ElementTy, NumElements);
3307 if (V->getType() != SliceTy)
3308 V = IRB.CreateBitPreservingCastChain(DL, V, SliceTy);
3309
3310 // Mix in the existing elements.
3311 Value *Old =
3312 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3313 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3314 }
3315 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3316 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3317 LLVMContext::MD_access_group});
3318 if (AATags)
3319 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3320 V->getType(), DL));
3321 Pass.DeadInsts.push_back(&SI);
3322
3323 // NOTE: Careful to use OrigV rather than V.
3324 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3325 Store, Store->getPointerOperand(), OrigV, DL);
3326 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3327 return true;
3328 }
3329
3330 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3331 assert(IntTy && "We cannot extract an integer from the alloca");
3332 assert(!SI.isVolatile());
3333 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3334 IntTy->getBitWidth()) {
3335 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3336 "oldload");
3337 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3338 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3339 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3340 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3341 }
3342 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3343 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3344 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3345 LLVMContext::MD_access_group});
3346 if (AATags)
3347 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3348 V->getType(), DL));
3349
3350 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3351 Store, Store->getPointerOperand(),
3352 Store->getValueOperand(), DL);
3353
3354 Pass.DeadInsts.push_back(&SI);
3355 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3356 return true;
3357 }
3358
3359 bool visitStoreInst(StoreInst &SI) {
3360 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3361 Value *OldOp = SI.getOperand(1);
3362 assert(OldOp == OldPtr);
3363
3364 AAMDNodes AATags = SI.getAAMetadata();
3365 Value *V = SI.getValueOperand();
3366
3367 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3368 // alloca that should be re-examined after promoting this alloca.
3369 if (V->getType()->isPointerTy())
3370 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3371 Pass.PostPromotionWorklist.insert(AI);
3372
3373 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3374 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3375 assert(!SI.isVolatile());
3376 assert(V->getType()->isIntegerTy() &&
3377 "Only integer type loads and stores are split");
3378 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3379 "Non-byte-multiple bit width");
3380 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3381 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3382 "extract");
3383 }
3384
3385 if (VecTy)
3386 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3387 if (IntTy && V->getType()->isIntegerTy())
3388 return rewriteIntegerStore(V, SI, AATags);
3389
3390 StoreInst *NewSI;
3391 if (NewBeginOffset == NewAllocaBeginOffset &&
3392 NewEndOffset == NewAllocaEndOffset &&
3393 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3394 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3395 Value *NewPtr =
3396 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3397
3398 NewSI =
3399 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3400 } else {
3401 unsigned AS = SI.getPointerAddressSpace();
3402 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3403 NewSI =
3404 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3405 }
3406 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3407 LLVMContext::MD_access_group});
3408 if (AATags)
3409 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3410 V->getType(), DL));
3411 if (SI.isVolatile())
3412 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3413 if (NewSI->isAtomic())
3414 NewSI->setAlignment(SI.getAlign());
3415
3416 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3417 NewSI, NewSI->getPointerOperand(),
3418 NewSI->getValueOperand(), DL);
3419
3420 Pass.DeadInsts.push_back(&SI);
3421 deleteIfTriviallyDead(OldOp);
3422
3423 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3424 return NewSI->getPointerOperand() == &NewAI &&
3425 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3426 !SI.isVolatile();
3427 }
3428
3429 /// Compute an integer value from splatting an i8 across the given
3430 /// number of bytes.
3431 ///
3432 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3433 /// call this routine.
3434 /// FIXME: Heed the advice above.
3435 ///
3436 /// \param V The i8 value to splat.
3437 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3438 Value *getIntegerSplat(Value *V, unsigned Size) {
3439 assert(Size > 0 && "Expected a positive number of bytes.");
3440 IntegerType *VTy = cast<IntegerType>(V->getType());
3441 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3442 if (Size == 1)
3443 return V;
3444
3445 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3446 V = IRB.CreateMul(
3447 IRB.CreateZExt(V, SplatIntTy, "zext"),
3448 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3449 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3450 SplatIntTy)),
3451 "isplat");
3452 return V;
3453 }
3454
3455 /// Compute a vector splat for a given element value.
3456 Value *getVectorSplat(Value *V, unsigned NumElements) {
3457 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3458 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3459 return V;
3460 }
3461
3462 bool visitMemSetInst(MemSetInst &II) {
3463 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3464 assert(II.getRawDest() == OldPtr);
3465
3466 AAMDNodes AATags = II.getAAMetadata();
3467
3468 // If the memset has a variable size, it cannot be split, just adjust the
3469 // pointer to the new alloca.
3470 if (!isa<ConstantInt>(II.getLength())) {
3471 assert(!IsSplit);
3472 assert(NewBeginOffset == BeginOffset);
3473 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3474 II.setDestAlignment(getSliceAlign());
3475 // In theory we should call migrateDebugInfo here. However, we do not
3476 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3477 // constant geps, or storing a variable number of bytes.
3479 "AT: Unexpected link to non-const GEP");
3480 deleteIfTriviallyDead(OldPtr);
3481 return false;
3482 }
3483
3484 // Record this instruction for deletion.
3485 Pass.DeadInsts.push_back(&II);
3486
3487 Type *ScalarTy = NewAllocaTy->getScalarType();
3488
3489 const bool CanContinue = [&]() {
3490 if (VecTy || IntTy)
3491 return true;
3492 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3493 return false;
3494 // Length must be in range for FixedVectorType.
3495 auto *C = cast<ConstantInt>(II.getLength());
3496 const uint64_t Len = C->getLimitedValue();
3497 if (Len > std::numeric_limits<unsigned>::max())
3498 return false;
3499 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3500 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3501 return canConvertValue(DL, SrcTy, NewAllocaTy) &&
3502 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3503 }();
3504
3505 // If this doesn't map cleanly onto the alloca type, and that type isn't
3506 // a single value type, just emit a memset.
3507 if (!CanContinue) {
3508 Type *SizeTy = II.getLength()->getType();
3509 unsigned Sz = NewEndOffset - NewBeginOffset;
3510 Constant *Size = ConstantInt::get(SizeTy, Sz);
3511 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3512 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3513 MaybeAlign(getSliceAlign()), II.isVolatile()));
3514 if (AATags)
3515 New->setAAMetadata(
3516 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3517
3518 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3519 New, New->getRawDest(), nullptr, DL);
3520
3521 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3522 return false;
3523 }
3524
3525 // If we can represent this as a simple value, we have to build the actual
3526 // value to store, which requires expanding the byte present in memset to
3527 // a sensible representation for the alloca type. This is essentially
3528 // splatting the byte to a sufficiently wide integer, splatting it across
3529 // any desired vector width, and bitcasting to the final type.
3530 Value *V;
3531
3532 if (VecTy) {
3533 // If this is a memset of a vectorized alloca, insert it.
3534 assert(ElementTy == ScalarTy);
3535
3536 unsigned BeginIndex = getIndex(NewBeginOffset);
3537 unsigned EndIndex = getIndex(NewEndOffset);
3538 assert(EndIndex > BeginIndex && "Empty vector!");
3539 unsigned NumElements = EndIndex - BeginIndex;
3540 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3541 "Too many elements!");
3542
3543 Value *Splat = getIntegerSplat(
3544 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3545 Splat = IRB.CreateBitPreservingCastChain(DL, Splat, ElementTy);
3546 if (NumElements > 1)
3547 Splat = getVectorSplat(Splat, NumElements);
3548
3549 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3550 "oldload");
3551 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3552 } else if (IntTy) {
3553 // If this is a memset on an alloca where we can widen stores, insert the
3554 // set integer.
3555 assert(!II.isVolatile());
3556
3557 uint64_t Size = NewEndOffset - NewBeginOffset;
3558 V = getIntegerSplat(II.getValue(), Size);
3559
3560 if (IntTy && (NewBeginOffset != NewAllocaBeginOffset ||
3561 NewEndOffset != NewAllocaEndOffset)) {
3562 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI,
3563 NewAI.getAlign(), "oldload");
3564 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3565 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3566 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3567 } else {
3568 assert(V->getType() == IntTy &&
3569 "Wrong type for an alloca wide integer!");
3570 }
3571 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3572 } else {
3573 // Established these invariants above.
3574 assert(NewBeginOffset == NewAllocaBeginOffset);
3575 assert(NewEndOffset == NewAllocaEndOffset);
3576
3577 V = getIntegerSplat(II.getValue(),
3578 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3579 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(NewAllocaTy))
3580 V = getVectorSplat(
3581 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3582
3583 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3584 }
3585
3586 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3587 StoreInst *New =
3588 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3589 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3590 LLVMContext::MD_access_group});
3591 if (AATags)
3592 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3593 V->getType(), DL));
3594
3595 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3596 New, New->getPointerOperand(), V, DL);
3597
3598 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3599 return !II.isVolatile();
3600 }
3601
3602 bool visitMemTransferInst(MemTransferInst &II) {
3603 // Rewriting of memory transfer instructions can be a bit tricky. We break
3604 // them into two categories: split intrinsics and unsplit intrinsics.
3605
3606 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3607
3608 AAMDNodes AATags = II.getAAMetadata();
3609
3610 bool IsDest = &II.getRawDestUse() == OldUse;
3611 assert((IsDest && II.getRawDest() == OldPtr) ||
3612 (!IsDest && II.getRawSource() == OldPtr));
3613
3614 Align SliceAlign = getSliceAlign();
3615 // For unsplit intrinsics, we simply modify the source and destination
3616 // pointers in place. This isn't just an optimization, it is a matter of
3617 // correctness. With unsplit intrinsics we may be dealing with transfers
3618 // within a single alloca before SROA ran, or with transfers that have
3619 // a variable length. We may also be dealing with memmove instead of
3620 // memcpy, and so simply updating the pointers is the necessary for us to
3621 // update both source and dest of a single call.
3622 if (!IsSplittable) {
3623 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3624 if (IsDest) {
3625 // Update the address component of linked dbg.assigns.
3626 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3627 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3628 DbgAssign->getAddress() == II.getDest())
3629 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3630 }
3631 II.setDest(AdjustedPtr);
3632 II.setDestAlignment(SliceAlign);
3633 } else {
3634 II.setSource(AdjustedPtr);
3635 II.setSourceAlignment(SliceAlign);
3636 }
3637
3638 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3639 deleteIfTriviallyDead(OldPtr);
3640 return false;
3641 }
3642 // For split transfer intrinsics we have an incredibly useful assurance:
3643 // the source and destination do not reside within the same alloca, and at
3644 // least one of them does not escape. This means that we can replace
3645 // memmove with memcpy, and we don't need to worry about all manner of
3646 // downsides to splitting and transforming the operations.
3647
3648 // If this doesn't map cleanly onto the alloca type, and that type isn't
3649 // a single value type, just emit a memcpy.
3650 bool EmitMemCpy =
3651 !VecTy && !IntTy &&
3652 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3653 SliceSize != DL.getTypeStoreSize(NewAllocaTy).getFixedValue() ||
3654 !DL.typeSizeEqualsStoreSize(NewAllocaTy) ||
3655 !NewAllocaTy->isSingleValueType());
3656
3657 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3658 // size hasn't been shrunk based on analysis of the viable range, this is
3659 // a no-op.
3660 if (EmitMemCpy && &OldAI == &NewAI) {
3661 // Ensure the start lines up.
3662 assert(NewBeginOffset == BeginOffset);
3663
3664 // Rewrite the size as needed.
3665 if (NewEndOffset != EndOffset)
3666 II.setLength(NewEndOffset - NewBeginOffset);
3667 return false;
3668 }
3669 // Record this instruction for deletion.
3670 Pass.DeadInsts.push_back(&II);
3671
3672 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3673 // alloca that should be re-examined after rewriting this instruction.
3674 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3675 if (AllocaInst *AI =
3677 assert(AI != &OldAI && AI != &NewAI &&
3678 "Splittable transfers cannot reach the same alloca on both ends.");
3679 Pass.Worklist.insert(AI);
3680 }
3681
3682 Type *OtherPtrTy = OtherPtr->getType();
3683 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3684
3685 // Compute the relative offset for the other pointer within the transfer.
3686 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3687 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3688 Align OtherAlign =
3689 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3690 OtherAlign =
3691 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3692
3693 if (EmitMemCpy) {
3694 // Compute the other pointer, folding as much as possible to produce
3695 // a single, simple GEP in most cases.
3696 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3697 OtherPtr->getName() + ".");
3698
3699 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3700 Type *SizeTy = II.getLength()->getType();
3701 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3702
3703 Value *DestPtr, *SrcPtr;
3704 MaybeAlign DestAlign, SrcAlign;
3705 // Note: IsDest is true iff we're copying into the new alloca slice
3706 if (IsDest) {
3707 DestPtr = OurPtr;
3708 DestAlign = SliceAlign;
3709 SrcPtr = OtherPtr;
3710 SrcAlign = OtherAlign;
3711 } else {
3712 DestPtr = OtherPtr;
3713 DestAlign = OtherAlign;
3714 SrcPtr = OurPtr;
3715 SrcAlign = SliceAlign;
3716 }
3717 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3718 Size, II.isVolatile());
3719 if (AATags)
3720 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3721
3722 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3723 if (IsDest) {
3724 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3725 &II, New, DestPtr, nullptr, DL);
3726 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3728 DL, Offset, /*AllowNonInbounds*/ true))) {
3729 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3730 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3731 }
3732 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3733 return false;
3734 }
3735
3736 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3737 NewEndOffset == NewAllocaEndOffset;
3738 uint64_t Size = NewEndOffset - NewBeginOffset;
3739 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3740 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3741 unsigned NumElements = EndIndex - BeginIndex;
3742 IntegerType *SubIntTy =
3743 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3744
3745 // Reset the other pointer type to match the register type we're going to
3746 // use, but using the address space of the original other pointer.
3747 Type *OtherTy;
3748 if (VecTy && !IsWholeAlloca) {
3749 if (NumElements == 1)
3750 OtherTy = VecTy->getElementType();
3751 else
3752 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3753 } else if (IntTy && !IsWholeAlloca) {
3754 OtherTy = SubIntTy;
3755 } else {
3756 OtherTy = NewAllocaTy;
3757 }
3758
3759 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3760 OtherPtr->getName() + ".");
3761 MaybeAlign SrcAlign = OtherAlign;
3762 MaybeAlign DstAlign = SliceAlign;
3763 if (!IsDest)
3764 std::swap(SrcAlign, DstAlign);
3765
3766 Value *SrcPtr;
3767 Value *DstPtr;
3768
3769 if (IsDest) {
3770 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3771 SrcPtr = AdjPtr;
3772 } else {
3773 DstPtr = AdjPtr;
3774 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3775 }
3776
3777 Value *Src;
3778 if (VecTy && !IsWholeAlloca && !IsDest) {
3779 Src =
3780 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3781 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3782 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3783 Src =
3784 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3785 Src = IRB.CreateBitPreservingCastChain(DL, Src, IntTy);
3786 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3787 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3788 } else {
3789 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3790 II.isVolatile(), "copyload");
3791 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3792 LLVMContext::MD_access_group});
3793 if (AATags)
3794 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3795 Load->getType(), DL));
3796 Src = Load;
3797 }
3798
3799 if (VecTy && !IsWholeAlloca && IsDest) {
3800 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3801 "oldload");
3802 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3803 } else if (IntTy && !IsWholeAlloca && IsDest) {
3804 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3805 "oldload");
3806 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3807 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3808 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3809 Src = IRB.CreateBitPreservingCastChain(DL, Src, NewAllocaTy);
3810 }
3811
3812 StoreInst *Store = cast<StoreInst>(
3813 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3814 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3815 LLVMContext::MD_access_group});
3816 if (AATags)
3817 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3818 Src->getType(), DL));
3819
3820 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3821 if (IsDest) {
3822
3823 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3824 Store, DstPtr, Src, DL);
3825 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3827 DL, Offset, /*AllowNonInbounds*/ true))) {
3828 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3829 &II, Store, DstPtr, Src, DL);
3830 }
3831
3832 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3833 return !II.isVolatile();
3834 }
3835
3836 bool visitIntrinsicInst(IntrinsicInst &II) {
3837 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3838 "Unexpected intrinsic!");
3839 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3840
3841 // Record this instruction for deletion.
3842 Pass.DeadInsts.push_back(&II);
3843
3844 if (II.isDroppable()) {
3845 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3846 // TODO For now we forget assumed information, this can be improved.
3847 OldPtr->dropDroppableUsesIn(II);
3848 return true;
3849 }
3850
3851 assert(II.getArgOperand(0) == OldPtr);
3852 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3853 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3854 Value *New;
3855 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3856 New = IRB.CreateLifetimeStart(Ptr);
3857 else
3858 New = IRB.CreateLifetimeEnd(Ptr);
3859
3860 (void)New;
3861 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3862
3863 return true;
3864 }
3865
3866 void fixLoadStoreAlign(Instruction &Root) {
3867 // This algorithm implements the same visitor loop as
3868 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3869 // or store found.
3870 SmallPtrSet<Instruction *, 4> Visited;
3871 SmallVector<Instruction *, 4> Uses;
3872 Visited.insert(&Root);
3873 Uses.push_back(&Root);
3874 do {
3875 Instruction *I = Uses.pop_back_val();
3876
3877 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3878 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3879 continue;
3880 }
3881 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3882 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3883 continue;
3884 }
3885
3889 for (User *U : I->users())
3890 if (Visited.insert(cast<Instruction>(U)).second)
3891 Uses.push_back(cast<Instruction>(U));
3892 } while (!Uses.empty());
3893 }
3894
3895 bool visitPHINode(PHINode &PN) {
3896 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3897 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3898 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3899
3900 // We would like to compute a new pointer in only one place, but have it be
3901 // as local as possible to the PHI. To do that, we re-use the location of
3902 // the old pointer, which necessarily must be in the right position to
3903 // dominate the PHI.
3904 IRBuilderBase::InsertPointGuard Guard(IRB);
3905 if (isa<PHINode>(OldPtr))
3906 IRB.SetInsertPoint(OldPtr->getParent(),
3907 OldPtr->getParent()->getFirstInsertionPt());
3908 else
3909 IRB.SetInsertPoint(OldPtr);
3910 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3911
3912 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3913 // Replace the operands which were using the old pointer.
3914 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3915
3916 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3917 deleteIfTriviallyDead(OldPtr);
3918
3919 // Fix the alignment of any loads or stores using this PHI node.
3920 fixLoadStoreAlign(PN);
3921
3922 // PHIs can't be promoted on their own, but often can be speculated. We
3923 // check the speculation outside of the rewriter so that we see the
3924 // fully-rewritten alloca.
3925 PHIUsers.insert(&PN);
3926 return true;
3927 }
3928
3929 bool visitSelectInst(SelectInst &SI) {
3930 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3931 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3932 "Pointer isn't an operand!");
3933 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3934 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3935
3936 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3937 // Replace the operands which were using the old pointer.
3938 if (SI.getOperand(1) == OldPtr)
3939 SI.setOperand(1, NewPtr);
3940 if (SI.getOperand(2) == OldPtr)
3941 SI.setOperand(2, NewPtr);
3942
3943 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3944 deleteIfTriviallyDead(OldPtr);
3945
3946 // Fix the alignment of any loads or stores using this select.
3947 fixLoadStoreAlign(SI);
3948
3949 // Selects can't be promoted on their own, but often can be speculated. We
3950 // check the speculation outside of the rewriter so that we see the
3951 // fully-rewritten alloca.
3952 SelectUsers.insert(&SI);
3953 return true;
3954 }
3955};
3956
3957/// Visitor to rewrite aggregate loads and stores as scalar.
3958///
3959/// This pass aggressively rewrites all aggregate loads and stores on
3960/// a particular pointer (or any pointer derived from it which we can identify)
3961/// with scalar loads and stores.
3962class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3963 // Befriend the base class so it can delegate to private visit methods.
3964 friend class InstVisitor<AggLoadStoreRewriter, bool>;
3965
3966 /// Queue of pointer uses to analyze and potentially rewrite.
3968
3969 /// Set to prevent us from cycling with phi nodes and loops.
3970 SmallPtrSet<User *, 8> Visited;
3971
3972 /// The current pointer use being rewritten. This is used to dig up the used
3973 /// value (as opposed to the user).
3974 Use *U = nullptr;
3975
3976 /// Used to calculate offsets, and hence alignment, of subobjects.
3977 const DataLayout &DL;
3978
3979 IRBuilderTy &IRB;
3980
3981public:
3982 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3983 : DL(DL), IRB(IRB) {}
3984
3985 /// Rewrite loads and stores through a pointer and all pointers derived from
3986 /// it.
3987 bool rewrite(Instruction &I) {
3988 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3989 enqueueUsers(I);
3990 bool Changed = false;
3991 while (!Queue.empty()) {
3992 U = Queue.pop_back_val();
3993 Changed |= visit(cast<Instruction>(U->getUser()));
3994 }
3995 return Changed;
3996 }
3997
3998private:
3999 /// Enqueue all the users of the given instruction for further processing.
4000 /// This uses a set to de-duplicate users.
4001 void enqueueUsers(Instruction &I) {
4002 for (Use &U : I.uses())
4003 if (Visited.insert(U.getUser()).second)
4004 Queue.push_back(&U);
4005 }
4006
4007 // Conservative default is to not rewrite anything.
4008 bool visitInstruction(Instruction &I) { return false; }
4009
4010 /// Generic recursive split emission class.
4011 template <typename Derived> class OpSplitter {
4012 protected:
4013 /// The builder used to form new instructions.
4014 IRBuilderTy &IRB;
4015
4016 /// The indices which to be used with insert- or extractvalue to select the
4017 /// appropriate value within the aggregate.
4018 SmallVector<unsigned, 4> Indices;
4019
4020 /// The indices to a GEP instruction which will move Ptr to the correct slot
4021 /// within the aggregate.
4022 SmallVector<Value *, 4> GEPIndices;
4023
4024 /// The base pointer of the original op, used as a base for GEPing the
4025 /// split operations.
4026 Value *Ptr;
4027
4028 /// The base pointee type being GEPed into.
4029 Type *BaseTy;
4030
4031 /// Known alignment of the base pointer.
4032 Align BaseAlign;
4033
4034 /// To calculate offset of each component so we can correctly deduce
4035 /// alignments.
4036 const DataLayout &DL;
4037
4038 /// Initialize the splitter with an insertion point, Ptr and start with a
4039 /// single zero GEP index.
4040 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4041 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4042 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4043 BaseAlign(BaseAlign), DL(DL) {
4044 IRB.SetInsertPoint(InsertionPoint);
4045 }
4046
4047 public:
4048 /// Generic recursive split emission routine.
4049 ///
4050 /// This method recursively splits an aggregate op (load or store) into
4051 /// scalar or vector ops. It splits recursively until it hits a single value
4052 /// and emits that single value operation via the template argument.
4053 ///
4054 /// The logic of this routine relies on GEPs and insertvalue and
4055 /// extractvalue all operating with the same fundamental index list, merely
4056 /// formatted differently (GEPs need actual values).
4057 ///
4058 /// \param Ty The type being split recursively into smaller ops.
4059 /// \param Agg The aggregate value being built up or stored, depending on
4060 /// whether this is splitting a load or a store respectively.
4061 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4062 if (Ty->isSingleValueType()) {
4063 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4064 return static_cast<Derived *>(this)->emitFunc(
4065 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4066 }
4067
4068 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4069 unsigned OldSize = Indices.size();
4070 (void)OldSize;
4071 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4072 ++Idx) {
4073 assert(Indices.size() == OldSize && "Did not return to the old size");
4074 Indices.push_back(Idx);
4075 GEPIndices.push_back(IRB.getInt32(Idx));
4076 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4077 GEPIndices.pop_back();
4078 Indices.pop_back();
4079 }
4080 return;
4081 }
4082
4083 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4084 unsigned OldSize = Indices.size();
4085 (void)OldSize;
4086 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4087 ++Idx) {
4088 assert(Indices.size() == OldSize && "Did not return to the old size");
4089 Indices.push_back(Idx);
4090 GEPIndices.push_back(IRB.getInt32(Idx));
4091 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4092 GEPIndices.pop_back();
4093 Indices.pop_back();
4094 }
4095 return;
4096 }
4097
4098 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4099 }
4100 };
4101
4102 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4103 AAMDNodes AATags;
4104 // A vector to hold the split components that we want to emit
4105 // separate fake uses for.
4106 SmallVector<Value *, 4> Components;
4107 // A vector to hold all the fake uses of the struct that we are splitting.
4108 // Usually there should only be one, but we are handling the general case.
4110
4111 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4112 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4113 IRBuilderTy &IRB)
4114 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4115 IRB),
4116 AATags(AATags) {}
4117
4118 /// Emit a leaf load of a single value. This is called at the leaves of the
4119 /// recursive emission to actually load values.
4120 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4122 // Load the single value and insert it using the indices.
4123 Value *GEP =
4124 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4125 LoadInst *Load =
4126 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4127
4128 APInt Offset(
4129 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4130 if (AATags &&
4131 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4132 Load->setAAMetadata(
4133 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4134 // Record the load so we can generate a fake use for this aggregate
4135 // component.
4136 Components.push_back(Load);
4137
4138 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4139 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4140 }
4141
4142 // Stash the fake uses that use the value generated by this instruction.
4143 void recordFakeUses(LoadInst &LI) {
4144 for (Use &U : LI.uses())
4145 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4146 if (II->getIntrinsicID() == Intrinsic::fake_use)
4147 FakeUses.push_back(II);
4148 }
4149
4150 // Replace all fake uses of the aggregate with a series of fake uses, one
4151 // for each split component.
4152 void emitFakeUses() {
4153 for (Instruction *I : FakeUses) {
4154 IRB.SetInsertPoint(I);
4155 for (auto *V : Components)
4156 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4157 I->eraseFromParent();
4158 }
4159 }
4160 };
4161
4162 bool visitLoadInst(LoadInst &LI) {
4163 assert(LI.getPointerOperand() == *U);
4164 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4165 return false;
4166
4167 // We have an aggregate being loaded, split it apart.
4168 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4169 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4170 getAdjustedAlignment(&LI, 0), DL, IRB);
4171 Splitter.recordFakeUses(LI);
4173 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4174 Splitter.emitFakeUses();
4175 Visited.erase(&LI);
4176 LI.replaceAllUsesWith(V);
4177 LI.eraseFromParent();
4178 return true;
4179 }
4180
4181 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4182 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4183 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4184 const DataLayout &DL, IRBuilderTy &IRB)
4185 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4186 DL, IRB),
4187 AATags(AATags), AggStore(AggStore) {}
4188 AAMDNodes AATags;
4189 StoreInst *AggStore;
4190 /// Emit a leaf store of a single value. This is called at the leaves of the
4191 /// recursive emission to actually produce stores.
4192 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4194 // Extract the single value and store it using the indices.
4195 //
4196 // The gep and extractvalue values are factored out of the CreateStore
4197 // call to make the output independent of the argument evaluation order.
4198 Value *ExtractValue =
4199 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4200 Value *InBoundsGEP =
4201 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4202 StoreInst *Store =
4203 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4204
4205 APInt Offset(
4206 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4207 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4208 if (AATags) {
4209 Store->setAAMetadata(AATags.adjustForAccess(
4210 Offset.getZExtValue(), ExtractValue->getType(), DL));
4211 }
4212
4213 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4214 // If we cannot (because there's an intervening non-const or unbounded
4215 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4216 // this instruction.
4218 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4219 uint64_t SizeInBits =
4220 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4221 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4222 SizeInBits, AggStore, Store,
4223 Store->getPointerOperand(), Store->getValueOperand(),
4224 DL);
4225 } else {
4227 "AT: unexpected debug.assign linked to store through "
4228 "unbounded GEP");
4229 }
4230 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4231 }
4232 };
4233
4234 bool visitStoreInst(StoreInst &SI) {
4235 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4236 return false;
4237 Value *V = SI.getValueOperand();
4238 if (V->getType()->isSingleValueType())
4239 return false;
4240
4241 // We have an aggregate being stored, split it apart.
4242 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4243 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4244 getAdjustedAlignment(&SI, 0), DL, IRB);
4245 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4246 Visited.erase(&SI);
4247 // The stores replacing SI each have markers describing fragments of the
4248 // assignment so delete the assignment markers linked to SI.
4250 SI.eraseFromParent();
4251 return true;
4252 }
4253
4254 bool visitBitCastInst(BitCastInst &BC) {
4255 enqueueUsers(BC);
4256 return false;
4257 }
4258
4259 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4260 enqueueUsers(ASC);
4261 return false;
4262 }
4263
4264 // Unfold gep (select cond, ptr1, ptr2), idx
4265 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4266 // and gep ptr, (select cond, idx1, idx2)
4267 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4268 // We also allow for i1 zext indices, which are equivalent to selects.
4269 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4270 // Check whether the GEP has exactly one select operand and all indices
4271 // will become constant after the transform.
4273 for (Value *Op : GEPI.indices()) {
4274 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4275 if (Sel)
4276 return false;
4277
4278 Sel = SI;
4279 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4280 !isa<ConstantInt>(SI->getFalseValue()))
4281 return false;
4282 continue;
4283 }
4284 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4285 if (Sel)
4286 return false;
4287 Sel = ZI;
4288 if (!ZI->getSrcTy()->isIntegerTy(1))
4289 return false;
4290 continue;
4291 }
4292
4293 if (!isa<ConstantInt>(Op))
4294 return false;
4295 }
4296
4297 if (!Sel)
4298 return false;
4299
4300 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4301 dbgs() << " original: " << *Sel << "\n";
4302 dbgs() << " " << GEPI << "\n";);
4303
4304 auto GetNewOps = [&](Value *SelOp) {
4305 SmallVector<Value *> NewOps;
4306 for (Value *Op : GEPI.operands())
4307 if (Op == Sel)
4308 NewOps.push_back(SelOp);
4309 else
4310 NewOps.push_back(Op);
4311 return NewOps;
4312 };
4313
4314 Value *Cond, *True, *False;
4315 Instruction *MDFrom = nullptr;
4316 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4317 Cond = SI->getCondition();
4318 True = SI->getTrueValue();
4319 False = SI->getFalseValue();
4321 MDFrom = SI;
4322 } else {
4323 Cond = Sel->getOperand(0);
4324 True = ConstantInt::get(Sel->getType(), 1);
4325 False = ConstantInt::get(Sel->getType(), 0);
4326 }
4327 SmallVector<Value *> TrueOps = GetNewOps(True);
4328 SmallVector<Value *> FalseOps = GetNewOps(False);
4329
4330 IRB.SetInsertPoint(&GEPI);
4331 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4332
4333 Type *Ty = GEPI.getSourceElementType();
4334 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4335 True->getName() + ".sroa.gep", NW);
4336
4337 Value *NFalse =
4338 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4339 False->getName() + ".sroa.gep", NW);
4340
4341 Value *NSel = MDFrom
4342 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4343 Sel->getName() + ".sroa.sel", MDFrom)
4344 : IRB.CreateSelectWithUnknownProfile(
4345 Cond, NTrue, NFalse, DEBUG_TYPE,
4346 Sel->getName() + ".sroa.sel");
4347 Visited.erase(&GEPI);
4348 GEPI.replaceAllUsesWith(NSel);
4349 GEPI.eraseFromParent();
4350 Instruction *NSelI = cast<Instruction>(NSel);
4351 Visited.insert(NSelI);
4352 enqueueUsers(*NSelI);
4353
4354 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4355 dbgs() << " " << *NFalse << "\n";
4356 dbgs() << " " << *NSel << "\n";);
4357
4358 return true;
4359 }
4360
4361 // Unfold gep (phi ptr1, ptr2), idx
4362 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4363 // and gep ptr, (phi idx1, idx2)
4364 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4365 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4366 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4367 // operand (looking through the phi if it is the phi we want to unfold) is
4368 // an instruction besides a static alloca.
4369 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4370 auto IsInvalidPointerOperand = [](Value *V) {
4371 if (!isa<Instruction>(V))
4372 return false;
4373 if (auto *AI = dyn_cast<AllocaInst>(V))
4374 return !AI->isStaticAlloca();
4375 return true;
4376 };
4377 if (Phi) {
4378 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4379 return false;
4380 } else {
4381 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4382 return false;
4383 }
4384 // Check whether the GEP has exactly one phi operand (including the pointer
4385 // operand) and all indices will become constant after the transform.
4386 for (Value *Op : GEPI.indices()) {
4387 if (auto *SI = dyn_cast<PHINode>(Op)) {
4388 if (Phi)
4389 return false;
4390
4391 Phi = SI;
4392 if (!all_of(Phi->incoming_values(),
4393 [](Value *V) { return isa<ConstantInt>(V); }))
4394 return false;
4395 continue;
4396 }
4397
4398 if (!isa<ConstantInt>(Op))
4399 return false;
4400 }
4401
4402 if (!Phi)
4403 return false;
4404
4405 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4406 dbgs() << " original: " << *Phi << "\n";
4407 dbgs() << " " << GEPI << "\n";);
4408
4409 auto GetNewOps = [&](Value *PhiOp) {
4410 SmallVector<Value *> NewOps;
4411 for (Value *Op : GEPI.operands())
4412 if (Op == Phi)
4413 NewOps.push_back(PhiOp);
4414 else
4415 NewOps.push_back(Op);
4416 return NewOps;
4417 };
4418
4419 IRB.SetInsertPoint(Phi);
4420 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4421 Phi->getName() + ".sroa.phi");
4422
4423 Type *SourceTy = GEPI.getSourceElementType();
4424 // We only handle arguments, constants, and static allocas here, so we can
4425 // insert GEPs at the end of the entry block.
4426 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4427 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4428 Value *Op = Phi->getIncomingValue(I);
4429 BasicBlock *BB = Phi->getIncomingBlock(I);
4430 Value *NewGEP;
4431 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4432 NewGEP = NewPhi->getIncomingValue(NI);
4433 } else {
4434 SmallVector<Value *> NewOps = GetNewOps(Op);
4435 NewGEP =
4436 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4437 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4438 }
4439 NewPhi->addIncoming(NewGEP, BB);
4440 }
4441
4442 Visited.erase(&GEPI);
4443 GEPI.replaceAllUsesWith(NewPhi);
4444 GEPI.eraseFromParent();
4445 Visited.insert(NewPhi);
4446 enqueueUsers(*NewPhi);
4447
4448 LLVM_DEBUG(dbgs() << " to: ";
4449 for (Value *In
4450 : NewPhi->incoming_values()) dbgs()
4451 << "\n " << *In;
4452 dbgs() << "\n " << *NewPhi << '\n');
4453
4454 return true;
4455 }
4456
4457 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4458 if (unfoldGEPSelect(GEPI))
4459 return true;
4460
4461 if (unfoldGEPPhi(GEPI))
4462 return true;
4463
4464 enqueueUsers(GEPI);
4465 return false;
4466 }
4467
4468 bool visitPHINode(PHINode &PN) {
4469 enqueueUsers(PN);
4470 return false;
4471 }
4472
4473 bool visitSelectInst(SelectInst &SI) {
4474 enqueueUsers(SI);
4475 return false;
4476 }
4477};
4478
4479} // end anonymous namespace
4480
4481/// Strip aggregate type wrapping.
4482///
4483/// This removes no-op aggregate types wrapping an underlying type. It will
4484/// strip as many layers of types as it can without changing either the type
4485/// size or the allocated size.
4487 if (Ty->isSingleValueType())
4488 return Ty;
4489
4490 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4491 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4492
4493 Type *InnerTy;
4494 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4495 InnerTy = ArrTy->getElementType();
4496 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4497 const StructLayout *SL = DL.getStructLayout(STy);
4498 unsigned Index = SL->getElementContainingOffset(0);
4499 InnerTy = STy->getElementType(Index);
4500 } else {
4501 return Ty;
4502 }
4503
4504 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4505 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4506 return Ty;
4507
4508 return stripAggregateTypeWrapping(DL, InnerTy);
4509}
4510
4511/// Try to find a partition of the aggregate type passed in for a given
4512/// offset and size.
4513///
4514/// This recurses through the aggregate type and tries to compute a subtype
4515/// based on the offset and size. When the offset and size span a sub-section
4516/// of an array, it will even compute a new array type for that sub-section,
4517/// and the same for structs.
4518///
4519/// Note that this routine is very strict and tries to find a partition of the
4520/// type which produces the *exact* right offset and size. It is not forgiving
4521/// when the size or offset cause either end of type-based partition to be off.
4522/// Also, this is a best-effort routine. It is reasonable to give up and not
4523/// return a type if necessary.
4525 uint64_t Size) {
4526 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4527 return stripAggregateTypeWrapping(DL, Ty);
4528 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4529 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4530 return nullptr;
4531
4532 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4533 Type *ElementTy;
4534 uint64_t TyNumElements;
4535 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4536 ElementTy = AT->getElementType();
4537 TyNumElements = AT->getNumElements();
4538 } else {
4539 // FIXME: This isn't right for vectors with non-byte-sized or
4540 // non-power-of-two sized elements.
4541 auto *VT = cast<FixedVectorType>(Ty);
4542 ElementTy = VT->getElementType();
4543 TyNumElements = VT->getNumElements();
4544 }
4545 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4546 uint64_t NumSkippedElements = Offset / ElementSize;
4547 if (NumSkippedElements >= TyNumElements)
4548 return nullptr;
4549 Offset -= NumSkippedElements * ElementSize;
4550
4551 // First check if we need to recurse.
4552 if (Offset > 0 || Size < ElementSize) {
4553 // Bail if the partition ends in a different array element.
4554 if ((Offset + Size) > ElementSize)
4555 return nullptr;
4556 // Recurse through the element type trying to peel off offset bytes.
4557 return getTypePartition(DL, ElementTy, Offset, Size);
4558 }
4559 assert(Offset == 0);
4560
4561 if (Size == ElementSize)
4562 return stripAggregateTypeWrapping(DL, ElementTy);
4563 assert(Size > ElementSize);
4564 uint64_t NumElements = Size / ElementSize;
4565 if (NumElements * ElementSize != Size)
4566 return nullptr;
4567 return ArrayType::get(ElementTy, NumElements);
4568 }
4569
4571 if (!STy)
4572 return nullptr;
4573
4574 const StructLayout *SL = DL.getStructLayout(STy);
4575
4576 if (SL->getSizeInBits().isScalable())
4577 return nullptr;
4578
4579 if (Offset >= SL->getSizeInBytes())
4580 return nullptr;
4581 uint64_t EndOffset = Offset + Size;
4582 if (EndOffset > SL->getSizeInBytes())
4583 return nullptr;
4584
4585 unsigned Index = SL->getElementContainingOffset(Offset);
4586 Offset -= SL->getElementOffset(Index);
4587
4588 Type *ElementTy = STy->getElementType(Index);
4589 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4590 if (Offset >= ElementSize)
4591 return nullptr; // The offset points into alignment padding.
4592
4593 // See if any partition must be contained by the element.
4594 if (Offset > 0 || Size < ElementSize) {
4595 if ((Offset + Size) > ElementSize)
4596 return nullptr;
4597 return getTypePartition(DL, ElementTy, Offset, Size);
4598 }
4599 assert(Offset == 0);
4600
4601 if (Size == ElementSize)
4602 return stripAggregateTypeWrapping(DL, ElementTy);
4603
4604 StructType::element_iterator EI = STy->element_begin() + Index,
4605 EE = STy->element_end();
4606 if (EndOffset < SL->getSizeInBytes()) {
4607 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4608 if (Index == EndIndex)
4609 return nullptr; // Within a single element and its padding.
4610
4611 // Don't try to form "natural" types if the elements don't line up with the
4612 // expected size.
4613 // FIXME: We could potentially recurse down through the last element in the
4614 // sub-struct to find a natural end point.
4615 if (SL->getElementOffset(EndIndex) != EndOffset)
4616 return nullptr;
4617
4618 assert(Index < EndIndex);
4619 EE = STy->element_begin() + EndIndex;
4620 }
4621
4622 // Try to build up a sub-structure.
4623 StructType *SubTy =
4624 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4625 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4626 if (Size != SubSL->getSizeInBytes())
4627 return nullptr; // The sub-struct doesn't have quite the size needed.
4628
4629 return SubTy;
4630}
4631
4632/// Pre-split loads and stores to simplify rewriting.
4633///
4634/// We want to break up the splittable load+store pairs as much as
4635/// possible. This is important to do as a preprocessing step, as once we
4636/// start rewriting the accesses to partitions of the alloca we lose the
4637/// necessary information to correctly split apart paired loads and stores
4638/// which both point into this alloca. The case to consider is something like
4639/// the following:
4640///
4641/// %a = alloca [12 x i8]
4642/// %gep1 = getelementptr i8, ptr %a, i32 0
4643/// %gep2 = getelementptr i8, ptr %a, i32 4
4644/// %gep3 = getelementptr i8, ptr %a, i32 8
4645/// store float 0.0, ptr %gep1
4646/// store float 1.0, ptr %gep2
4647/// %v = load i64, ptr %gep1
4648/// store i64 %v, ptr %gep2
4649/// %f1 = load float, ptr %gep2
4650/// %f2 = load float, ptr %gep3
4651///
4652/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4653/// promote everything so we recover the 2 SSA values that should have been
4654/// there all along.
4655///
4656/// \returns true if any changes are made.
4657bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4658 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4659
4660 // Track the loads and stores which are candidates for pre-splitting here, in
4661 // the order they first appear during the partition scan. These give stable
4662 // iteration order and a basis for tracking which loads and stores we
4663 // actually split.
4666
4667 // We need to accumulate the splits required of each load or store where we
4668 // can find them via a direct lookup. This is important to cross-check loads
4669 // and stores against each other. We also track the slice so that we can kill
4670 // all the slices that end up split.
4671 struct SplitOffsets {
4672 Slice *S;
4673 std::vector<uint64_t> Splits;
4674 };
4675 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4676
4677 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4678 // This is important as we also cannot pre-split stores of those loads!
4679 // FIXME: This is all pretty gross. It means that we can be more aggressive
4680 // in pre-splitting when the load feeding the store happens to come from
4681 // a separate alloca. Put another way, the effectiveness of SROA would be
4682 // decreased by a frontend which just concatenated all of its local allocas
4683 // into one big flat alloca. But defeating such patterns is exactly the job
4684 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4685 // change store pre-splitting to actually force pre-splitting of the load
4686 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4687 // maybe it would make it more principled?
4688 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4689
4690 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4691 for (auto &P : AS.partitions()) {
4692 for (Slice &S : P) {
4693 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4694 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4695 // If this is a load we have to track that it can't participate in any
4696 // pre-splitting. If this is a store of a load we have to track that
4697 // that load also can't participate in any pre-splitting.
4698 if (auto *LI = dyn_cast<LoadInst>(I))
4699 UnsplittableLoads.insert(LI);
4700 else if (auto *SI = dyn_cast<StoreInst>(I))
4701 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4702 UnsplittableLoads.insert(LI);
4703 continue;
4704 }
4705 assert(P.endOffset() > S.beginOffset() &&
4706 "Empty or backwards partition!");
4707
4708 // Determine if this is a pre-splittable slice.
4709 if (auto *LI = dyn_cast<LoadInst>(I)) {
4710 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4711
4712 // The load must be used exclusively to store into other pointers for
4713 // us to be able to arbitrarily pre-split it. The stores must also be
4714 // simple to avoid changing semantics.
4715 auto IsLoadSimplyStored = [](LoadInst *LI) {
4716 for (User *LU : LI->users()) {
4717 auto *SI = dyn_cast<StoreInst>(LU);
4718 if (!SI || !SI->isSimple())
4719 return false;
4720 }
4721 return true;
4722 };
4723 if (!IsLoadSimplyStored(LI)) {
4724 UnsplittableLoads.insert(LI);
4725 continue;
4726 }
4727
4728 Loads.push_back(LI);
4729 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4730 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4731 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4732 continue;
4733 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4734 if (!StoredLoad || !StoredLoad->isSimple())
4735 continue;
4736 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4737
4738 Stores.push_back(SI);
4739 } else {
4740 // Other uses cannot be pre-split.
4741 continue;
4742 }
4743
4744 // Record the initial split.
4745 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4746 auto &Offsets = SplitOffsetsMap[I];
4747 assert(Offsets.Splits.empty() &&
4748 "Should not have splits the first time we see an instruction!");
4749 Offsets.S = &S;
4750 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4751 }
4752
4753 // Now scan the already split slices, and add a split for any of them which
4754 // we're going to pre-split.
4755 for (Slice *S : P.splitSliceTails()) {
4756 auto SplitOffsetsMapI =
4757 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4758 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4759 continue;
4760 auto &Offsets = SplitOffsetsMapI->second;
4761
4762 assert(Offsets.S == S && "Found a mismatched slice!");
4763 assert(!Offsets.Splits.empty() &&
4764 "Cannot have an empty set of splits on the second partition!");
4765 assert(Offsets.Splits.back() ==
4766 P.beginOffset() - Offsets.S->beginOffset() &&
4767 "Previous split does not end where this one begins!");
4768
4769 // Record each split. The last partition's end isn't needed as the size
4770 // of the slice dictates that.
4771 if (S->endOffset() > P.endOffset())
4772 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4773 }
4774 }
4775
4776 // We may have split loads where some of their stores are split stores. For
4777 // such loads and stores, we can only pre-split them if their splits exactly
4778 // match relative to their starting offset. We have to verify this prior to
4779 // any rewriting.
4780 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4781 // Lookup the load we are storing in our map of split
4782 // offsets.
4783 auto *LI = cast<LoadInst>(SI->getValueOperand());
4784 // If it was completely unsplittable, then we're done,
4785 // and this store can't be pre-split.
4786 if (UnsplittableLoads.count(LI))
4787 return true;
4788
4789 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4790 if (LoadOffsetsI == SplitOffsetsMap.end())
4791 return false; // Unrelated loads are definitely safe.
4792 auto &LoadOffsets = LoadOffsetsI->second;
4793
4794 // Now lookup the store's offsets.
4795 auto &StoreOffsets = SplitOffsetsMap[SI];
4796
4797 // If the relative offsets of each split in the load and
4798 // store match exactly, then we can split them and we
4799 // don't need to remove them here.
4800 if (LoadOffsets.Splits == StoreOffsets.Splits)
4801 return false;
4802
4803 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4804 << " " << *LI << "\n"
4805 << " " << *SI << "\n");
4806
4807 // We've found a store and load that we need to split
4808 // with mismatched relative splits. Just give up on them
4809 // and remove both instructions from our list of
4810 // candidates.
4811 UnsplittableLoads.insert(LI);
4812 return true;
4813 });
4814 // Now we have to go *back* through all the stores, because a later store may
4815 // have caused an earlier store's load to become unsplittable and if it is
4816 // unsplittable for the later store, then we can't rely on it being split in
4817 // the earlier store either.
4818 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4819 auto *LI = cast<LoadInst>(SI->getValueOperand());
4820 return UnsplittableLoads.count(LI);
4821 });
4822 // Once we've established all the loads that can't be split for some reason,
4823 // filter any that made it into our list out.
4824 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4825 return UnsplittableLoads.count(LI);
4826 });
4827
4828 // If no loads or stores are left, there is no pre-splitting to be done for
4829 // this alloca.
4830 if (Loads.empty() && Stores.empty())
4831 return false;
4832
4833 // From here on, we can't fail and will be building new accesses, so rig up
4834 // an IR builder.
4835 IRBuilderTy IRB(&AI);
4836
4837 // Collect the new slices which we will merge into the alloca slices.
4838 SmallVector<Slice, 4> NewSlices;
4839
4840 // Track any allocas we end up splitting loads and stores for so we iterate
4841 // on them.
4842 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4843
4844 // At this point, we have collected all of the loads and stores we can
4845 // pre-split, and the specific splits needed for them. We actually do the
4846 // splitting in a specific order in order to handle when one of the loads in
4847 // the value operand to one of the stores.
4848 //
4849 // First, we rewrite all of the split loads, and just accumulate each split
4850 // load in a parallel structure. We also build the slices for them and append
4851 // them to the alloca slices.
4852 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4853 std::vector<LoadInst *> SplitLoads;
4854 const DataLayout &DL = AI.getDataLayout();
4855 for (LoadInst *LI : Loads) {
4856 SplitLoads.clear();
4857
4858 auto &Offsets = SplitOffsetsMap[LI];
4859 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4860 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4861 "Load must have type size equal to store size");
4862 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4863 "Load must be >= slice size");
4864
4865 uint64_t BaseOffset = Offsets.S->beginOffset();
4866 assert(BaseOffset + SliceSize > BaseOffset &&
4867 "Cannot represent alloca access size using 64-bit integers!");
4868
4870 IRB.SetInsertPoint(LI);
4871
4872 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4873
4874 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4875 int Idx = 0, Size = Offsets.Splits.size();
4876 for (;;) {
4877 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4878 auto AS = LI->getPointerAddressSpace();
4879 auto *PartPtrTy = LI->getPointerOperandType();
4880 LoadInst *PLoad = IRB.CreateAlignedLoad(
4881 PartTy,
4882 getAdjustedPtr(IRB, DL, BasePtr,
4883 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4884 PartPtrTy, BasePtr->getName() + "."),
4885 getAdjustedAlignment(LI, PartOffset),
4886 /*IsVolatile*/ false, LI->getName());
4887 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4888 LLVMContext::MD_access_group});
4889
4890 // Append this load onto the list of split loads so we can find it later
4891 // to rewrite the stores.
4892 SplitLoads.push_back(PLoad);
4893
4894 // Now build a new slice for the alloca.
4895 NewSlices.push_back(
4896 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4897 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4898 /*IsSplittable*/ false, nullptr));
4899 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4900 << ", " << NewSlices.back().endOffset()
4901 << "): " << *PLoad << "\n");
4902
4903 // See if we've handled all the splits.
4904 if (Idx >= Size)
4905 break;
4906
4907 // Setup the next partition.
4908 PartOffset = Offsets.Splits[Idx];
4909 ++Idx;
4910 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4911 }
4912
4913 // Now that we have the split loads, do the slow walk over all uses of the
4914 // load and rewrite them as split stores, or save the split loads to use
4915 // below if the store is going to be split there anyways.
4916 bool DeferredStores = false;
4917 for (User *LU : LI->users()) {
4918 StoreInst *SI = cast<StoreInst>(LU);
4919 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4920 DeferredStores = true;
4921 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4922 << "\n");
4923 continue;
4924 }
4925
4926 Value *StoreBasePtr = SI->getPointerOperand();
4927 IRB.SetInsertPoint(SI);
4928 AAMDNodes AATags = SI->getAAMetadata();
4929
4930 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4931
4932 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4933 LoadInst *PLoad = SplitLoads[Idx];
4934 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4935 auto *PartPtrTy = SI->getPointerOperandType();
4936
4937 auto AS = SI->getPointerAddressSpace();
4938 StoreInst *PStore = IRB.CreateAlignedStore(
4939 PLoad,
4940 getAdjustedPtr(IRB, DL, StoreBasePtr,
4941 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4942 PartPtrTy, StoreBasePtr->getName() + "."),
4943 getAdjustedAlignment(SI, PartOffset),
4944 /*IsVolatile*/ false);
4945 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4946 LLVMContext::MD_access_group,
4947 LLVMContext::MD_DIAssignID});
4948
4949 if (AATags)
4950 PStore->setAAMetadata(
4951 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4952 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4953 }
4954
4955 // We want to immediately iterate on any allocas impacted by splitting
4956 // this store, and we have to track any promotable alloca (indicated by
4957 // a direct store) as needing to be resplit because it is no longer
4958 // promotable.
4959 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4960 ResplitPromotableAllocas.insert(OtherAI);
4961 Worklist.insert(OtherAI);
4962 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4963 StoreBasePtr->stripInBoundsOffsets())) {
4964 Worklist.insert(OtherAI);
4965 }
4966
4967 // Mark the original store as dead.
4968 DeadInsts.push_back(SI);
4969 }
4970
4971 // Save the split loads if there are deferred stores among the users.
4972 if (DeferredStores)
4973 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4974
4975 // Mark the original load as dead and kill the original slice.
4976 DeadInsts.push_back(LI);
4977 Offsets.S->kill();
4978 }
4979
4980 // Second, we rewrite all of the split stores. At this point, we know that
4981 // all loads from this alloca have been split already. For stores of such
4982 // loads, we can simply look up the pre-existing split loads. For stores of
4983 // other loads, we split those loads first and then write split stores of
4984 // them.
4985 for (StoreInst *SI : Stores) {
4986 auto *LI = cast<LoadInst>(SI->getValueOperand());
4987 IntegerType *Ty = cast<IntegerType>(LI->getType());
4988 assert(Ty->getBitWidth() % 8 == 0);
4989 uint64_t StoreSize = Ty->getBitWidth() / 8;
4990 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4991
4992 auto &Offsets = SplitOffsetsMap[SI];
4993 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4994 "Slice size should always match load size exactly!");
4995 uint64_t BaseOffset = Offsets.S->beginOffset();
4996 assert(BaseOffset + StoreSize > BaseOffset &&
4997 "Cannot represent alloca access size using 64-bit integers!");
4998
4999 Value *LoadBasePtr = LI->getPointerOperand();
5000 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
5001
5002 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5003
5004 // Check whether we have an already split load.
5005 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
5006 std::vector<LoadInst *> *SplitLoads = nullptr;
5007 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5008 SplitLoads = &SplitLoadsMapI->second;
5009 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5010 "Too few split loads for the number of splits in the store!");
5011 } else {
5012 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5013 }
5014
5015 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5016 int Idx = 0, Size = Offsets.Splits.size();
5017 for (;;) {
5018 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5019 auto *LoadPartPtrTy = LI->getPointerOperandType();
5020 auto *StorePartPtrTy = SI->getPointerOperandType();
5021
5022 // Either lookup a split load or create one.
5023 LoadInst *PLoad;
5024 if (SplitLoads) {
5025 PLoad = (*SplitLoads)[Idx];
5026 } else {
5027 IRB.SetInsertPoint(LI);
5028 auto AS = LI->getPointerAddressSpace();
5029 PLoad = IRB.CreateAlignedLoad(
5030 PartTy,
5031 getAdjustedPtr(IRB, DL, LoadBasePtr,
5032 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5033 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5034 getAdjustedAlignment(LI, PartOffset),
5035 /*IsVolatile*/ false, LI->getName());
5036 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5037 LLVMContext::MD_access_group});
5038 }
5039
5040 // And store this partition.
5041 IRB.SetInsertPoint(SI);
5042 auto AS = SI->getPointerAddressSpace();
5043 StoreInst *PStore = IRB.CreateAlignedStore(
5044 PLoad,
5045 getAdjustedPtr(IRB, DL, StoreBasePtr,
5046 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5047 StorePartPtrTy, StoreBasePtr->getName() + "."),
5048 getAdjustedAlignment(SI, PartOffset),
5049 /*IsVolatile*/ false);
5050 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5051 LLVMContext::MD_access_group});
5052
5053 // Now build a new slice for the alloca.
5054 // ProtectedFieldDisc==nullptr is a lie, but it doesn't matter because we
5055 // already determined that all accesses are consistent.
5056 NewSlices.push_back(
5057 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5058 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5059 /*IsSplittable*/ false, nullptr));
5060 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5061 << ", " << NewSlices.back().endOffset()
5062 << "): " << *PStore << "\n");
5063 if (!SplitLoads) {
5064 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5065 }
5066
5067 // See if we've finished all the splits.
5068 if (Idx >= Size)
5069 break;
5070
5071 // Setup the next partition.
5072 PartOffset = Offsets.Splits[Idx];
5073 ++Idx;
5074 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5075 }
5076
5077 // We want to immediately iterate on any allocas impacted by splitting
5078 // this load, which is only relevant if it isn't a load of this alloca and
5079 // thus we didn't already split the loads above. We also have to keep track
5080 // of any promotable allocas we split loads on as they can no longer be
5081 // promoted.
5082 if (!SplitLoads) {
5083 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5084 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5085 ResplitPromotableAllocas.insert(OtherAI);
5086 Worklist.insert(OtherAI);
5087 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5088 LoadBasePtr->stripInBoundsOffsets())) {
5089 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5090 Worklist.insert(OtherAI);
5091 }
5092 }
5093
5094 // Mark the original store as dead now that we've split it up and kill its
5095 // slice. Note that we leave the original load in place unless this store
5096 // was its only use. It may in turn be split up if it is an alloca load
5097 // for some other alloca, but it may be a normal load. This may introduce
5098 // redundant loads, but where those can be merged the rest of the optimizer
5099 // should handle the merging, and this uncovers SSA splits which is more
5100 // important. In practice, the original loads will almost always be fully
5101 // split and removed eventually, and the splits will be merged by any
5102 // trivial CSE, including instcombine.
5103 if (LI->hasOneUse()) {
5104 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5105 DeadInsts.push_back(LI);
5106 }
5107 DeadInsts.push_back(SI);
5108 Offsets.S->kill();
5109 }
5110
5111 // Remove the killed slices that have ben pre-split.
5112 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5113
5114 // Insert our new slices. This will sort and merge them into the sorted
5115 // sequence.
5116 AS.insert(NewSlices);
5117
5118 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5119#ifndef NDEBUG
5120 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5121 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5122#endif
5123
5124 // Finally, don't try to promote any allocas that new require re-splitting.
5125 // They have already been added to the worklist above.
5126 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5127
5128 return true;
5129}
5130
5131/// Select a partition type for an alloca partition.
5132///
5133/// Try to compute a friendly type for this partition of the alloca. This
5134/// won't always succeed, in which case we fall back to a legal integer type
5135/// or an i8 array of an appropriate size.
5136///
5137/// \returns A tuple with the following elements:
5138/// - PartitionType: The computed type for this partition.
5139/// - IsIntegerWideningViable: True if integer widening promotion is used.
5140/// - VectorType: The vector type if vector promotion is used, otherwise
5141/// nullptr.
5142static std::tuple<Type *, bool, VectorType *>
5144 LLVMContext &C) {
5145 // First check if the partition is viable for vector promotion.
5146 //
5147 // We prefer vector promotion over integer widening promotion when:
5148 // - The vector element type is a floating-point type.
5149 // - All the loads/stores to the alloca are vector loads/stores to the
5150 // entire alloca or load/store a single element of the vector.
5151 //
5152 // Otherwise when there is an integer vector with mixed type loads/stores we
5153 // prefer integer widening promotion because it's more likely the user is
5154 // doing bitwise arithmetic and we generate better code.
5155 VectorType *VecTy =
5157 // If the vector element type is a floating-point type, we prefer vector
5158 // promotion. If the vector has one element, let the below code select
5159 // whether we promote with the vector or scalar.
5160 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5161 VecTy->getElementCount().getFixedValue() > 1)
5162 return {VecTy, false, VecTy};
5163
5164 // Check if there is a common type that all slices of the partition use that
5165 // spans the partition.
5166 auto [CommonUseTy, LargestIntTy] =
5167 findCommonType(P.begin(), P.end(), P.endOffset());
5168 if (CommonUseTy) {
5169 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
5170 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5171 // We prefer vector promotion here because if vector promotion is viable
5172 // and there is a common type used, then it implies the second listed
5173 // condition for preferring vector promotion is true.
5174 if (VecTy)
5175 return {VecTy, false, VecTy};
5176 return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
5177 nullptr};
5178 }
5179 }
5180
5181 // Can we find an appropriate subtype in the original allocated
5182 // type?
5183 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5184 P.beginOffset(), P.size())) {
5185 // If the partition is an integer array that can be spanned by a legal
5186 // integer type, prefer to represent it as a legal integer type because
5187 // it's more likely to be promotable.
5188 if (TypePartitionTy->isArrayTy() &&
5189 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5190 DL.isLegalInteger(P.size() * 8))
5191 TypePartitionTy = Type::getIntNTy(C, P.size() * 8);
5192 // There was no common type used, so we prefer integer widening promotion.
5193 if (isIntegerWideningViable(P, TypePartitionTy, DL))
5194 return {TypePartitionTy, true, nullptr};
5195 if (VecTy)
5196 return {VecTy, false, VecTy};
5197 // If we couldn't promote with TypePartitionTy, try with the largest
5198 // integer type used.
5199 if (LargestIntTy &&
5200 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
5201 isIntegerWideningViable(P, LargestIntTy, DL))
5202 return {LargestIntTy, true, nullptr};
5203
5204 // Fallback to TypePartitionTy and we probably won't promote.
5205 return {TypePartitionTy, false, nullptr};
5206 }
5207
5208 // Select the largest integer type used if it spans the partition.
5209 if (LargestIntTy &&
5210 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size())
5211 return {LargestIntTy, false, nullptr};
5212
5213 // Select a legal integer type if it spans the partition.
5214 if (DL.isLegalInteger(P.size() * 8))
5215 return {Type::getIntNTy(C, P.size() * 8), false, nullptr};
5216
5217 // Fallback to an i8 array.
5218 return {ArrayType::get(Type::getInt8Ty(C), P.size()), false, nullptr};
5219}
5220
5221/// Rewrite an alloca partition's users.
5222///
5223/// This routine drives both of the rewriting goals of the SROA pass. It tries
5224/// to rewrite uses of an alloca partition to be conducive for SSA value
5225/// promotion. If the partition needs a new, more refined alloca, this will
5226/// build that new alloca, preserving as much type information as possible, and
5227/// rewrite the uses of the old alloca to point at the new one and have the
5228/// appropriate new offsets. It also evaluates how successful the rewrite was
5229/// at enabling promotion and if it was successful queues the alloca to be
5230/// promoted.
5231std::pair<AllocaInst *, uint64_t>
5232SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5233 const DataLayout &DL = AI.getDataLayout();
5234 // Select the type for the new alloca that spans the partition.
5235 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5236 selectPartitionType(P, DL, AI, *C);
5237
5238 // Check for the case where we're going to rewrite to a new alloca of the
5239 // exact same type as the original, and with the same access offsets. In that
5240 // case, re-use the existing alloca, but still run through the rewriter to
5241 // perform phi and select speculation.
5242 // P.beginOffset() can be non-zero even with the same type in a case with
5243 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5244 AllocaInst *NewAI;
5245 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5246 NewAI = &AI;
5247 // FIXME: We should be able to bail at this point with "nothing changed".
5248 // FIXME: We might want to defer PHI speculation until after here.
5249 // FIXME: return nullptr;
5250 } else {
5251 // Make sure the alignment is compatible with P.beginOffset().
5252 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5253 // If we will get at least this much alignment from the type alone, leave
5254 // the alloca's alignment unconstrained.
5255 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
5256 NewAI = new AllocaInst(
5257 PartitionTy, AI.getAddressSpace(), nullptr,
5258 IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
5259 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5260 AI.getIterator());
5261 // Copy the old AI debug location over to the new one.
5262 NewAI->setDebugLoc(AI.getDebugLoc());
5263 ++NumNewAllocas;
5264 }
5265
5266 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5267 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5268
5269 // Track the high watermark on the worklist as it is only relevant for
5270 // promoted allocas. We will reset it to this point if the alloca is not in
5271 // fact scheduled for promotion.
5272 unsigned PPWOldSize = PostPromotionWorklist.size();
5273 unsigned NumUses = 0;
5274 SmallSetVector<PHINode *, 8> PHIUsers;
5275 SmallSetVector<SelectInst *, 8> SelectUsers;
5276
5277 AllocaSliceRewriter Rewriter(
5278 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5279 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5280 bool Promotable = true;
5281 // Check whether we can have tree-structured merge.
5282 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5283 NumUses += DeletedValues->size() + 1;
5284 for (Value *V : *DeletedValues)
5285 DeadInsts.push_back(V);
5286 } else {
5287 for (Slice *S : P.splitSliceTails()) {
5288 Promotable &= Rewriter.visit(S);
5289 ++NumUses;
5290 }
5291 for (Slice &S : P) {
5292 Promotable &= Rewriter.visit(&S);
5293 ++NumUses;
5294 }
5295 }
5296
5297 NumAllocaPartitionUses += NumUses;
5298 MaxUsesPerAllocaPartition.updateMax(NumUses);
5299
5300 // Now that we've processed all the slices in the new partition, check if any
5301 // PHIs or Selects would block promotion.
5302 for (PHINode *PHI : PHIUsers)
5303 if (!isSafePHIToSpeculate(*PHI)) {
5304 Promotable = false;
5305 PHIUsers.clear();
5306 SelectUsers.clear();
5307 break;
5308 }
5309
5311 NewSelectsToRewrite;
5312 NewSelectsToRewrite.reserve(SelectUsers.size());
5313 for (SelectInst *Sel : SelectUsers) {
5314 std::optional<RewriteableMemOps> Ops =
5315 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5316 if (!Ops) {
5317 Promotable = false;
5318 PHIUsers.clear();
5319 SelectUsers.clear();
5320 NewSelectsToRewrite.clear();
5321 break;
5322 }
5323 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5324 }
5325
5326 if (Promotable) {
5327 for (Use *U : AS.getDeadUsesIfPromotable()) {
5328 auto *OldInst = dyn_cast<Instruction>(U->get());
5329 Value::dropDroppableUse(*U);
5330 if (OldInst)
5331 if (isInstructionTriviallyDead(OldInst))
5332 DeadInsts.push_back(OldInst);
5333 }
5334 if (PHIUsers.empty() && SelectUsers.empty()) {
5335 // Promote the alloca.
5336 PromotableAllocas.insert(NewAI);
5337 } else {
5338 // If we have either PHIs or Selects to speculate, add them to those
5339 // worklists and re-queue the new alloca so that we promote in on the
5340 // next iteration.
5341 SpeculatablePHIs.insert_range(PHIUsers);
5342 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5343 NewSelectsToRewrite.size());
5344 for (auto &&KV : llvm::make_range(
5345 std::make_move_iterator(NewSelectsToRewrite.begin()),
5346 std::make_move_iterator(NewSelectsToRewrite.end())))
5347 SelectsToRewrite.insert(std::move(KV));
5348 Worklist.insert(NewAI);
5349 }
5350 } else {
5351 // Drop any post-promotion work items if promotion didn't happen.
5352 while (PostPromotionWorklist.size() > PPWOldSize)
5353 PostPromotionWorklist.pop_back();
5354
5355 // We couldn't promote and we didn't create a new partition, nothing
5356 // happened.
5357 if (NewAI == &AI)
5358 return {nullptr, 0};
5359
5360 // If we can't promote the alloca, iterate on it to check for new
5361 // refinements exposed by splitting the current alloca. Don't iterate on an
5362 // alloca which didn't actually change and didn't get promoted.
5363 Worklist.insert(NewAI);
5364 }
5365
5366 return {NewAI, DL.getTypeSizeInBits(PartitionTy).getFixedValue()};
5367}
5368
5369// There isn't a shared interface to get the "address" parts out of a
5370// dbg.declare and dbg.assign, so provide some wrappers.
5373 return DVR->isKillAddress();
5374 return DVR->isKillLocation();
5375}
5376
5379 return DVR->getAddressExpression();
5380 return DVR->getExpression();
5381}
5382
5383/// Create or replace an existing fragment in a DIExpression with \p Frag.
5384/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5385/// operation, add \p BitExtractOffset to the offset part.
5386///
5387/// Returns the new expression, or nullptr if this fails (see details below).
5388///
5389/// This function is similar to DIExpression::createFragmentExpression except
5390/// for 3 important distinctions:
5391/// 1. The new fragment isn't relative to an existing fragment.
5392/// 2. It assumes the computed location is a memory location. This means we
5393/// don't need to perform checks that creating the fragment preserves the
5394/// expression semantics.
5395/// 3. Existing extract_bits are modified independently of fragment changes
5396/// using \p BitExtractOffset. A change to the fragment offset or size
5397/// may affect a bit extract. But a bit extract offset can change
5398/// independently of the fragment dimensions.
5399///
5400/// Returns the new expression, or nullptr if one couldn't be created.
5401/// Ideally this is only used to signal that a bit-extract has become
5402/// zero-sized (and thus the new debug record has no size and can be
5403/// dropped), however, it fails for other reasons too - see the FIXME below.
5404///
5405/// FIXME: To keep the change that introduces this function NFC it bails
5406/// in some situations unecessarily, e.g. when fragment and bit extract
5407/// sizes differ.
5410 int64_t BitExtractOffset) {
5412 bool HasFragment = false;
5413 bool HasBitExtract = false;
5414
5415 for (auto &Op : Expr->expr_ops()) {
5416 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5417 HasFragment = true;
5418 continue;
5419 }
5420 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5422 HasBitExtract = true;
5423 int64_t ExtractOffsetInBits = Op.getArg(0);
5424 int64_t ExtractSizeInBits = Op.getArg(1);
5425
5426 // DIExpression::createFragmentExpression doesn't know how to handle
5427 // a fragment that is smaller than the extract. Copy the behaviour
5428 // (bail) to avoid non-NFC changes.
5429 // FIXME: Don't do this.
5430 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5431 return nullptr;
5432
5433 assert(BitExtractOffset <= 0);
5434 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5435
5436 // DIExpression::createFragmentExpression doesn't know what to do
5437 // if the new extract starts "outside" the existing one. Copy the
5438 // behaviour (bail) to avoid non-NFC changes.
5439 // FIXME: Don't do this.
5440 if (AdjustedOffset < 0)
5441 return nullptr;
5442
5443 Ops.push_back(Op.getOp());
5444 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5445 Ops.push_back(ExtractSizeInBits);
5446 continue;
5447 }
5448 Op.appendToVector(Ops);
5449 }
5450
5451 // Unsupported by createFragmentExpression, so don't support it here yet to
5452 // preserve NFC-ness.
5453 if (HasFragment && HasBitExtract)
5454 return nullptr;
5455
5456 if (!HasBitExtract) {
5458 Ops.push_back(Frag.OffsetInBits);
5459 Ops.push_back(Frag.SizeInBits);
5460 }
5461 return DIExpression::get(Expr->getContext(), Ops);
5462}
5463
5464/// Insert a new DbgRecord.
5465/// \p Orig Original to copy record type, debug loc and variable from, and
5466/// additionally value and value expression for dbg_assign records.
5467/// \p NewAddr Location's new base address.
5468/// \p NewAddrExpr New expression to apply to address.
5469/// \p BeforeInst Insert position.
5470/// \p NewFragment New fragment (absolute, non-relative).
5471/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5472static void
5474 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5475 std::optional<DIExpression::FragmentInfo> NewFragment,
5476 int64_t BitExtractAdjustment) {
5477 (void)DIB;
5478
5479 // A dbg_assign puts fragment info in the value expression only. The address
5480 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5481 // new fragment info into NewAddrExpr (as it only has one expression).
5482 DIExpression *NewFragmentExpr =
5483 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5484 if (NewFragment)
5485 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5486 BitExtractAdjustment);
5487 if (!NewFragmentExpr)
5488 return;
5489
5490 if (Orig->isDbgDeclare()) {
5492 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5493 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5494 BeforeInst->getIterator());
5495 return;
5496 }
5497
5498 if (Orig->isDbgValue()) {
5500 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5501 // Drop debug information if the expression doesn't start with a
5502 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5503 // describes the address of alloca rather than the value inside the alloca.
5504 if (!NewFragmentExpr->startsWithDeref())
5505 DVR->setKillAddress();
5506 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5507 BeforeInst->getIterator());
5508 return;
5509 }
5510
5511 // Apply a DIAssignID to the store if it doesn't already have it.
5512 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5513 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5515 }
5516
5518 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5519 NewAddrExpr, Orig->getDebugLoc());
5520 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5521 (void)NewAssign;
5522}
5523
5524/// Walks the slices of an alloca and form partitions based on them,
5525/// rewriting each of their uses.
5526bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5527 if (AS.begin() == AS.end())
5528 return false;
5529
5530 unsigned NumPartitions = 0;
5531 bool Changed = false;
5532 const DataLayout &DL = AI.getModule()->getDataLayout();
5533
5534 // First try to pre-split loads and stores.
5535 Changed |= presplitLoadsAndStores(AI, AS);
5536
5537 // Now that we have identified any pre-splitting opportunities,
5538 // mark loads and stores unsplittable except for the following case.
5539 // We leave a slice splittable if all other slices are disjoint or fully
5540 // included in the slice, such as whole-alloca loads and stores.
5541 // If we fail to split these during pre-splitting, we want to force them
5542 // to be rewritten into a partition.
5543 bool IsSorted = true;
5544
5545 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5546 const uint64_t MaxBitVectorSize = 1024;
5547 if (AllocaSize <= MaxBitVectorSize) {
5548 // If a byte boundary is included in any load or store, a slice starting or
5549 // ending at the boundary is not splittable.
5550 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5551 for (Slice &S : AS)
5552 for (unsigned O = S.beginOffset() + 1;
5553 O < S.endOffset() && O < AllocaSize; O++)
5554 SplittableOffset.reset(O);
5555
5556 for (Slice &S : AS) {
5557 if (!S.isSplittable())
5558 continue;
5559
5560 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5561 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5562 continue;
5563
5564 if (isa<LoadInst>(S.getUse()->getUser()) ||
5565 isa<StoreInst>(S.getUse()->getUser())) {
5566 S.makeUnsplittable();
5567 IsSorted = false;
5568 }
5569 }
5570 } else {
5571 // We only allow whole-alloca splittable loads and stores
5572 // for a large alloca to avoid creating too large BitVector.
5573 for (Slice &S : AS) {
5574 if (!S.isSplittable())
5575 continue;
5576
5577 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5578 continue;
5579
5580 if (isa<LoadInst>(S.getUse()->getUser()) ||
5581 isa<StoreInst>(S.getUse()->getUser())) {
5582 S.makeUnsplittable();
5583 IsSorted = false;
5584 }
5585 }
5586 }
5587
5588 if (!IsSorted)
5590
5591 /// Describes the allocas introduced by rewritePartition in order to migrate
5592 /// the debug info.
5593 struct Fragment {
5594 AllocaInst *Alloca;
5595 uint64_t Offset;
5596 uint64_t Size;
5597 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5598 : Alloca(AI), Offset(O), Size(S) {}
5599 };
5600 SmallVector<Fragment, 4> Fragments;
5601
5602 // Rewrite each partition.
5603 for (auto &P : AS.partitions()) {
5604 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5605 if (NewAI) {
5606 Changed = true;
5607 if (NewAI != &AI) {
5608 uint64_t SizeOfByte = 8;
5609 // Don't include any padding.
5610 uint64_t Size = std::min(ActiveBits, P.size() * SizeOfByte);
5611 Fragments.push_back(
5612 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5613 }
5614 }
5615 ++NumPartitions;
5616 }
5617
5618 NumAllocaPartitions += NumPartitions;
5619 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5620
5621 // Migrate debug information from the old alloca to the new alloca(s)
5622 // and the individual partitions.
5623 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5624 // Can't overlap with undef memory.
5625 if (isKillAddress(DbgVariable))
5626 return;
5627
5628 const Value *DbgPtr = DbgVariable->getAddress();
5630 DbgVariable->getFragmentOrEntireVariable();
5631 // Get the address expression constant offset if one exists and the ops
5632 // that come after it.
5633 int64_t CurrentExprOffsetInBytes = 0;
5634 SmallVector<uint64_t> PostOffsetOps;
5635 if (!getAddressExpression(DbgVariable)
5636 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5637 return; // Couldn't interpret this DIExpression - drop the var.
5638
5639 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5640 int64_t ExtractOffsetInBits = 0;
5641 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5642 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5644 ExtractOffsetInBits = Op.getArg(0);
5645 break;
5646 }
5647 }
5648
5649 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5650 for (auto Fragment : Fragments) {
5651 int64_t OffsetFromLocationInBits;
5652 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5653 // Find the variable fragment that the new alloca slice covers.
5654 // Drop debug info for this variable fragment if we can't compute an
5655 // intersect between it and the alloca slice.
5657 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5658 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5659 NewDbgFragment, OffsetFromLocationInBits))
5660 continue; // Do not migrate this fragment to this slice.
5661
5662 // Zero sized fragment indicates there's no intersect between the variable
5663 // fragment and the alloca slice. Skip this slice for this variable
5664 // fragment.
5665 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5666 continue; // Do not migrate this fragment to this slice.
5667
5668 // No fragment indicates DbgVariable's variable or fragment exactly
5669 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5670 if (!NewDbgFragment)
5671 NewDbgFragment = DbgVariable->getFragment();
5672
5673 // Reduce the new expression offset by the bit-extract offset since
5674 // we'll be keeping that.
5675 int64_t OffestFromNewAllocaInBits =
5676 OffsetFromLocationInBits - ExtractOffsetInBits;
5677 // We need to adjust an existing bit extract if the offset expression
5678 // can't eat the slack (i.e., if the new offset would be negative).
5679 int64_t BitExtractOffset =
5680 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5681 // The magnitude of a negative value indicates the number of bits into
5682 // the existing variable fragment that the memory region begins. The new
5683 // variable fragment already excludes those bits - the new DbgPtr offset
5684 // only needs to be applied if it's positive.
5685 OffestFromNewAllocaInBits =
5686 std::max(int64_t(0), OffestFromNewAllocaInBits);
5687
5688 // Rebuild the expression:
5689 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5690 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5691 // address expression but the value expression instead.
5692 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5693 if (OffestFromNewAllocaInBits > 0) {
5694 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5695 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5696 }
5697
5698 // Remove any existing intrinsics on the new alloca describing
5699 // the variable fragment.
5700 auto RemoveOne = [DbgVariable](auto *OldDII) {
5701 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5702 return LHS->getVariable() == RHS->getVariable() &&
5703 LHS->getDebugLoc()->getInlinedAt() ==
5704 RHS->getDebugLoc()->getInlinedAt();
5705 };
5706 if (SameVariableFragment(OldDII, DbgVariable))
5707 OldDII->eraseFromParent();
5708 };
5709 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5710 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5711 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5712 NewDbgFragment, BitExtractOffset);
5713 }
5714 };
5715
5716 // Migrate debug information from the old alloca to the new alloca(s)
5717 // and the individual partitions.
5718 for_each(findDVRDeclares(&AI), MigrateOne);
5719 for_each(findDVRValues(&AI), MigrateOne);
5720 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5721
5722 return Changed;
5723}
5724
5725/// Clobber a use with poison, deleting the used value if it becomes dead.
5726void SROA::clobberUse(Use &U) {
5727 Value *OldV = U;
5728 // Replace the use with an poison value.
5729 U = PoisonValue::get(OldV->getType());
5730
5731 // Check for this making an instruction dead. We have to garbage collect
5732 // all the dead instructions to ensure the uses of any alloca end up being
5733 // minimal.
5734 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5735 if (isInstructionTriviallyDead(OldI)) {
5736 DeadInsts.push_back(OldI);
5737 }
5738}
5739
5740/// A basic LoadAndStorePromoter that does not remove store nodes.
5742public:
5744 Type *ZeroType)
5745 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5746 bool shouldDelete(Instruction *I) const override {
5747 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5748 }
5749
5751 return UndefValue::get(ZeroType);
5752 }
5753
5754private:
5755 Type *ZeroType;
5756};
5757
5758bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5759 // Look through each "partition", looking for slices with the same start/end
5760 // that do not overlap with any before them. The slices are sorted by
5761 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5762 // sophisticated algorithm that takes splittable slices into account.
5763 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5764 bool AllSameAndValid = true;
5765 Type *PartitionType = nullptr;
5767 uint64_t BeginOffset = 0;
5768 uint64_t EndOffset = 0;
5769
5770 auto Flush = [&]() {
5771 if (AllSameAndValid && !Insts.empty()) {
5772 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5773 << EndOffset << ")\n");
5775 SSAUpdater SSA(&NewPHIs);
5776 Insts.push_back(&AI);
5777 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5778 Promoter.run(Insts);
5779 }
5780 AllSameAndValid = true;
5781 PartitionType = nullptr;
5782 Insts.clear();
5783 };
5784
5785 for (Slice &S : AS) {
5786 auto *User = cast<Instruction>(S.getUse()->getUser());
5787 if (isAssumeLikeIntrinsic(User)) {
5788 LLVM_DEBUG({
5789 dbgs() << "Ignoring slice: ";
5790 AS.print(dbgs(), &S);
5791 });
5792 continue;
5793 }
5794 if (S.beginOffset() >= EndOffset) {
5795 Flush();
5796 BeginOffset = S.beginOffset();
5797 EndOffset = S.endOffset();
5798 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5799 if (AllSameAndValid) {
5800 LLVM_DEBUG({
5801 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5802 << EndOffset << ")";
5803 AS.print(dbgs(), &S);
5804 });
5805 AllSameAndValid = false;
5806 }
5807 EndOffset = std::max(EndOffset, S.endOffset());
5808 continue;
5809 }
5810
5811 if (auto *LI = dyn_cast<LoadInst>(User)) {
5812 Type *UserTy = LI->getType();
5813 // LoadAndStorePromoter requires all the types to be the same.
5814 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5815 AllSameAndValid = false;
5816 PartitionType = UserTy;
5817 Insts.push_back(User);
5818 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5819 Type *UserTy = SI->getValueOperand()->getType();
5820 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5821 AllSameAndValid = false;
5822 PartitionType = UserTy;
5823 Insts.push_back(User);
5824 } else {
5825 AllSameAndValid = false;
5826 }
5827 }
5828
5829 Flush();
5830 return true;
5831}
5832
5833/// Analyze an alloca for SROA.
5834///
5835/// This analyzes the alloca to ensure we can reason about it, builds
5836/// the slices of the alloca, and then hands it off to be split and
5837/// rewritten as needed.
5838std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5839SROA::runOnAlloca(AllocaInst &AI) {
5840 bool Changed = false;
5841 bool CFGChanged = false;
5842
5843 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5844 ++NumAllocasAnalyzed;
5845
5846 // Special case dead allocas, as they're trivial.
5847 if (AI.use_empty()) {
5848 AI.eraseFromParent();
5849 Changed = true;
5850 return {Changed, CFGChanged};
5851 }
5852 const DataLayout &DL = AI.getDataLayout();
5853
5854 // Skip alloca forms that this analysis can't handle.
5855 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
5856 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
5857 return {Changed, CFGChanged};
5858
5859 // First, split any FCA loads and stores touching this alloca to promote
5860 // better splitting and promotion opportunities.
5861 IRBuilderTy IRB(&AI);
5862 AggLoadStoreRewriter AggRewriter(DL, IRB);
5863 Changed |= AggRewriter.rewrite(AI);
5864
5865 // Build the slices using a recursive instruction-visiting builder.
5866 AllocaSlices AS(DL, AI);
5867 LLVM_DEBUG(AS.print(dbgs()));
5868 if (AS.isEscaped())
5869 return {Changed, CFGChanged};
5870
5871 if (AS.isEscapedReadOnly()) {
5872 Changed |= propagateStoredValuesToLoads(AI, AS);
5873 return {Changed, CFGChanged};
5874 }
5875
5876 for (auto &P : AS.partitions()) {
5877 // For now, we can't split if a field is accessed both via protected field
5878 // and not, because that would mean that we would need to introduce sign and
5879 // auth operations to convert between the protected and non-protected uses,
5880 // and this pass doesn't know how to do that. Also, this case is unlikely to
5881 // occur in normal code.
5882 std::optional<Value *> ProtectedFieldDisc;
5883 auto SliceHasMismatch = [&](Slice &S) {
5884 if (auto *II = dyn_cast<IntrinsicInst>(S.getUse()->getUser()))
5885 if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
5886 II->getIntrinsicID() == Intrinsic::lifetime_end)
5887 return false;
5888 if (!ProtectedFieldDisc)
5889 ProtectedFieldDisc = S.ProtectedFieldDisc;
5890 return *ProtectedFieldDisc != S.ProtectedFieldDisc;
5891 };
5892 for (Slice &S : P)
5893 if (SliceHasMismatch(S))
5894 return {Changed, CFGChanged};
5895 for (Slice *S : P.splitSliceTails())
5896 if (SliceHasMismatch(*S))
5897 return {Changed, CFGChanged};
5898 }
5899
5900 // Delete all the dead users of this alloca before splitting and rewriting it.
5901 for (Instruction *DeadUser : AS.getDeadUsers()) {
5902 // Free up everything used by this instruction.
5903 for (Use &DeadOp : DeadUser->operands())
5904 clobberUse(DeadOp);
5905
5906 // Now replace the uses of this instruction.
5907 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5908
5909 // And mark it for deletion.
5910 DeadInsts.push_back(DeadUser);
5911 Changed = true;
5912 }
5913 for (Use *DeadOp : AS.getDeadOperands()) {
5914 clobberUse(*DeadOp);
5915 Changed = true;
5916 }
5917 for (IntrinsicInst *PFPUser : AS.getPFPUsers()) {
5918 PFPUser->replaceAllUsesWith(PFPUser->getArgOperand(0));
5919
5920 DeadInsts.push_back(PFPUser);
5921 Changed = true;
5922 }
5923
5924 // No slices to split. Leave the dead alloca for a later pass to clean up.
5925 if (AS.begin() == AS.end())
5926 return {Changed, CFGChanged};
5927
5928 Changed |= splitAlloca(AI, AS);
5929
5930 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5931 while (!SpeculatablePHIs.empty())
5932 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5933
5934 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5935 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5936 while (!RemainingSelectsToRewrite.empty()) {
5937 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5938 CFGChanged |=
5939 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5940 }
5941
5942 return {Changed, CFGChanged};
5943}
5944
5945/// Delete the dead instructions accumulated in this run.
5946///
5947/// Recursively deletes the dead instructions we've accumulated. This is done
5948/// at the very end to maximize locality of the recursive delete and to
5949/// minimize the problems of invalidated instruction pointers as such pointers
5950/// are used heavily in the intermediate stages of the algorithm.
5951///
5952/// We also record the alloca instructions deleted here so that they aren't
5953/// subsequently handed to mem2reg to promote.
5954bool SROA::deleteDeadInstructions(
5955 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5956 bool Changed = false;
5957 while (!DeadInsts.empty()) {
5958 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5959 if (!I)
5960 continue;
5961 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5962
5963 // If the instruction is an alloca, find the possible dbg.declare connected
5964 // to it, and remove it too. We must do this before calling RAUW or we will
5965 // not be able to find it.
5966 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5967 DeletedAllocas.insert(AI);
5968 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5969 OldDII->eraseFromParent();
5970 }
5971
5973 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5974
5975 for (Use &Operand : I->operands())
5976 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5977 // Zero out the operand and see if it becomes trivially dead.
5978 Operand = nullptr;
5980 DeadInsts.push_back(U);
5981 }
5982
5983 ++NumDeleted;
5984 I->eraseFromParent();
5985 Changed = true;
5986 }
5987 return Changed;
5988}
5989/// Promote the allocas, using the best available technique.
5990///
5991/// This attempts to promote whatever allocas have been identified as viable in
5992/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5993/// This function returns whether any promotion occurred.
5994bool SROA::promoteAllocas() {
5995 if (PromotableAllocas.empty())
5996 return false;
5997
5998 if (SROASkipMem2Reg) {
5999 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
6000 } else {
6001 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
6002 NumPromoted += PromotableAllocas.size();
6003 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
6004 }
6005
6006 PromotableAllocas.clear();
6007 return true;
6008}
6009
6010std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6011 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6012
6013 const DataLayout &DL = F.getDataLayout();
6014 BasicBlock &EntryBB = F.getEntryBlock();
6015 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
6016 I != E; ++I) {
6017 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
6018 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
6019 if (Size && Size->isScalable() && isAllocaPromotable(AI))
6020 PromotableAllocas.insert(AI);
6021 else
6022 Worklist.insert(AI);
6023 }
6024 }
6025
6026 bool Changed = false;
6027 bool CFGChanged = false;
6028 // A set of deleted alloca instruction pointers which should be removed from
6029 // the list of promotable allocas.
6030 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6031
6032 do {
6033 while (!Worklist.empty()) {
6034 auto [IterationChanged, IterationCFGChanged] =
6035 runOnAlloca(*Worklist.pop_back_val());
6036 Changed |= IterationChanged;
6037 CFGChanged |= IterationCFGChanged;
6038
6039 Changed |= deleteDeadInstructions(DeletedAllocas);
6040
6041 // Remove the deleted allocas from various lists so that we don't try to
6042 // continue processing them.
6043 if (!DeletedAllocas.empty()) {
6044 Worklist.set_subtract(DeletedAllocas);
6045 PostPromotionWorklist.set_subtract(DeletedAllocas);
6046 PromotableAllocas.set_subtract(DeletedAllocas);
6047 DeletedAllocas.clear();
6048 }
6049 }
6050
6051 Changed |= promoteAllocas();
6052
6053 Worklist = PostPromotionWorklist;
6054 PostPromotionWorklist.clear();
6055 } while (!Worklist.empty());
6056
6057 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6058 assert((!CFGChanged || !PreserveCFG) &&
6059 "Should not have modified the CFG when told to preserve it.");
6060
6061 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6062 for (auto &BB : F) {
6064 }
6065 }
6066
6067 return {Changed, CFGChanged};
6068}
6069
6073 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6074 auto [Changed, CFGChanged] =
6075 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6076 if (!Changed)
6077 return PreservedAnalyses::all();
6079 if (!CFGChanged)
6082 return PA;
6083}
6084
6086 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6087 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6088 OS, MapClassName2PassName);
6089 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6090 : "<modify-cfg>");
6091}
6092
6093SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6094
6095namespace {
6096
6097/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6098class SROALegacyPass : public FunctionPass {
6100
6101public:
6102 static char ID;
6103
6107 }
6108
6109 bool runOnFunction(Function &F) override {
6110 if (skipFunction(F))
6111 return false;
6112
6113 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6114 AssumptionCache &AC =
6115 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6116 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6117 auto [Changed, _] =
6118 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6119 return Changed;
6120 }
6121
6122 void getAnalysisUsage(AnalysisUsage &AU) const override {
6123 AU.addRequired<AssumptionCacheTracker>();
6124 AU.addRequired<DominatorTreeWrapperPass>();
6125 AU.addPreserved<GlobalsAAWrapperPass>();
6126 AU.addPreserved<DominatorTreeWrapperPass>();
6127 }
6128
6129 StringRef getPassName() const override { return "SROA"; }
6130};
6131
6132} // end anonymous namespace
6133
6134char SROALegacyPass::ID = 0;
6135
6140
6141INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6142 "Scalar Replacement Of Aggregates", false, false)
6145INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:598
This file implements a map that provides insertion order iteration.
static std::optional< AllocFnsTy > getAllocationSize(const CallBase *CB, const TargetLibraryInfo *TLI)
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:344
static std::tuple< Type *, bool, VectorType * > selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, LLVMContext &C)
Select a partition type for an alloca partition.
Definition SROA.cpp:5143
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2282
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1960
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2133
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1526
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4486
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:279
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5408
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2525
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2058
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1949
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2364
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2558
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1021
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1915
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1848
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:1008
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5371
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2580
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2459
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1666
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2238
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:325
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1592
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2500
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5473
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1809
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2652
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5377
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4524
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1970
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1747
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1033
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1053
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:821
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:968
partition_iterator & operator++()
Definition SROA.cpp:988
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5746
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5743
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5750
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
DebugLoc getDebugLoc() const
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI void setKillAddress()
Kill the address component.
LLVM_ABI bool isKillLocation() const
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
LLVM_ABI void setAssignId(DIAssignID *New)
DIExpression * getExpression() const
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DILocalVariable * getVariable() const
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:67
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:316
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:809
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:125
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
LLVMContext & getContext() const
Definition Metadata.h:1244
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6070
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6085
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6093
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:591
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:365
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:290
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:736
TypeSize getSizeInBytes() const
Definition DataLayout.h:745
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:767
TypeSize getSizeInBits() const
Definition DataLayout.h:747
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:313
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:205
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:287
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:820
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:214
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:203
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:360
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2131
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1495
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2134
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:396
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:82
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6136
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:822
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70