75#define LDIST_NAME "loop-distribute"
76#define DEBUG_TYPE LDIST_NAME
81 "llvm.loop.distribute.followup_all";
83 "llvm.loop.distribute.followup_coincident";
85 "llvm.loop.distribute.followup_sequential";
87 "llvm.loop.distribute.followup_fallback";
92 cl::desc(
"Turn on DominatorTree and LoopInfo verification "
93 "after Loop Distribution"),
97 "loop-distribute-non-if-convertible",
cl::Hidden,
98 cl::desc(
"Whether to distribute into a loop that may not be "
99 "if-convertible by the loop vectorizer"),
104 cl::desc(
"The maximum number of SCEV checks allowed for Loop "
108 "loop-distribute-scev-check-threshold-with-pragma",
cl::init(128),
111 "The maximum number of SCEV checks allowed for Loop "
112 "Distribution for loop marked with #pragma loop distribute(enable)"));
116 cl::desc(
"Enable the new, experimental LoopDistribution Pass"),
119STATISTIC(NumLoopsDistributed,
"Number of loops distributed");
130 : DepCycle(DepCycle), OrigLoop(L) {
135 bool hasDepCycle()
const {
return DepCycle; }
145 bool empty()
const {
return Set.empty(); }
149 void moveTo(InstPartition &
Other) {
150 Other.Set.insert(Set.begin(), Set.end());
152 Other.DepCycle |= DepCycle;
157 void populateUsedSet() {
161 for (
auto *
B : OrigLoop->getBlocks())
162 Set.insert(
B->getTerminator());
167 while (!Worklist.empty()) {
170 for (
Value *V :
I->operand_values()) {
171 auto *
I = dyn_cast<Instruction>(V);
172 if (
I && OrigLoop->contains(
I->getParent()) && Set.insert(
I).second)
173 Worklist.push_back(
I);
187 LI, DT, ClonedLoopBlocks);
193 const Loop *getClonedLoop()
const {
return ClonedLoop; }
198 Loop *getDistributedLoop()
const {
199 return ClonedLoop ? ClonedLoop : OrigLoop;
207 void remapInstructions() {
213 void removeUnusedInsts() {
216 for (
auto *Block : OrigLoop->getBlocks())
217 for (
auto &Inst : *Block)
218 if (!Set.count(&Inst)) {
221 NewInst = cast<Instruction>(VMap[NewInst]);
223 assert(!isa<BranchInst>(NewInst) &&
224 "Branches are marked used early on");
225 Unused.push_back(NewInst);
230 for (
auto *Inst :
reverse(Unused)) {
231 if (!Inst->use_empty())
233 Inst->eraseFromParent();
239 dbgs() <<
" (cycle)\n";
242 dbgs() <<
" " <<
I->getParent()->getName() <<
":" << *
I <<
"\n";
245 void printBlocks()
const {
246 for (
auto *BB : getDistributedLoop()->getBlocks())
262 Loop *ClonedLoop =
nullptr;
276class InstPartitionContainer {
281 :
L(
L), LI(LI), DT(DT) {}
284 unsigned getSize()
const {
return PartitionContainer.size(); }
290 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
291 PartitionContainer.emplace_back(Inst, L,
true);
293 PartitionContainer.back().add(Inst);
301 void addToNewNonCyclicPartition(
Instruction *Inst) {
302 PartitionContainer.emplace_back(Inst, L);
310 void mergeAdjacentNonCyclic() {
311 mergeAdjacentPartitionsIf(
312 [](
const InstPartition *
P) {
return !
P->hasDepCycle(); });
317 void mergeNonIfConvertible() {
318 mergeAdjacentPartitionsIf([&](
const InstPartition *Partition) {
319 if (Partition->hasDepCycle())
323 bool seenStore =
false;
325 for (
auto *Inst : *Partition)
326 if (isa<StoreInst>(Inst)) {
336 void mergeBeforePopulating() {
337 mergeAdjacentNonCyclic();
339 mergeNonIfConvertible();
349 bool mergeToAvoidDuplicatedLoads() {
353 LoadToPartitionT LoadToPartition;
354 ToBeMergedT ToBeMerged;
359 for (PartitionContainerT::iterator
I = PartitionContainer.begin(),
360 E = PartitionContainer.end();
367 if (isa<LoadInst>(Inst)) {
369 LoadToPartitionT::iterator LoadToPart;
371 std::tie(LoadToPart, NewElt) =
372 LoadToPartition.insert(std::make_pair(Inst, PartI));
375 <<
"Merging partitions due to this load in multiple "
376 <<
"partitions: " << PartI <<
", " << LoadToPart->second
383 ToBeMerged.unionSets(PartI, &*PartJ);
384 }
while (&*PartJ != LoadToPart->second);
388 if (ToBeMerged.empty())
393 for (ToBeMergedT::iterator
I = ToBeMerged.begin(),
E = ToBeMerged.end();
398 auto PartI =
I->getData();
399 for (
auto *PartJ :
make_range(std::next(ToBeMerged.member_begin(
I)),
400 ToBeMerged.member_end())) {
401 PartJ->moveTo(*PartI);
406 PartitionContainer.remove_if(
407 [](
const InstPartition &
P) {
return P.empty(); });
414 void setupPartitionIdOnInstructions() {
416 for (
const auto &Partition : PartitionContainer) {
421 std::tie(Iter, NewElt) =
422 InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
432 void populateUsedSet() {
433 for (
auto &
P : PartitionContainer)
444 assert(Pred &&
"Preheader does not have a single predecessor");
446 assert(ExitBlock &&
"No single exit block");
449 assert(!PartitionContainer.empty() &&
"at least two partitions expected");
453 "preheader not empty");
456 MDNode *OrigLoopID =
L->getLoopID();
464 NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred,
Index, LI, DT);
466 Part.getVMap()[ExitBlock] = TopPH;
467 Part.remapInstructions();
468 setNewLoopID(OrigLoopID, &Part);
475 setNewLoopID(OrigLoopID, &PartitionContainer.back());
480 for (
auto Curr = PartitionContainer.cbegin(),
481 Next = std::next(PartitionContainer.cbegin()),
482 E = PartitionContainer.cend();
483 Next !=
E; ++Curr, ++Next)
485 Next->getDistributedLoop()->getLoopPreheader(),
486 Curr->getDistributedLoop()->getExitingBlock());
490 void removeUnusedInsts() {
491 for (
auto &Partition : PartitionContainer)
492 Partition.removeUnusedInsts();
505 unsigned N = RtPtrCheck->
Pointers.size();
507 for (
unsigned I = 0;
I <
N; ++
I) {
512 int &Partition = PtrToPartitions[
I];
518 int ThisPartition = this->InstToPartitionId[Inst];
520 Partition = ThisPartition;
522 else if (Partition == -1)
524 else if (Partition != (
int)ThisPartition)
527 assert(Partition != -2 &&
"Pointer not belonging to any partition");
530 return PtrToPartitions;
535 for (
const auto &
P : PartitionContainer) {
536 OS <<
"Partition " <<
Index++ <<
" (" << &
P <<
"):\n";
545 const InstPartitionContainer &Partitions) {
546 Partitions.print(
OS);
551 void printBlocks()
const {
553 for (
const auto &
P : PartitionContainer) {
554 dbgs() <<
"\nPartition " <<
Index++ <<
" (" << &
P <<
"):\n";
560 using PartitionContainerT = std::list<InstPartition>;
563 PartitionContainerT PartitionContainer;
567 InstToPartitionIdT InstToPartitionId;
575 template <
class UnaryPredicate>
576 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
577 InstPartition *PrevMatch =
nullptr;
578 for (
auto I = PartitionContainer.begin();
I != PartitionContainer.end();) {
580 if (PrevMatch ==
nullptr && DoesMatch) {
583 }
else if (PrevMatch !=
nullptr && DoesMatch) {
584 I->moveTo(*PrevMatch);
585 I = PartitionContainer.erase(
I);
594 void setNewLoopID(
MDNode *OrigLoopID, InstPartition *Part) {
601 Loop *NewLoop = Part->getDistributedLoop();
613class MemoryInstructionDependences {
619 unsigned NumUnsafeDependencesStartOrEnd = 0;
629 MemoryInstructionDependences(
635 for (
const auto &Dep : Dependences)
636 if (Dep.isPossiblyBackward()) {
640 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
641 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
648 AccessesType Accesses;
652class LoopDistributeForLoop {
657 :
L(
L),
F(
F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) {
663 assert(
L->isInnermost() &&
"Only process inner loops.");
666 <<
L->getHeader()->getParent()->getName()
667 <<
"\" checking " << *L <<
"\n");
670 if (!
L->getExitBlock())
671 return fail(
"MultipleExitBlocks",
"multiple exit blocks");
672 if (!
L->isLoopSimplifyForm())
673 return fail(
"NotLoopSimplifyForm",
674 "loop is not in loop-simplify form");
675 if (!
L->isRotatedForm())
676 return fail(
"NotBottomTested",
"loop is not bottom tested");
680 LAI = &LAIs.getInfo(*L);
685 return fail(
"MemOpsCanBeVectorized",
686 "memory operations are safe for vectorization");
689 if (!Dependences || Dependences->empty())
690 return fail(
"NoUnsafeDeps",
"no unsafe dependences to isolate");
692 InstPartitionContainer Partitions(L, LI, DT);
717 int NumUnsafeDependencesActive = 0;
718 for (
const auto &InstDep : MID) {
722 if (NumUnsafeDependencesActive ||
723 InstDep.NumUnsafeDependencesStartOrEnd > 0)
724 Partitions.addToCyclicPartition(
I);
726 Partitions.addToNewNonCyclicPartition(
I);
727 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
728 assert(NumUnsafeDependencesActive >= 0 &&
729 "Negative number of dependences active");
738 for (
auto *Inst : DefsUsedOutside)
739 Partitions.addToNewNonCyclicPartition(Inst);
742 if (Partitions.getSize() < 2)
743 return fail(
"CantIsolateUnsafeDeps",
744 "cannot isolate unsafe dependencies");
748 Partitions.mergeBeforePopulating();
750 if (Partitions.getSize() < 2)
751 return fail(
"CantIsolateUnsafeDeps",
752 "cannot isolate unsafe dependencies");
755 Partitions.populateUsedSet();
760 if (Partitions.mergeToAvoidDuplicatedLoads()) {
761 LLVM_DEBUG(
dbgs() <<
"\nPartitions merged to ensure unique loads:\n"
763 if (Partitions.getSize() < 2)
764 return fail(
"CantIsolateUnsafeDeps",
765 "cannot isolate unsafe dependencies");
772 return fail(
"RuntimeCheckWithConvergent",
773 "may not insert runtime check with convergent operation");
779 return fail(
"TooManySCEVRuntimeChecks",
780 "too many SCEV run-time checks needed.\n");
783 return fail(
"HeuristicDisabled",
"distribution heuristic disabled");
788 Partitions.setupPartitionIdOnInstructions();
791 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
793 const auto &AllChecks = RtPtrChecking->
getChecks();
794 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
798 return fail(
"RuntimeCheckWithConvergent",
799 "may not insert runtime check with convergent operation");
811 MDNode *OrigLoopID =
L->getLoopID();
816 LVer.versionLoop(DefsUsedOutside);
817 LVer.annotateLoopWithNoAlias();
825 "llvm.loop.distribute.",
true);
826 LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
831 Partitions.cloneLoops();
835 Partitions.removeUnusedInsts();
841 assert(DT->
verify(DominatorTree::VerificationLevel::Fast));
844 ++NumLoopsDistributed;
849 <<
"distributed loop";
857 bool Forced = isForced().value_or(
false);
864 L->getStartLoc(),
L->getHeader())
865 <<
"loop not distributed: use -Rpass-analysis=loop-distribute for "
874 RemarkName,
L->getStartLoc(),
L->getHeader())
875 <<
"loop not distributed: " << Message);
881 *F,
L->getStartLoc(),
"loop not distributed: failed "
882 "explicitly specified loop distribution"));
892 const std::optional<bool> &isForced()
const {
return IsForced; }
906 copy_if(AllChecks, std::back_inserter(Checks),
908 for (
unsigned PtrIdx1 :
Check.first->Members)
909 for (
unsigned PtrIdx2 :
Check.second->Members)
925 PtrToPartition, PtrIdx1, PtrIdx2))
936 std::optional<const MDOperand *>
Value =
942 assert(Op && mdconst::hasa<ConstantInt>(*Op) &&
"invalid metadata");
943 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
963 std::optional<bool> IsForced;
977 for (
Loop *TopLevelLoop : *LI)
980 if (L->isInnermost())
984 bool Changed =
false;
985 for (
Loop *L : Worklist) {
986 LoopDistributeForLoop LDL(L, &
F, LI, DT, SE, LAIs, ORE);
991 Changed |= LDL.processLoop();
1006 bool Changed =
runImpl(
F, &LI, &DT, &SE, &ORE, LAIs);
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
std::optional< std::vector< StOtherPiece > > Other
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
This header provides classes for managing per-loop analyses.
static const char *const LLVMLoopDistributeFollowupCoincident
static cl::opt< bool > DistributeNonIfConvertible("loop-distribute-non-if-convertible", cl::Hidden, cl::desc("Whether to distribute into a loop that may not be " "if-convertible by the loop vectorizer"), cl::init(false))
static cl::opt< bool > EnableLoopDistribute("enable-loop-distribute", cl::Hidden, cl::desc("Enable the new, experimental LoopDistribution Pass"), cl::init(false))
static cl::opt< unsigned > DistributeSCEVCheckThreshold("loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution"))
static const char *const LLVMLoopDistributeFollowupSequential
static cl::opt< unsigned > PragmaDistributeSCEVCheckThreshold("loop-distribute-scev-check-threshold-with-pragma", cl::init(128), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution for loop marked with #pragma loop distribute(enable)"))
static const char *const LLVMLoopDistributeFollowupAll
static const char *const LLVMLoopDistributeFollowupFallback
static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, LoopAccessInfoManager &LAIs)
Shared implementation between new and old PMs.
static cl::opt< bool > LDistVerify("loop-distribute-verify", cl::Hidden, cl::desc("Turn on DominatorTree and LoopInfo verification " "after Loop Distribution"), cl::init(false))
This header defines various interfaces for pass management in LLVM.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static unsigned getSize(unsigned Kind)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Dependence - This class represents a dependence between two memory memory references in a function.
Diagnostic information for optimization failures.
Analysis pass which computes a DominatorTree.
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
const BasicBlock * getParent() const
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
const RuntimePointerChecking * getRuntimePointerChecking() const
bool canVectorizeMemory() const
Return true we can analyze the memory accesses in the loop and there are no memory dependence cycles.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Return the list of instructions that use Ptr to read or write memory.
bool hasConvergentOp() const
Return true if there is a convergent operation in the loop.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void verify(const DominatorTreeBase< BlockT, false > &DomTree) const
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Tracking metadata reference owned by Metadata.
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
const SCEVPredicate & getPredicate() const
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual unsigned getComplexity() const
Returns the estimated complexity of this predicate.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
SmallPtrSetIterator< PtrType > const_iterator
SmallPtrSetIterator< PtrType > iterator
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
LLVM Value Representation.
This class implements an extremely fast bulk output stream that can only output to a stream.
friend const_iterator begin(StringRef path, Style style)
Get begin iterator over path.
friend const_iterator end(StringRef path)
Get end iterator over path.
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
initializer< Ty > init(const Ty &Val)
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path)
Get end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
std::pair< const RuntimeCheckingPtrGroup *, const RuntimeCheckingPtrGroup * > RuntimePointerCheck
A memcheck which made up of a pair of grouped pointers.
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
auto reverse(ContainerTy &&C)
Loop * cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, Loop *OrigLoop, ValueToValueMapTy &VMap, const Twine &NameSuffix, LoopInfo *LI, DominatorTree *DT, SmallVectorImpl< BasicBlock * > &Blocks)
Clones a loop OrigLoop.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool hasDisableAllTransformsHint(const Loop *L)
Look for the loop attribute that disables all transformation heuristic.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
iterator_range< df_iterator< T > > depth_first(const T &G)
Dependece between memory access instructions.