LLVM  13.0.0git
LoopFuse.cpp
Go to the documentation of this file.
1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the loop fusion pass.
11 /// The implementation is largely based on the following document:
12 ///
13 /// Code Transformations to Augment the Scope of Loop Fusion in a
14 /// Production Compiler
15 /// Christopher Mark Barton
16 /// MSc Thesis
17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
18 ///
19 /// The general approach taken is to collect sets of control flow equivalent
20 /// loops and test whether they can be fused. The necessary conditions for
21 /// fusion are:
22 /// 1. The loops must be adjacent (there cannot be any statements between
23 /// the two loops).
24 /// 2. The loops must be conforming (they must execute the same number of
25 /// iterations).
26 /// 3. The loops must be control flow equivalent (if one loop executes, the
27 /// other is guaranteed to execute).
28 /// 4. There cannot be any negative distance dependencies between the loops.
29 /// If all of these conditions are satisfied, it is safe to fuse the loops.
30 ///
31 /// This implementation creates FusionCandidates that represent the loop and the
32 /// necessary information needed by fusion. It then operates on the fusion
33 /// candidates, first confirming that the candidate is eligible for fusion. The
34 /// candidates are then collected into control flow equivalent sets, sorted in
35 /// dominance order. Each set of control flow equivalent candidates is then
36 /// traversed, attempting to fuse pairs of candidates in the set. If all
37 /// requirements for fusion are met, the two candidates are fused, creating a
38 /// new (fused) candidate which is then added back into the set to consider for
39 /// additional fusion.
40 ///
41 /// This implementation currently does not make any modifications to remove
42 /// conditions for fusion. Code transformations to make loops conform to each of
43 /// the conditions for fusion are discussed in more detail in the document
44 /// above. These can be added to the current implementation in the future.
45 //===----------------------------------------------------------------------===//
46 
48 #include "llvm/ADT/Statistic.h"
52 #include "llvm/Analysis/LoopInfo.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/Verifier.h"
60 #include "llvm/InitializePasses.h"
61 #include "llvm/Pass.h"
63 #include "llvm/Support/Debug.h"
65 #include "llvm/Transforms/Scalar.h"
66 #include "llvm/Transforms/Utils.h"
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "loop-fusion"
74 
75 STATISTIC(FuseCounter, "Loops fused");
76 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
77 STATISTIC(InvalidPreheader, "Loop has invalid preheader");
78 STATISTIC(InvalidHeader, "Loop has invalid header");
79 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
80 STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
81 STATISTIC(InvalidLatch, "Loop has invalid latch");
82 STATISTIC(InvalidLoop, "Loop is invalid");
83 STATISTIC(AddressTakenBB, "Basic block has address taken");
84 STATISTIC(MayThrowException, "Loop may throw an exception");
85 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
86 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
87 STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
88 STATISTIC(UnknownTripCount, "Loop has unknown trip count");
89 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
90 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
91 STATISTIC(NonAdjacent, "Loops are not adjacent");
92 STATISTIC(
93  NonEmptyPreheader,
94  "Loop has a non-empty preheader with instructions that cannot be moved");
95 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
96 STATISTIC(NonIdenticalGuards, "Candidates have different guards");
97 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
98  "instructions that cannot be moved");
99 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
100  "instructions that cannot be moved");
101 STATISTIC(NotRotated, "Candidate is not rotated");
102 STATISTIC(OnlySecondCandidateIsGuarded,
103  "The second candidate is guarded while the first one is not");
104 
109 };
110 
112  "loop-fusion-dependence-analysis",
113  cl::desc("Which dependence analysis should loop fusion use?"),
115  "Use the scalar evolution interface"),
117  "Use the dependence analysis interface"),
119  "Use all available analyses")),
121 
123  "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
124  cl::desc("Max number of iterations to be peeled from a loop, such that "
125  "fusion can take place"));
126 
127 #ifndef NDEBUG
128 static cl::opt<bool>
129  VerboseFusionDebugging("loop-fusion-verbose-debug",
130  cl::desc("Enable verbose debugging for Loop Fusion"),
132 #endif
133 
134 namespace {
135 /// This class is used to represent a candidate for loop fusion. When it is
136 /// constructed, it checks the conditions for loop fusion to ensure that it
137 /// represents a valid candidate. It caches several parts of a loop that are
138 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
139 /// of continually querying the underlying Loop to retrieve these values. It is
140 /// assumed these will not change throughout loop fusion.
141 ///
142 /// The invalidate method should be used to indicate that the FusionCandidate is
143 /// no longer a valid candidate for fusion. Similarly, the isValid() method can
144 /// be used to ensure that the FusionCandidate is still valid for fusion.
145 struct FusionCandidate {
146  /// Cache of parts of the loop used throughout loop fusion. These should not
147  /// need to change throughout the analysis and transformation.
148  /// These parts are cached to avoid repeatedly looking up in the Loop class.
149 
150  /// Preheader of the loop this candidate represents
151  BasicBlock *Preheader;
152  /// Header of the loop this candidate represents
153  BasicBlock *Header;
154  /// Blocks in the loop that exit the loop
155  BasicBlock *ExitingBlock;
156  /// The successor block of this loop (where the exiting blocks go to)
157  BasicBlock *ExitBlock;
158  /// Latch of the loop
159  BasicBlock *Latch;
160  /// The loop that this fusion candidate represents
161  Loop *L;
162  /// Vector of instructions in this loop that read from memory
164  /// Vector of instructions in this loop that write to memory
166  /// Are all of the members of this fusion candidate still valid
167  bool Valid;
168  /// Guard branch of the loop, if it exists
169  BranchInst *GuardBranch;
170  /// Peeling Paramaters of the Loop.
172  /// Can you Peel this Loop?
173  bool AbleToPeel;
174  /// Has this loop been Peeled
175  bool Peeled;
176 
177  /// Dominator and PostDominator trees are needed for the
178  /// FusionCandidateCompare function, required by FusionCandidateSet to
179  /// determine where the FusionCandidate should be inserted into the set. These
180  /// are used to establish ordering of the FusionCandidates based on dominance.
181  const DominatorTree *DT;
182  const PostDominatorTree *PDT;
183 
185 
186  FusionCandidate(Loop *L, const DominatorTree *DT,
189  : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
190  ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
191  Latch(L->getLoopLatch()), L(L), Valid(true),
192  GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
193  Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
194 
195  // Walk over all blocks in the loop and check for conditions that may
196  // prevent fusion. For each block, walk over all instructions and collect
197  // the memory reads and writes If any instructions that prevent fusion are
198  // found, invalidate this object and return.
199  for (BasicBlock *BB : L->blocks()) {
200  if (BB->hasAddressTaken()) {
201  invalidate();
202  reportInvalidCandidate(AddressTakenBB);
203  return;
204  }
205 
206  for (Instruction &I : *BB) {
207  if (I.mayThrow()) {
208  invalidate();
209  reportInvalidCandidate(MayThrowException);
210  return;
211  }
212  if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
213  if (SI->isVolatile()) {
214  invalidate();
215  reportInvalidCandidate(ContainsVolatileAccess);
216  return;
217  }
218  }
219  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
220  if (LI->isVolatile()) {
221  invalidate();
222  reportInvalidCandidate(ContainsVolatileAccess);
223  return;
224  }
225  }
226  if (I.mayWriteToMemory())
227  MemWrites.push_back(&I);
228  if (I.mayReadFromMemory())
229  MemReads.push_back(&I);
230  }
231  }
232  }
233 
234  /// Check if all members of the class are valid.
235  bool isValid() const {
236  return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
237  !L->isInvalid() && Valid;
238  }
239 
240  /// Verify that all members are in sync with the Loop object.
241  void verify() const {
242  assert(isValid() && "Candidate is not valid!!");
243  assert(!L->isInvalid() && "Loop is invalid!");
244  assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
245  assert(Header == L->getHeader() && "Header is out of sync");
246  assert(ExitingBlock == L->getExitingBlock() &&
247  "Exiting Blocks is out of sync");
248  assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
249  assert(Latch == L->getLoopLatch() && "Latch is out of sync");
250  }
251 
252  /// Get the entry block for this fusion candidate.
253  ///
254  /// If this fusion candidate represents a guarded loop, the entry block is the
255  /// loop guard block. If it represents an unguarded loop, the entry block is
256  /// the preheader of the loop.
257  BasicBlock *getEntryBlock() const {
258  if (GuardBranch)
259  return GuardBranch->getParent();
260  else
261  return Preheader;
262  }
263 
264  /// After Peeling the loop is modified quite a bit, hence all of the Blocks
265  /// need to be updated accordingly.
266  void updateAfterPeeling() {
267  Preheader = L->getLoopPreheader();
268  Header = L->getHeader();
269  ExitingBlock = L->getExitingBlock();
270  ExitBlock = L->getExitBlock();
271  Latch = L->getLoopLatch();
272  verify();
273  }
274 
275  /// Given a guarded loop, get the successor of the guard that is not in the
276  /// loop.
277  ///
278  /// This method returns the successor of the loop guard that is not located
279  /// within the loop (i.e., the successor of the guard that is not the
280  /// preheader).
281  /// This method is only valid for guarded loops.
282  BasicBlock *getNonLoopBlock() const {
283  assert(GuardBranch && "Only valid on guarded loops.");
284  assert(GuardBranch->isConditional() &&
285  "Expecting guard to be a conditional branch.");
286  if (Peeled)
287  return GuardBranch->getSuccessor(1);
288  return (GuardBranch->getSuccessor(0) == Preheader)
289  ? GuardBranch->getSuccessor(1)
290  : GuardBranch->getSuccessor(0);
291  }
292 
293 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
294  LLVM_DUMP_METHOD void dump() const {
295  dbgs() << "\tGuardBranch: ";
296  if (GuardBranch)
297  dbgs() << *GuardBranch;
298  else
299  dbgs() << "nullptr";
300  dbgs() << "\n"
301  << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
302  << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
303  << "\n"
304  << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
305  << "\tExitingBB: "
306  << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
307  << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
308  << "\n"
309  << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
310  << "\tEntryBlock: "
311  << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
312  << "\n";
313  }
314 #endif
315 
316  /// Determine if a fusion candidate (representing a loop) is eligible for
317  /// fusion. Note that this only checks whether a single loop can be fused - it
318  /// does not check whether it is *legal* to fuse two loops together.
319  bool isEligibleForFusion(ScalarEvolution &SE) const {
320  if (!isValid()) {
321  LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
322  if (!Preheader)
323  ++InvalidPreheader;
324  if (!Header)
325  ++InvalidHeader;
326  if (!ExitingBlock)
327  ++InvalidExitingBlock;
328  if (!ExitBlock)
329  ++InvalidExitBlock;
330  if (!Latch)
331  ++InvalidLatch;
332  if (L->isInvalid())
333  ++InvalidLoop;
334 
335  return false;
336  }
337 
338  // Require ScalarEvolution to be able to determine a trip count.
340  LLVM_DEBUG(dbgs() << "Loop " << L->getName()
341  << " trip count not computable!\n");
342  return reportInvalidCandidate(UnknownTripCount);
343  }
344 
345  if (!L->isLoopSimplifyForm()) {
346  LLVM_DEBUG(dbgs() << "Loop " << L->getName()
347  << " is not in simplified form!\n");
348  return reportInvalidCandidate(NotSimplifiedForm);
349  }
350 
351  if (!L->isRotatedForm()) {
352  LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
353  return reportInvalidCandidate(NotRotated);
354  }
355 
356  return true;
357  }
358 
359 private:
360  // This is only used internally for now, to clear the MemWrites and MemReads
361  // list and setting Valid to false. I can't envision other uses of this right
362  // now, since once FusionCandidates are put into the FusionCandidateSet they
363  // are immutable. Thus, any time we need to change/update a FusionCandidate,
364  // we must create a new one and insert it into the FusionCandidateSet to
365  // ensure the FusionCandidateSet remains ordered correctly.
366  void invalidate() {
367  MemWrites.clear();
368  MemReads.clear();
369  Valid = false;
370  }
371 
372  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
373  using namespace ore;
374  assert(L && Preheader && "Fusion candidate not initialized properly!");
375 #if LLVM_ENABLE_STATS
376  ++Stat;
378  L->getStartLoc(), Preheader)
379  << "[" << Preheader->getParent()->getName() << "]: "
380  << "Loop is not a candidate for fusion: " << Stat.getDesc());
381 #endif
382  return false;
383  }
384 };
385 
386 struct FusionCandidateCompare {
387  /// Comparison functor to sort two Control Flow Equivalent fusion candidates
388  /// into dominance order.
389  /// If LHS dominates RHS and RHS post-dominates LHS, return true;
390  /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
391  bool operator()(const FusionCandidate &LHS,
392  const FusionCandidate &RHS) const {
393  const DominatorTree *DT = LHS.DT;
394 
395  BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
396  BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
397 
398  // Do not save PDT to local variable as it is only used in asserts and thus
399  // will trigger an unused variable warning if building without asserts.
400  assert(DT && LHS.PDT && "Expecting valid dominator tree");
401 
402  // Do this compare first so if LHS == RHS, function returns false.
403  if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
404  // RHS dominates LHS
405  // Verify LHS post-dominates RHS
406  assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
407  return false;
408  }
409 
410  if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
411  // Verify RHS Postdominates LHS
412  assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
413  return true;
414  }
415 
416  // If LHS does not dominate RHS and RHS does not dominate LHS then there is
417  // no dominance relationship between the two FusionCandidates. Thus, they
418  // should not be in the same set together.
420  "No dominance relationship between these fusion candidates!");
421  }
422 };
423 
424 using LoopVector = SmallVector<Loop *, 4>;
425 
426 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
427 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
428 // dominates FC1 and FC1 post-dominates FC0.
429 // std::set was chosen because we want a sorted data structure with stable
430 // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
431 // loops by moving intervening code around. When this intervening code contains
432 // loops, those loops will be moved also. The corresponding FusionCandidates
433 // will also need to be moved accordingly. As this is done, having stable
434 // iterators will simplify the logic. Similarly, having an efficient insert that
435 // keeps the FusionCandidateSet sorted will also simplify the implementation.
436 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
437 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
438 
439 #if !defined(NDEBUG)
441  const FusionCandidate &FC) {
442  if (FC.isValid())
443  OS << FC.Preheader->getName();
444  else
445  OS << "<Invalid>";
446 
447  return OS;
448 }
449 
451  const FusionCandidateSet &CandSet) {
452  for (const FusionCandidate &FC : CandSet)
453  OS << FC << '\n';
454 
455  return OS;
456 }
457 
458 static void
459 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
460  dbgs() << "Fusion Candidates: \n";
461  for (const auto &CandidateSet : FusionCandidates) {
462  dbgs() << "*** Fusion Candidate Set ***\n";
463  dbgs() << CandidateSet;
464  dbgs() << "****************************\n";
465  }
466 }
467 #endif
468 
469 /// Collect all loops in function at the same nest level, starting at the
470 /// outermost level.
471 ///
472 /// This data structure collects all loops at the same nest level for a
473 /// given function (specified by the LoopInfo object). It starts at the
474 /// outermost level.
475 struct LoopDepthTree {
476  using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
477  using iterator = LoopsOnLevelTy::iterator;
478  using const_iterator = LoopsOnLevelTy::const_iterator;
479 
480  LoopDepthTree(LoopInfo &LI) : Depth(1) {
481  if (!LI.empty())
482  LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
483  }
484 
485  /// Test whether a given loop has been removed from the function, and thus is
486  /// no longer valid.
487  bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
488 
489  /// Record that a given loop has been removed from the function and is no
490  /// longer valid.
491  void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
492 
493  /// Descend the tree to the next (inner) nesting level
494  void descend() {
495  LoopsOnLevelTy LoopsOnNextLevel;
496 
497  for (const LoopVector &LV : *this)
498  for (Loop *L : LV)
499  if (!isRemovedLoop(L) && L->begin() != L->end())
500  LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
501 
502  LoopsOnLevel = LoopsOnNextLevel;
503  RemovedLoops.clear();
504  Depth++;
505  }
506 
507  bool empty() const { return size() == 0; }
508  size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
509  unsigned getDepth() const { return Depth; }
510 
511  iterator begin() { return LoopsOnLevel.begin(); }
512  iterator end() { return LoopsOnLevel.end(); }
513  const_iterator begin() const { return LoopsOnLevel.begin(); }
514  const_iterator end() const { return LoopsOnLevel.end(); }
515 
516 private:
517  /// Set of loops that have been removed from the function and are no longer
518  /// valid.
519  SmallPtrSet<const Loop *, 8> RemovedLoops;
520 
521  /// Depth of the current level, starting at 1 (outermost loops).
522  unsigned Depth;
523 
524  /// Vector of loops at the current depth level that have the same parent loop
525  LoopsOnLevelTy LoopsOnLevel;
526 };
527 
528 #ifndef NDEBUG
529 static void printLoopVector(const LoopVector &LV) {
530  dbgs() << "****************************\n";
531  for (auto L : LV)
532  printLoop(*L, dbgs());
533  dbgs() << "****************************\n";
534 }
535 #endif
536 
537 struct LoopFuser {
538 private:
539  // Sets of control flow equivalent fusion candidates for a given nest level.
540  FusionCandidateCollection FusionCandidates;
541 
542  LoopDepthTree LDT;
543  DomTreeUpdater DTU;
544 
545  LoopInfo &LI;
546  DominatorTree &DT;
547  DependenceInfo &DI;
548  ScalarEvolution &SE;
549  PostDominatorTree &PDT;
551  AssumptionCache &AC;
552 
553  const TargetTransformInfo &TTI;
554 
555 public:
556  LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
560  : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
561  DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
562 
563  /// This is the main entry point for loop fusion. It will traverse the
564  /// specified function and collect candidate loops to fuse, starting at the
565  /// outermost nesting level and working inwards.
566  bool fuseLoops(Function &F) {
567 #ifndef NDEBUG
569  LI.print(dbgs());
570  }
571 #endif
572 
573  LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
574  << "\n");
575  bool Changed = false;
576 
577  while (!LDT.empty()) {
578  LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
579  << LDT.getDepth() << "\n";);
580 
581  for (const LoopVector &LV : LDT) {
582  assert(LV.size() > 0 && "Empty loop set was build!");
583 
584  // Skip singleton loop sets as they do not offer fusion opportunities on
585  // this level.
586  if (LV.size() == 1)
587  continue;
588 #ifndef NDEBUG
590  LLVM_DEBUG({
591  dbgs() << " Visit loop set (#" << LV.size() << "):\n";
592  printLoopVector(LV);
593  });
594  }
595 #endif
596 
597  collectFusionCandidates(LV);
598  Changed |= fuseCandidates();
599  }
600 
601  // Finished analyzing candidates at this level.
602  // Descend to the next level and clear all of the candidates currently
603  // collected. Note that it will not be possible to fuse any of the
604  // existing candidates with new candidates because the new candidates will
605  // be at a different nest level and thus not be control flow equivalent
606  // with all of the candidates collected so far.
607  LLVM_DEBUG(dbgs() << "Descend one level!\n");
608  LDT.descend();
609  FusionCandidates.clear();
610  }
611 
612  if (Changed)
613  LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
614 
615 #ifndef NDEBUG
616  assert(DT.verify());
617  assert(PDT.verify());
618  LI.verify(DT);
619  SE.verify();
620 #endif
621 
622  LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
623  return Changed;
624  }
625 
626 private:
627  /// Determine if two fusion candidates are control flow equivalent.
628  ///
629  /// Two fusion candidates are control flow equivalent if when one executes,
630  /// the other is guaranteed to execute. This is determined using dominators
631  /// and post-dominators: if A dominates B and B post-dominates A then A and B
632  /// are control-flow equivalent.
633  bool isControlFlowEquivalent(const FusionCandidate &FC0,
634  const FusionCandidate &FC1) const {
635  assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
636 
637  return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
638  DT, PDT);
639  }
640 
641  /// Iterate over all loops in the given loop set and identify the loops that
642  /// are eligible for fusion. Place all eligible fusion candidates into Control
643  /// Flow Equivalent sets, sorted by dominance.
644  void collectFusionCandidates(const LoopVector &LV) {
645  for (Loop *L : LV) {
648  FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
649  if (!CurrCand.isEligibleForFusion(SE))
650  continue;
651 
652  // Go through each list in FusionCandidates and determine if L is control
653  // flow equivalent with the first loop in that list. If it is, append LV.
654  // If not, go to the next list.
655  // If no suitable list is found, start another list and add it to
656  // FusionCandidates.
657  bool FoundSet = false;
658 
659  for (auto &CurrCandSet : FusionCandidates) {
660  if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
661  CurrCandSet.insert(CurrCand);
662  FoundSet = true;
663 #ifndef NDEBUG
665  LLVM_DEBUG(dbgs() << "Adding " << CurrCand
666  << " to existing candidate set\n");
667 #endif
668  break;
669  }
670  }
671  if (!FoundSet) {
672  // No set was found. Create a new set and add to FusionCandidates
673 #ifndef NDEBUG
675  LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
676 #endif
677  FusionCandidateSet NewCandSet;
678  NewCandSet.insert(CurrCand);
679  FusionCandidates.push_back(NewCandSet);
680  }
681  NumFusionCandidates++;
682  }
683  }
684 
685  /// Determine if it is beneficial to fuse two loops.
686  ///
687  /// For now, this method simply returns true because we want to fuse as much
688  /// as possible (primarily to test the pass). This method will evolve, over
689  /// time, to add heuristics for profitability of fusion.
690  bool isBeneficialFusion(const FusionCandidate &FC0,
691  const FusionCandidate &FC1) {
692  return true;
693  }
694 
695  /// Determine if two fusion candidates have the same trip count (i.e., they
696  /// execute the same number of iterations).
697  ///
698  /// This function will return a pair of values. The first is a boolean,
699  /// stating whether or not the two candidates are known at compile time to
700  /// have the same TripCount. The second is the difference in the two
701  /// TripCounts. This information can be used later to determine whether or not
702  /// peeling can be performed on either one of the candiates.
703  std::pair<bool, Optional<unsigned>>
704  haveIdenticalTripCounts(const FusionCandidate &FC0,
705  const FusionCandidate &FC1) const {
706 
707  const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
708  if (isa<SCEVCouldNotCompute>(TripCount0)) {
709  UncomputableTripCount++;
710  LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
711  return {false, None};
712  }
713 
714  const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
715  if (isa<SCEVCouldNotCompute>(TripCount1)) {
716  UncomputableTripCount++;
717  LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
718  return {false, None};
719  }
720 
721  LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
722  << *TripCount1 << " are "
723  << (TripCount0 == TripCount1 ? "identical" : "different")
724  << "\n");
725 
726  if (TripCount0 == TripCount1)
727  return {true, 0};
728 
729  LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
730  "determining the difference between trip counts\n");
731 
732  // Currently only considering loops with a single exit point
733  // and a non-constant trip count.
734  const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
735  const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
736 
737  // If any of the tripcounts are zero that means that loop(s) do not have
738  // a single exit or a constant tripcount.
739  if (TC0 == 0 || TC1 == 0) {
740  LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
741  "have a constant number of iterations. Peeling "
742  "is not benefical\n");
743  return {false, None};
744  }
745 
746  Optional<unsigned> Difference = None;
747  int Diff = TC0 - TC1;
748 
749  if (Diff > 0)
750  Difference = Diff;
751  else {
752  LLVM_DEBUG(
753  dbgs() << "Difference is less than 0. FC1 (second loop) has more "
754  "iterations than the first one. Currently not supported\n");
755  }
756 
757  LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
758  << "\n");
759 
760  return {false, Difference};
761  }
762 
763  void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
764  unsigned PeelCount) {
765  assert(FC0.AbleToPeel && "Should be able to peel loop");
766 
767  LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
768  << " iterations of the first loop. \n");
769 
770  FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true);
771  if (FC0.Peeled) {
772  LLVM_DEBUG(dbgs() << "Done Peeling\n");
773 
774 #ifndef NDEBUG
775  auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
776 
777  assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
778  "Loops should have identical trip counts after peeling");
779 #endif
780 
781  FC0.PP.PeelCount += PeelCount;
782 
783  // Peeling does not update the PDT
784  PDT.recalculate(*FC0.Preheader->getParent());
785 
786  FC0.updateAfterPeeling();
787 
788  // In this case the iterations of the loop are constant, so the first
789  // loop will execute completely (will not jump from one of
790  // the peeled blocks to the second loop). Here we are updating the
791  // branch conditions of each of the peeled blocks, such that it will
792  // branch to its successor which is not the preheader of the second loop
793  // in the case of unguarded loops, or the succesors of the exit block of
794  // the first loop otherwise. Doing this update will ensure that the entry
795  // block of the first loop dominates the entry block of the second loop.
796  BasicBlock *BB =
797  FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
798  if (BB) {
801  for (BasicBlock *Pred : predecessors(BB)) {
802  if (Pred != FC0.ExitBlock) {
803  WorkList.emplace_back(Pred->getTerminator());
804  TreeUpdates.emplace_back(
806  }
807  }
808  // Cannot modify the predecessors inside the above loop as it will cause
809  // the iterators to be nullptrs, causing memory errors.
810  for (Instruction *CurrentBranch: WorkList) {
811  BasicBlock *Succ = CurrentBranch->getSuccessor(0);
812  if (Succ == BB)
813  Succ = CurrentBranch->getSuccessor(1);
814  ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
815  }
816 
817  DTU.applyUpdates(TreeUpdates);
818  DTU.flush();
819  }
820  LLVM_DEBUG(
821  dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
822  << " iterations from the first loop.\n"
823  "Both Loops have the same number of iterations now.\n");
824  }
825  }
826 
827  /// Walk each set of control flow equivalent fusion candidates and attempt to
828  /// fuse them. This does a single linear traversal of all candidates in the
829  /// set. The conditions for legal fusion are checked at this point. If a pair
830  /// of fusion candidates passes all legality checks, they are fused together
831  /// and a new fusion candidate is created and added to the FusionCandidateSet.
832  /// The original fusion candidates are then removed, as they are no longer
833  /// valid.
834  bool fuseCandidates() {
835  bool Fused = false;
836  LLVM_DEBUG(printFusionCandidates(FusionCandidates));
837  for (auto &CandidateSet : FusionCandidates) {
838  if (CandidateSet.size() < 2)
839  continue;
840 
841  LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
842  << CandidateSet << "\n");
843 
844  for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
845  assert(!LDT.isRemovedLoop(FC0->L) &&
846  "Should not have removed loops in CandidateSet!");
847  auto FC1 = FC0;
848  for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
849  assert(!LDT.isRemovedLoop(FC1->L) &&
850  "Should not have removed loops in CandidateSet!");
851 
852  LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
853  dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
854 
855  FC0->verify();
856  FC1->verify();
857 
858  // Check if the candidates have identical tripcounts (first value of
859  // pair), and if not check the difference in the tripcounts between
860  // the loops (second value of pair). The difference is not equal to
861  // None iff the loops iterate a constant number of times, and have a
862  // single exit.
863  std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
864  haveIdenticalTripCounts(*FC0, *FC1);
865  bool SameTripCount = IdenticalTripCountRes.first;
866  Optional<unsigned> TCDifference = IdenticalTripCountRes.second;
867 
868  // Here we are checking that FC0 (the first loop) can be peeled, and
869  // both loops have different tripcounts.
870  if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
871  if (*TCDifference > FusionPeelMaxCount) {
872  LLVM_DEBUG(dbgs()
873  << "Difference in loop trip counts: " << *TCDifference
874  << " is greater than maximum peel count specificed: "
875  << FusionPeelMaxCount << "\n");
876  } else {
877  // Dependent on peeling being performed on the first loop, and
878  // assuming all other conditions for fusion return true.
879  SameTripCount = true;
880  }
881  }
882 
883  if (!SameTripCount) {
884  LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
885  "counts. Not fusing.\n");
886  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
887  NonEqualTripCount);
888  continue;
889  }
890 
891  if (!isAdjacent(*FC0, *FC1)) {
892  LLVM_DEBUG(dbgs()
893  << "Fusion candidates are not adjacent. Not fusing.\n");
894  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
895  continue;
896  }
897 
898  if (!FC0->GuardBranch && FC1->GuardBranch) {
899  LLVM_DEBUG(dbgs() << "The second candidate is guarded while the "
900  "first one is not. Not fusing.\n");
901  reportLoopFusion<OptimizationRemarkMissed>(
902  *FC0, *FC1, OnlySecondCandidateIsGuarded);
903  continue;
904  }
905 
906  // Ensure that FC0 and FC1 have identical guards.
907  // If one (or both) are not guarded, this check is not necessary.
908  if (FC0->GuardBranch && FC1->GuardBranch &&
909  !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
910  LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
911  "guards. Not Fusing.\n");
912  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
913  NonIdenticalGuards);
914  continue;
915  }
916 
917  if (!isSafeToMoveBefore(*FC1->Preheader,
918  *FC0->Preheader->getTerminator(), DT, &PDT,
919  &DI)) {
920  LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
921  "instructions in preheader. Not fusing.\n");
922  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
923  NonEmptyPreheader);
924  continue;
925  }
926 
927  if (FC0->GuardBranch) {
928  assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
929 
930  if (!isSafeToMoveBefore(*FC0->ExitBlock,
931  *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
932  &PDT, &DI)) {
933  LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
934  "instructions in exit block. Not fusing.\n");
935  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
936  NonEmptyExitBlock);
937  continue;
938  }
939 
940  if (!isSafeToMoveBefore(
941  *FC1->GuardBranch->getParent(),
942  *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
943  &DI)) {
944  LLVM_DEBUG(dbgs()
945  << "Fusion candidate contains unsafe "
946  "instructions in guard block. Not fusing.\n");
947  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
948  NonEmptyGuardBlock);
949  continue;
950  }
951  }
952 
953  // Check the dependencies across the loops and do not fuse if it would
954  // violate them.
955  if (!dependencesAllowFusion(*FC0, *FC1)) {
956  LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
957  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
958  InvalidDependencies);
959  continue;
960  }
961 
962  bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
963  LLVM_DEBUG(dbgs()
964  << "\tFusion appears to be "
965  << (BeneficialToFuse ? "" : "un") << "profitable!\n");
966  if (!BeneficialToFuse) {
967  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
968  FusionNotBeneficial);
969  continue;
970  }
971  // All analysis has completed and has determined that fusion is legal
972  // and profitable. At this point, start transforming the code and
973  // perform fusion.
974 
975  LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
976  << *FC1 << "\n");
977 
978  FusionCandidate FC0Copy = *FC0;
979  // Peel the loop after determining that fusion is legal. The Loops
980  // will still be safe to fuse after the peeling is performed.
981  bool Peel = TCDifference && *TCDifference > 0;
982  if (Peel)
983  peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
984 
985  // Report fusion to the Optimization Remarks.
986  // Note this needs to be done *before* performFusion because
987  // performFusion will change the original loops, making it not
988  // possible to identify them after fusion is complete.
989  reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
990  FuseCounter);
991 
992  FusionCandidate FusedCand(
993  performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE,
994  FC0Copy.PP);
995  FusedCand.verify();
996  assert(FusedCand.isEligibleForFusion(SE) &&
997  "Fused candidate should be eligible for fusion!");
998 
999  // Notify the loop-depth-tree that these loops are not valid objects
1000  LDT.removeLoop(FC1->L);
1001 
1002  CandidateSet.erase(FC0);
1003  CandidateSet.erase(FC1);
1004 
1005  auto InsertPos = CandidateSet.insert(FusedCand);
1006 
1007  assert(InsertPos.second &&
1008  "Unable to insert TargetCandidate in CandidateSet!");
1009 
1010  // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
1011  // of the FC1 loop will attempt to fuse the new (fused) loop with the
1012  // remaining candidates in the current candidate set.
1013  FC0 = FC1 = InsertPos.first;
1014 
1015  LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
1016  << "\n");
1017 
1018  Fused = true;
1019  }
1020  }
1021  }
1022  return Fused;
1023  }
1024 
1025  /// Rewrite all additive recurrences in a SCEV to use a new loop.
1026  class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
1027  public:
1028  AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
1029  bool UseMax = true)
1030  : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
1031  NewL(NewL) {}
1032 
1033  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
1034  const Loop *ExprL = Expr->getLoop();
1036  if (ExprL == &OldL) {
1037  Operands.append(Expr->op_begin(), Expr->op_end());
1038  return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
1039  }
1040 
1041  if (OldL.contains(ExprL)) {
1042  bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
1043  if (!UseMax || !Pos || !Expr->isAffine()) {
1044  Valid = false;
1045  return Expr;
1046  }
1047  return visit(Expr->getStart());
1048  }
1049 
1050  for (const SCEV *Op : Expr->operands())
1051  Operands.push_back(visit(Op));
1052  return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
1053  }
1054 
1055  bool wasValidSCEV() const { return Valid; }
1056 
1057  private:
1058  bool Valid, UseMax;
1059  const Loop &OldL, &NewL;
1060  };
1061 
1062  /// Return false if the access functions of \p I0 and \p I1 could cause
1063  /// a negative dependence.
1064  bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
1065  Instruction &I1, bool EqualIsInvalid) {
1066  Value *Ptr0 = getLoadStorePointerOperand(&I0);
1067  Value *Ptr1 = getLoadStorePointerOperand(&I1);
1068  if (!Ptr0 || !Ptr1)
1069  return false;
1070 
1071  const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
1072  const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
1073 #ifndef NDEBUG
1075  LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
1076  << *SCEVPtr1 << "\n");
1077 #endif
1078  AddRecLoopReplacer Rewriter(SE, L0, L1);
1079  SCEVPtr0 = Rewriter.visit(SCEVPtr0);
1080 #ifndef NDEBUG
1082  LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
1083  << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
1084 #endif
1085  if (!Rewriter.wasValidSCEV())
1086  return false;
1087 
1088  // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
1089  // L0) and the other is not. We could check if it is monotone and test
1090  // the beginning and end value instead.
1091 
1092  BasicBlock *L0Header = L0.getHeader();
1093  auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
1094  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
1095  if (!AddRec)
1096  return false;
1097  return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
1098  !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
1099  };
1100  if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
1101  return false;
1102 
1103  ICmpInst::Predicate Pred =
1104  EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
1105  bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
1106 #ifndef NDEBUG
1108  LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
1109  << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
1110  << "\n");
1111 #endif
1112  return IsAlwaysGE;
1113  }
1114 
1115  /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
1116  /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
1117  /// specified by @p DepChoice are used to determine this.
1118  bool dependencesAllowFusion(const FusionCandidate &FC0,
1119  const FusionCandidate &FC1, Instruction &I0,
1120  Instruction &I1, bool AnyDep,
1121  FusionDependenceAnalysisChoice DepChoice) {
1122 #ifndef NDEBUG
1123  if (VerboseFusionDebugging) {
1124  LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
1125  << DepChoice << "\n");
1126  }
1127 #endif
1128  switch (DepChoice) {
1130  return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
1132  auto DepResult = DI.depends(&I0, &I1, true);
1133  if (!DepResult)
1134  return true;
1135 #ifndef NDEBUG
1136  if (VerboseFusionDebugging) {
1137  LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
1138  dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
1139  << (DepResult->isOrdered() ? "true" : "false")
1140  << "]\n");
1141  LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
1142  << "\n");
1143  }
1144 #endif
1145 
1146  if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
1147  LLVM_DEBUG(
1148  dbgs() << "TODO: Implement pred/succ dependence handling!\n");
1149 
1150  // TODO: Can we actually use the dependence info analysis here?
1151  return false;
1152  }
1153 
1155  return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
1157  dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
1159  }
1160 
1161  llvm_unreachable("Unknown fusion dependence analysis choice!");
1162  }
1163 
1164  /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
1165  bool dependencesAllowFusion(const FusionCandidate &FC0,
1166  const FusionCandidate &FC1) {
1167  LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
1168  << "\n");
1169  assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
1170  assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
1171 
1172  for (Instruction *WriteL0 : FC0.MemWrites) {
1173  for (Instruction *WriteL1 : FC1.MemWrites)
1174  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
1175  /* AnyDep */ false,
1177  InvalidDependencies++;
1178  return false;
1179  }
1180  for (Instruction *ReadL1 : FC1.MemReads)
1181  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
1182  /* AnyDep */ false,
1184  InvalidDependencies++;
1185  return false;
1186  }
1187  }
1188 
1189  for (Instruction *WriteL1 : FC1.MemWrites) {
1190  for (Instruction *WriteL0 : FC0.MemWrites)
1191  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
1192  /* AnyDep */ false,
1194  InvalidDependencies++;
1195  return false;
1196  }
1197  for (Instruction *ReadL0 : FC0.MemReads)
1198  if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
1199  /* AnyDep */ false,
1201  InvalidDependencies++;
1202  return false;
1203  }
1204  }
1205 
1206  // Walk through all uses in FC1. For each use, find the reaching def. If the
1207  // def is located in FC0 then it is is not safe to fuse.
1208  for (BasicBlock *BB : FC1.L->blocks())
1209  for (Instruction &I : *BB)
1210  for (auto &Op : I.operands())
1211  if (Instruction *Def = dyn_cast<Instruction>(Op))
1212  if (FC0.L->contains(Def->getParent())) {
1213  InvalidDependencies++;
1214  return false;
1215  }
1216 
1217  return true;
1218  }
1219 
1220  /// Determine if two fusion candidates are adjacent in the CFG.
1221  ///
1222  /// This method will determine if there are additional basic blocks in the CFG
1223  /// between the exit of \p FC0 and the entry of \p FC1.
1224  /// If the two candidates are guarded loops, then it checks whether the
1225  /// non-loop successor of the \p FC0 guard branch is the entry block of \p
1226  /// FC1. If not, then the loops are not adjacent. If the two candidates are
1227  /// not guarded loops, then it checks whether the exit block of \p FC0 is the
1228  /// preheader of \p FC1.
1229  bool isAdjacent(const FusionCandidate &FC0,
1230  const FusionCandidate &FC1) const {
1231  // If the successor of the guard branch is FC1, then the loops are adjacent
1232  if (FC0.GuardBranch)
1233  return FC0.getNonLoopBlock() == FC1.getEntryBlock();
1234  else
1235  return FC0.ExitBlock == FC1.getEntryBlock();
1236  }
1237 
1238  /// Determine if two fusion candidates have identical guards
1239  ///
1240  /// This method will determine if two fusion candidates have the same guards.
1241  /// The guards are considered the same if:
1242  /// 1. The instructions to compute the condition used in the compare are
1243  /// identical.
1244  /// 2. The successors of the guard have the same flow into/around the loop.
1245  /// If the compare instructions are identical, then the first successor of the
1246  /// guard must go to the same place (either the preheader of the loop or the
1247  /// NonLoopBlock). In other words, the the first successor of both loops must
1248  /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
1249  /// the NonLoopBlock). The same must be true for the second successor.
1250  bool haveIdenticalGuards(const FusionCandidate &FC0,
1251  const FusionCandidate &FC1) const {
1252  assert(FC0.GuardBranch && FC1.GuardBranch &&
1253  "Expecting FC0 and FC1 to be guarded loops.");
1254 
1255  if (auto FC0CmpInst =
1256  dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
1257  if (auto FC1CmpInst =
1258  dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
1259  if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
1260  return false;
1261 
1262  // The compare instructions are identical.
1263  // Now make sure the successor of the guards have the same flow into/around
1264  // the loop
1265  if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
1266  return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
1267  else
1268  return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
1269  }
1270 
1271  /// Modify the latch branch of FC to be unconditional since successors of the
1272  /// branch are the same.
1273  void simplifyLatchBranch(const FusionCandidate &FC) const {
1274  BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
1275  if (FCLatchBranch) {
1276  assert(FCLatchBranch->isConditional() &&
1277  FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
1278  "Expecting the two successors of FCLatchBranch to be the same");
1279  BranchInst *NewBranch =
1280  BranchInst::Create(FCLatchBranch->getSuccessor(0));
1281  ReplaceInstWithInst(FCLatchBranch, NewBranch);
1282  }
1283  }
1284 
1285  /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
1286  /// successor, then merge FC0.Latch with its unique successor.
1287  void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
1288  moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
1289  if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
1290  MergeBlockIntoPredecessor(Succ, &DTU, &LI);
1291  DTU.flush();
1292  }
1293  }
1294 
1295  /// Fuse two fusion candidates, creating a new fused loop.
1296  ///
1297  /// This method contains the mechanics of fusing two loops, represented by \p
1298  /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
1299  /// postdominates \p FC0 (making them control flow equivalent). It also
1300  /// assumes that the other conditions for fusion have been met: adjacent,
1301  /// identical trip counts, and no negative distance dependencies exist that
1302  /// would prevent fusion. Thus, there is no checking for these conditions in
1303  /// this method.
1304  ///
1305  /// Fusion is performed by rewiring the CFG to update successor blocks of the
1306  /// components of tho loop. Specifically, the following changes are done:
1307  ///
1308  /// 1. The preheader of \p FC1 is removed as it is no longer necessary
1309  /// (because it is currently only a single statement block).
1310  /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
1311  /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
1312  /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
1313  ///
1314  /// All of these modifications are done with dominator tree updates, thus
1315  /// keeping the dominator (and post dominator) information up-to-date.
1316  ///
1317  /// This can be improved in the future by actually merging blocks during
1318  /// fusion. For example, the preheader of \p FC1 can be merged with the
1319  /// preheader of \p FC0. This would allow loops with more than a single
1320  /// statement in the preheader to be fused. Similarly, the latch blocks of the
1321  /// two loops could also be fused into a single block. This will require
1322  /// analysis to prove it is safe to move the contents of the block past
1323  /// existing code, which currently has not been implemented.
1324  Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
1325  assert(FC0.isValid() && FC1.isValid() &&
1326  "Expecting valid fusion candidates");
1327 
1328  LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
1329  dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
1330 
1331  // Move instructions from the preheader of FC1 to the end of the preheader
1332  // of FC0.
1333  moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
1334 
1335  // Fusing guarded loops is handled slightly differently than non-guarded
1336  // loops and has been broken out into a separate method instead of trying to
1337  // intersperse the logic within a single method.
1338  if (FC0.GuardBranch)
1339  return fuseGuardedLoops(FC0, FC1);
1340 
1341  assert(FC1.Preheader ==
1342  (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
1343  assert(FC1.Preheader->size() == 1 &&
1344  FC1.Preheader->getSingleSuccessor() == FC1.Header);
1345 
1346  // Remember the phi nodes originally in the header of FC0 in order to rewire
1347  // them later. However, this is only necessary if the new loop carried
1348  // values might not dominate the exiting branch. While we do not generally
1349  // test if this is the case but simply insert intermediate phi nodes, we
1350  // need to make sure these intermediate phi nodes have different
1351  // predecessors. To this end, we filter the special case where the exiting
1352  // block is the latch block of the first loop. Nothing needs to be done
1353  // anyway as all loop carried values dominate the latch and thereby also the
1354  // exiting branch.
1355  SmallVector<PHINode *, 8> OriginalFC0PHIs;
1356  if (FC0.ExitingBlock != FC0.Latch)
1357  for (PHINode &PHI : FC0.Header->phis())
1358  OriginalFC0PHIs.push_back(&PHI);
1359 
1360  // Replace incoming blocks for header PHIs first.
1361  FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
1362  FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
1363 
1364  // Then modify the control flow and update DT and PDT.
1366 
1367  // The old exiting block of the first loop (FC0) has to jump to the header
1368  // of the second as we need to execute the code in the second header block
1369  // regardless of the trip count. That is, if the trip count is 0, so the
1370  // back edge is never taken, we still have to execute both loop headers,
1371  // especially (but not only!) if the second is a do-while style loop.
1372  // However, doing so might invalidate the phi nodes of the first loop as
1373  // the new values do only need to dominate their latch and not the exiting
1374  // predicate. To remedy this potential problem we always introduce phi
1375  // nodes in the header of the second loop later that select the loop carried
1376  // value, if the second header was reached through an old latch of the
1377  // first, or undef otherwise. This is sound as exiting the first implies the
1378  // second will exit too, __without__ taking the back-edge. [Their
1379  // trip-counts are equal after all.
1380  // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
1381  // to FC1.Header? I think this is basically what the three sequences are
1382  // trying to accomplish; however, doing this directly in the CFG may mean
1383  // the DT/PDT becomes invalid
1384  if (!FC0.Peeled) {
1385  FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
1386  FC1.Header);
1388  DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
1390  DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1391  } else {
1393  DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
1394 
1395  // Remove the ExitBlock of the first Loop (also not needed)
1396  FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
1397  FC1.Header);
1399  DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1400  FC0.ExitBlock->getTerminator()->eraseFromParent();
1402  DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1403  new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
1404  }
1405 
1406  // The pre-header of L1 is not necessary anymore.
1407  assert(pred_empty(FC1.Preheader));
1408  FC1.Preheader->getTerminator()->eraseFromParent();
1409  new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
1411  DominatorTree::Delete, FC1.Preheader, FC1.Header));
1412 
1413  // Moves the phi nodes from the second to the first loops header block.
1414  while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
1415  if (SE.isSCEVable(PHI->getType()))
1416  SE.forgetValue(PHI);
1417  if (PHI->hasNUsesOrMore(1))
1418  PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
1419  else
1420  PHI->eraseFromParent();
1421  }
1422 
1423  // Introduce new phi nodes in the second loop header to ensure
1424  // exiting the first and jumping to the header of the second does not break
1425  // the SSA property of the phis originally in the first loop. See also the
1426  // comment above.
1427  Instruction *L1HeaderIP = &FC1.Header->front();
1428  for (PHINode *LCPHI : OriginalFC0PHIs) {
1429  int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1430  assert(L1LatchBBIdx >= 0 &&
1431  "Expected loop carried value to be rewired at this point!");
1432 
1433  Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1434 
1435  PHINode *L1HeaderPHI = PHINode::Create(
1436  LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
1437  L1HeaderPHI->addIncoming(LCV, FC0.Latch);
1438  L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
1439  FC0.ExitingBlock);
1440 
1441  LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1442  }
1443 
1444  // Replace latch terminator destinations.
1445  FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
1446  FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
1447 
1448  // Modify the latch branch of FC0 to be unconditional as both successors of
1449  // the branch are the same.
1450  simplifyLatchBranch(FC0);
1451 
1452  // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1453  // performed the updates above.
1454  if (FC0.Latch != FC0.ExitingBlock)
1456  DominatorTree::Insert, FC0.Latch, FC1.Header));
1457 
1459  FC0.Latch, FC0.Header));
1461  FC1.Latch, FC0.Header));
1463  FC1.Latch, FC1.Header));
1464 
1465  // Update DT/PDT
1466  DTU.applyUpdates(TreeUpdates);
1467 
1468  LI.removeBlock(FC1.Preheader);
1469  DTU.deleteBB(FC1.Preheader);
1470  if (FC0.Peeled) {
1471  LI.removeBlock(FC0.ExitBlock);
1472  DTU.deleteBB(FC0.ExitBlock);
1473  }
1474 
1475  DTU.flush();
1476 
1477  // Is there a way to keep SE up-to-date so we don't need to forget the loops
1478  // and rebuild the information in subsequent passes of fusion?
1479  // Note: Need to forget the loops before merging the loop latches, as
1480  // mergeLatch may remove the only block in FC1.
1481  SE.forgetLoop(FC1.L);
1482  SE.forgetLoop(FC0.L);
1483 
1484  // Move instructions from FC0.Latch to FC1.Latch.
1485  // Note: mergeLatch requires an updated DT.
1486  mergeLatch(FC0, FC1);
1487 
1488  // Merge the loops.
1489  SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
1490  for (BasicBlock *BB : Blocks) {
1491  FC0.L->addBlockEntry(BB);
1492  FC1.L->removeBlockFromLoop(BB);
1493  if (LI.getLoopFor(BB) != FC1.L)
1494  continue;
1495  LI.changeLoopFor(BB, FC0.L);
1496  }
1497  while (!FC1.L->isInnermost()) {
1498  const auto &ChildLoopIt = FC1.L->begin();
1499  Loop *ChildLoop = *ChildLoopIt;
1500  FC1.L->removeChildLoop(ChildLoopIt);
1501  FC0.L->addChildLoop(ChildLoop);
1502  }
1503 
1504  // Delete the now empty loop L1.
1505  LI.erase(FC1.L);
1506 
1507 #ifndef NDEBUG
1508  assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
1510  assert(PDT.verify());
1511  LI.verify(DT);
1512  SE.verify();
1513 #endif
1514 
1515  LLVM_DEBUG(dbgs() << "Fusion done:\n");
1516 
1517  return FC0.L;
1518  }
1519 
1520  /// Report details on loop fusion opportunities.
1521  ///
1522  /// This template function can be used to report both successful and missed
1523  /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1524  /// be one of:
1525  /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1526  /// given two valid fusion candidates.
1527  /// - OptimizationRemark to report successful fusion of two fusion
1528  /// candidates.
1529  /// The remarks will be printed using the form:
1530  /// <path/filename>:<line number>:<column number>: [<function name>]:
1531  /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1532  template <typename RemarkKind>
1533  void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
1534  llvm::Statistic &Stat) {
1535  assert(FC0.Preheader && FC1.Preheader &&
1536  "Expecting valid fusion candidates");
1537  using namespace ore;
1538 #if LLVM_ENABLE_STATS
1539  ++Stat;
1540  ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
1541  FC0.Preheader)
1542  << "[" << FC0.Preheader->getParent()->getName()
1543  << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
1544  << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
1545  << ": " << Stat.getDesc());
1546 #endif
1547  }
1548 
1549  /// Fuse two guarded fusion candidates, creating a new fused loop.
1550  ///
1551  /// Fusing guarded loops is handled much the same way as fusing non-guarded
1552  /// loops. The rewiring of the CFG is slightly different though, because of
1553  /// the presence of the guards around the loops and the exit blocks after the
1554  /// loop body. As such, the new loop is rewired as follows:
1555  /// 1. Keep the guard branch from FC0 and use the non-loop block target
1556  /// from the FC1 guard branch.
1557  /// 2. Remove the exit block from FC0 (this exit block should be empty
1558  /// right now).
1559  /// 3. Remove the guard branch for FC1
1560  /// 4. Remove the preheader for FC1.
1561  /// The exit block successor for the latch of FC0 is updated to be the header
1562  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
1563  /// be the header of FC0, thus creating the fused loop.
1564  Loop *fuseGuardedLoops(const FusionCandidate &FC0,
1565  const FusionCandidate &FC1) {
1566  assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
1567 
1568  BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
1569  BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
1570  BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
1571  BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
1572  BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
1573 
1574  // Move instructions from the exit block of FC0 to the beginning of the exit
1575  // block of FC1, in the case that the FC0 loop has not been peeled. In the
1576  // case that FC0 loop is peeled, then move the instructions of the successor
1577  // of the FC0 Exit block to the beginning of the exit block of FC1.
1579  (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
1580  DT, PDT, DI);
1581 
1582  // Move instructions from the guard block of FC1 to the end of the guard
1583  // block of FC0.
1584  moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
1585 
1586  assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
1587 
1589 
1590  ////////////////////////////////////////////////////////////////////////////
1591  // Update the Loop Guard
1592  ////////////////////////////////////////////////////////////////////////////
1593  // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
1594  // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
1595  // Thus, one path from the guard goes to the preheader for FC0 (and thus
1596  // executes the new fused loop) and the other path goes to the NonLoopBlock
1597  // for FC1 (where FC1 guard would have gone if FC1 was not executed).
1598  FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
1599  FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
1600 
1601  BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
1602  BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
1603 
1604  // The guard of FC1 is not necessary anymore.
1605  FC1.GuardBranch->eraseFromParent();
1606  new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
1607 
1609  DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
1611  DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
1613  DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
1615  DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
1616 
1617  if (FC0.Peeled) {
1618  // Remove the Block after the ExitBlock of FC0
1620  DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
1621  FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
1622  new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
1623  FC0ExitBlockSuccessor);
1624  }
1625 
1626  assert(pred_empty(FC1GuardBlock) &&
1627  "Expecting guard block to have no predecessors");
1628  assert(succ_empty(FC1GuardBlock) &&
1629  "Expecting guard block to have no successors");
1630 
1631  // Remember the phi nodes originally in the header of FC0 in order to rewire
1632  // them later. However, this is only necessary if the new loop carried
1633  // values might not dominate the exiting branch. While we do not generally
1634  // test if this is the case but simply insert intermediate phi nodes, we
1635  // need to make sure these intermediate phi nodes have different
1636  // predecessors. To this end, we filter the special case where the exiting
1637  // block is the latch block of the first loop. Nothing needs to be done
1638  // anyway as all loop carried values dominate the latch and thereby also the
1639  // exiting branch.
1640  // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
1641  // (because the loops are rotated. Thus, nothing will ever be added to
1642  // OriginalFC0PHIs.
1643  SmallVector<PHINode *, 8> OriginalFC0PHIs;
1644  if (FC0.ExitingBlock != FC0.Latch)
1645  for (PHINode &PHI : FC0.Header->phis())
1646  OriginalFC0PHIs.push_back(&PHI);
1647 
1648  assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
1649 
1650  // Replace incoming blocks for header PHIs first.
1651  FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
1652  FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
1653 
1654  // The old exiting block of the first loop (FC0) has to jump to the header
1655  // of the second as we need to execute the code in the second header block
1656  // regardless of the trip count. That is, if the trip count is 0, so the
1657  // back edge is never taken, we still have to execute both loop headers,
1658  // especially (but not only!) if the second is a do-while style loop.
1659  // However, doing so might invalidate the phi nodes of the first loop as
1660  // the new values do only need to dominate their latch and not the exiting
1661  // predicate. To remedy this potential problem we always introduce phi
1662  // nodes in the header of the second loop later that select the loop carried
1663  // value, if the second header was reached through an old latch of the
1664  // first, or undef otherwise. This is sound as exiting the first implies the
1665  // second will exit too, __without__ taking the back-edge (their
1666  // trip-counts are equal after all).
1667  FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
1668  FC1.Header);
1669 
1671  DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1673  DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1674 
1675  // Remove FC0 Exit Block
1676  // The exit block for FC0 is no longer needed since control will flow
1677  // directly to the header of FC1. Since it is an empty block, it can be
1678  // removed at this point.
1679  // TODO: In the future, we can handle non-empty exit blocks my merging any
1680  // instructions from FC0 exit block into FC1 exit block prior to removing
1681  // the block.
1682  assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
1683  FC0.ExitBlock->getTerminator()->eraseFromParent();
1684  new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
1685 
1686  // Remove FC1 Preheader
1687  // The pre-header of L1 is not necessary anymore.
1688  assert(pred_empty(FC1.Preheader));
1689  FC1.Preheader->getTerminator()->eraseFromParent();
1690  new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
1692  DominatorTree::Delete, FC1.Preheader, FC1.Header));
1693 
1694  // Moves the phi nodes from the second to the first loops header block.
1695  while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
1696  if (SE.isSCEVable(PHI->getType()))
1697  SE.forgetValue(PHI);
1698  if (PHI->hasNUsesOrMore(1))
1699  PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
1700  else
1701  PHI->eraseFromParent();
1702  }
1703 
1704  // Introduce new phi nodes in the second loop header to ensure
1705  // exiting the first and jumping to the header of the second does not break
1706  // the SSA property of the phis originally in the first loop. See also the
1707  // comment above.
1708  Instruction *L1HeaderIP = &FC1.Header->front();
1709  for (PHINode *LCPHI : OriginalFC0PHIs) {
1710  int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1711  assert(L1LatchBBIdx >= 0 &&
1712  "Expected loop carried value to be rewired at this point!");
1713 
1714  Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1715 
1716  PHINode *L1HeaderPHI = PHINode::Create(
1717  LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
1718  L1HeaderPHI->addIncoming(LCV, FC0.Latch);
1719  L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
1720  FC0.ExitingBlock);
1721 
1722  LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1723  }
1724 
1725  // Update the latches
1726 
1727  // Replace latch terminator destinations.
1728  FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
1729  FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
1730 
1731  // Modify the latch branch of FC0 to be unconditional as both successors of
1732  // the branch are the same.
1733  simplifyLatchBranch(FC0);
1734 
1735  // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1736  // performed the updates above.
1737  if (FC0.Latch != FC0.ExitingBlock)
1739  DominatorTree::Insert, FC0.Latch, FC1.Header));
1740 
1742  FC0.Latch, FC0.Header));
1744  FC1.Latch, FC0.Header));
1746  FC1.Latch, FC1.Header));
1747 
1748  // All done
1749  // Apply the updates to the Dominator Tree and cleanup.
1750 
1751  assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
1752  assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
1753 
1754  // Update DT/PDT
1755  DTU.applyUpdates(TreeUpdates);
1756 
1757  LI.removeBlock(FC1GuardBlock);
1758  LI.removeBlock(FC1.Preheader);
1759  LI.removeBlock(FC0.ExitBlock);
1760  if (FC0.Peeled) {
1761  LI.removeBlock(FC0ExitBlockSuccessor);
1762  DTU.deleteBB(FC0ExitBlockSuccessor);
1763  }
1764  DTU.deleteBB(FC1GuardBlock);
1765  DTU.deleteBB(FC1.Preheader);
1766  DTU.deleteBB(FC0.ExitBlock);
1767  DTU.flush();
1768 
1769  // Is there a way to keep SE up-to-date so we don't need to forget the loops
1770  // and rebuild the information in subsequent passes of fusion?
1771  // Note: Need to forget the loops before merging the loop latches, as
1772  // mergeLatch may remove the only block in FC1.
1773  SE.forgetLoop(FC1.L);
1774  SE.forgetLoop(FC0.L);
1775 
1776  // Move instructions from FC0.Latch to FC1.Latch.
1777  // Note: mergeLatch requires an updated DT.
1778  mergeLatch(FC0, FC1);
1779 
1780  // Merge the loops.
1781  SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
1782  for (BasicBlock *BB : Blocks) {
1783  FC0.L->addBlockEntry(BB);
1784  FC1.L->removeBlockFromLoop(BB);
1785  if (LI.getLoopFor(BB) != FC1.L)
1786  continue;
1787  LI.changeLoopFor(BB, FC0.L);
1788  }
1789  while (!FC1.L->isInnermost()) {
1790  const auto &ChildLoopIt = FC1.L->begin();
1791  Loop *ChildLoop = *ChildLoopIt;
1792  FC1.L->removeChildLoop(ChildLoopIt);
1793  FC0.L->addChildLoop(ChildLoop);
1794  }
1795 
1796  // Delete the now empty loop L1.
1797  LI.erase(FC1.L);
1798 
1799 #ifndef NDEBUG
1800  assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
1802  assert(PDT.verify());
1803  LI.verify(DT);
1804  SE.verify();
1805 #endif
1806 
1807  LLVM_DEBUG(dbgs() << "Fusion done:\n");
1808 
1809  return FC0.L;
1810  }
1811 };
1812 
1813 struct LoopFuseLegacy : public FunctionPass {
1814 
1815  static char ID;
1816 
1817  LoopFuseLegacy() : FunctionPass(ID) {
1819  }
1820 
1821  void getAnalysisUsage(AnalysisUsage &AU) const override {
1831 
1836  }
1837 
1838  bool runOnFunction(Function &F) override {
1839  if (skipFunction(F))
1840  return false;
1841  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1842  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1843  auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
1844  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1845  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
1846  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1847  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1848  const TargetTransformInfo &TTI =
1849  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1850  const DataLayout &DL = F.getParent()->getDataLayout();
1851 
1852  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
1853  return LF.fuseLoops(F);
1854  }
1855 };
1856 } // namespace
1857 
1859  auto &LI = AM.getResult<LoopAnalysis>(F);
1860  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1861  auto &DI = AM.getResult<DependenceAnalysis>(F);
1862  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1863  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
1865  auto &AC = AM.getResult<AssumptionAnalysis>(F);
1867  const DataLayout &DL = F.getParent()->getDataLayout();
1868 
1869  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
1870  bool Changed = LF.fuseLoops(F);
1871  if (!Changed)
1872  return PreservedAnalyses::all();
1873 
1874  PreservedAnalyses PA;
1878  PA.preserve<LoopAnalysis>();
1879  return PA;
1880 }
1881 
1882 char LoopFuseLegacy::ID = 0;
1883 
1884 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
1885  false)
1894 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
1895 
1896 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
AssumptionCache.h
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12230
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2319
llvm::predecessors
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:127
llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition: ScalarEvolution.h:2074
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:499
llvm
Definition: AllocatorList.h:23
llvm::sys::path::const_iterator::end
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::DomTreeUpdater::deleteBB
void deleteBB(BasicBlock *DelBB)
Delete DelBB.
Definition: DomTreeUpdater.cpp:177
llvm::LoopBase::getExitBlock
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:82
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::peelLoop
bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA)
Peel off the first PeelCount iterations of loop L.
Definition: LoopPeel.cpp:667
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:722
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:378
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:785
Scalar.h
llvm::DomTreeUpdater::applyUpdates
void applyUpdates(ArrayRef< DominatorTree::UpdateType > Updates)
Submit updates to all available trees.
Definition: DomTreeUpdater.cpp:230
llvm::SCEVAddRecExpr::getStart
const SCEV * getStart() const
Definition: ScalarEvolutionExpressions.h:362
llvm::DependenceAnalysisWrapperPass
Legacy pass manager pass to access dependence information.
Definition: DependenceAnalysis.h:979
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
Pass.h
VerboseFusionDebugging
static cl::opt< bool > VerboseFusionDebugging("loop-fusion-verbose-debug", cl::desc("Enable verbose debugging for Loop Fusion"), cl::Hidden, cl::init(false), cl::ZeroOrMore)
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::LoopInfoBase::verify
void verify(const DominatorTreeBase< BlockT, false > &DomTree) const
Definition: LoopInfoImpl.h:689
Statistic.h
llvm::ScalarEvolution::getAddRecExpr
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
Definition: ScalarEvolution.cpp:3346
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:167
llvm::Loop::getStartLoc
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:633
DomTreeUpdater.h
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
R600_InstFlag::FC
@ FC
Definition: R600Defines.h:32
llvm::LoopInfoBase::changeLoopFor
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
Definition: LoopInfo.h:1002
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::LoopInfoBase::print
void print(raw_ostream &OS) const
Definition: LoopInfoImpl.h:620
llvm::verifyFunction
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:5737
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:34
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:749
ScalarEvolution.h
llvm::LoopInfoBase::removeBlock
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
Definition: LoopInfo.h:1029
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
llvm::LoopInfoWrapperPass
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:1258
llvm::DominatorTreeBase< BasicBlock, false >::Insert
static constexpr UpdateKind Insert
Definition: GenericDomTree.h:242
llvm::Optional< unsigned >
llvm::LoopBase::begin
iterator begin() const
Definition: LoopInfo.h:154
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:128
llvm::errs
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition: raw_ostream.cpp:892
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
llvm::succ_empty
bool succ_empty(const Instruction *I)
Definition: CFG.h:256
llvm::CallingConv::Fast
@ Fast
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
llvm::isControlFlowEquivalent
bool isControlFlowEquivalent(const Instruction &I0, const Instruction &I1, const DominatorTree &DT, const PostDominatorTree &PDT)
Return true if I0 and I1 are control flow equivalent.
Definition: CodeMoverUtils.cpp:229
llvm::SCEVNAryExpr::operands
op_range operands() const
Definition: ScalarEvolutionExpressions.h:209
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
F
#define F(x, y, z)
Definition: MD5.cpp:56
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
llvm::canPeel
bool canPeel(Loop *L)
Definition: LoopPeel.cpp:89
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:115
CommandLine.h
FusionPeelMaxCount
static cl::opt< unsigned > FusionPeelMaxCount("loop-fusion-peel-max-count", cl::init(0), cl::Hidden, cl::desc("Max number of iterations to be peeled from a loop, such that " "fusion can take place"))
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
Rewriter
Virtual Register Rewriter
Definition: VirtRegMap.cpp:229
PostDominators.h
llvm::PostDominatorTreeWrapperPass
Definition: PostDominators.h:73
llvm::DominatorTreeBase< BasicBlock, false >::UpdateType
cfg::Update< NodePtr > UpdateType
Definition: GenericDomTree.h:240
SI
@ SI
Definition: SIInstrInfo.cpp:7463
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::ReplaceInstWithInst
void ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
Definition: BasicBlockUtils.cpp:476
llvm::LoopBase::end
iterator end() const
Definition: LoopInfo.h:155
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
false
Definition: StackSlotColoring.cpp:142
llvm::printLoop
void printLoop(Loop &L, raw_ostream &OS, const std::string &Banner="")
Function to print a loop's contents as LLVM's text IR assembly.
Definition: LoopInfo.cpp:979
llvm::ScalarEvolution::isKnownPredicate
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
Definition: ScalarEvolution.cpp:9626
llvm::Instruction
Definition: Instruction.h:45
llvm::isSafeToMoveBefore
bool isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, DominatorTree &DT, const PostDominatorTree *PDT=nullptr, DependenceInfo *DI=nullptr)
Return true if I can be safely moved before InsertPoint.
Definition: CodeMoverUtils.cpp:310
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:281
llvm::SCEVExprContains
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
Definition: ScalarEvolutionExpressions.h:679
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:50
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1770
llvm::DomTreeUpdater
Definition: DomTreeUpdater.h:28
FUSION_DEPENDENCE_ANALYSIS_SCEV
@ FUSION_DEPENDENCE_ANALYSIS_SCEV
Definition: LoopFuse.cpp:106
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2104
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoopFuse.cpp:73
llvm::TrackingStatistic::getName
const char * getName() const
Definition: Statistic.h:64
llvm::LoopBase::getExitingBlock
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:49
Utils.h
llvm::Loop::getName
StringRef getName() const
Definition: LoopInfo.h:863
llvm::None
const NoneType None
Definition: None.h:23
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::cl::ZeroOrMore
@ ZeroOrMore
Definition: CommandLine.h:120
LoopFuse.h
llvm::ScalarEvolution::getSmallConstantTripCount
unsigned getSmallConstantTripCount(const Loop *L)
Returns the maximum trip count of the loop if it is a single-exit loop and we can compute a small max...
Definition: ScalarEvolution.cpp:6944
llvm::TrackingStatistic::getDesc
const char * getDesc() const
Definition: Statistic.h:65
llvm::Loop::isLoopSimplifyForm
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to,...
Definition: LoopInfo.cpp:478
llvm::cl::opt
Definition: CommandLine.h:1422
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::DependenceInfo
DependenceInfo - This class is the main dependence-analysis driver.
Definition: DependenceAnalysis.h:272
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:303
llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:699
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::moveInstructionsToTheEnd
void moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB, DominatorTree &DT, const PostDominatorTree &PDT, DependenceInfo &DI)
Move instructions, in an order-preserving manner, from FromBB to the end of ToBB when proven safe.
Definition: CodeMoverUtils.cpp:415
llvm::ScalarEvolution::verify
void verify() const
Definition: ScalarEvolution.cpp:12674
llvm::createLoopFusePass
FunctionPass * createLoopFusePass()
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2375
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:169
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::PreservedAnalyses::preserve
void preserve()
Mark an analysis as preserved.
Definition: PassManager.h:176
llvm::SCEVNAryExpr::op_end
op_iterator op_end() const
Definition: ScalarEvolutionExpressions.h:208
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2722
llvm::BranchInst::Create
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
Definition: Instructions.h:3063
llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:964
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:167
llvm::SCEVRewriteVisitor
This visitor recursively visits a SCEV expression and re-writes it.
Definition: ScalarEvolutionExpressions.h:706
llvm::ScalarEvolution::forgetValue
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
Definition: ScalarEvolution.cpp:7272
llvm::LoopBase::getLoopLatch
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:216
llvm::initializeLoopFuseLegacyPass
void initializeLoopFuseLegacyPass(PassRegistry &)
llvm::sys::path::const_iterator::begin
friend const_iterator begin(StringRef path, Style style)
Get begin iterator over path.
Definition: Path.cpp:224
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
llvm::User::replaceUsesOfWith
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:162
llvm::LoopInfo::erase
void erase(Loop *L)
Update LoopInfo after removing the last backedge from a loop.
Definition: LoopInfo.cpp:878
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1463
FUSION_DEPENDENCE_ANALYSIS_ALL
@ FUSION_DEPENDENCE_ANALYSIS_ALL
Definition: LoopFuse.cpp:108
llvm::DominatorTreeBase::recalculate
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Definition: GenericDomTree.h:778
llvm::SCEVNAryExpr::getNoWrapFlags
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
Definition: ScalarEvolutionExpressions.h:215
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::LoopInfo
Definition: LoopInfo.h:1080
llvm::MergeBlockIntoPredecessor
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false)
Attempts to merge a block into its predecessor, if possible.
Definition: BasicBlockUtils.cpp:173
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::PostDominatorTree
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Definition: PostDominators.h:28
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
verify
ppc ctr loops verify
Definition: PPCCTRLoops.cpp:76
llvm::DependenceAnalysis
AnalysisPass to compute dependence information in a function.
Definition: DependenceAnalysis.h:957
clEnumValN
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:674
llvm::moveInstructionsToTheBeginning
void moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB, DominatorTree &DT, const PostDominatorTree &PDT, DependenceInfo &DI)
Move instructions, in an order-preserving manner, from FromBB to the beginning of ToBB when proven sa...
Definition: CodeMoverUtils.cpp:400
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
CodeMoverUtils.h
llvm::pred_empty
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:119
reportInvalidCandidate
static bool reportInvalidCandidate(const Instruction &I, llvm::Statistic &Stat)
Definition: CodeMoverUtils.cpp:267
llvm::TrackingStatistic
Definition: Statistic.h:49
llvm::OptimizationRemarkAnalysis
Diagnostic information for optimization analysis remarks.
Definition: DiagnosticInfo.h:770
llvm::DomTreeUpdater::flush
void flush()
Apply all pending updates to available trees and flush all BasicBlocks awaiting deletion.
Definition: DomTreeUpdater.cpp:72
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:299
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:148
llvm::ScalarEvolution::isKnownPositive
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
Definition: ScalarEvolution.cpp:9547
llvm::BasicBlock::getContext
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:32
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::LoopFusePass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopFuse.cpp:1858
llvm::BasicBlock::replacePhiUsesWith
void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
Definition: BasicBlock.cpp:440
llvm::OptimizationRemarkEmitterWrapperPass
OptimizationRemarkEmitter legacy analysis pass.
Definition: OptimizationRemarkEmitter.h:138
llvm::LoopSimplifyID
char & LoopSimplifyID
Definition: LoopSimplify.cpp:801
Verifier.h
llvm::ScalarEvolution::forgetLoop
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
Definition: ScalarEvolution.cpp:7209
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:254
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) FunctionPass *llvm
Definition: LoopFuse.cpp:1884
llvm::SCEVAddRecExpr::getLoop
const Loop * getLoop() const
Definition: ScalarEvolutionExpressions.h:363
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::PHINode::Create
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Definition: Instructions.h:2614
llvm::LoopBase::isInvalid
bool isInvalid() const
Return true if this loop is no longer valid.
Definition: LoopInfo.h:215
llvm::SCEVNAryExpr::op_begin
op_iterator op_begin() const
Definition: ScalarEvolutionExpressions.h:207
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:352
llvm::LoopInfoBase::rbegin
reverse_iterator rbegin() const
Definition: LoopInfo.h:941
Function.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::ScalarEvolution::isSCEVable
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Definition: ScalarEvolution.cpp:3759
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::CmpInst::ICMP_SGE
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:750
ScalarEvolutionExpressions.h
llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:252
llvm::DependenceInfo::depends
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
Definition: DependenceAnalysis.cpp:3483
llvm::LoopInfoBase::rend
reverse_iterator rend() const
Definition: LoopInfo.h:942
isValid
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Definition: RustDemangle.cpp:88
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::DominatorTreeBase::verify
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
Definition: GenericDomTree.h:802
TargetTransformInfo.h
FusionDependenceAnalysisChoice
FusionDependenceAnalysisChoice
Definition: LoopFuse.cpp:105
llvm::PHINode
Definition: Instructions.h:2572
llvm::LoopInfoBase::empty
bool empty() const
Definition: LoopInfo.h:943
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5239
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AnalysisUsage::addRequiredID
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:267
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::UnreachableInst
This function has undefined behavior.
Definition: Instructions.h:4652
llvm::PostDominatorTreeAnalysis
Analysis pass which computes a PostDominatorTree.
Definition: PostDominators.h:47
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
DependenceAnalysis.h
llvm::Loop::isRotatedForm
bool isRotatedForm() const
Return true if the loop is in rotated form.
Definition: LoopInfo.h:785
llvm::cl::desc
Definition: CommandLine.h:414
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3007
raw_ostream.h
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7041
FUSION_DEPENDENCE_ANALYSIS_DA
@ FUSION_DEPENDENCE_ANALYSIS_DA
Definition: LoopFuse.cpp:107
LoopPeel.h
BasicBlockUtils.h
true
basic Basic Alias true
Definition: BasicAliasAnalysis.cpp:1797
InitializePasses.h
llvm::OptimizationRemarkEmitterAnalysis
Definition: OptimizationRemarkEmitter.h:156
llvm::ScalarEvolution::getSCEVAtScope
const SCEV * getSCEVAtScope(const SCEV *S, const Loop *L)
Return a SCEV expression for the specified value at the specified scope in the program.
Definition: ScalarEvolution.cpp:8452
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
Debug.h
FusionDependenceAnalysis
static cl::opt< FusionDependenceAnalysisChoice > FusionDependenceAnalysis("loop-fusion-dependence-analysis", cl::desc("Which dependence analysis should loop fusion use?"), cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", "Use the scalar evolution interface"), clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", "Use the dependence analysis interface"), clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", "Use all available analyses")), cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore)
llvm::gatherPeelingPreferences
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, Optional< bool > UserAllowPeeling, Optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:624
llvm::BranchInst::isConditional
bool isConditional() const
Definition: Instructions.h:3086
llvm::BranchInst::getSuccessor
BasicBlock * getSuccessor(unsigned i) const
Definition: Instructions.h:3100
llvm::SCEVAddRecExpr::getStepRecurrence
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
Definition: ScalarEvolutionExpressions.h:369
llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:1233
llvm::DominatorTreeBase< BasicBlock, false >::Delete
static constexpr UpdateKind Delete
Definition: GenericDomTree.h:243
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:908
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38