LLVM  10.0.0svn
LoopUnrollAndJam.cpp
Go to the documentation of this file.
1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/LoopPass.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IntrinsicInst.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/Support/Debug.h"
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "loop-unroll-and-jam"
43 
44 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46 
48 
49 // Partition blocks in an outer/inner loop pair into blocks before and after
50 // the loop
51 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
52  BasicBlockSet &ForeBlocks,
53  BasicBlockSet &SubLoopBlocks,
54  BasicBlockSet &AftBlocks,
55  DominatorTree *DT) {
56  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
57  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
58 
59  for (BasicBlock *BB : L->blocks()) {
60  if (!SubLoop->contains(BB)) {
61  if (DT->dominates(SubLoopLatch, BB))
62  AftBlocks.insert(BB);
63  else
64  ForeBlocks.insert(BB);
65  }
66  }
67 
68  // Check that all blocks in ForeBlocks together dominate the subloop
69  // TODO: This might ideally be done better with a dominator/postdominators.
70  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
71  for (BasicBlock *BB : ForeBlocks) {
72  if (BB == SubLoopPreHeader)
73  continue;
74  Instruction *TI = BB->getTerminator();
75  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
76  if (!ForeBlocks.count(TI->getSuccessor(i)))
77  return false;
78  }
79 
80  return true;
81 }
82 
83 // Looks at the phi nodes in Header for values coming from Latch. For these
84 // instructions and all their operands calls Visit on them, keeping going for
85 // all the operands in AftBlocks. Returns false if Visit returns false,
86 // otherwise returns true. This is used to process the instructions in the
87 // Aft blocks that need to be moved before the subloop. It is used in two
88 // places. One to check that the required set of instructions can be moved
89 // before the loop. Then to collect the instructions to actually move in
90 // moveHeaderPhiOperandsToForeBlocks.
91 template <typename T>
92 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
93  BasicBlockSet &AftBlocks, T Visit) {
95  for (auto &Phi : Header->phis()) {
96  Value *V = Phi.getIncomingValueForBlock(Latch);
97  if (Instruction *I = dyn_cast<Instruction>(V))
98  Worklist.push_back(I);
99  }
100 
101  while (!Worklist.empty()) {
102  Instruction *I = Worklist.back();
103  Worklist.pop_back();
104  if (!Visit(I))
105  return false;
106 
107  if (AftBlocks.count(I->getParent()))
108  for (auto &U : I->operands())
109  if (Instruction *II = dyn_cast<Instruction>(U))
110  Worklist.push_back(II);
111  }
112 
113  return true;
114 }
115 
116 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
118  BasicBlock *Latch,
119  Instruction *InsertLoc,
120  BasicBlockSet &AftBlocks) {
121  // We need to ensure we move the instructions in the correct order,
122  // starting with the earliest required instruction and moving forward.
123  std::vector<Instruction *> Visited;
124  processHeaderPhiOperands(Header, Latch, AftBlocks,
125  [&Visited, &AftBlocks](Instruction *I) {
126  if (AftBlocks.count(I->getParent()))
127  Visited.push_back(I);
128  return true;
129  });
130 
131  // Move all instructions in program order to before the InsertLoc
132  BasicBlock *InsertLocBB = InsertLoc->getParent();
133  for (Instruction *I : reverse(Visited)) {
134  if (I->getParent() != InsertLocBB)
135  I->moveBefore(InsertLoc);
136  }
137 }
138 
139 /*
140  This method performs Unroll and Jam. For a simple loop like:
141  for (i = ..)
142  Fore(i)
143  for (j = ..)
144  SubLoop(i, j)
145  Aft(i)
146 
147  Instead of doing normal inner or outer unrolling, we do:
148  for (i = .., i+=2)
149  Fore(i)
150  Fore(i+1)
151  for (j = ..)
152  SubLoop(i, j)
153  SubLoop(i+1, j)
154  Aft(i)
155  Aft(i+1)
156 
157  So the outer loop is essetially unrolled and then the inner loops are fused
158  ("jammed") together into a single loop. This can increase speed when there
159  are loads in SubLoop that are invariant to i, as they become shared between
160  the now jammed inner loops.
161 
162  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
163  Fore blocks are those before the inner loop, Aft are those after. Normal
164  Unroll code is used to copy each of these sets of blocks and the results are
165  combined together into the final form above.
166 
167  isSafeToUnrollAndJam should be used prior to calling this to make sure the
168  unrolling will be valid. Checking profitablility is also advisable.
169 
170  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
171  necessary to create one and not fully unrolled).
172 */
174  Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
175  bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
176  AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
177 
178  // When we enter here we should have already checked that it is safe
179  BasicBlock *Header = L->getHeader();
180  assert(L->getSubLoops().size() == 1);
181  Loop *SubLoop = *L->begin();
182 
183  // Don't enter the unroll code if there is nothing to do.
184  if (TripCount == 0 && Count < 2) {
185  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
187  }
188 
189  assert(Count > 0);
190  assert(TripMultiple > 0);
191  assert(TripCount == 0 || TripCount % TripMultiple == 0);
192 
193  // Are we eliminating the loop control altogether?
194  bool CompletelyUnroll = (Count == TripCount);
195 
196  // We use the runtime remainder in cases where we don't know trip multiple
197  if (TripMultiple == 1 || TripMultiple % Count != 0) {
198  if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
199  /*UseEpilogRemainder*/ true,
200  UnrollRemainder, /*ForgetAllSCEV*/ false,
201  LI, SE, DT, AC, true, EpilogueLoop)) {
202  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
203  "generated when assuming runtime trip count\n");
205  }
206  }
207 
208  // Notify ScalarEvolution that the loop will be substantially changed,
209  // if not outright eliminated.
210  if (SE) {
211  SE->forgetLoop(L);
212  SE->forgetLoop(SubLoop);
213  }
214 
215  using namespace ore;
216  // Report the unrolling decision.
217  if (CompletelyUnroll) {
218  LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
219  << Header->getName() << " with trip count " << TripCount
220  << "!\n");
221  ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
222  L->getHeader())
223  << "completely unroll and jammed loop with "
224  << NV("UnrollCount", TripCount) << " iterations");
225  } else {
226  auto DiagBuilder = [&]() {
227  OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
228  L->getHeader());
229  return Diag << "unroll and jammed loop by a factor of "
230  << NV("UnrollCount", Count);
231  };
232 
233  LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
234  << " by " << Count);
235  if (TripMultiple != 1) {
236  LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
237  ORE->emit([&]() {
238  return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
239  << " trips per branch";
240  });
241  } else {
242  LLVM_DEBUG(dbgs() << " with run-time trip count");
243  ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
244  }
245  LLVM_DEBUG(dbgs() << "!\n");
246  }
247 
248  BasicBlock *Preheader = L->getLoopPreheader();
249  BasicBlock *LatchBlock = L->getLoopLatch();
250  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
251  assert(Preheader && LatchBlock && Header);
252  assert(BI && !BI->isUnconditional());
253  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
254  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
255  bool SubLoopContinueOnTrue = SubLoop->contains(
256  SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
257 
258  // Partition blocks in an outer/inner loop pair into blocks before and after
259  // the loop
260  BasicBlockSet SubLoopBlocks;
261  BasicBlockSet ForeBlocks;
262  BasicBlockSet AftBlocks;
263  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
264  DT);
265 
266  // We keep track of the entering/first and exiting/last block of each of
267  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
268  // blocks easier.
269  std::vector<BasicBlock *> ForeBlocksFirst;
270  std::vector<BasicBlock *> ForeBlocksLast;
271  std::vector<BasicBlock *> SubLoopBlocksFirst;
272  std::vector<BasicBlock *> SubLoopBlocksLast;
273  std::vector<BasicBlock *> AftBlocksFirst;
274  std::vector<BasicBlock *> AftBlocksLast;
275  ForeBlocksFirst.push_back(Header);
276  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
277  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
278  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
279  AftBlocksFirst.push_back(SubLoop->getExitBlock());
280  AftBlocksLast.push_back(L->getExitingBlock());
281  // Maps Blocks[0] -> Blocks[It]
282  ValueToValueMapTy LastValueMap;
283 
284  // Move any instructions from fore phi operands from AftBlocks into Fore.
286  Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
287  AftBlocks);
288 
289  // The current on-the-fly SSA update requires blocks to be processed in
290  // reverse postorder so that LastValueMap contains the correct value at each
291  // exit.
292  LoopBlocksDFS DFS(L);
293  DFS.perform(LI);
294  // Stash the DFS iterators before adding blocks to the loop.
295  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
296  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
297 
298  if (Header->getParent()->isDebugInfoForProfiling())
299  for (BasicBlock *BB : L->getBlocks())
300  for (Instruction &I : *BB)
301  if (!isa<DbgInfoIntrinsic>(&I))
302  if (const DILocation *DIL = I.getDebugLoc()) {
303  auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
304  if (NewDIL)
305  I.setDebugLoc(NewDIL.getValue());
306  else
307  LLVM_DEBUG(dbgs()
308  << "Failed to create new discriminator: "
309  << DIL->getFilename() << " Line: " << DIL->getLine());
310  }
311 
312  // Copy all blocks
313  for (unsigned It = 1; It != Count; ++It) {
314  std::vector<BasicBlock *> NewBlocks;
315  // Maps Blocks[It] -> Blocks[It-1]
316  DenseMap<Value *, Value *> PrevItValueMap;
317 
318  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
319  ValueToValueMapTy VMap;
320  BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
321  Header->getParent()->getBasicBlockList().push_back(New);
322 
323  if (ForeBlocks.count(*BB)) {
324  L->addBasicBlockToLoop(New, *LI);
325 
326  if (*BB == ForeBlocksFirst[0])
327  ForeBlocksFirst.push_back(New);
328  if (*BB == ForeBlocksLast[0])
329  ForeBlocksLast.push_back(New);
330  } else if (SubLoopBlocks.count(*BB)) {
331  SubLoop->addBasicBlockToLoop(New, *LI);
332 
333  if (*BB == SubLoopBlocksFirst[0])
334  SubLoopBlocksFirst.push_back(New);
335  if (*BB == SubLoopBlocksLast[0])
336  SubLoopBlocksLast.push_back(New);
337  } else if (AftBlocks.count(*BB)) {
338  L->addBasicBlockToLoop(New, *LI);
339 
340  if (*BB == AftBlocksFirst[0])
341  AftBlocksFirst.push_back(New);
342  if (*BB == AftBlocksLast[0])
343  AftBlocksLast.push_back(New);
344  } else {
345  llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
346  }
347 
348  // Update our running maps of newest clones
349  PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
350  LastValueMap[*BB] = New;
351  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
352  VI != VE; ++VI) {
353  PrevItValueMap[VI->second] =
354  const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
355  LastValueMap[VI->first] = VI->second;
356  }
357 
358  NewBlocks.push_back(New);
359 
360  // Update DomTree:
361  if (*BB == ForeBlocksFirst[0])
362  DT->addNewBlock(New, ForeBlocksLast[It - 1]);
363  else if (*BB == SubLoopBlocksFirst[0])
364  DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
365  else if (*BB == AftBlocksFirst[0])
366  DT->addNewBlock(New, AftBlocksLast[It - 1]);
367  else {
368  // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
369  // structure.
370  auto BBDomNode = DT->getNode(*BB);
371  auto BBIDom = BBDomNode->getIDom();
372  BasicBlock *OriginalBBIDom = BBIDom->getBlock();
373  assert(OriginalBBIDom);
374  assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
375  DT->addNewBlock(
376  New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
377  }
378  }
379 
380  // Remap all instructions in the most recent iteration
381  for (BasicBlock *NewBlock : NewBlocks) {
382  for (Instruction &I : *NewBlock) {
383  ::remapInstruction(&I, LastValueMap);
384  if (auto *II = dyn_cast<IntrinsicInst>(&I))
385  if (II->getIntrinsicID() == Intrinsic::assume)
386  AC->registerAssumption(II);
387  }
388  }
389 
390  // Alter the ForeBlocks phi's, pointing them at the latest version of the
391  // value from the previous iteration's phis
392  for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
393  Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
394  assert(OldValue && "should have incoming edge from Aft[It]");
395  Value *NewValue = OldValue;
396  if (Value *PrevValue = PrevItValueMap[OldValue])
397  NewValue = PrevValue;
398 
399  assert(Phi.getNumOperands() == 2);
400  Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
401  Phi.setIncomingValue(0, NewValue);
402  Phi.removeIncomingValue(1);
403  }
404  }
405 
406  // Now that all the basic blocks for the unrolled iterations are in place,
407  // finish up connecting the blocks and phi nodes. At this point LastValueMap
408  // is the last unrolled iterations values.
409 
410  // Update Phis in BB from OldBB to point to NewBB
411  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
412  BasicBlock *NewBB) {
413  for (PHINode &Phi : BB->phis()) {
414  int I = Phi.getBasicBlockIndex(OldBB);
415  Phi.setIncomingBlock(I, NewBB);
416  }
417  };
418  // Update Phis in BB from OldBB to point to NewBB and use the latest value
419  // from LastValueMap
420  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
421  BasicBlock *NewBB,
422  ValueToValueMapTy &LastValueMap) {
423  for (PHINode &Phi : BB->phis()) {
424  for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
425  if (Phi.getIncomingBlock(b) == OldBB) {
426  Value *OldValue = Phi.getIncomingValue(b);
427  if (Value *LastValue = LastValueMap[OldValue])
428  Phi.setIncomingValue(b, LastValue);
429  Phi.setIncomingBlock(b, NewBB);
430  break;
431  }
432  }
433  }
434  };
435  // Move all the phis from Src into Dest
436  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
437  Instruction *insertPoint = Dest->getFirstNonPHI();
438  while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
439  Phi->moveBefore(insertPoint);
440  };
441 
442  // Update the PHI values outside the loop to point to the last block
443  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
444  LastValueMap);
445 
446  // Update ForeBlocks successors and phi nodes
447  BranchInst *ForeTerm =
448  cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
449  BasicBlock *Dest = SubLoopBlocksFirst[0];
450  ForeTerm->setSuccessor(0, Dest);
451 
452  if (CompletelyUnroll) {
453  while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
454  Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
455  Phi->getParent()->getInstList().erase(Phi);
456  }
457  } else {
458  // Update the PHI values to point to the last aft block
459  updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
460  AftBlocksLast.back(), LastValueMap);
461  }
462 
463  for (unsigned It = 1; It != Count; It++) {
464  // Remap ForeBlock successors from previous iteration to this
465  BranchInst *ForeTerm =
466  cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
467  BasicBlock *Dest = ForeBlocksFirst[It];
468  ForeTerm->setSuccessor(0, Dest);
469  }
470 
471  // Subloop successors and phis
472  BranchInst *SubTerm =
473  cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
474  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
475  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
476  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
477  ForeBlocksLast.back());
478  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
479  SubLoopBlocksLast.back());
480 
481  for (unsigned It = 1; It != Count; It++) {
482  // Replace the conditional branch of the previous iteration subloop with an
483  // unconditional one to this one
484  BranchInst *SubTerm =
485  cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
486  BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
487  SubTerm->eraseFromParent();
488 
489  updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
490  ForeBlocksLast.back());
491  updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
492  SubLoopBlocksLast.back());
493  movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
494  }
495 
496  // Aft blocks successors and phis
497  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
498  if (CompletelyUnroll) {
499  BranchInst::Create(LoopExit, Term);
500  Term->eraseFromParent();
501  } else {
502  Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
503  }
504  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
505  SubLoopBlocksLast.back());
506 
507  for (unsigned It = 1; It != Count; It++) {
508  // Replace the conditional branch of the previous iteration subloop with an
509  // unconditional one to this one
510  BranchInst *AftTerm =
511  cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
512  BranchInst::Create(AftBlocksFirst[It], AftTerm);
513  AftTerm->eraseFromParent();
514 
515  updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
516  SubLoopBlocksLast.back());
517  movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
518  }
519 
521  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
522  // new ones required.
523  if (Count != 1) {
525  DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
526  SubLoopBlocksFirst[0]);
528  SubLoopBlocksLast[0], AftBlocksFirst[0]);
529 
531  ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
533  SubLoopBlocksLast.back(), AftBlocksFirst[0]);
534  DTU.applyUpdatesPermissive(DTUpdates);
535  }
536 
537  // Merge adjacent basic blocks, if possible.
538  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
539  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
540  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
541  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
542  while (!MergeBlocks.empty()) {
543  BasicBlock *BB = *MergeBlocks.begin();
545  if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
546  BasicBlock *Dest = Term->getSuccessor(0);
547  BasicBlock *Fold = Dest->getUniquePredecessor();
548  if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
549  // Don't remove BB and add Fold as they are the same BB
550  assert(Fold == BB);
551  (void)Fold;
552  MergeBlocks.erase(Dest);
553  } else
554  MergeBlocks.erase(BB);
555  } else
556  MergeBlocks.erase(BB);
557  }
558  // Apply updates to the DomTree.
559  DT = &DTU.getDomTree();
560 
561  // At this point, the code is well formed. We now do a quick sweep over the
562  // inserted code, doing constant propagation and dead code elimination as we
563  // go.
564  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
565  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
566 
567  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
568  ++NumUnrolledAndJammed;
569 
570 #ifndef NDEBUG
571  // We shouldn't have done anything to break loop simplify form or LCSSA.
572  Loop *OuterL = L->getParentLoop();
573  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
574  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
575  if (!CompletelyUnroll)
577  assert(SubLoop->isLoopSimplifyForm());
578  assert(DT->verify());
579 #endif
580 
581  // Update LoopInfo if the loop is completely removed.
582  if (CompletelyUnroll)
583  LI->erase(L);
584 
585  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
587 }
588 
589 static bool getLoadsAndStores(BasicBlockSet &Blocks,
590  SmallVector<Value *, 4> &MemInstr) {
591  // Scan the BBs and collect legal loads and stores.
592  // Returns false if non-simple loads/stores are found.
593  for (BasicBlock *BB : Blocks) {
594  for (Instruction &I : *BB) {
595  if (auto *Ld = dyn_cast<LoadInst>(&I)) {
596  if (!Ld->isSimple())
597  return false;
598  MemInstr.push_back(&I);
599  } else if (auto *St = dyn_cast<StoreInst>(&I)) {
600  if (!St->isSimple())
601  return false;
602  MemInstr.push_back(&I);
603  } else if (I.mayReadOrWriteMemory()) {
604  return false;
605  }
606  }
607  }
608  return true;
609 }
610 
613  unsigned LoopDepth, bool InnerLoop,
614  DependenceInfo &DI) {
615  // Use DA to check for dependencies between loads and stores that make unroll
616  // and jam invalid
617  for (Value *I : Earlier) {
618  for (Value *J : Later) {
619  Instruction *Src = cast<Instruction>(I);
620  Instruction *Dst = cast<Instruction>(J);
621  if (Src == Dst)
622  continue;
623  // Ignore Input dependencies.
624  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
625  continue;
626 
627  // Track dependencies, and if we find them take a conservative approach
628  // by allowing only = or < (not >), altough some > would be safe
629  // (depending upon unroll width).
630  // For the inner loop, we need to disallow any (> <) dependencies
631  // FIXME: Allow > so long as distance is less than unroll width
632  if (auto D = DI.depends(Src, Dst, true)) {
633  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
634 
635  if (D->isConfused()) {
636  LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
637  << " " << *Src << "\n"
638  << " " << *Dst << "\n");
639  return false;
640  }
641  if (!InnerLoop) {
642  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
643  LLVM_DEBUG(dbgs() << " > dependency between:\n"
644  << " " << *Src << "\n"
645  << " " << *Dst << "\n");
646  return false;
647  }
648  } else {
649  assert(LoopDepth + 1 <= D->getLevels());
650  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
651  D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
652  LLVM_DEBUG(dbgs() << " < > dependency between:\n"
653  << " " << *Src << "\n"
654  << " " << *Dst << "\n");
655  return false;
656  }
657  }
658  }
659  }
660  }
661  return true;
662 }
663 
664 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
665  BasicBlockSet &SubLoopBlocks,
666  BasicBlockSet &AftBlocks, DependenceInfo &DI) {
667  // Get all loads/store pairs for each blocks
668  SmallVector<Value *, 4> ForeMemInstr;
669  SmallVector<Value *, 4> SubLoopMemInstr;
670  SmallVector<Value *, 4> AftMemInstr;
671  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
672  !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
673  !getLoadsAndStores(AftBlocks, AftMemInstr))
674  return false;
675 
676  // Check for dependencies between any blocks that may change order
677  unsigned LoopDepth = L->getLoopDepth();
678  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
679  DI) &&
680  checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
681  checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
682  DI) &&
683  checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
684  DI);
685 }
686 
688  DependenceInfo &DI) {
689  /* We currently handle outer loops like this:
690  |
691  ForeFirst <----\ }
692  Blocks | } ForeBlocks
693  ForeLast | }
694  | |
695  SubLoopFirst <\ | }
696  Blocks | | } SubLoopBlocks
697  SubLoopLast -/ | }
698  | |
699  AftFirst | }
700  Blocks | } AftBlocks
701  AftLast ------/ }
702  |
703 
704  There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
705  and AftBlocks, providing that there is one edge from Fores to SubLoops,
706  one edge from SubLoops to Afts and a single outer loop exit (from Afts).
707  In practice we currently limit Aft blocks to a single block, and limit
708  things further in the profitablility checks of the unroll and jam pass.
709 
710  Because of the way we rearrange basic blocks, we also require that
711  the Fore blocks on all unrolled iterations are safe to move before the
712  SubLoop blocks of all iterations. So we require that the phi node looping
713  operands of ForeHeader can be moved to at least the end of ForeEnd, so that
714  we can arrange cloned Fore Blocks before the subloop and match up Phi's
715  correctly.
716 
717  i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
718  It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
719 
720  There are then a number of checks along the lines of no calls, no
721  exceptions, inner loop IV is consistent, etc. Note that for loops requiring
722  runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
723  UnrollAndJamLoop if the trip count cannot be easily calculated.
724  */
725 
726  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
727  return false;
728  Loop *SubLoop = L->getSubLoops()[0];
729  if (!SubLoop->isLoopSimplifyForm())
730  return false;
731 
732  BasicBlock *Header = L->getHeader();
733  BasicBlock *Latch = L->getLoopLatch();
734  BasicBlock *Exit = L->getExitingBlock();
735  BasicBlock *SubLoopHeader = SubLoop->getHeader();
736  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
737  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
738 
739  if (Latch != Exit)
740  return false;
741  if (SubLoopLatch != SubLoopExit)
742  return false;
743 
744  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
745  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
746  return false;
747  }
748 
749  // Split blocks into Fore/SubLoop/Aft based on dominators
750  BasicBlockSet SubLoopBlocks;
751  BasicBlockSet ForeBlocks;
752  BasicBlockSet AftBlocks;
753  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
754  AftBlocks, &DT)) {
755  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
756  return false;
757  }
758 
759  // Aft blocks may need to move instructions to fore blocks, which becomes more
760  // difficult if there are multiple (potentially conditionally executed)
761  // blocks. For now we just exclude loops with multiple aft blocks.
762  if (AftBlocks.size() != 1) {
763  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
764  "multiple blocks after the loop\n");
765  return false;
766  }
767 
768  // Check inner loop backedge count is consistent on all iterations of the
769  // outer loop
770  if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
771  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
772  "not consistent on each iteration\n");
773  return false;
774  }
775 
776  // Check the loop safety info for exceptions.
778  LSI.computeLoopSafetyInfo(L);
779  if (LSI.anyBlockMayThrow()) {
780  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
781  return false;
782  }
783 
784  // We've ruled out the easy stuff and now need to check that there are no
785  // interdependencies which may prevent us from moving the:
786  // ForeBlocks before Subloop and AftBlocks.
787  // Subloop before AftBlocks.
788  // ForeBlock phi operands before the subloop
789 
790  // Make sure we can move all instructions we need to before the subloop
792  Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
793  if (SubLoop->contains(I->getParent()))
794  return false;
795  if (AftBlocks.count(I->getParent())) {
796  // If we hit a phi node in afts we know we are done (probably
797  // LCSSA)
798  if (isa<PHINode>(I))
799  return false;
800  // Can't move instructions with side effects or memory
801  // reads/writes
802  if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
803  return false;
804  }
805  // Keep going
806  return true;
807  })) {
808  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
809  "instructions after subloop to before it\n");
810  return false;
811  }
812 
813  // Check for memory dependencies which prohibit the unrolling we are doing.
814  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
815  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
816  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
817  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
818  return false;
819  }
820 
821  return true;
822 }
virtual void computeLoopSafetyInfo(const Loop *CurLoop)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:49
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:209
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
DiagnosticInfoOptimizationBase::Argument NV
This class represents lattice values for constants.
Definition: AllocatorList.h:23
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr)
Attempts to merge a block into its predecessor, if possible.
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
static bool getLoadsAndStores(BasicBlockSet &Blocks, SmallVector< Value *, 4 > &MemInstr)
bool isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const
Return true if this Loop and all inner subloops are in LCSSA form.
Definition: LoopInfo.cpp:451
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:97
The main scalar evolution driver.
DominatorTree & getDomTree()
Flush DomTree updates and return DomTree.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:160
A cache of @llvm.assume calls within a function.
static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, BasicBlockSet &AftBlocks, T Visit)
BasicBlock * getSuccessor(unsigned i) const
bool hasIterationCountInvariantInParent(Loop *L, ScalarEvolution &SE)
Check inner loop (L) backedge count is known to be invariant on all iterations of its outer loop...
Definition: LoopUtils.cpp:720
STATISTIC(NumFunctions, "Total number of functions")
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:144
DependenceInfo - This class is the main dependence-analysis driver.
static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch, Instruction *InsertLoc, BasicBlockSet &AftBlocks)
static bool checkDependencies(SmallVector< Value *, 4 > &Earlier, SmallVector< Value *, 4 > &Later, unsigned LoopDepth, bool InnerLoop, DependenceInfo &DI)
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:273
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:104
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:80
void erase(Loop *L)
Update LoopInfo after removing the last backedge from a loop.
Definition: LoopInfo.cpp:853
BlockT * getHeader() const
Definition: LoopInfo.h:105
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:261
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:235
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:253
bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, DependenceInfo &DI)
This header provides classes for managing per-loop analyses.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
Debug location.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
void remapInstruction(Instruction *I, ValueToValueMapTy &VMap)
Convert the instruction operands from referencing the current values into those specified by VMap...
Definition: LoopUnroll.cpp:67
The loop was fully unrolled into straight-line code.
SmallPtrSet< BasicBlock *, 4 > BasicBlockSet
bool isDebugInfoForProfiling() const
Returns true if we should emit debug info for profiling.
Definition: Metadata.cpp:1508
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
Conditional or Unconditional Branch instruction.
DomTreeNodeBase * getIDom() const
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, BasicBlockSet &ForeBlocks, BasicBlockSet &SubLoopBlocks, BasicBlockSet &AftBlocks, DominatorTree *DT)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.h:582
Diagnostic information for applied optimization remarks.
const Instruction & back() const
Definition: BasicBlock.h:287
std::vector< BasicBlock * >::const_reverse_iterator RPOIterator
Definition: LoopIterator.h:101
op_range operands()
Definition: User.h:237
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
void applyUpdatesPermissive(ArrayRef< DominatorTree::UpdateType > Updates)
Submit updates to all available trees.
The loop was partially unrolled – we still have a loop, but with a smaller trip count.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:607
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
iterator end()
Definition: ValueMap.h:136
size_type size() const
Definition: SmallPtrSet.h:92
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches, switches, etc.
Definition: BasicBlock.h:396
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:115
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA, Loop **ResultLoop=nullptr)
Insert code in the prolog/epilog code when unrolling a loop with a run-time trip-count.
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Definition: SmallPtrSet.h:377
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
iterator begin() const
Definition: LoopInfo.h:147
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
#define DEBUG_TYPE
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Store the result of a depth first search within basic blocks contained by a single loop...
Definition: LoopIterator.h:97
void push_back(pointer val)
Definition: ilist.h:311
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
static void DFS(BasicBlock *Root, SetVector< BasicBlock *> &Set)
LoopT * getParentLoop() const
Definition: LoopInfo.h:106
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:136
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:460
void registerAssumption(CallInst *CI)
Add an @llvm.assume intrinsic to this function&#39;s cache.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
iterator begin() const
Definition: SmallPtrSet.h:396
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:154
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
const BasicBlockListType & getBasicBlockList() const
Get the underlying elements of the Function...
Definition: Function.h:657
block_iterator block_end() const
Definition: LoopInfo.h:160
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:329
bool isUnconditional() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
The loop was not modified.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:86
virtual bool anyBlockMayThrow() const
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:45
#define LLVM_DEBUG(X)
Definition: Debug.h:122
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:161
block_iterator block_begin() const
Definition: LoopInfo.h:159
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:49
The optimization diagnostic interface.
iterator begin()
Definition: ValueMap.h:135
const BasicBlock * getParent() const
Definition: Instruction.h:66
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition: UnrollLoop.h:53
LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop=nullptr)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:542
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC)
Perform some cleanup and simplifications on loops after unrolling.
Definition: LoopUnroll.cpp:199