LLVM  10.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/Function.h"
52 #include "llvm/Pass.h"
53 #include "llvm/Support/Allocator.h"
56 #include "llvm/Support/CodeGen.h"
58 #include "llvm/Support/Compiler.h"
59 #include "llvm/Support/Debug.h"
62 #include <algorithm>
63 #include <cassert>
64 #include <cstdint>
65 #include <iterator>
66 #include <memory>
67 #include <string>
68 #include <tuple>
69 #include <utility>
70 #include <vector>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "block-placement"
75 
76 STATISTIC(NumCondBranches, "Number of conditional branches");
77 STATISTIC(NumUncondBranches, "Number of unconditional branches");
78 STATISTIC(CondBranchTakenFreq,
79  "Potential frequency of taking conditional branches");
80 STATISTIC(UncondBranchTakenFreq,
81  "Potential frequency of taking unconditional branches");
82 
84  "align-all-blocks",
85  cl::desc("Force the alignment of all blocks in the function in log2 format "
86  "(e.g 4 means align on 16B boundaries)."),
87  cl::init(0), cl::Hidden);
88 
90  "align-all-nofallthru-blocks",
91  cl::desc("Force the alignment of all blocks that have no fall-through "
92  "predecessors (i.e. don't add nops that are executed). In log2 "
93  "format (e.g 4 means align on 16B boundaries)."),
94  cl::init(0), cl::Hidden);
95 
96 // FIXME: Find a good default for this flag and remove the flag.
98  "block-placement-exit-block-bias",
99  cl::desc("Block frequency percentage a loop exit block needs "
100  "over the original exit to be considered the new exit."),
101  cl::init(0), cl::Hidden);
102 
103 // Definition:
104 // - Outlining: placement of a basic block outside the chain or hot path.
105 
107  "loop-to-cold-block-ratio",
108  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
109  "(frequency of block) is greater than this ratio"),
110  cl::init(5), cl::Hidden);
111 
113  "force-loop-cold-block",
114  cl::desc("Force outlining cold blocks from loops."),
115  cl::init(false), cl::Hidden);
116 
117 static cl::opt<bool>
118  PreciseRotationCost("precise-rotation-cost",
119  cl::desc("Model the cost of loop rotation more "
120  "precisely by using profile data."),
121  cl::init(false), cl::Hidden);
122 
123 static cl::opt<bool>
124  ForcePreciseRotationCost("force-precise-rotation-cost",
125  cl::desc("Force the use of precise cost "
126  "loop rotation strategy."),
127  cl::init(false), cl::Hidden);
128 
130  "misfetch-cost",
131  cl::desc("Cost that models the probabilistic risk of an instruction "
132  "misfetch due to a jump comparing to falling through, whose cost "
133  "is zero."),
134  cl::init(1), cl::Hidden);
135 
136 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
137  cl::desc("Cost of jump instructions."),
138  cl::init(1), cl::Hidden);
139 static cl::opt<bool>
140 TailDupPlacement("tail-dup-placement",
141  cl::desc("Perform tail duplication during placement. "
142  "Creates more fallthrough opportunites in "
143  "outline branches."),
144  cl::init(true), cl::Hidden);
145 
146 static cl::opt<bool>
147 BranchFoldPlacement("branch-fold-placement",
148  cl::desc("Perform branch folding during placement. "
149  "Reduces code size."),
150  cl::init(true), cl::Hidden);
151 
152 // Heuristic for tail duplication.
154  "tail-dup-placement-threshold",
155  cl::desc("Instruction cutoff for tail duplication during layout. "
156  "Tail merging during layout is forced to have a threshold "
157  "that won't conflict."), cl::init(2),
158  cl::Hidden);
159 
160 // Heuristic for aggressive tail duplication.
162  "tail-dup-placement-aggressive-threshold",
163  cl::desc("Instruction cutoff for aggressive tail duplication during "
164  "layout. Used at -O3. Tail merging during layout is forced to "
165  "have a threshold that won't conflict."), cl::init(4),
166  cl::Hidden);
167 
168 // Heuristic for tail duplication.
170  "tail-dup-placement-penalty",
171  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
172  "Copying can increase fallthrough, but it also increases icache "
173  "pressure. This parameter controls the penalty to account for that. "
174  "Percent as integer."),
175  cl::init(2),
176  cl::Hidden);
177 
178 // Heuristic for triangle chains.
180  "triangle-chain-count",
181  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
182  "triangle tail duplication heuristic to kick in. 0 to disable."),
183  cl::init(2),
184  cl::Hidden);
185 
188 
189 // Internal option used to control BFI display only after MBP pass.
190 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
191 // -view-block-layout-with-bfi=
193 
194 // Command line option to specify the name of the function for CFG dump
195 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
197 
198 namespace {
199 
200 class BlockChain;
201 
202 /// Type for our function-wide basic block -> block chain mapping.
203 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
204 
205 /// A chain of blocks which will be laid out contiguously.
206 ///
207 /// This is the datastructure representing a chain of consecutive blocks that
208 /// are profitable to layout together in order to maximize fallthrough
209 /// probabilities and code locality. We also can use a block chain to represent
210 /// a sequence of basic blocks which have some external (correctness)
211 /// requirement for sequential layout.
212 ///
213 /// Chains can be built around a single basic block and can be merged to grow
214 /// them. They participate in a block-to-chain mapping, which is updated
215 /// automatically as chains are merged together.
216 class BlockChain {
217  /// The sequence of blocks belonging to this chain.
218  ///
219  /// This is the sequence of blocks for a particular chain. These will be laid
220  /// out in-order within the function.
222 
223  /// A handle to the function-wide basic block to block chain mapping.
224  ///
225  /// This is retained in each block chain to simplify the computation of child
226  /// block chains for SCC-formation and iteration. We store the edges to child
227  /// basic blocks, and map them back to their associated chains using this
228  /// structure.
229  BlockToChainMapType &BlockToChain;
230 
231 public:
232  /// Construct a new BlockChain.
233  ///
234  /// This builds a new block chain representing a single basic block in the
235  /// function. It also registers itself as the chain that block participates
236  /// in with the BlockToChain mapping.
237  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
238  : Blocks(1, BB), BlockToChain(BlockToChain) {
239  assert(BB && "Cannot create a chain with a null basic block");
240  BlockToChain[BB] = this;
241  }
242 
243  /// Iterator over blocks within the chain.
246 
247  /// Beginning of blocks within the chain.
248  iterator begin() { return Blocks.begin(); }
249  const_iterator begin() const { return Blocks.begin(); }
250 
251  /// End of blocks within the chain.
252  iterator end() { return Blocks.end(); }
253  const_iterator end() const { return Blocks.end(); }
254 
255  bool remove(MachineBasicBlock* BB) {
256  for(iterator i = begin(); i != end(); ++i) {
257  if (*i == BB) {
258  Blocks.erase(i);
259  return true;
260  }
261  }
262  return false;
263  }
264 
265  /// Merge a block chain into this one.
266  ///
267  /// This routine merges a block chain into this one. It takes care of forming
268  /// a contiguous sequence of basic blocks, updating the edge list, and
269  /// updating the block -> chain mapping. It does not free or tear down the
270  /// old chain, but the old chain's block list is no longer valid.
271  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
272  assert(BB && "Can't merge a null block.");
273  assert(!Blocks.empty() && "Can't merge into an empty chain.");
274 
275  // Fast path in case we don't have a chain already.
276  if (!Chain) {
277  assert(!BlockToChain[BB] &&
278  "Passed chain is null, but BB has entry in BlockToChain.");
279  Blocks.push_back(BB);
280  BlockToChain[BB] = this;
281  return;
282  }
283 
284  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
285  assert(Chain->begin() != Chain->end());
286 
287  // Update the incoming blocks to point to this chain, and add them to the
288  // chain structure.
289  for (MachineBasicBlock *ChainBB : *Chain) {
290  Blocks.push_back(ChainBB);
291  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
292  BlockToChain[ChainBB] = this;
293  }
294  }
295 
296 #ifndef NDEBUG
297  /// Dump the blocks in this chain.
298  LLVM_DUMP_METHOD void dump() {
299  for (MachineBasicBlock *MBB : *this)
300  MBB->dump();
301  }
302 #endif // NDEBUG
303 
304  /// Count of predecessors of any block within the chain which have not
305  /// yet been scheduled. In general, we will delay scheduling this chain
306  /// until those predecessors are scheduled (or we find a sufficiently good
307  /// reason to override this heuristic.) Note that when forming loop chains,
308  /// blocks outside the loop are ignored and treated as if they were already
309  /// scheduled.
310  ///
311  /// Note: This field is reinitialized multiple times - once for each loop,
312  /// and then once for the function as a whole.
313  unsigned UnscheduledPredecessors = 0;
314 };
315 
316 class MachineBlockPlacement : public MachineFunctionPass {
317  /// A type for a block filter set.
318  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
319 
320  /// Pair struct containing basic block and taildup profitability
321  struct BlockAndTailDupResult {
322  MachineBasicBlock *BB;
323  bool ShouldTailDup;
324  };
325 
326  /// Triple struct containing edge weight and the edge.
327  struct WeightedEdge {
328  BlockFrequency Weight;
329  MachineBasicBlock *Src;
330  MachineBasicBlock *Dest;
331  };
332 
333  /// work lists of blocks that are ready to be laid out
336 
337  /// Edges that have already been computed as optimal.
339 
340  /// Machine Function
342 
343  /// A handle to the branch probability pass.
344  const MachineBranchProbabilityInfo *MBPI;
345 
346  /// A handle to the function-wide block frequency pass.
347  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
348 
349  /// A handle to the loop info.
350  MachineLoopInfo *MLI;
351 
352  /// Preferred loop exit.
353  /// Member variable for convenience. It may be removed by duplication deep
354  /// in the call stack.
355  MachineBasicBlock *PreferredLoopExit;
356 
357  /// A handle to the target's instruction info.
358  const TargetInstrInfo *TII;
359 
360  /// A handle to the target's lowering info.
361  const TargetLoweringBase *TLI;
362 
363  /// A handle to the post dominator tree.
365 
366  /// Duplicator used to duplicate tails during placement.
367  ///
368  /// Placement decisions can open up new tail duplication opportunities, but
369  /// since tail duplication affects placement decisions of later blocks, it
370  /// must be done inline.
371  TailDuplicator TailDup;
372 
373  /// Allocator and owner of BlockChain structures.
374  ///
375  /// We build BlockChains lazily while processing the loop structure of
376  /// a function. To reduce malloc traffic, we allocate them using this
377  /// slab-like allocator, and destroy them after the pass completes. An
378  /// important guarantee is that this allocator produces stable pointers to
379  /// the chains.
381 
382  /// Function wide BasicBlock to BlockChain mapping.
383  ///
384  /// This mapping allows efficiently moving from any given basic block to the
385  /// BlockChain it participates in, if any. We use it to, among other things,
386  /// allow implicitly defining edges between chains as the existing edges
387  /// between basic blocks.
389 
390 #ifndef NDEBUG
391  /// The set of basic blocks that have terminators that cannot be fully
392  /// analyzed. These basic blocks cannot be re-ordered safely by
393  /// MachineBlockPlacement, and we must preserve physical layout of these
394  /// blocks and their successors through the pass.
395  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
396 #endif
397 
398  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
399  /// if the count goes to 0, add them to the appropriate work list.
400  void markChainSuccessors(
401  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
402  const BlockFilterSet *BlockFilter = nullptr);
403 
404  /// Decrease the UnscheduledPredecessors count for a single block, and
405  /// if the count goes to 0, add them to the appropriate work list.
406  void markBlockSuccessors(
407  const BlockChain &Chain, const MachineBasicBlock *BB,
408  const MachineBasicBlock *LoopHeaderBB,
409  const BlockFilterSet *BlockFilter = nullptr);
410 
412  collectViableSuccessors(
413  const MachineBasicBlock *BB, const BlockChain &Chain,
414  const BlockFilterSet *BlockFilter,
416  bool shouldPredBlockBeOutlined(
417  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
418  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
419  BranchProbability SuccProb, BranchProbability HotProb);
420  bool repeatedlyTailDuplicateBlock(
422  const MachineBasicBlock *LoopHeaderBB,
423  BlockChain &Chain, BlockFilterSet *BlockFilter,
424  MachineFunction::iterator &PrevUnplacedBlockIt);
425  bool maybeTailDuplicateBlock(
427  BlockChain &Chain, BlockFilterSet *BlockFilter,
428  MachineFunction::iterator &PrevUnplacedBlockIt,
429  bool &DuplicatedToLPred);
430  bool hasBetterLayoutPredecessor(
431  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
432  const BlockChain &SuccChain, BranchProbability SuccProb,
433  BranchProbability RealSuccProb, const BlockChain &Chain,
434  const BlockFilterSet *BlockFilter);
435  BlockAndTailDupResult selectBestSuccessor(
436  const MachineBasicBlock *BB, const BlockChain &Chain,
437  const BlockFilterSet *BlockFilter);
438  MachineBasicBlock *selectBestCandidateBlock(
439  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
440  MachineBasicBlock *getFirstUnplacedBlock(
441  const BlockChain &PlacedChain,
442  MachineFunction::iterator &PrevUnplacedBlockIt,
443  const BlockFilterSet *BlockFilter);
444 
445  /// Add a basic block to the work list if it is appropriate.
446  ///
447  /// If the optional parameter BlockFilter is provided, only MBB
448  /// present in the set will be added to the worklist. If nullptr
449  /// is provided, no filtering occurs.
450  void fillWorkLists(const MachineBasicBlock *MBB,
451  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
452  const BlockFilterSet *BlockFilter);
453 
454  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
455  BlockFilterSet *BlockFilter = nullptr);
456  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
457  const MachineBasicBlock *OldTop);
458  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
459  const BlockFilterSet &LoopBlockSet);
460  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
461  const BlockFilterSet &LoopBlockSet);
462  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
463  const MachineBasicBlock *OldTop,
464  const MachineBasicBlock *ExitBB,
465  const BlockFilterSet &LoopBlockSet);
466  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
467  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
468  MachineBasicBlock *findBestLoopTop(
469  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
470  MachineBasicBlock *findBestLoopExit(
471  const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
472  BlockFrequency &ExitFreq);
473  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
474  void buildLoopChains(const MachineLoop &L);
475  void rotateLoop(
476  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
477  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
478  void rotateLoopWithProfile(
479  BlockChain &LoopChain, const MachineLoop &L,
480  const BlockFilterSet &LoopBlockSet);
481  void buildCFGChains();
482  void optimizeBranches();
483  void alignBlocks();
484  /// Returns true if a block should be tail-duplicated to increase fallthrough
485  /// opportunities.
486  bool shouldTailDuplicate(MachineBasicBlock *BB);
487  /// Check the edge frequencies to see if tail duplication will increase
488  /// fallthroughs.
489  bool isProfitableToTailDup(
490  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
491  BranchProbability QProb,
492  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
493 
494  /// Check for a trellis layout.
495  bool isTrellis(const MachineBasicBlock *BB,
496  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
497  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
498 
499  /// Get the best successor given a trellis layout.
500  BlockAndTailDupResult getBestTrellisSuccessor(
501  const MachineBasicBlock *BB,
502  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
503  BranchProbability AdjustedSumProb, const BlockChain &Chain,
504  const BlockFilterSet *BlockFilter);
505 
506  /// Get the best pair of non-conflicting edges.
507  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
508  const MachineBasicBlock *BB,
510 
511  /// Returns true if a block can tail duplicate into all unplaced
512  /// predecessors. Filters based on loop.
513  bool canTailDuplicateUnplacedPreds(
514  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
515  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
516 
517  /// Find chains of triangles to tail-duplicate where a global analysis works,
518  /// but a local analysis would not find them.
519  void precomputeTriangleChains();
520 
521 public:
522  static char ID; // Pass identification, replacement for typeid
523 
524  MachineBlockPlacement() : MachineFunctionPass(ID) {
526  }
527 
528  bool runOnMachineFunction(MachineFunction &F) override;
529 
530  bool allowTailDupPlacement() const {
531  assert(F);
533  }
534 
535  void getAnalysisUsage(AnalysisUsage &AU) const override {
538  if (TailDupPlacement)
543  }
544 };
545 
546 } // end anonymous namespace
547 
549 
551 
552 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
553  "Branch Probability Basic Block Placement", false, false)
558 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
559  "Branch Probability Basic Block Placement", false, false)
560 
561 #ifndef NDEBUG
562 /// Helper to print the name of a MBB.
563 ///
564 /// Only used by debug logging.
565 static std::string getBlockName(const MachineBasicBlock *BB) {
566  std::string Result;
567  raw_string_ostream OS(Result);
568  OS << printMBBReference(*BB);
569  OS << " ('" << BB->getName() << "')";
570  OS.flush();
571  return Result;
572 }
573 #endif
574 
575 /// Mark a chain's successors as having one fewer preds.
576 ///
577 /// When a chain is being merged into the "placed" chain, this routine will
578 /// quickly walk the successors of each block in the chain and mark them as
579 /// having one fewer active predecessor. It also adds any successors of this
580 /// chain which reach the zero-predecessor state to the appropriate worklist.
581 void MachineBlockPlacement::markChainSuccessors(
582  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
583  const BlockFilterSet *BlockFilter) {
584  // Walk all the blocks in this chain, marking their successors as having
585  // a predecessor placed.
586  for (MachineBasicBlock *MBB : Chain) {
587  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
588  }
589 }
590 
591 /// Mark a single block's successors as having one fewer preds.
592 ///
593 /// Under normal circumstances, this is only called by markChainSuccessors,
594 /// but if a block that was to be placed is completely tail-duplicated away,
595 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
596 /// for just that block.
597 void MachineBlockPlacement::markBlockSuccessors(
598  const BlockChain &Chain, const MachineBasicBlock *MBB,
599  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
600  // Add any successors for which this is the only un-placed in-loop
601  // predecessor to the worklist as a viable candidate for CFG-neutral
602  // placement. No subsequent placement of this block will violate the CFG
603  // shape, so we get to use heuristics to choose a favorable placement.
604  for (MachineBasicBlock *Succ : MBB->successors()) {
605  if (BlockFilter && !BlockFilter->count(Succ))
606  continue;
607  BlockChain &SuccChain = *BlockToChain[Succ];
608  // Disregard edges within a fixed chain, or edges to the loop header.
609  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
610  continue;
611 
612  // This is a cross-chain edge that is within the loop, so decrement the
613  // loop predecessor count of the destination chain.
614  if (SuccChain.UnscheduledPredecessors == 0 ||
615  --SuccChain.UnscheduledPredecessors > 0)
616  continue;
617 
618  auto *NewBB = *SuccChain.begin();
619  if (NewBB->isEHPad())
620  EHPadWorkList.push_back(NewBB);
621  else
622  BlockWorkList.push_back(NewBB);
623  }
624 }
625 
626 /// This helper function collects the set of successors of block
627 /// \p BB that are allowed to be its layout successors, and return
628 /// the total branch probability of edges from \p BB to those
629 /// blocks.
630 BranchProbability MachineBlockPlacement::collectViableSuccessors(
631  const MachineBasicBlock *BB, const BlockChain &Chain,
632  const BlockFilterSet *BlockFilter,
634  // Adjust edge probabilities by excluding edges pointing to blocks that is
635  // either not in BlockFilter or is already in the current chain. Consider the
636  // following CFG:
637  //
638  // --->A
639  // | / \
640  // | B C
641  // | \ / \
642  // ----D E
643  //
644  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
645  // A->C is chosen as a fall-through, D won't be selected as a successor of C
646  // due to CFG constraint (the probability of C->D is not greater than
647  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
648  // when calculating the probability of C->D, D will be selected and we
649  // will get A C D B as the layout of this loop.
650  auto AdjustedSumProb = BranchProbability::getOne();
651  for (MachineBasicBlock *Succ : BB->successors()) {
652  bool SkipSucc = false;
653  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
654  SkipSucc = true;
655  } else {
656  BlockChain *SuccChain = BlockToChain[Succ];
657  if (SuccChain == &Chain) {
658  SkipSucc = true;
659  } else if (Succ != *SuccChain->begin()) {
660  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
661  << " -> Mid chain!\n");
662  continue;
663  }
664  }
665  if (SkipSucc)
666  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
667  else
668  Successors.push_back(Succ);
669  }
670 
671  return AdjustedSumProb;
672 }
673 
674 /// The helper function returns the branch probability that is adjusted
675 /// or normalized over the new total \p AdjustedSumProb.
676 static BranchProbability
678  BranchProbability AdjustedSumProb) {
679  BranchProbability SuccProb;
680  uint32_t SuccProbN = OrigProb.getNumerator();
681  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
682  if (SuccProbN >= SuccProbD)
683  SuccProb = BranchProbability::getOne();
684  else
685  SuccProb = BranchProbability(SuccProbN, SuccProbD);
686 
687  return SuccProb;
688 }
689 
690 /// Check if \p BB has exactly the successors in \p Successors.
691 static bool
694  if (BB.succ_size() != Successors.size())
695  return false;
696  // We don't want to count self-loops
697  if (Successors.count(&BB))
698  return false;
699  for (MachineBasicBlock *Succ : BB.successors())
700  if (!Successors.count(Succ))
701  return false;
702  return true;
703 }
704 
705 /// Check if a block should be tail duplicated to increase fallthrough
706 /// opportunities.
707 /// \p BB Block to check.
708 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
709  // Blocks with single successors don't create additional fallthrough
710  // opportunities. Don't duplicate them. TODO: When conditional exits are
711  // analyzable, allow them to be duplicated.
712  bool IsSimple = TailDup.isSimpleBB(BB);
713 
714  if (BB->succ_size() == 1)
715  return false;
716  return TailDup.shouldTailDuplicate(IsSimple, *BB);
717 }
718 
719 /// Compare 2 BlockFrequency's with a small penalty for \p A.
720 /// In order to be conservative, we apply a X% penalty to account for
721 /// increased icache pressure and static heuristics. For small frequencies
722 /// we use only the numerators to improve accuracy. For simplicity, we assume the
723 /// penalty is less than 100%
724 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
726  uint64_t EntryFreq) {
727  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
728  BlockFrequency Gain = A - B;
729  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
730 }
731 
732 /// Check the edge frequencies to see if tail duplication will increase
733 /// fallthroughs. It only makes sense to call this function when
734 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
735 /// always locally profitable if we would have picked \p Succ without
736 /// considering duplication.
737 bool MachineBlockPlacement::isProfitableToTailDup(
738  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
739  BranchProbability QProb,
740  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
741  // We need to do a probability calculation to make sure this is profitable.
742  // First: does succ have a successor that post-dominates? This affects the
743  // calculation. The 2 relevant cases are:
744  // BB BB
745  // | \Qout | \Qout
746  // P| C |P C
747  // = C' = C'
748  // | /Qin | /Qin
749  // | / | /
750  // Succ Succ
751  // / \ | \ V
752  // U/ =V |U \
753  // / \ = D
754  // D E | /
755  // | /
756  // |/
757  // PDom
758  // '=' : Branch taken for that CFG edge
759  // In the second case, Placing Succ while duplicating it into C prevents the
760  // fallthrough of Succ into either D or PDom, because they now have C as an
761  // unplaced predecessor
762 
763  // Start by figuring out which case we fall into
764  MachineBasicBlock *PDom = nullptr;
766  // Only scan the relevant successors
767  auto AdjustedSuccSumProb =
768  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
769  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
770  auto BBFreq = MBFI->getBlockFreq(BB);
771  auto SuccFreq = MBFI->getBlockFreq(Succ);
772  BlockFrequency P = BBFreq * PProb;
773  BlockFrequency Qout = BBFreq * QProb;
774  uint64_t EntryFreq = MBFI->getEntryFreq();
775  // If there are no more successors, it is profitable to copy, as it strictly
776  // increases fallthrough.
777  if (SuccSuccs.size() == 0)
778  return greaterWithBias(P, Qout, EntryFreq);
779 
780  auto BestSuccSucc = BranchProbability::getZero();
781  // Find the PDom or the best Succ if no PDom exists.
782  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
783  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
784  if (Prob > BestSuccSucc)
785  BestSuccSucc = Prob;
786  if (PDom == nullptr)
787  if (MPDT->dominates(SuccSucc, Succ)) {
788  PDom = SuccSucc;
789  break;
790  }
791  }
792  // For the comparisons, we need to know Succ's best incoming edge that isn't
793  // from BB.
794  auto SuccBestPred = BlockFrequency(0);
795  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
796  if (SuccPred == Succ || SuccPred == BB
797  || BlockToChain[SuccPred] == &Chain
798  || (BlockFilter && !BlockFilter->count(SuccPred)))
799  continue;
800  auto Freq = MBFI->getBlockFreq(SuccPred)
801  * MBPI->getEdgeProbability(SuccPred, Succ);
802  if (Freq > SuccBestPred)
803  SuccBestPred = Freq;
804  }
805  // Qin is Succ's best unplaced incoming edge that isn't BB
806  BlockFrequency Qin = SuccBestPred;
807  // If it doesn't have a post-dominating successor, here is the calculation:
808  // BB BB
809  // | \Qout | \
810  // P| C | =
811  // = C' | C
812  // | /Qin | |
813  // | / | C' (+Succ)
814  // Succ Succ /|
815  // / \ | \/ |
816  // U/ =V | == |
817  // / \ | / \|
818  // D E D E
819  // '=' : Branch taken for that CFG edge
820  // Cost in the first case is: P + V
821  // For this calculation, we always assume P > Qout. If Qout > P
822  // The result of this function will be ignored at the caller.
823  // Let F = SuccFreq - Qin
824  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
825 
826  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
827  BranchProbability UProb = BestSuccSucc;
828  BranchProbability VProb = AdjustedSuccSumProb - UProb;
829  BlockFrequency F = SuccFreq - Qin;
830  BlockFrequency V = SuccFreq * VProb;
831  BlockFrequency QinU = std::min(Qin, F) * UProb;
832  BlockFrequency BaseCost = P + V;
833  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
834  return greaterWithBias(BaseCost, DupCost, EntryFreq);
835  }
836  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
837  BranchProbability VProb = AdjustedSuccSumProb - UProb;
838  BlockFrequency U = SuccFreq * UProb;
839  BlockFrequency V = SuccFreq * VProb;
840  BlockFrequency F = SuccFreq - Qin;
841  // If there is a post-dominating successor, here is the calculation:
842  // BB BB BB BB
843  // | \Qout | \ | \Qout | \
844  // |P C | = |P C | =
845  // = C' |P C = C' |P C
846  // | /Qin | | | /Qin | |
847  // | / | C' (+Succ) | / | C' (+Succ)
848  // Succ Succ /| Succ Succ /|
849  // | \ V | \/ | | \ V | \/ |
850  // |U \ |U /\ =? |U = |U /\ |
851  // = D = = =?| | D | = =|
852  // | / |/ D | / |/ D
853  // | / | / | = | /
854  // |/ | / |/ | =
855  // Dom Dom Dom Dom
856  // '=' : Branch taken for that CFG edge
857  // The cost for taken branches in the first case is P + U
858  // Let F = SuccFreq - Qin
859  // The cost in the second case (assuming independence), given the layout:
860  // BB, Succ, (C+Succ), D, Dom or the layout:
861  // BB, Succ, D, Dom, (C+Succ)
862  // is Qout + max(F, Qin) * U + min(F, Qin)
863  // compare P + U vs Qout + P * U + Qin.
864  //
865  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
866  //
867  // For the 3rd case, the cost is P + 2 * V
868  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
869  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
870  if (UProb > AdjustedSuccSumProb / 2 &&
871  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
872  Chain, BlockFilter))
873  // Cases 3 & 4
874  return greaterWithBias(
875  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
876  EntryFreq);
877  // Cases 1 & 2
878  return greaterWithBias((P + U),
879  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
880  std::max(Qin, F) * UProb),
881  EntryFreq);
882 }
883 
884 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
885 /// successors form the lower part of a trellis. A successor set S forms the
886 /// lower part of a trellis if all of the predecessors of S are either in S or
887 /// have all of S as successors. We ignore trellises where BB doesn't have 2
888 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
889 /// are very uncommon and complex to compute optimally. Allowing edges within S
890 /// is not strictly a trellis, but the same algorithm works, so we allow it.
891 bool MachineBlockPlacement::isTrellis(
892  const MachineBasicBlock *BB,
893  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
894  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
895  // Technically BB could form a trellis with branching factor higher than 2.
896  // But that's extremely uncommon.
897  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
898  return false;
899 
901  BB->succ_end());
902  // To avoid reviewing the same predecessors twice.
904 
905  for (MachineBasicBlock *Succ : ViableSuccs) {
906  int PredCount = 0;
907  for (auto SuccPred : Succ->predecessors()) {
908  // Allow triangle successors, but don't count them.
909  if (Successors.count(SuccPred)) {
910  // Make sure that it is actually a triangle.
911  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
912  if (!Successors.count(CheckSucc))
913  return false;
914  continue;
915  }
916  const BlockChain *PredChain = BlockToChain[SuccPred];
917  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
918  PredChain == &Chain || PredChain == BlockToChain[Succ])
919  continue;
920  ++PredCount;
921  // Perform the successor check only once.
922  if (!SeenPreds.insert(SuccPred).second)
923  continue;
924  if (!hasSameSuccessors(*SuccPred, Successors))
925  return false;
926  }
927  // If one of the successors has only BB as a predecessor, it is not a
928  // trellis.
929  if (PredCount < 1)
930  return false;
931  }
932  return true;
933 }
934 
935 /// Pick the highest total weight pair of edges that can both be laid out.
936 /// The edges in \p Edges[0] are assumed to have a different destination than
937 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
938 /// the individual highest weight edges to the 2 different destinations, or in
939 /// case of a conflict, one of them should be replaced with a 2nd best edge.
940 std::pair<MachineBlockPlacement::WeightedEdge,
941  MachineBlockPlacement::WeightedEdge>
942 MachineBlockPlacement::getBestNonConflictingEdges(
943  const MachineBasicBlock *BB,
945  Edges) {
946  // Sort the edges, and then for each successor, find the best incoming
947  // predecessor. If the best incoming predecessors aren't the same,
948  // then that is clearly the best layout. If there is a conflict, one of the
949  // successors will have to fallthrough from the second best predecessor. We
950  // compare which combination is better overall.
951 
952  // Sort for highest frequency.
953  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
954 
955  llvm::stable_sort(Edges[0], Cmp);
956  llvm::stable_sort(Edges[1], Cmp);
957  auto BestA = Edges[0].begin();
958  auto BestB = Edges[1].begin();
959  // Arrange for the correct answer to be in BestA and BestB
960  // If the 2 best edges don't conflict, the answer is already there.
961  if (BestA->Src == BestB->Src) {
962  // Compare the total fallthrough of (Best + Second Best) for both pairs
963  auto SecondBestA = std::next(BestA);
964  auto SecondBestB = std::next(BestB);
965  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
966  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
967  if (BestAScore < BestBScore)
968  BestA = SecondBestA;
969  else
970  BestB = SecondBestB;
971  }
972  // Arrange for the BB edge to be in BestA if it exists.
973  if (BestB->Src == BB)
974  std::swap(BestA, BestB);
975  return std::make_pair(*BestA, *BestB);
976 }
977 
978 /// Get the best successor from \p BB based on \p BB being part of a trellis.
979 /// We only handle trellises with 2 successors, so the algorithm is
980 /// straightforward: Find the best pair of edges that don't conflict. We find
981 /// the best incoming edge for each successor in the trellis. If those conflict,
982 /// we consider which of them should be replaced with the second best.
983 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
984 /// comes from \p BB, it will be in \p BestEdges[0]
985 MachineBlockPlacement::BlockAndTailDupResult
986 MachineBlockPlacement::getBestTrellisSuccessor(
987  const MachineBasicBlock *BB,
988  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
989  BranchProbability AdjustedSumProb, const BlockChain &Chain,
990  const BlockFilterSet *BlockFilter) {
991 
992  BlockAndTailDupResult Result = {nullptr, false};
994  BB->succ_end());
995 
996  // We assume size 2 because it's common. For general n, we would have to do
997  // the Hungarian algorithm, but it's not worth the complexity because more
998  // than 2 successors is fairly uncommon, and a trellis even more so.
999  if (Successors.size() != 2 || ViableSuccs.size() != 2)
1000  return Result;
1001 
1002  // Collect the edge frequencies of all edges that form the trellis.
1004  int SuccIndex = 0;
1005  for (auto Succ : ViableSuccs) {
1006  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1007  // Skip any placed predecessors that are not BB
1008  if (SuccPred != BB)
1009  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1010  BlockToChain[SuccPred] == &Chain ||
1011  BlockToChain[SuccPred] == BlockToChain[Succ])
1012  continue;
1013  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1014  MBPI->getEdgeProbability(SuccPred, Succ);
1015  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1016  }
1017  ++SuccIndex;
1018  }
1019 
1020  // Pick the best combination of 2 edges from all the edges in the trellis.
1021  WeightedEdge BestA, BestB;
1022  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1023 
1024  if (BestA.Src != BB) {
1025  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1026  // we shouldn't choose any successor. We've already looked and there's a
1027  // better fallthrough edge for all the successors.
1028  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1029  return Result;
1030  }
1031 
1032  // Did we pick the triangle edge? If tail-duplication is profitable, do
1033  // that instead. Otherwise merge the triangle edge now while we know it is
1034  // optimal.
1035  if (BestA.Dest == BestB.Src) {
1036  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1037  // would be better.
1038  MachineBasicBlock *Succ1 = BestA.Dest;
1039  MachineBasicBlock *Succ2 = BestB.Dest;
1040  // Check to see if tail-duplication would be profitable.
1041  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1042  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1043  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1044  Chain, BlockFilter)) {
1046  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1047  dbgs() << " Selected: " << getBlockName(Succ2)
1048  << ", probability: " << Succ2Prob
1049  << " (Tail Duplicate)\n");
1050  Result.BB = Succ2;
1051  Result.ShouldTailDup = true;
1052  return Result;
1053  }
1054  }
1055  // We have already computed the optimal edge for the other side of the
1056  // trellis.
1057  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1058 
1059  auto TrellisSucc = BestA.Dest;
1061  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1062  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1063  << ", probability: " << SuccProb << " (Trellis)\n");
1064  Result.BB = TrellisSucc;
1065  return Result;
1066 }
1067 
1068 /// When the option allowTailDupPlacement() is on, this method checks if the
1069 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1070 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1071 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1072  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1073  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1074  if (!shouldTailDuplicate(Succ))
1075  return false;
1076 
1077  // For CFG checking.
1079  BB->succ_end());
1080  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1081  // Make sure all unplaced and unfiltered predecessors can be
1082  // tail-duplicated into.
1083  // Skip any blocks that are already placed or not in this loop.
1084  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1085  || BlockToChain[Pred] == &Chain)
1086  continue;
1087  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1088  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1089  // This will result in a trellis after tail duplication, so we don't
1090  // need to copy Succ into this predecessor. In the presence
1091  // of a trellis tail duplication can continue to be profitable.
1092  // For example:
1093  // A A
1094  // |\ |\
1095  // | \ | \
1096  // | C | C+BB
1097  // | / | |
1098  // |/ | |
1099  // BB => BB |
1100  // |\ |\/|
1101  // | \ |/\|
1102  // | D | D
1103  // | / | /
1104  // |/ |/
1105  // Succ Succ
1106  //
1107  // After BB was duplicated into C, the layout looks like the one on the
1108  // right. BB and C now have the same successors. When considering
1109  // whether Succ can be duplicated into all its unplaced predecessors, we
1110  // ignore C.
1111  // We can do this because C already has a profitable fallthrough, namely
1112  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1113  // duplication and for this test.
1114  //
1115  // This allows trellises to be laid out in 2 separate chains
1116  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1117  // because it allows the creation of 2 fallthrough paths with links
1118  // between them, and we correctly identify the best layout for these
1119  // CFGs. We want to extend trellises that the user created in addition
1120  // to trellises created by tail-duplication, so we just look for the
1121  // CFG.
1122  continue;
1123  return false;
1124  }
1125  }
1126  return true;
1127 }
1128 
1129 /// Find chains of triangles where we believe it would be profitable to
1130 /// tail-duplicate them all, but a local analysis would not find them.
1131 /// There are 3 ways this can be profitable:
1132 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1133 /// longer chains)
1134 /// 2) The chains are statically correlated. Branch probabilities have a very
1135 /// U-shaped distribution.
1136 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1137 /// If the branches in a chain are likely to be from the same side of the
1138 /// distribution as their predecessor, but are independent at runtime, this
1139 /// transformation is profitable. (Because the cost of being wrong is a small
1140 /// fixed cost, unlike the standard triangle layout where the cost of being
1141 /// wrong scales with the # of triangles.)
1142 /// 3) The chains are dynamically correlated. If the probability that a previous
1143 /// branch was taken positively influences whether the next branch will be
1144 /// taken
1145 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1146 void MachineBlockPlacement::precomputeTriangleChains() {
1147  struct TriangleChain {
1148  std::vector<MachineBasicBlock *> Edges;
1149 
1150  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1151  : Edges({src, dst}) {}
1152 
1153  void append(MachineBasicBlock *dst) {
1154  assert(getKey()->isSuccessor(dst) &&
1155  "Attempting to append a block that is not a successor.");
1156  Edges.push_back(dst);
1157  }
1158 
1159  unsigned count() const { return Edges.size() - 1; }
1160 
1161  MachineBasicBlock *getKey() const {
1162  return Edges.back();
1163  }
1164  };
1165 
1166  if (TriangleChainCount == 0)
1167  return;
1168 
1169  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1170  // Map from last block to the chain that contains it. This allows us to extend
1171  // chains as we find new triangles.
1173  for (MachineBasicBlock &BB : *F) {
1174  // If BB doesn't have 2 successors, it doesn't start a triangle.
1175  if (BB.succ_size() != 2)
1176  continue;
1177  MachineBasicBlock *PDom = nullptr;
1178  for (MachineBasicBlock *Succ : BB.successors()) {
1179  if (!MPDT->dominates(Succ, &BB))
1180  continue;
1181  PDom = Succ;
1182  break;
1183  }
1184  // If BB doesn't have a post-dominating successor, it doesn't form a
1185  // triangle.
1186  if (PDom == nullptr)
1187  continue;
1188  // If PDom has a hint that it is low probability, skip this triangle.
1189  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1190  continue;
1191  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1192  // we're looking for.
1193  if (!shouldTailDuplicate(PDom))
1194  continue;
1195  bool CanTailDuplicate = true;
1196  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1197  // isn't the kind of triangle we're looking for.
1198  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1199  if (Pred == &BB)
1200  continue;
1201  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1202  CanTailDuplicate = false;
1203  break;
1204  }
1205  }
1206  // If we can't tail-duplicate PDom to its predecessors, then skip this
1207  // triangle.
1208  if (!CanTailDuplicate)
1209  continue;
1210 
1211  // Now we have an interesting triangle. Insert it if it's not part of an
1212  // existing chain.
1213  // Note: This cannot be replaced with a call insert() or emplace() because
1214  // the find key is BB, but the insert/emplace key is PDom.
1215  auto Found = TriangleChainMap.find(&BB);
1216  // If it is, remove the chain from the map, grow it, and put it back in the
1217  // map with the end as the new key.
1218  if (Found != TriangleChainMap.end()) {
1219  TriangleChain Chain = std::move(Found->second);
1220  TriangleChainMap.erase(Found);
1221  Chain.append(PDom);
1222  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1223  } else {
1224  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1225  assert(InsertResult.second && "Block seen twice.");
1226  (void)InsertResult;
1227  }
1228  }
1229 
1230  // Iterating over a DenseMap is safe here, because the only thing in the body
1231  // of the loop is inserting into another DenseMap (ComputedEdges).
1232  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1233  for (auto &ChainPair : TriangleChainMap) {
1234  TriangleChain &Chain = ChainPair.second;
1235  // Benchmarking has shown that due to branch correlation duplicating 2 or
1236  // more triangles is profitable, despite the calculations assuming
1237  // independence.
1238  if (Chain.count() < TriangleChainCount)
1239  continue;
1240  MachineBasicBlock *dst = Chain.Edges.back();
1241  Chain.Edges.pop_back();
1242  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1243  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1244  << getBlockName(dst)
1245  << " as pre-computed based on triangles.\n");
1246 
1247  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1248  assert(InsertResult.second && "Block seen twice.");
1249  (void)InsertResult;
1250 
1251  dst = src;
1252  }
1253  }
1254 }
1255 
1256 // When profile is not present, return the StaticLikelyProb.
1257 // When profile is available, we need to handle the triangle-shape CFG.
1259  const MachineBasicBlock *BB) {
1260  if (!BB->getParent()->getFunction().hasProfileData())
1261  return BranchProbability(StaticLikelyProb, 100);
1262  if (BB->succ_size() == 2) {
1263  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1264  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1265  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1266  /* See case 1 below for the cost analysis. For BB->Succ to
1267  * be taken with smaller cost, the following needs to hold:
1268  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1269  * So the threshold T in the calculation below
1270  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1271  * So T / (1 - T) = 2, Yielding T = 2/3
1272  * Also adding user specified branch bias, we have
1273  * T = (2/3)*(ProfileLikelyProb/50)
1274  * = (2*ProfileLikelyProb)/150)
1275  */
1276  return BranchProbability(2 * ProfileLikelyProb, 150);
1277  }
1278  }
1279  return BranchProbability(ProfileLikelyProb, 100);
1280 }
1281 
1282 /// Checks to see if the layout candidate block \p Succ has a better layout
1283 /// predecessor than \c BB. If yes, returns true.
1284 /// \p SuccProb: The probability adjusted for only remaining blocks.
1285 /// Only used for logging
1286 /// \p RealSuccProb: The un-adjusted probability.
1287 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1288 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1289 /// considered
1290 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1291  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1292  const BlockChain &SuccChain, BranchProbability SuccProb,
1293  BranchProbability RealSuccProb, const BlockChain &Chain,
1294  const BlockFilterSet *BlockFilter) {
1295 
1296  // There isn't a better layout when there are no unscheduled predecessors.
1297  if (SuccChain.UnscheduledPredecessors == 0)
1298  return false;
1299 
1300  // There are two basic scenarios here:
1301  // -------------------------------------
1302  // Case 1: triangular shape CFG (if-then):
1303  // BB
1304  // | \
1305  // | \
1306  // | Pred
1307  // | /
1308  // Succ
1309  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1310  // set Succ as the layout successor of BB. Picking Succ as BB's
1311  // successor breaks the CFG constraints (FIXME: define these constraints).
1312  // With this layout, Pred BB
1313  // is forced to be outlined, so the overall cost will be cost of the
1314  // branch taken from BB to Pred, plus the cost of back taken branch
1315  // from Pred to Succ, as well as the additional cost associated
1316  // with the needed unconditional jump instruction from Pred To Succ.
1317 
1318  // The cost of the topological order layout is the taken branch cost
1319  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1320  // must hold:
1321  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1322  // < freq(BB->Succ) * taken_branch_cost.
1323  // Ignoring unconditional jump cost, we get
1324  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1325  // prob(BB->Succ) > 2 * prob(BB->Pred)
1326  //
1327  // When real profile data is available, we can precisely compute the
1328  // probability threshold that is needed for edge BB->Succ to be considered.
1329  // Without profile data, the heuristic requires the branch bias to be
1330  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1331  // -----------------------------------------------------------------
1332  // Case 2: diamond like CFG (if-then-else):
1333  // S
1334  // / \
1335  // | \
1336  // BB Pred
1337  // \ /
1338  // Succ
1339  // ..
1340  //
1341  // The current block is BB and edge BB->Succ is now being evaluated.
1342  // Note that edge S->BB was previously already selected because
1343  // prob(S->BB) > prob(S->Pred).
1344  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1345  // choose Pred, we will have a topological ordering as shown on the left
1346  // in the picture below. If we choose Succ, we have the solution as shown
1347  // on the right:
1348  //
1349  // topo-order:
1350  //
1351  // S----- ---S
1352  // | | | |
1353  // ---BB | | BB
1354  // | | | |
1355  // | Pred-- | Succ--
1356  // | | | |
1357  // ---Succ ---Pred--
1358  //
1359  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1360  // = freq(S->Pred) + freq(S->BB)
1361  //
1362  // If we have profile data (i.e, branch probabilities can be trusted), the
1363  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1364  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1365  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1366  // means the cost of topological order is greater.
1367  // When profile data is not available, however, we need to be more
1368  // conservative. If the branch prediction is wrong, breaking the topo-order
1369  // will actually yield a layout with large cost. For this reason, we need
1370  // strong biased branch at block S with Prob(S->BB) in order to select
1371  // BB->Succ. This is equivalent to looking the CFG backward with backward
1372  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1373  // profile data).
1374  // --------------------------------------------------------------------------
1375  // Case 3: forked diamond
1376  // S
1377  // / \
1378  // / \
1379  // BB Pred
1380  // | \ / |
1381  // | \ / |
1382  // | X |
1383  // | / \ |
1384  // | / \ |
1385  // S1 S2
1386  //
1387  // The current block is BB and edge BB->S1 is now being evaluated.
1388  // As above S->BB was already selected because
1389  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1390  //
1391  // topo-order:
1392  //
1393  // S-------| ---S
1394  // | | | |
1395  // ---BB | | BB
1396  // | | | |
1397  // | Pred----| | S1----
1398  // | | | |
1399  // --(S1 or S2) ---Pred--
1400  // |
1401  // S2
1402  //
1403  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1404  // + min(freq(Pred->S1), freq(Pred->S2))
1405  // Non-topo-order cost:
1406  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1407  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1408  // is 0. Then the non topo layout is better when
1409  // freq(S->Pred) < freq(BB->S1).
1410  // This is exactly what is checked below.
1411  // Note there are other shapes that apply (Pred may not be a single block,
1412  // but they all fit this general pattern.)
1414 
1415  // Make sure that a hot successor doesn't have a globally more
1416  // important predecessor.
1417  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1418  bool BadCFGConflict = false;
1419 
1420  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1421  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1422  (BlockFilter && !BlockFilter->count(Pred)) ||
1423  BlockToChain[Pred] == &Chain ||
1424  // This check is redundant except for look ahead. This function is
1425  // called for lookahead by isProfitableToTailDup when BB hasn't been
1426  // placed yet.
1427  (Pred == BB))
1428  continue;
1429  // Do backward checking.
1430  // For all cases above, we need a backward checking to filter out edges that
1431  // are not 'strongly' biased.
1432  // BB Pred
1433  // \ /
1434  // Succ
1435  // We select edge BB->Succ if
1436  // freq(BB->Succ) > freq(Succ) * HotProb
1437  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1438  // HotProb
1439  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1440  // Case 1 is covered too, because the first equation reduces to:
1441  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1442  BlockFrequency PredEdgeFreq =
1443  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1444  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1445  BadCFGConflict = true;
1446  break;
1447  }
1448  }
1449 
1450  if (BadCFGConflict) {
1451  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1452  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1453  return true;
1454  }
1455 
1456  return false;
1457 }
1458 
1459 /// Select the best successor for a block.
1460 ///
1461 /// This looks across all successors of a particular block and attempts to
1462 /// select the "best" one to be the layout successor. It only considers direct
1463 /// successors which also pass the block filter. It will attempt to avoid
1464 /// breaking CFG structure, but cave and break such structures in the case of
1465 /// very hot successor edges.
1466 ///
1467 /// \returns The best successor block found, or null if none are viable, along
1468 /// with a boolean indicating if tail duplication is necessary.
1469 MachineBlockPlacement::BlockAndTailDupResult
1470 MachineBlockPlacement::selectBestSuccessor(
1471  const MachineBasicBlock *BB, const BlockChain &Chain,
1472  const BlockFilterSet *BlockFilter) {
1473  const BranchProbability HotProb(StaticLikelyProb, 100);
1474 
1475  BlockAndTailDupResult BestSucc = { nullptr, false };
1476  auto BestProb = BranchProbability::getZero();
1477 
1479  auto AdjustedSumProb =
1480  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1481 
1482  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1483  << "\n");
1484 
1485  // if we already precomputed the best successor for BB, return that if still
1486  // applicable.
1487  auto FoundEdge = ComputedEdges.find(BB);
1488  if (FoundEdge != ComputedEdges.end()) {
1489  MachineBasicBlock *Succ = FoundEdge->second.BB;
1490  ComputedEdges.erase(FoundEdge);
1491  BlockChain *SuccChain = BlockToChain[Succ];
1492  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1493  SuccChain != &Chain && Succ == *SuccChain->begin())
1494  return FoundEdge->second;
1495  }
1496 
1497  // if BB is part of a trellis, Use the trellis to determine the optimal
1498  // fallthrough edges
1499  if (isTrellis(BB, Successors, Chain, BlockFilter))
1500  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1501  BlockFilter);
1502 
1503  // For blocks with CFG violations, we may be able to lay them out anyway with
1504  // tail-duplication. We keep this vector so we can perform the probability
1505  // calculations the minimum number of times.
1507  DupCandidates;
1508  for (MachineBasicBlock *Succ : Successors) {
1509  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1510  BranchProbability SuccProb =
1511  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1512 
1513  BlockChain &SuccChain = *BlockToChain[Succ];
1514  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1515  // predecessor that yields lower global cost.
1516  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1517  Chain, BlockFilter)) {
1518  // If tail duplication would make Succ profitable, place it.
1519  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1520  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1521  continue;
1522  }
1523 
1524  LLVM_DEBUG(
1525  dbgs() << " Candidate: " << getBlockName(Succ)
1526  << ", probability: " << SuccProb
1527  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1528  << "\n");
1529 
1530  if (BestSucc.BB && BestProb >= SuccProb) {
1531  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1532  continue;
1533  }
1534 
1535  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1536  BestSucc.BB = Succ;
1537  BestProb = SuccProb;
1538  }
1539  // Handle the tail duplication candidates in order of decreasing probability.
1540  // Stop at the first one that is profitable. Also stop if they are less
1541  // profitable than BestSucc. Position is important because we preserve it and
1542  // prefer first best match. Here we aren't comparing in order, so we capture
1543  // the position instead.
1544  llvm::stable_sort(DupCandidates,
1545  [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1546  std::tuple<BranchProbability, MachineBasicBlock *> R) {
1547  return std::get<0>(L) > std::get<0>(R);
1548  });
1549  for (auto &Tup : DupCandidates) {
1550  BranchProbability DupProb;
1551  MachineBasicBlock *Succ;
1552  std::tie(DupProb, Succ) = Tup;
1553  if (DupProb < BestProb)
1554  break;
1555  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1556  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1557  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1558  << ", probability: " << DupProb
1559  << " (Tail Duplicate)\n");
1560  BestSucc.BB = Succ;
1561  BestSucc.ShouldTailDup = true;
1562  break;
1563  }
1564  }
1565 
1566  if (BestSucc.BB)
1567  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1568 
1569  return BestSucc;
1570 }
1571 
1572 /// Select the best block from a worklist.
1573 ///
1574 /// This looks through the provided worklist as a list of candidate basic
1575 /// blocks and select the most profitable one to place. The definition of
1576 /// profitable only really makes sense in the context of a loop. This returns
1577 /// the most frequently visited block in the worklist, which in the case of
1578 /// a loop, is the one most desirable to be physically close to the rest of the
1579 /// loop body in order to improve i-cache behavior.
1580 ///
1581 /// \returns The best block found, or null if none are viable.
1582 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1583  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1584  // Once we need to walk the worklist looking for a candidate, cleanup the
1585  // worklist of already placed entries.
1586  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1587  // some code complexity) into the loop below.
1588  WorkList.erase(llvm::remove_if(WorkList,
1589  [&](MachineBasicBlock *BB) {
1590  return BlockToChain.lookup(BB) == &Chain;
1591  }),
1592  WorkList.end());
1593 
1594  if (WorkList.empty())
1595  return nullptr;
1596 
1597  bool IsEHPad = WorkList[0]->isEHPad();
1598 
1599  MachineBasicBlock *BestBlock = nullptr;
1600  BlockFrequency BestFreq;
1601  for (MachineBasicBlock *MBB : WorkList) {
1602  assert(MBB->isEHPad() == IsEHPad &&
1603  "EHPad mismatch between block and work list.");
1604 
1605  BlockChain &SuccChain = *BlockToChain[MBB];
1606  if (&SuccChain == &Chain)
1607  continue;
1608 
1609  assert(SuccChain.UnscheduledPredecessors == 0 &&
1610  "Found CFG-violating block");
1611 
1612  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1613  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1614  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1615 
1616  // For ehpad, we layout the least probable first as to avoid jumping back
1617  // from least probable landingpads to more probable ones.
1618  //
1619  // FIXME: Using probability is probably (!) not the best way to achieve
1620  // this. We should probably have a more principled approach to layout
1621  // cleanup code.
1622  //
1623  // The goal is to get:
1624  //
1625  // +--------------------------+
1626  // | V
1627  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1628  //
1629  // Rather than:
1630  //
1631  // +-------------------------------------+
1632  // V |
1633  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1634  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1635  continue;
1636 
1637  BestBlock = MBB;
1638  BestFreq = CandidateFreq;
1639  }
1640 
1641  return BestBlock;
1642 }
1643 
1644 /// Retrieve the first unplaced basic block.
1645 ///
1646 /// This routine is called when we are unable to use the CFG to walk through
1647 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1648 /// We walk through the function's blocks in order, starting from the
1649 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1650 /// re-scanning the entire sequence on repeated calls to this routine.
1651 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1652  const BlockChain &PlacedChain,
1653  MachineFunction::iterator &PrevUnplacedBlockIt,
1654  const BlockFilterSet *BlockFilter) {
1655  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1656  ++I) {
1657  if (BlockFilter && !BlockFilter->count(&*I))
1658  continue;
1659  if (BlockToChain[&*I] != &PlacedChain) {
1660  PrevUnplacedBlockIt = I;
1661  // Now select the head of the chain to which the unplaced block belongs
1662  // as the block to place. This will force the entire chain to be placed,
1663  // and satisfies the requirements of merging chains.
1664  return *BlockToChain[&*I]->begin();
1665  }
1666  }
1667  return nullptr;
1668 }
1669 
1670 void MachineBlockPlacement::fillWorkLists(
1671  const MachineBasicBlock *MBB,
1672  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1673  const BlockFilterSet *BlockFilter = nullptr) {
1674  BlockChain &Chain = *BlockToChain[MBB];
1675  if (!UpdatedPreds.insert(&Chain).second)
1676  return;
1677 
1678  assert(
1679  Chain.UnscheduledPredecessors == 0 &&
1680  "Attempting to place block with unscheduled predecessors in worklist.");
1681  for (MachineBasicBlock *ChainBB : Chain) {
1682  assert(BlockToChain[ChainBB] == &Chain &&
1683  "Block in chain doesn't match BlockToChain map.");
1684  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1685  if (BlockFilter && !BlockFilter->count(Pred))
1686  continue;
1687  if (BlockToChain[Pred] == &Chain)
1688  continue;
1689  ++Chain.UnscheduledPredecessors;
1690  }
1691  }
1692 
1693  if (Chain.UnscheduledPredecessors != 0)
1694  return;
1695 
1696  MachineBasicBlock *BB = *Chain.begin();
1697  if (BB->isEHPad())
1698  EHPadWorkList.push_back(BB);
1699  else
1700  BlockWorkList.push_back(BB);
1701 }
1702 
1703 void MachineBlockPlacement::buildChain(
1704  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1705  BlockFilterSet *BlockFilter) {
1706  assert(HeadBB && "BB must not be null.\n");
1707  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1708  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1709 
1710  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1711  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1712  MachineBasicBlock *BB = *std::prev(Chain.end());
1713  while (true) {
1714  assert(BB && "null block found at end of chain in loop.");
1715  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1716  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1717 
1718 
1719  // Look for the best viable successor if there is one to place immediately
1720  // after this block.
1721  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1722  MachineBasicBlock* BestSucc = Result.BB;
1723  bool ShouldTailDup = Result.ShouldTailDup;
1724  if (allowTailDupPlacement())
1725  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1726 
1727  // If an immediate successor isn't available, look for the best viable
1728  // block among those we've identified as not violating the loop's CFG at
1729  // this point. This won't be a fallthrough, but it will increase locality.
1730  if (!BestSucc)
1731  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1732  if (!BestSucc)
1733  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1734 
1735  if (!BestSucc) {
1736  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1737  if (!BestSucc)
1738  break;
1739 
1740  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1741  "layout successor until the CFG reduces\n");
1742  }
1743 
1744  // Placement may have changed tail duplication opportunities.
1745  // Check for that now.
1746  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1747  // If the chosen successor was duplicated into all its predecessors,
1748  // don't bother laying it out, just go round the loop again with BB as
1749  // the chain end.
1750  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1751  BlockFilter, PrevUnplacedBlockIt))
1752  continue;
1753  }
1754 
1755  // Place this block, updating the datastructures to reflect its placement.
1756  BlockChain &SuccChain = *BlockToChain[BestSucc];
1757  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1758  // we selected a successor that didn't fit naturally into the CFG.
1759  SuccChain.UnscheduledPredecessors = 0;
1760  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1761  << getBlockName(BestSucc) << "\n");
1762  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1763  Chain.merge(BestSucc, &SuccChain);
1764  BB = *std::prev(Chain.end());
1765  }
1766 
1767  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1768  << getBlockName(*Chain.begin()) << "\n");
1769 }
1770 
1771 // If bottom of block BB has only one successor OldTop, in most cases it is
1772 // profitable to move it before OldTop, except the following case:
1773 //
1774 // -->OldTop<-
1775 // | . |
1776 // | . |
1777 // | . |
1778 // ---Pred |
1779 // | |
1780 // BB-----
1781 //
1782 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1783 // layout the other successor below it, so it can't reduce taken branch.
1784 // In this case we keep its original layout.
1785 bool
1786 MachineBlockPlacement::canMoveBottomBlockToTop(
1787  const MachineBasicBlock *BottomBlock,
1788  const MachineBasicBlock *OldTop) {
1789  if (BottomBlock->pred_size() != 1)
1790  return true;
1791  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1792  if (Pred->succ_size() != 2)
1793  return true;
1794 
1795  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1796  if (OtherBB == BottomBlock)
1797  OtherBB = *Pred->succ_rbegin();
1798  if (OtherBB == OldTop)
1799  return false;
1800 
1801  return true;
1802 }
1803 
1804 // Find out the possible fall through frequence to the top of a loop.
1806 MachineBlockPlacement::TopFallThroughFreq(
1807  const MachineBasicBlock *Top,
1808  const BlockFilterSet &LoopBlockSet) {
1809  BlockFrequency MaxFreq = 0;
1810  for (MachineBasicBlock *Pred : Top->predecessors()) {
1811  BlockChain *PredChain = BlockToChain[Pred];
1812  if (!LoopBlockSet.count(Pred) &&
1813  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1814  // Found a Pred block can be placed before Top.
1815  // Check if Top is the best successor of Pred.
1816  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1817  bool TopOK = true;
1818  for (MachineBasicBlock *Succ : Pred->successors()) {
1819  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1820  BlockChain *SuccChain = BlockToChain[Succ];
1821  // Check if Succ can be placed after Pred.
1822  // Succ should not be in any chain, or it is the head of some chain.
1823  if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1824  (!SuccChain || Succ == *SuccChain->begin())) {
1825  TopOK = false;
1826  break;
1827  }
1828  }
1829  if (TopOK) {
1830  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1831  MBPI->getEdgeProbability(Pred, Top);
1832  if (EdgeFreq > MaxFreq)
1833  MaxFreq = EdgeFreq;
1834  }
1835  }
1836  }
1837  return MaxFreq;
1838 }
1839 
1840 // Compute the fall through gains when move NewTop before OldTop.
1841 //
1842 // In following diagram, edges marked as "-" are reduced fallthrough, edges
1843 // marked as "+" are increased fallthrough, this function computes
1844 //
1845 // SUM(increased fallthrough) - SUM(decreased fallthrough)
1846 //
1847 // |
1848 // | -
1849 // V
1850 // --->OldTop
1851 // | .
1852 // | .
1853 // +| . +
1854 // | Pred --->
1855 // | |-
1856 // | V
1857 // --- NewTop <---
1858 // |-
1859 // V
1860 //
1862 MachineBlockPlacement::FallThroughGains(
1863  const MachineBasicBlock *NewTop,
1864  const MachineBasicBlock *OldTop,
1865  const MachineBasicBlock *ExitBB,
1866  const BlockFilterSet &LoopBlockSet) {
1867  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1868  BlockFrequency FallThrough2Exit = 0;
1869  if (ExitBB)
1870  FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1871  MBPI->getEdgeProbability(NewTop, ExitBB);
1872  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1873  MBPI->getEdgeProbability(NewTop, OldTop);
1874 
1875  // Find the best Pred of NewTop.
1876  MachineBasicBlock *BestPred = nullptr;
1877  BlockFrequency FallThroughFromPred = 0;
1878  for (MachineBasicBlock *Pred : NewTop->predecessors()) {
1879  if (!LoopBlockSet.count(Pred))
1880  continue;
1881  BlockChain *PredChain = BlockToChain[Pred];
1882  if (!PredChain || Pred == *std::prev(PredChain->end())) {
1883  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1884  MBPI->getEdgeProbability(Pred, NewTop);
1885  if (EdgeFreq > FallThroughFromPred) {
1886  FallThroughFromPred = EdgeFreq;
1887  BestPred = Pred;
1888  }
1889  }
1890  }
1891 
1892  // If NewTop is not placed after Pred, another successor can be placed
1893  // after Pred.
1894  BlockFrequency NewFreq = 0;
1895  if (BestPred) {
1896  for (MachineBasicBlock *Succ : BestPred->successors()) {
1897  if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
1898  continue;
1899  if (ComputedEdges.find(Succ) != ComputedEdges.end())
1900  continue;
1901  BlockChain *SuccChain = BlockToChain[Succ];
1902  if ((SuccChain && (Succ != *SuccChain->begin())) ||
1903  (SuccChain == BlockToChain[BestPred]))
1904  continue;
1905  BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
1906  MBPI->getEdgeProbability(BestPred, Succ);
1907  if (EdgeFreq > NewFreq)
1908  NewFreq = EdgeFreq;
1909  }
1910  BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
1911  MBPI->getEdgeProbability(BestPred, NewTop);
1912  if (NewFreq > OrigEdgeFreq) {
1913  // If NewTop is not the best successor of Pred, then Pred doesn't
1914  // fallthrough to NewTop. So there is no FallThroughFromPred and
1915  // NewFreq.
1916  NewFreq = 0;
1917  FallThroughFromPred = 0;
1918  }
1919  }
1920 
1921  BlockFrequency Result = 0;
1922  BlockFrequency Gains = BackEdgeFreq + NewFreq;
1923  BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
1924  FallThroughFromPred;
1925  if (Gains > Lost)
1926  Result = Gains - Lost;
1927  return Result;
1928 }
1929 
1930 /// Helper function of findBestLoopTop. Find the best loop top block
1931 /// from predecessors of old top.
1932 ///
1933 /// Look for a block which is strictly better than the old top for laying
1934 /// out before the old top of the loop. This looks for only two patterns:
1935 ///
1936 /// 1. a block has only one successor, the old loop top
1937 ///
1938 /// Because such a block will always result in an unconditional jump,
1939 /// rotating it in front of the old top is always profitable.
1940 ///
1941 /// 2. a block has two successors, one is old top, another is exit
1942 /// and it has more than one predecessors
1943 ///
1944 /// If it is below one of its predecessors P, only P can fall through to
1945 /// it, all other predecessors need a jump to it, and another conditional
1946 /// jump to loop header. If it is moved before loop header, all its
1947 /// predecessors jump to it, then fall through to loop header. So all its
1948 /// predecessors except P can reduce one taken branch.
1949 /// At the same time, move it before old top increases the taken branch
1950 /// to loop exit block, so the reduced taken branch will be compared with
1951 /// the increased taken branch to the loop exit block.
1953 MachineBlockPlacement::findBestLoopTopHelper(
1954  MachineBasicBlock *OldTop,
1955  const MachineLoop &L,
1956  const BlockFilterSet &LoopBlockSet) {
1957  // Check that the header hasn't been fused with a preheader block due to
1958  // crazy branches. If it has, we need to start with the header at the top to
1959  // prevent pulling the preheader into the loop body.
1960  BlockChain &HeaderChain = *BlockToChain[OldTop];
1961  if (!LoopBlockSet.count(*HeaderChain.begin()))
1962  return OldTop;
1963 
1964  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
1965  << "\n");
1966 
1967  BlockFrequency BestGains = 0;
1968  MachineBasicBlock *BestPred = nullptr;
1969  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
1970  if (!LoopBlockSet.count(Pred))
1971  continue;
1972  if (Pred == L.getHeader())
1973  continue;
1974  LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
1975  << Pred->succ_size() << " successors, ";
1976  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1977  if (Pred->succ_size() > 2)
1978  continue;
1979 
1980  MachineBasicBlock *OtherBB = nullptr;
1981  if (Pred->succ_size() == 2) {
1982  OtherBB = *Pred->succ_begin();
1983  if (OtherBB == OldTop)
1984  OtherBB = *Pred->succ_rbegin();
1985  }
1986 
1987  if (!canMoveBottomBlockToTop(Pred, OldTop))
1988  continue;
1989 
1990  BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
1991  LoopBlockSet);
1992  if ((Gains > 0) && (Gains > BestGains ||
1993  ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
1994  BestPred = Pred;
1995  BestGains = Gains;
1996  }
1997  }
1998 
1999  // If no direct predecessor is fine, just use the loop header.
2000  if (!BestPred) {
2001  LLVM_DEBUG(dbgs() << " final top unchanged\n");
2002  return OldTop;
2003  }
2004 
2005  // Walk backwards through any straight line of predecessors.
2006  while (BestPred->pred_size() == 1 &&
2007  (*BestPred->pred_begin())->succ_size() == 1 &&
2008  *BestPred->pred_begin() != L.getHeader())
2009  BestPred = *BestPred->pred_begin();
2010 
2011  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2012  return BestPred;
2013 }
2014 
2015 /// Find the best loop top block for layout.
2016 ///
2017 /// This function iteratively calls findBestLoopTopHelper, until no new better
2018 /// BB can be found.
2020 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2021  const BlockFilterSet &LoopBlockSet) {
2022  // Placing the latch block before the header may introduce an extra branch
2023  // that skips this block the first time the loop is executed, which we want
2024  // to avoid when optimising for size.
2025  // FIXME: in theory there is a case that does not introduce a new branch,
2026  // i.e. when the layout predecessor does not fallthrough to the loop header.
2027  // In practice this never happens though: there always seems to be a preheader
2028  // that can fallthrough and that is also placed before the header.
2029  if (F->getFunction().hasOptSize())
2030  return L.getHeader();
2031 
2032  MachineBasicBlock *OldTop = nullptr;
2033  MachineBasicBlock *NewTop = L.getHeader();
2034  while (NewTop != OldTop) {
2035  OldTop = NewTop;
2036  NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2037  if (NewTop != OldTop)
2038  ComputedEdges[NewTop] = { OldTop, false };
2039  }
2040  return NewTop;
2041 }
2042 
2043 /// Find the best loop exiting block for layout.
2044 ///
2045 /// This routine implements the logic to analyze the loop looking for the best
2046 /// block to layout at the top of the loop. Typically this is done to maximize
2047 /// fallthrough opportunities.
2049 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2050  const BlockFilterSet &LoopBlockSet,
2051  BlockFrequency &ExitFreq) {
2052  // We don't want to layout the loop linearly in all cases. If the loop header
2053  // is just a normal basic block in the loop, we want to look for what block
2054  // within the loop is the best one to layout at the top. However, if the loop
2055  // header has be pre-merged into a chain due to predecessors not having
2056  // analyzable branches, *and* the predecessor it is merged with is *not* part
2057  // of the loop, rotating the header into the middle of the loop will create
2058  // a non-contiguous range of blocks which is Very Bad. So start with the
2059  // header and only rotate if safe.
2060  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2061  if (!LoopBlockSet.count(*HeaderChain.begin()))
2062  return nullptr;
2063 
2064  BlockFrequency BestExitEdgeFreq;
2065  unsigned BestExitLoopDepth = 0;
2066  MachineBasicBlock *ExitingBB = nullptr;
2067  // If there are exits to outer loops, loop rotation can severely limit
2068  // fallthrough opportunities unless it selects such an exit. Keep a set of
2069  // blocks where rotating to exit with that block will reach an outer loop.
2070  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2071 
2072  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2073  << getBlockName(L.getHeader()) << "\n");
2074  for (MachineBasicBlock *MBB : L.getBlocks()) {
2075  BlockChain &Chain = *BlockToChain[MBB];
2076  // Ensure that this block is at the end of a chain; otherwise it could be
2077  // mid-way through an inner loop or a successor of an unanalyzable branch.
2078  if (MBB != *std::prev(Chain.end()))
2079  continue;
2080 
2081  // Now walk the successors. We need to establish whether this has a viable
2082  // exiting successor and whether it has a viable non-exiting successor.
2083  // We store the old exiting state and restore it if a viable looping
2084  // successor isn't found.
2085  MachineBasicBlock *OldExitingBB = ExitingBB;
2086  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2087  bool HasLoopingSucc = false;
2088  for (MachineBasicBlock *Succ : MBB->successors()) {
2089  if (Succ->isEHPad())
2090  continue;
2091  if (Succ == MBB)
2092  continue;
2093  BlockChain &SuccChain = *BlockToChain[Succ];
2094  // Don't split chains, either this chain or the successor's chain.
2095  if (&Chain == &SuccChain) {
2096  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2097  << getBlockName(Succ) << " (chain conflict)\n");
2098  continue;
2099  }
2100 
2101  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2102  if (LoopBlockSet.count(Succ)) {
2103  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2104  << getBlockName(Succ) << " (" << SuccProb << ")\n");
2105  HasLoopingSucc = true;
2106  continue;
2107  }
2108 
2109  unsigned SuccLoopDepth = 0;
2110  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2111  SuccLoopDepth = ExitLoop->getLoopDepth();
2112  if (ExitLoop->contains(&L))
2113  BlocksExitingToOuterLoop.insert(MBB);
2114  }
2115 
2116  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2117  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2118  << getBlockName(Succ) << " [L:" << SuccLoopDepth
2119  << "] (";
2120  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
2121  // Note that we bias this toward an existing layout successor to retain
2122  // incoming order in the absence of better information. The exit must have
2123  // a frequency higher than the current exit before we consider breaking
2124  // the layout.
2125  BranchProbability Bias(100 - ExitBlockBias, 100);
2126  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2127  ExitEdgeFreq > BestExitEdgeFreq ||
2128  (MBB->isLayoutSuccessor(Succ) &&
2129  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2130  BestExitEdgeFreq = ExitEdgeFreq;
2131  ExitingBB = MBB;
2132  }
2133  }
2134 
2135  if (!HasLoopingSucc) {
2136  // Restore the old exiting state, no viable looping successor was found.
2137  ExitingBB = OldExitingBB;
2138  BestExitEdgeFreq = OldBestExitEdgeFreq;
2139  }
2140  }
2141  // Without a candidate exiting block or with only a single block in the
2142  // loop, just use the loop header to layout the loop.
2143  if (!ExitingBB) {
2144  LLVM_DEBUG(
2145  dbgs() << " No other candidate exit blocks, using loop header\n");
2146  return nullptr;
2147  }
2148  if (L.getNumBlocks() == 1) {
2149  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2150  return nullptr;
2151  }
2152 
2153  // Also, if we have exit blocks which lead to outer loops but didn't select
2154  // one of them as the exiting block we are rotating toward, disable loop
2155  // rotation altogether.
2156  if (!BlocksExitingToOuterLoop.empty() &&
2157  !BlocksExitingToOuterLoop.count(ExitingBB))
2158  return nullptr;
2159 
2160  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2161  << "\n");
2162  ExitFreq = BestExitEdgeFreq;
2163  return ExitingBB;
2164 }
2165 
2166 /// Check if there is a fallthrough to loop header Top.
2167 ///
2168 /// 1. Look for a Pred that can be layout before Top.
2169 /// 2. Check if Top is the most possible successor of Pred.
2170 bool
2171 MachineBlockPlacement::hasViableTopFallthrough(
2172  const MachineBasicBlock *Top,
2173  const BlockFilterSet &LoopBlockSet) {
2174  for (MachineBasicBlock *Pred : Top->predecessors()) {
2175  BlockChain *PredChain = BlockToChain[Pred];
2176  if (!LoopBlockSet.count(Pred) &&
2177  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2178  // Found a Pred block can be placed before Top.
2179  // Check if Top is the best successor of Pred.
2180  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2181  bool TopOK = true;
2182  for (MachineBasicBlock *Succ : Pred->successors()) {
2183  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2184  BlockChain *SuccChain = BlockToChain[Succ];
2185  // Check if Succ can be placed after Pred.
2186  // Succ should not be in any chain, or it is the head of some chain.
2187  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2188  TopOK = false;
2189  break;
2190  }
2191  }
2192  if (TopOK)
2193  return true;
2194  }
2195  }
2196  return false;
2197 }
2198 
2199 /// Attempt to rotate an exiting block to the bottom of the loop.
2200 ///
2201 /// Once we have built a chain, try to rotate it to line up the hot exit block
2202 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2203 /// branches. For example, if the loop has fallthrough into its header and out
2204 /// of its bottom already, don't rotate it.
2205 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2206  const MachineBasicBlock *ExitingBB,
2207  BlockFrequency ExitFreq,
2208  const BlockFilterSet &LoopBlockSet) {
2209  if (!ExitingBB)
2210  return;
2211 
2212  MachineBasicBlock *Top = *LoopChain.begin();
2213  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2214 
2215  // If ExitingBB is already the last one in a chain then nothing to do.
2216  if (Bottom == ExitingBB)
2217  return;
2218 
2219  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2220 
2221  // If the header has viable fallthrough, check whether the current loop
2222  // bottom is a viable exiting block. If so, bail out as rotating will
2223  // introduce an unnecessary branch.
2224  if (ViableTopFallthrough) {
2225  for (MachineBasicBlock *Succ : Bottom->successors()) {
2226  BlockChain *SuccChain = BlockToChain[Succ];
2227  if (!LoopBlockSet.count(Succ) &&
2228  (!SuccChain || Succ == *SuccChain->begin()))
2229  return;
2230  }
2231 
2232  // Rotate will destroy the top fallthrough, we need to ensure the new exit
2233  // frequency is larger than top fallthrough.
2234  BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2235  if (FallThrough2Top >= ExitFreq)
2236  return;
2237  }
2238 
2239  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2240  if (ExitIt == LoopChain.end())
2241  return;
2242 
2243  // Rotating a loop exit to the bottom when there is a fallthrough to top
2244  // trades the entry fallthrough for an exit fallthrough.
2245  // If there is no bottom->top edge, but the chosen exit block does have
2246  // a fallthrough, we break that fallthrough for nothing in return.
2247 
2248  // Let's consider an example. We have a built chain of basic blocks
2249  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2250  // By doing a rotation we get
2251  // Bk+1, ..., Bn, B1, ..., Bk
2252  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2253  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2254  // It might be compensated by fallthrough Bn -> B1.
2255  // So we have a condition to avoid creation of extra branch by loop rotation.
2256  // All below must be true to avoid loop rotation:
2257  // If there is a fallthrough to top (B1)
2258  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2259  // There is no fallthrough from bottom (Bn) to top (B1).
2260  // Please note that there is no exit fallthrough from Bn because we checked it
2261  // above.
2262  if (ViableTopFallthrough) {
2263  assert(std::next(ExitIt) != LoopChain.end() &&
2264  "Exit should not be last BB");
2265  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2266  if (ExitingBB->isSuccessor(NextBlockInChain))
2267  if (!Bottom->isSuccessor(Top))
2268  return;
2269  }
2270 
2271  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2272  << " at bottom\n");
2273  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2274 }
2275 
2276 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2277 ///
2278 /// With profile data, we can determine the cost in terms of missed fall through
2279 /// opportunities when rotating a loop chain and select the best rotation.
2280 /// Basically, there are three kinds of cost to consider for each rotation:
2281 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2282 /// the loop to the loop header.
2283 /// 2. The possibly missed fall through edges (if they exist) from the loop
2284 /// exits to BB out of the loop.
2285 /// 3. The missed fall through edge (if it exists) from the last BB to the
2286 /// first BB in the loop chain.
2287 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2288 /// We select the best rotation with the smallest cost.
2289 void MachineBlockPlacement::rotateLoopWithProfile(
2290  BlockChain &LoopChain, const MachineLoop &L,
2291  const BlockFilterSet &LoopBlockSet) {
2292  auto RotationPos = LoopChain.end();
2293 
2294  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2295 
2296  // A utility lambda that scales up a block frequency by dividing it by a
2297  // branch probability which is the reciprocal of the scale.
2298  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2299  unsigned Scale) -> BlockFrequency {
2300  if (Scale == 0)
2301  return 0;
2302  // Use operator / between BlockFrequency and BranchProbability to implement
2303  // saturating multiplication.
2304  return Freq / BranchProbability(1, Scale);
2305  };
2306 
2307  // Compute the cost of the missed fall-through edge to the loop header if the
2308  // chain head is not the loop header. As we only consider natural loops with
2309  // single header, this computation can be done only once.
2310  BlockFrequency HeaderFallThroughCost(0);
2311  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2312  for (auto *Pred : ChainHeaderBB->predecessors()) {
2313  BlockChain *PredChain = BlockToChain[Pred];
2314  if (!LoopBlockSet.count(Pred) &&
2315  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2316  auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2317  MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2318  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2319  // If the predecessor has only an unconditional jump to the header, we
2320  // need to consider the cost of this jump.
2321  if (Pred->succ_size() == 1)
2322  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2323  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2324  }
2325  }
2326 
2327  // Here we collect all exit blocks in the loop, and for each exit we find out
2328  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2329  // as the sum of frequencies of exit edges we collect here, excluding the exit
2330  // edge from the tail of the loop chain.
2332  for (auto BB : LoopChain) {
2333  auto LargestExitEdgeProb = BranchProbability::getZero();
2334  for (auto *Succ : BB->successors()) {
2335  BlockChain *SuccChain = BlockToChain[Succ];
2336  if (!LoopBlockSet.count(Succ) &&
2337  (!SuccChain || Succ == *SuccChain->begin())) {
2338  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2339  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2340  }
2341  }
2342  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2343  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2344  ExitsWithFreq.emplace_back(BB, ExitFreq);
2345  }
2346  }
2347 
2348  // In this loop we iterate every block in the loop chain and calculate the
2349  // cost assuming the block is the head of the loop chain. When the loop ends,
2350  // we should have found the best candidate as the loop chain's head.
2351  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2352  EndIter = LoopChain.end();
2353  Iter != EndIter; Iter++, TailIter++) {
2354  // TailIter is used to track the tail of the loop chain if the block we are
2355  // checking (pointed by Iter) is the head of the chain.
2356  if (TailIter == LoopChain.end())
2357  TailIter = LoopChain.begin();
2358 
2359  auto TailBB = *TailIter;
2360 
2361  // Calculate the cost by putting this BB to the top.
2362  BlockFrequency Cost = 0;
2363 
2364  // If the current BB is the loop header, we need to take into account the
2365  // cost of the missed fall through edge from outside of the loop to the
2366  // header.
2367  if (Iter != LoopChain.begin())
2368  Cost += HeaderFallThroughCost;
2369 
2370  // Collect the loop exit cost by summing up frequencies of all exit edges
2371  // except the one from the chain tail.
2372  for (auto &ExitWithFreq : ExitsWithFreq)
2373  if (TailBB != ExitWithFreq.first)
2374  Cost += ExitWithFreq.second;
2375 
2376  // The cost of breaking the once fall-through edge from the tail to the top
2377  // of the loop chain. Here we need to consider three cases:
2378  // 1. If the tail node has only one successor, then we will get an
2379  // additional jmp instruction. So the cost here is (MisfetchCost +
2380  // JumpInstCost) * tail node frequency.
2381  // 2. If the tail node has two successors, then we may still get an
2382  // additional jmp instruction if the layout successor after the loop
2383  // chain is not its CFG successor. Note that the more frequently executed
2384  // jmp instruction will be put ahead of the other one. Assume the
2385  // frequency of those two branches are x and y, where x is the frequency
2386  // of the edge to the chain head, then the cost will be
2387  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2388  // 3. If the tail node has more than two successors (this rarely happens),
2389  // we won't consider any additional cost.
2390  if (TailBB->isSuccessor(*Iter)) {
2391  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2392  if (TailBB->succ_size() == 1)
2393  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2395  else if (TailBB->succ_size() == 2) {
2396  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2397  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2398  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2399  ? TailBBFreq * TailToHeadProb.getCompl()
2400  : TailToHeadFreq;
2401  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2402  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2403  }
2404  }
2405 
2406  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2407  << getBlockName(*Iter)
2408  << " to the top: " << Cost.getFrequency() << "\n");
2409 
2410  if (Cost < SmallestRotationCost) {
2411  SmallestRotationCost = Cost;
2412  RotationPos = Iter;
2413  }
2414  }
2415 
2416  if (RotationPos != LoopChain.end()) {
2417  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2418  << " to the top\n");
2419  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2420  }
2421 }
2422 
2423 /// Collect blocks in the given loop that are to be placed.
2424 ///
2425 /// When profile data is available, exclude cold blocks from the returned set;
2426 /// otherwise, collect all blocks in the loop.
2428 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2429  BlockFilterSet LoopBlockSet;
2430 
2431  // Filter cold blocks off from LoopBlockSet when profile data is available.
2432  // Collect the sum of frequencies of incoming edges to the loop header from
2433  // outside. If we treat the loop as a super block, this is the frequency of
2434  // the loop. Then for each block in the loop, we calculate the ratio between
2435  // its frequency and the frequency of the loop block. When it is too small,
2436  // don't add it to the loop chain. If there are outer loops, then this block
2437  // will be merged into the first outer loop chain for which this block is not
2438  // cold anymore. This needs precise profile data and we only do this when
2439  // profile data is available.
2441  BlockFrequency LoopFreq(0);
2442  for (auto LoopPred : L.getHeader()->predecessors())
2443  if (!L.contains(LoopPred))
2444  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2445  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2446 
2447  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2448  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2449  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2450  continue;
2451  LoopBlockSet.insert(LoopBB);
2452  }
2453  } else
2454  LoopBlockSet.insert(L.block_begin(), L.block_end());
2455 
2456  return LoopBlockSet;
2457 }
2458 
2459 /// Forms basic block chains from the natural loop structures.
2460 ///
2461 /// These chains are designed to preserve the existing *structure* of the code
2462 /// as much as possible. We can then stitch the chains together in a way which
2463 /// both preserves the topological structure and minimizes taken conditional
2464 /// branches.
2465 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2466  // First recurse through any nested loops, building chains for those inner
2467  // loops.
2468  for (const MachineLoop *InnerLoop : L)
2469  buildLoopChains(*InnerLoop);
2470 
2471  assert(BlockWorkList.empty() &&
2472  "BlockWorkList not empty when starting to build loop chains.");
2473  assert(EHPadWorkList.empty() &&
2474  "EHPadWorkList not empty when starting to build loop chains.");
2475  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2476 
2477  // Check if we have profile data for this function. If yes, we will rotate
2478  // this loop by modeling costs more precisely which requires the profile data
2479  // for better layout.
2480  bool RotateLoopWithProfile =
2483 
2484  // First check to see if there is an obviously preferable top block for the
2485  // loop. This will default to the header, but may end up as one of the
2486  // predecessors to the header if there is one which will result in strictly
2487  // fewer branches in the loop body.
2488  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2489 
2490  // If we selected just the header for the loop top, look for a potentially
2491  // profitable exit block in the event that rotating the loop can eliminate
2492  // branches by placing an exit edge at the bottom.
2493  //
2494  // Loops are processed innermost to uttermost, make sure we clear
2495  // PreferredLoopExit before processing a new loop.
2496  PreferredLoopExit = nullptr;
2497  BlockFrequency ExitFreq;
2498  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2499  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2500 
2501  BlockChain &LoopChain = *BlockToChain[LoopTop];
2502 
2503  // FIXME: This is a really lame way of walking the chains in the loop: we
2504  // walk the blocks, and use a set to prevent visiting a particular chain
2505  // twice.
2506  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2507  assert(LoopChain.UnscheduledPredecessors == 0 &&
2508  "LoopChain should not have unscheduled predecessors.");
2509  UpdatedPreds.insert(&LoopChain);
2510 
2511  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2512  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2513 
2514  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2515 
2516  if (RotateLoopWithProfile)
2517  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2518  else
2519  rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2520 
2521  LLVM_DEBUG({
2522  // Crash at the end so we get all of the debugging output first.
2523  bool BadLoop = false;
2524  if (LoopChain.UnscheduledPredecessors) {
2525  BadLoop = true;
2526  dbgs() << "Loop chain contains a block without its preds placed!\n"
2527  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2528  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2529  }
2530  for (MachineBasicBlock *ChainBB : LoopChain) {
2531  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2532  if (!LoopBlockSet.remove(ChainBB)) {
2533  // We don't mark the loop as bad here because there are real situations
2534  // where this can occur. For example, with an unanalyzable fallthrough
2535  // from a loop block to a non-loop block or vice versa.
2536  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2537  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2538  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2539  << " Bad block: " << getBlockName(ChainBB) << "\n";
2540  }
2541  }
2542 
2543  if (!LoopBlockSet.empty()) {
2544  BadLoop = true;
2545  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2546  dbgs() << "Loop contains blocks never placed into a chain!\n"
2547  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2548  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2549  << " Bad block: " << getBlockName(LoopBB) << "\n";
2550  }
2551  assert(!BadLoop && "Detected problems with the placement of this loop.");
2552  });
2553 
2554  BlockWorkList.clear();
2555  EHPadWorkList.clear();
2556 }
2557 
2558 void MachineBlockPlacement::buildCFGChains() {
2559  // Ensure that every BB in the function has an associated chain to simplify
2560  // the assumptions of the remaining algorithm.
2561  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2562  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2563  ++FI) {
2564  MachineBasicBlock *BB = &*FI;
2565  BlockChain *Chain =
2566  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2567  // Also, merge any blocks which we cannot reason about and must preserve
2568  // the exact fallthrough behavior for.
2569  while (true) {
2570  Cond.clear();
2571  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2572  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2573  break;
2574 
2575  MachineFunction::iterator NextFI = std::next(FI);
2576  MachineBasicBlock *NextBB = &*NextFI;
2577  // Ensure that the layout successor is a viable block, as we know that
2578  // fallthrough is a possibility.
2579  assert(NextFI != FE && "Can't fallthrough past the last block.");
2580  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2581  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2582  << "\n");
2583  Chain->merge(NextBB, nullptr);
2584 #ifndef NDEBUG
2585  BlocksWithUnanalyzableExits.insert(&*BB);
2586 #endif
2587  FI = NextFI;
2588  BB = NextBB;
2589  }
2590  }
2591 
2592  // Build any loop-based chains.
2593  PreferredLoopExit = nullptr;
2594  for (MachineLoop *L : *MLI)
2595  buildLoopChains(*L);
2596 
2597  assert(BlockWorkList.empty() &&
2598  "BlockWorkList should be empty before building final chain.");
2599  assert(EHPadWorkList.empty() &&
2600  "EHPadWorkList should be empty before building final chain.");
2601 
2602  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2603  for (MachineBasicBlock &MBB : *F)
2604  fillWorkLists(&MBB, UpdatedPreds);
2605 
2606  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2607  buildChain(&F->front(), FunctionChain);
2608 
2609 #ifndef NDEBUG
2610  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2611 #endif
2612  LLVM_DEBUG({
2613  // Crash at the end so we get all of the debugging output first.
2614  bool BadFunc = false;
2615  FunctionBlockSetType FunctionBlockSet;
2616  for (MachineBasicBlock &MBB : *F)
2617  FunctionBlockSet.insert(&MBB);
2618 
2619  for (MachineBasicBlock *ChainBB : FunctionChain)
2620  if (!FunctionBlockSet.erase(ChainBB)) {
2621  BadFunc = true;
2622  dbgs() << "Function chain contains a block not in the function!\n"
2623  << " Bad block: " << getBlockName(ChainBB) << "\n";
2624  }
2625 
2626  if (!FunctionBlockSet.empty()) {
2627  BadFunc = true;
2628  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2629  dbgs() << "Function contains blocks never placed into a chain!\n"
2630  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2631  }
2632  assert(!BadFunc && "Detected problems with the block placement.");
2633  });
2634 
2635  // Splice the blocks into place.
2636  MachineFunction::iterator InsertPos = F->begin();
2637  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2638  for (MachineBasicBlock *ChainBB : FunctionChain) {
2639  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2640  : " ... ")
2641  << getBlockName(ChainBB) << "\n");
2642  if (InsertPos != MachineFunction::iterator(ChainBB))
2643  F->splice(InsertPos, ChainBB);
2644  else
2645  ++InsertPos;
2646 
2647  // Update the terminator of the previous block.
2648  if (ChainBB == *FunctionChain.begin())
2649  continue;
2650  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2651 
2652  // FIXME: It would be awesome of updateTerminator would just return rather
2653  // than assert when the branch cannot be analyzed in order to remove this
2654  // boiler plate.
2655  Cond.clear();
2656  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2657 
2658 #ifndef NDEBUG
2659  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2660  // Given the exact block placement we chose, we may actually not _need_ to
2661  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2662  // do that at this point is a bug.
2663  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2664  !PrevBB->canFallThrough()) &&
2665  "Unexpected block with un-analyzable fallthrough!");
2666  Cond.clear();
2667  TBB = FBB = nullptr;
2668  }
2669 #endif
2670 
2671  // The "PrevBB" is not yet updated to reflect current code layout, so,
2672  // o. it may fall-through to a block without explicit "goto" instruction
2673  // before layout, and no longer fall-through it after layout; or
2674  // o. just opposite.
2675  //
2676  // analyzeBranch() may return erroneous value for FBB when these two
2677  // situations take place. For the first scenario FBB is mistakenly set NULL;
2678  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2679  // mistakenly pointing to "*BI".
2680  // Thus, if the future change needs to use FBB before the layout is set, it
2681  // has to correct FBB first by using the code similar to the following:
2682  //
2683  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2684  // PrevBB->updateTerminator();
2685  // Cond.clear();
2686  // TBB = FBB = nullptr;
2687  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2688  // // FIXME: This should never take place.
2689  // TBB = FBB = nullptr;
2690  // }
2691  // }
2692  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2693  PrevBB->updateTerminator();
2694  }
2695 
2696  // Fixup the last block.
2697  Cond.clear();
2698  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2699  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2700  F->back().updateTerminator();
2701 
2702  BlockWorkList.clear();
2703  EHPadWorkList.clear();
2704 }
2705 
2706 void MachineBlockPlacement::optimizeBranches() {
2707  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2708  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2709 
2710  // Now that all the basic blocks in the chain have the proper layout,
2711  // make a final call to AnalyzeBranch with AllowModify set.
2712  // Indeed, the target may be able to optimize the branches in a way we
2713  // cannot because all branches may not be analyzable.
2714  // E.g., the target may be able to remove an unconditional branch to
2715  // a fallthrough when it occurs after predicated terminators.
2717  for (MachineBasicBlock *ChainBB : FunctionChain) {
2718  Cond.clear();
2719  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2720  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2721  // If PrevBB has a two-way branch, try to re-order the branches
2722  // such that we branch to the successor with higher probability first.
2723  if (TBB && !Cond.empty() && FBB &&
2724  MBPI->getEdgeProbability(ChainBB, FBB) >
2725  MBPI->getEdgeProbability(ChainBB, TBB) &&
2726  !TII->reverseBranchCondition(Cond)) {
2727  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2728  << getBlockName(ChainBB) << "\n");
2729  LLVM_DEBUG(dbgs() << " Edge probability: "
2730  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2731  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2732  DebugLoc dl; // FIXME: this is nowhere
2733  TII->removeBranch(*ChainBB);
2734  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2735  ChainBB->updateTerminator();
2736  } else if (Cond.empty() && TBB && ChainBB != TBB && !TBB->empty() &&
2737  !TBB->canFallThrough()) {
2738  // When ChainBB is unconditional branch to the TBB, and TBB has no
2739  // fallthrough predecessor and fallthrough successor, try to merge
2740  // ChainBB and TBB. This is legal under the one of following conditions:
2741  // 1. ChainBB is empty except for an unconditional branch.
2742  // 2. TBB has only one predecessor.
2744  if (((TBB == &*F->begin()) || !std::prev(I)->canFallThrough()) &&
2745  (TailDup.isSimpleBB(ChainBB) || (TBB->pred_size() == 1))) {
2746  TII->removeBranch(*ChainBB);
2747  ChainBB->removeSuccessor(TBB);
2748 
2749  // Update the CFG.
2750  while (!TBB->pred_empty()) {
2751  MachineBasicBlock *Pred = *(TBB->pred_end() - 1);
2752  Pred->ReplaceUsesOfBlockWith(TBB, ChainBB);
2753  }
2754 
2755  while (!TBB->succ_empty()) {
2756  MachineBasicBlock *Succ = *(TBB->succ_end() - 1);
2757  ChainBB->addSuccessor(Succ, MBPI->getEdgeProbability(TBB, Succ));
2758  TBB->removeSuccessor(Succ);
2759  }
2760 
2761  // Move all the instructions of TBB to ChainBB.
2762  ChainBB->splice(ChainBB->end(), TBB, TBB->begin(), TBB->end());
2763  EmptyBB.push_back(TBB);
2764 
2765  // If TBB was the target of a jump table, update jump tables to go to
2766  // the ChainBB instead.
2767  if (MachineJumpTableInfo *MJTI = F->getJumpTableInfo())
2768  MJTI->ReplaceMBBInJumpTables(TBB, ChainBB);
2769  }
2770  }
2771  }
2772  }
2773 
2774  for (auto BB: EmptyBB) {
2775  MLI->removeBlock(BB);
2776  FunctionChain.remove(BB);
2777  BlockToChain.erase(BB);
2778  F->erase(BB);
2779  }
2780 }
2781 
2782 void MachineBlockPlacement::alignBlocks() {
2783  // Walk through the backedges of the function now that we have fully laid out
2784  // the basic blocks and align the destination of each backedge. We don't rely
2785  // exclusively on the loop info here so that we can align backedges in
2786  // unnatural CFGs and backedges that were introduced purely because of the
2787  // loop rotations done during this layout pass.
2788  if (F->getFunction().hasMinSize() ||
2789  (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2790  return;
2791  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2792  if (FunctionChain.begin() == FunctionChain.end())
2793  return; // Empty chain.
2794 
2795  const BranchProbability ColdProb(1, 5); // 20%
2796  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2797  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2798  for (MachineBasicBlock *ChainBB : FunctionChain) {
2799  if (ChainBB == *FunctionChain.begin())
2800  continue;
2801 
2802  // Don't align non-looping basic blocks. These are unlikely to execute
2803  // enough times to matter in practice. Note that we'll still handle
2804  // unnatural CFGs inside of a natural outer loop (the common case) and
2805  // rotated loops.
2806  MachineLoop *L = MLI->getLoopFor(ChainBB);
2807  if (!L)
2808  continue;
2809 
2810  const llvm::Align Align = TLI->getPrefLoopAlignment(L);
2811  if (Align == 1)
2812  continue; // Don't care about loop alignment.
2813 
2814  // If the block is cold relative to the function entry don't waste space
2815  // aligning it.
2816  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2817  if (Freq < WeightedEntryFreq)
2818  continue;
2819 
2820  // If the block is cold relative to its loop header, don't align it
2821  // regardless of what edges into the block exist.
2822  MachineBasicBlock *LoopHeader = L->getHeader();
2823  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2824  if (Freq < (LoopHeaderFreq * ColdProb))
2825  continue;
2826 
2827  // Check for the existence of a non-layout predecessor which would benefit
2828  // from aligning this block.
2829  MachineBasicBlock *LayoutPred =
2830  &*std::prev(MachineFunction::iterator(ChainBB));
2831 
2832  // Force alignment if all the predecessors are jumps. We already checked
2833  // that the block isn't cold above.
2834  if (!LayoutPred->isSuccessor(ChainBB)) {
2835  ChainBB->setLogAlignment(Log2(Align));
2836  continue;
2837  }
2838 
2839  // Align this block if the layout predecessor's edge into this block is
2840  // cold relative to the block. When this is true, other predecessors make up
2841  // all of the hot entries into the block and thus alignment is likely to be
2842  // important.
2843  BranchProbability LayoutProb =
2844  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2845  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2846  if (LayoutEdgeFreq <= (Freq * ColdProb))
2847  ChainBB->setLogAlignment(Log2(Align));
2848  }
2849 }
2850 
2851 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2852 /// it was duplicated into its chain predecessor and removed.
2853 /// \p BB - Basic block that may be duplicated.
2854 ///
2855 /// \p LPred - Chosen layout predecessor of \p BB.
2856 /// Updated to be the chain end if LPred is removed.
2857 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2858 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2859 /// Used to identify which blocks to update predecessor
2860 /// counts.
2861 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2862 /// chosen in the given order due to unnatural CFG
2863 /// only needed if \p BB is removed and
2864 /// \p PrevUnplacedBlockIt pointed to \p BB.
2865 /// @return true if \p BB was removed.
2866 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2867  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2868  const MachineBasicBlock *LoopHeaderBB,
2869  BlockChain &Chain, BlockFilterSet *BlockFilter,
2870  MachineFunction::iterator &PrevUnplacedBlockIt) {
2871  bool Removed, DuplicatedToLPred;
2872  bool DuplicatedToOriginalLPred;
2873  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2874  PrevUnplacedBlockIt,
2875  DuplicatedToLPred);
2876  if (!Removed)
2877  return false;
2878  DuplicatedToOriginalLPred = DuplicatedToLPred;
2879  // Iteratively try to duplicate again. It can happen that a block that is
2880  // duplicated into is still small enough to be duplicated again.
2881  // No need to call markBlockSuccessors in this case, as the blocks being
2882  // duplicated from here on are already scheduled.
2883  // Note that DuplicatedToLPred always implies Removed.
2884  while (DuplicatedToLPred) {
2885  assert(Removed && "Block must have been removed to be duplicated into its "
2886  "layout predecessor.");
2887  MachineBasicBlock *DupBB, *DupPred;
2888  // The removal callback causes Chain.end() to be updated when a block is
2889  // removed. On the first pass through the loop, the chain end should be the
2890  // same as it was on function entry. On subsequent passes, because we are
2891  // duplicating the block at the end of the chain, if it is removed the
2892  // chain will have shrunk by one block.
2893  BlockChain::iterator ChainEnd = Chain.end();
2894  DupBB = *(--ChainEnd);
2895  // Now try to duplicate again.
2896  if (ChainEnd == Chain.begin())
2897  break;
2898  DupPred = *std::prev(ChainEnd);
2899  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2900  PrevUnplacedBlockIt,
2901  DuplicatedToLPred);
2902  }
2903  // If BB was duplicated into LPred, it is now scheduled. But because it was
2904  // removed, markChainSuccessors won't be called for its chain. Instead we
2905  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2906  // at the end because repeating the tail duplication can increase the number
2907  // of unscheduled predecessors.
2908  LPred = *std::prev(Chain.end());
2909  if (DuplicatedToOriginalLPred)
2910  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2911  return true;
2912 }
2913 
2914 /// Tail duplicate \p BB into (some) predecessors if profitable.
2915 /// \p BB - Basic block that may be duplicated
2916 /// \p LPred - Chosen layout predecessor of \p BB
2917 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2918 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2919 /// Used to identify which blocks to update predecessor
2920 /// counts.
2921 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2922 /// chosen in the given order due to unnatural CFG
2923 /// only needed if \p BB is removed and
2924 /// \p PrevUnplacedBlockIt pointed to \p BB.
2925 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2926 /// only be true if the block was removed.
2927 /// \return - True if the block was duplicated into all preds and removed.
2928 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2930  BlockChain &Chain, BlockFilterSet *BlockFilter,
2931  MachineFunction::iterator &PrevUnplacedBlockIt,
2932  bool &DuplicatedToLPred) {
2933  DuplicatedToLPred = false;
2934  if (!shouldTailDuplicate(BB))
2935  return false;
2936 
2937  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
2938  << "\n");
2939 
2940  // This has to be a callback because none of it can be done after
2941  // BB is deleted.
2942  bool Removed = false;
2943  auto RemovalCallback =
2944  [&](MachineBasicBlock *RemBB) {
2945  // Signal to outer function
2946  Removed = true;
2947 
2948  // Conservative default.
2949  bool InWorkList = true;
2950  // Remove from the Chain and Chain Map
2951  if (BlockToChain.count(RemBB)) {
2952  BlockChain *Chain = BlockToChain[RemBB];
2953  InWorkList = Chain->UnscheduledPredecessors == 0;
2954  Chain->remove(RemBB);
2955  BlockToChain.erase(RemBB);
2956  }
2957 
2958  // Handle the unplaced block iterator
2959  if (&(*PrevUnplacedBlockIt) == RemBB) {
2960  PrevUnplacedBlockIt++;
2961  }
2962 
2963  // Handle the Work Lists
2964  if (InWorkList) {
2965  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2966  if (RemBB->isEHPad())
2967  RemoveList = EHPadWorkList;
2968  RemoveList.erase(
2969  llvm::remove_if(RemoveList,
2970  [RemBB](MachineBasicBlock *BB) {
2971  return BB == RemBB;
2972  }),
2973  RemoveList.end());
2974  }
2975 
2976  // Handle the filter set
2977  if (BlockFilter) {
2978  BlockFilter->remove(RemBB);
2979  }
2980 
2981  // Remove the block from loop info.
2982  MLI->removeBlock(RemBB);
2983  if (RemBB == PreferredLoopExit)
2984  PreferredLoopExit = nullptr;
2985 
2986  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
2987  << getBlockName(RemBB) << "\n");
2988  };
2989  auto RemovalCallbackRef =
2990  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2991 
2992  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2993  bool IsSimple = TailDup.isSimpleBB(BB);
2994  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2995  &DuplicatedPreds, &RemovalCallbackRef);
2996 
2997  // Update UnscheduledPredecessors to reflect tail-duplication.
2998  DuplicatedToLPred = false;
2999  for (MachineBasicBlock *Pred : DuplicatedPreds) {
3000  // We're only looking for unscheduled predecessors that match the filter.
3001  BlockChain* PredChain = BlockToChain[Pred];
3002  if (Pred == LPred)
3003  DuplicatedToLPred = true;
3004  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3005  || PredChain == &Chain)
3006  continue;
3007  for (MachineBasicBlock *NewSucc : Pred->successors()) {
3008  if (BlockFilter && !BlockFilter->count(NewSucc))
3009  continue;
3010  BlockChain *NewChain = BlockToChain[NewSucc];
3011  if (NewChain != &Chain && NewChain != PredChain)
3012  NewChain->UnscheduledPredecessors++;
3013  }
3014  }
3015  return Removed;
3016 }
3017 
3018 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3019  if (skipFunction(MF.getFunction()))
3020  return false;
3021 
3022  // Check for single-block functions and skip them.
3023  if (std::next(MF.begin()) == MF.end())
3024  return false;
3025 
3026  F = &MF;
3027  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3028  MBFI = std::make_unique<BranchFolder::MBFIWrapper>(
3029  getAnalysis<MachineBlockFrequencyInfo>());
3030  MLI = &getAnalysis<MachineLoopInfo>();
3031  TII = MF.getSubtarget().getInstrInfo();
3032  TLI = MF.getSubtarget().getTargetLowering();
3033  MPDT = nullptr;
3034 
3035  // Initialize PreferredLoopExit to nullptr here since it may never be set if
3036  // there are no MachineLoops.
3037  PreferredLoopExit = nullptr;
3038 
3039  assert(BlockToChain.empty() &&
3040  "BlockToChain map should be empty before starting placement.");
3041  assert(ComputedEdges.empty() &&
3042  "Computed Edge map should be empty before starting placement.");
3043 
3044  unsigned TailDupSize = TailDupPlacementThreshold;
3045  // If only the aggressive threshold is explicitly set, use it.
3046  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3047  TailDupPlacementThreshold.getNumOccurrences() == 0)
3049 
3050  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3051  // For aggressive optimization, we can adjust some thresholds to be less
3052  // conservative.
3053  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
3054  // At O3 we should be more willing to copy blocks for tail duplication. This
3055  // increases size pressure, so we only do it at O3
3056  // Do this unless only the regular threshold is explicitly set.
3057  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3058  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3060  }
3061 
3062  if (allowTailDupPlacement()) {
3063  MPDT = &getAnalysis<MachinePostDominatorTree>();
3064  if (MF.getFunction().hasOptSize())
3065  TailDupSize = 1;
3066  bool PreRegAlloc = false;
3067  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
3068  precomputeTriangleChains();
3069  }
3070 
3071  buildCFGChains();
3072 
3073  // Changing the layout can create new tail merging opportunities.
3074  // TailMerge can create jump into if branches that make CFG irreducible for
3075  // HW that requires structured CFG.
3076  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3077  PassConfig->getEnableTailMerge() &&
3079  // No tail merging opportunities if the block number is less than four.
3080  if (MF.size() > 3 && EnableTailMerge) {
3081  unsigned TailMergeSize = TailDupSize + 1;
3082  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
3083  *MBPI, TailMergeSize);
3084 
3085  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
3086  getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
3087  /*AfterPlacement=*/true)) {
3088  // Redo the layout if tail merging creates/removes/moves blocks.
3089  BlockToChain.clear();
3090  ComputedEdges.clear();
3091  // Must redo the post-dominator tree if blocks were changed.
3092  if (MPDT)
3093  MPDT->runOnMachineFunction(MF);
3094  ChainAllocator.DestroyAll();
3095  buildCFGChains();
3096  }
3097  }
3098 
3099  // optimizeBranches() may change the blocks, but we haven't updated the
3100  // post-dominator tree. Because the post-dominator tree won't be used after
3101  // this function and this pass don't preserve the post-dominator tree.
3102  optimizeBranches();
3103  alignBlocks();
3104 
3105  BlockToChain.clear();
3106  ComputedEdges.clear();
3107  ChainAllocator.DestroyAll();
3108 
3109  if (AlignAllBlock)
3110  // Align all of the blocks in the function to a specific alignment.
3111  for (MachineBasicBlock &MBB : MF)
3113  else if (AlignAllNonFallThruBlocks) {
3114  // Align all of the blocks that have no fall-through predecessors to a
3115  // specific alignment.
3116  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3117  auto LayoutPred = std::prev(MBI);
3118  if (!LayoutPred->isSuccessor(&*MBI))
3119  MBI->setLogAlignment(AlignAllNonFallThruBlocks);
3120  }
3121  }
3122  if (ViewBlockLayoutWithBFI != GVDT_None &&
3123  (ViewBlockFreqFuncName.empty() ||
3124  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3125  MBFI->view("MBP." + MF.getName(), false);
3126  }
3127 
3128 
3129  // We always return true as we have no way to track whether the final order
3130  // differs from the original order.
3131  return true;
3132 }
3133 
3134 namespace {
3135 
3136 /// A pass to compute block placement statistics.
3137 ///
3138 /// A separate pass to compute interesting statistics for evaluating block
3139 /// placement. This is separate from the actual placement pass so that they can
3140 /// be computed in the absence of any placement transformations or when using
3141 /// alternative placement strategies.
3142 class MachineBlockPlacementStats : public MachineFunctionPass {
3143  /// A handle to the branch probability pass.
3144  const MachineBranchProbabilityInfo *MBPI;
3145 
3146  /// A handle to the function-wide block frequency pass.
3147  const MachineBlockFrequencyInfo *MBFI;
3148 
3149 public:
3150  static char ID; // Pass identification, replacement for typeid
3151 
3152  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3154  }
3155 
3156  bool runOnMachineFunction(MachineFunction &F) override;
3157 
3158  void getAnalysisUsage(AnalysisUsage &AU) const override {
3161  AU.setPreservesAll();
3163  }
3164 };
3165 
3166 } // end anonymous namespace
3167 
3169 
3171 
3172 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3173  "Basic Block Placement Stats", false, false)
3176 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3177  "Basic Block Placement Stats", false, false)
3178 
3179 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3180  // Check for single-block functions and skip them.
3181  if (std::next(F.begin()) == F.end())
3182  return false;
3183 
3184  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3185  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3186 
3187  for (MachineBasicBlock &MBB : F) {
3188  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3189  Statistic &NumBranches =
3190  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3191  Statistic &BranchTakenFreq =
3192  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3193  for (MachineBasicBlock *Succ : MBB.successors()) {
3194  // Skip if this successor is a fallthrough.
3195  if (MBB.isLayoutSuccessor(Succ))
3196  continue;
3197 
3198  BlockFrequency EdgeFreq =
3199  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3200  ++NumBranches;
3201  BranchTakenFreq += EdgeFreq.getFrequency();
3202  }
3203  }
3204 
3205  return false;
3206 }
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:143
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
BranchProbability getCompl() const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:320
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:476
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:622
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
virtual llvm::Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
bool requiresStructuredCFG() const
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:104
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:195
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void ReplaceUsesOfBlockWith(MachineBasicBlock *Old, MachineBasicBlock *New)
Given a machine basic block that branched to &#39;Old&#39;, change the code and CFG so that it branches to &#39;N...
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:455
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:105
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:261
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:462
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1231
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:290
void setLogAlignment(unsigned A)
Set alignment of the basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
succ_reverse_iterator succ_rbegin()
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:308
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:490
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1205
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:434
const MachineBasicBlock & front() const
size_t size() const
Definition: SmallVector.h:52
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:92
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:115
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:297
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:441
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:319
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
LLVM_NODISCARD bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:174
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:168
unsigned succ_size(const Instruction *I)
Definition: CFG.h:256
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:154
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:619
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:160
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:185
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
void erase(iterator MBBI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void stable_sort(R &&Range)
Definition: STLExtras.h:1289
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:503
#define DEBUG_TYPE
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
#define LLVM_DEBUG(X)
Definition: Debug.h:122
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
block_iterator block_begin() const
Definition: LoopInfo.h:159
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.